## Building a Baseline Model 

### Going to use Linear Regression
#### Steps:
* 1. One Hot Encoding
* 2. Scaling
* 3. log transformation on price(output) column 

In [33]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [94]:
# temp = pd.read_csv("Datasets/gurgaon_properties_post_feature_selection.csv")
# temp

In [95]:
df = pd.read_csv("gurgaon_properties_post_feature_selection.csv")
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,0.0,36.0,3.0,2.0,2.0,1.0,850.0,0.0,0.0,0.0,1.0,1.0,0.82
1,0.0,95.0,2.0,2.0,2.0,1.0,1226.0,1.0,0.0,0.0,1.0,2.0,0.95
2,0.0,103.0,2.0,2.0,1.0,1.0,1000.0,0.0,0.0,0.0,1.0,0.0,0.32
3,0.0,99.0,3.0,4.0,4.0,3.0,1615.0,1.0,0.0,1.0,0.0,2.0,1.6
4,0.0,5.0,2.0,2.0,1.0,3.0,582.0,0.0,1.0,0.0,0.0,2.0,0.48


In [174]:
df["sector"] = df["sector"].astype("int64")
df["property_type"] = df["property_type"].astype("int64")
df["luxury_category"] = df["luxury_category"].astype("int64")
df["floor_category"] = df["floor_category"].astype("int64")

In [175]:
# step1: one hot encoding 
# sector, balcony, agePossession, furnishing type, luxury category, floor category

# split the data into X(features) and y(label) 
X = df.drop(columns=["price"])
y = df["price"]

In [176]:
# import all the necessary libraries 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [177]:
columns_to_encode = ['sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [178]:
# Applying the log1p transformation to the target variable (as the price column is right skewed)
y_transformed = np.log1p(y)

In [184]:
# create a column transformer for preprocessing 
preprocessor = ColumnTransformer(
    transformers = [
        ("num", StandardScaler(), ['property_type', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ("cat", OneHotEncoder(drop="first"), columns_to_encode)
    ],
    remainder="passthrough"
)

In [185]:
# Creating a pipeline 
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", SVR(kernel="rbf"))
])

In [186]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [187]:
scores.mean()

0.8872972455527514

In [188]:
scores.std()

0.016622291957156702

In [189]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [190]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['property_type', 'bedRoom',
                                                   'bathroom', 'built_up_area',
                                                   'servant room',
                                                   'store room']),
                                                 ('cat',
                                                  OneHotEncoder(drop='first'),
                                                  ['sector', 'balcony',
                                                   'agePossession',
                                                   'furnishing_type',
                                                   'luxury_category',
                                                   'floor_category'])])),
                ('regressor', SVR())])

In [191]:
y_pred = pipeline.predict(X_test)

In [192]:
y_pred = np.expm1(y_pred)

In [193]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test),y_pred)

0.4757702198395371