In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('gurgaon_properties_post_feature_selection.csv')

In [3]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,1.0,33.0,4,4,0.0,2.0,3240.0,0,0,0,1.0,2.0,4.5
1,0.0,74.0,3,3,2.0,1.0,1950.0,1,0,2,2.0,0.0,2.8
2,0.0,6.0,3,3,3.0,4.0,1365.0,0,0,1,2.0,0.0,1.9
3,1.0,57.0,5,5,3.0,0.0,3240.0,1,0,0,1.0,2.0,10.56
4,1.0,13.0,4,3,2.0,2.0,1800.0,0,0,0,1.0,1.0,2.2


In [4]:
# one hot encode -> sector, balcony, agePossession, furnishing type, luxury category, floor category
# why ? because linear regression using so we cannot use ordinal encoding because it will judge them on their assign numbers
X=df.drop(columns=['price'])
y=df['price']

In [5]:
X.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,1.0,33.0,4,4,0.0,2.0,3240.0,0,0,0,1.0,2.0
1,0.0,74.0,3,3,2.0,1.0,1950.0,1,0,2,2.0,0.0
2,0.0,6.0,3,3,3.0,4.0,1365.0,0,0,1,2.0,0.0
3,1.0,57.0,5,5,3.0,0.0,3240.0,1,0,0,1.0,2.0
4,1.0,13.0,4,3,2.0,2.0,1800.0,0,0,0,1.0,1.0


In [6]:
y.head()

0     4.50
1     2.80
2     1.90
3    10.56
4     2.20
Name: price, dtype: float64

# Linear Regression - Baseline model

In [7]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [8]:
column_to_encode = ['sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [9]:
# Applying the log1p transformation to the target variable ? why because right skewed so we are making more normal 
y_transformed = np.log1p(y)

In [16]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers = [
        ( 'num', StandardScaler(), ['property_type', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room'] ),
        ( 'cat', OneHotEncoder(drop='first'), column_to_encode )
    ],
    remainder='passthrough'
)

In [19]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression() )
])

In [20]:
# k-fold cross validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
score = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

Traceback (most recent call last):
  File "C:\Users\ritik\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 136, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "C:\Users\ritik\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ritik\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ritik\anaconda3\Lib\site-packages\sklearn\utils\_response.py", line 109, in _get_response_values
    y_pred, pos_label = estimator.predict(X), None
                        ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ritik\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 507, in predict
    Xt = transform.transform(Xt)
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ritik\anaconda3\Lib\site-packages\skl

In [21]:
score.mean()

nan

In [22]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers = [
        ( 'num', StandardScaler(), ['property_type', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room'] ),
        # The fix is adding handle_unknown='ignore'
        ( 'cat', OneHotEncoder(drop='first', handle_unknown='ignore'), column_to_encode )
    ],
    remainder='passthrough'
)

In [23]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression() )
])

In [24]:
# k-fold cross validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
score = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [25]:
score.mean()

0.8574929578620696

In [26]:
score.std()

0.020825209924223133

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [28]:
pipeline.fit(X_train,y_train)

In [29]:
y_pred=pipeline.predict(X_test)



In [30]:
y_pred= np.expm1(y_pred)

In [31]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test),y_pred)

0.640367837631421

In [32]:
# trying svr

In [33]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf') )
])

In [34]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [35]:
scores.mean()

0.885224504885409

In [36]:
scores.std()

0.018025373297622126

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [38]:
pipeline.fit(X_train,y_train)

In [39]:
y_pred = pipeline.predict(X_test)



In [40]:
y_pred = np.expm1(y_pred)

In [41]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test),y_pred)

0.528500583903019