In [267]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [268]:
df = pd.read_csv('../data/flats_post_feature_selection.csv')

In [269]:
df.head()

Unnamed: 0,price,sector,built_up_area,bedRoom,bathroom,balcony,extra_rooms,agePossession,furnishing_type,luxury_category,floor_category
0,0.45,sector 7,1000.0,2,2,1,0,Relatively New,0,Low,Mid Floor
1,0.5,sector 3,722.0,2,2,1,0,Old Property,2,Low,Low Floor
2,0.4,sohna road,661.0,2,2,3,0,New Property,0,Low,High Floor
3,1.47,sector 61,1333.0,2,2,2,1,Under Construction,0,Medium,Low Floor
4,0.7,sector 92,1217.0,2,2,3,1,Under Construction,0,Low,Mid Floor


In [270]:
df.isnull().sum()

price              0
sector             0
built_up_area      0
bedRoom            0
bathroom           0
balcony            0
extra_rooms        0
agePossession      0
furnishing_type    0
luxury_category    0
floor_category     0
dtype: int64

In [271]:
df['furnishing_type'].value_counts()

furnishing_type
0    1882
2     803
1     147
Name: count, dtype: int64

Comments: Since we want to convert all the ordinal encoding, One hot encoding inside pipeline, we want to convert 'furnishing_type' column into categorical column again. Bcoz this pipeline will be useful while prediction also to do all these processes

In [272]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [273]:
df.head()

Unnamed: 0,price,sector,built_up_area,bedRoom,bathroom,balcony,extra_rooms,agePossession,furnishing_type,luxury_category,floor_category
0,0.45,sector 7,1000.0,2,2,1,0,Relatively New,unfurnished,Low,Mid Floor
1,0.5,sector 3,722.0,2,2,1,0,Old Property,furnished,Low,Low Floor
2,0.4,sohna road,661.0,2,2,3,0,New Property,unfurnished,Low,High Floor
3,1.47,sector 61,1333.0,2,2,2,1,Under Construction,unfurnished,Medium,Low Floor
4,0.7,sector 92,1217.0,2,2,3,1,Under Construction,unfurnished,Low,Mid Floor


In [274]:
df['sector'].value_counts()

sector
sohna road                   147
sector 85                    107
sector 102                   106
sector 92                     96
sector 69                     90
                            ... 
sector 11                      1
saroop garden sector 105       1
bptp                           1
sector 9                       1
godrej aria , sector 79 ,      1
Name: count, Length: 138, dtype: int64

In [275]:
df.shape

(2832, 11)

Comments: Many sector has only 1-2 ocurrence. It'll cause problem while k-fold cross validation. Since if 1 occurrence, the single point if goes into testing data only and no ocurrence in train data, then test data won't get encoding, will raise error. 

In [276]:
a = df['sector'].value_counts()[df['sector'].value_counts() >= 3]
df = df[df['sector'].isin(a.index)]

In [277]:
df['sector'].value_counts()

sector
sohna road    147
sector 85     107
sector 102    106
sector 92      96
sector 69      90
             ... 
sector 15       3
sector 31       3
sector 21       3
sector 6        3
sector 105      3
Name: count, Length: 92, dtype: int64

In [278]:
df.shape

(2771, 11)

In [279]:
df.to_csv('../data/flats_final_model_selection.csv', index=False)

In [280]:
df = pd.read_csv("../data/flats_final_model_selection.csv")

In [281]:
X = df.drop(columns=['price'])
y = df['price']

In [282]:
# Applying the log1p transformation to the target variable to make target column normally distributed
y_transformed = np.log1p(y)

Approach 1: Ordinal Encoding for categorical and StandardScaling for Numerical
Convert all categorical(object) columns to number with Ordinal Encoding and experiment with different type of algorithms

In [283]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2771 entries, 0 to 2770
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sector           2771 non-null   object 
 1   built_up_area    2771 non-null   float64
 2   bedRoom          2771 non-null   int64  
 3   bathroom         2771 non-null   int64  
 4   balcony          2771 non-null   object 
 5   extra_rooms      2771 non-null   int64  
 6   agePossession    2771 non-null   object 
 7   furnishing_type  2771 non-null   object 
 8   luxury_category  2771 non-null   object 
 9   floor_category   2771 non-null   object 
dtypes: float64(1), int64(3), object(6)
memory usage: 216.6+ KB


In [284]:
columns_to_encode = ['sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [285]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'extra_rooms']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [286]:
# Creating a sklearn pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [287]:
# K-fold cross-validation and find out mean 'r2' score
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [288]:
scores.mean()

np.float64(0.6680069704931234)

In [289]:
scores.std()

np.float64(0.0329413273716765)

In [290]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [291]:
pipeline.fit(X_train,y_train)

In [292]:
y_pred = pipeline.predict(X_test)

In [293]:
y_pred = np.expm1(y_pred)

In [294]:
mean_absolute_error(np.expm1(y_test),y_pred)

np.float64(0.55602671368083)

Writing pipeline, score predictor in a function

In [301]:
def scorer(preprocessor, model_name, model):
    
    output = []
    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [302]:
! pip install xgboost



In [303]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from tqdm import tqdm

In [304]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'extra_rooms']),
        ('cat', OrdinalEncoder(), ['sector', 'agePossession', 'balcony', 'furnishing_type', 'luxury_category', 'floor_category'])
    ], 
    remainder='passthrough'
)

In [305]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost': XGBRegressor()
}

In [306]:
model_output = []
for model_name, model in tqdm(model_dict.items()):
    model_output.append(scorer(preprocessor, model_name, model))

100%|██████████| 11/11 [00:35<00:00,  3.18s/it]


In [307]:
model_df_OE = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [308]:
model_df_OE.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.889529,0.279135
5,random forest,0.869029,0.284126
6,extra trees,0.859779,0.31191
7,gradient boosting,0.862803,0.317774
4,decision tree,0.749156,0.430793
9,mlp,0.755497,0.450996
8,adaboost,0.747424,0.457904
1,svr,0.739781,0.461152
2,ridge,0.668014,0.555839
0,linear_reg,0.668007,0.556027


Approach 2: Ordinal Encoding & One Hot Encoding for Categorical and Standard Scaler for Numerical
Ordinal Encoding cols: ['balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']
One Hot Encoding Cols: ['sector']
Standard Scaling Cols: ['bedRoom', 'bathroom', 'built_up_area', 'extra_rooms']

In [309]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'extra_rooms']),
        ('cat', OrdinalEncoder(), ['agePossession', 'balcony', 'furnishing_type', 'luxury_category', 'floor_category']),
        ('cat1',OneHotEncoder(drop='first', sparse_output=False),['sector'])
    ], 
    remainder='passthrough'
)

In [310]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [311]:
model_output = []
for model_name,model in tqdm(model_dict.items()):
    model_output.append(scorer(preprocessor, model_name, model))

100%|██████████| 11/11 [01:08<00:00,  6.23s/it]


In [312]:
model_output

[['linear_reg',
  np.float64(0.8596097793459683),
  np.float64(0.3132723971941038)],
 ['svr', np.float64(0.868073245272587), np.float64(0.3193719246959997)],
 ['ridge', np.float64(0.8592399339136094), np.float64(0.3214242604677093)],
 ['LASSO', np.float64(-0.004562332666279811), np.float64(0.7911464737993283)],
 ['decision tree',
  np.float64(0.781273819142573),
  np.float64(0.3661639566841625)],
 ['random forest',
  np.float64(0.859747310191886),
  np.float64(0.3200773005954461)],
 ['extra trees',
  np.float64(0.882638926667729),
  np.float64(0.27390746095038304)],
 ['gradient boosting',
  np.float64(0.8351910547966777),
  np.float64(0.3537467970428676)],
 ['adaboost', np.float64(0.6686985796132346), np.float64(0.5525098172980539)],
 ['mlp', np.float64(0.8818314746887947), np.float64(0.2800242271474937)],
 ['xgboost', np.float64(0.8896834196349314), np.float64(0.274301013748925)]]

In [313]:
model_df_OE_OHE = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [314]:
model_df_OE_OHE.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.882639,0.273907
10,xgboost,0.889683,0.274301
9,mlp,0.881831,0.280024
0,linear_reg,0.85961,0.313272
1,svr,0.868073,0.319372
5,random forest,0.859747,0.320077
2,ridge,0.85924,0.321424
7,gradient boosting,0.835191,0.353747
4,decision tree,0.781274,0.366164
8,adaboost,0.668699,0.55251


Approach 3: Ordinal Encoding & One Hot Encoding for Categorical and Standard Scaler for Numerical and then PCA for dimensionality reduction
Ordinal Encoding cols: ['balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']
One Hot Encoding Cols: ['sector']
Standard Scaling Cols: ['bedRoom', 'bathroom', 'built_up_area', 'extra_rooms']

In [315]:
def scorer_with_PCA(_n_components, preprocessor, model_name, model):
    
    output = []
    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=_n_components)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [316]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'extra_rooms']),
        ('cat', OrdinalEncoder(), ['agePossession', 'balcony', 'furnishing_type', 'luxury_category', 'floor_category']),
        ('cat1',OneHotEncoder(drop='first', sparse_output=False),['sector'])
    ], 
    remainder='passthrough'
)

In [317]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [324]:
model_output = []
for model_name,model in tqdm(model_dict.items()):
    model_output.append(scorer_with_PCA(0.95, preprocessor, model_name, model))

100%|██████████| 11/11 [02:31<00:00, 13.74s/it]


In [325]:
model_df_OE_OHE_PCA = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [326]:
model_df_OE_OHE_PCA.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.81668,0.368983
5,random forest,0.804808,0.382658
10,xgboost,0.796715,0.391398
7,gradient boosting,0.792959,0.403959
9,mlp,0.760486,0.409426
1,svr,0.78005,0.426017
4,decision tree,0.62529,0.522881
8,adaboost,0.666672,0.531424
2,ridge,0.712796,0.54763
0,linear_reg,0.712715,0.548726


Remarks: <br>
From 3 approaches, Approach 1 and Approach 2 are fine. But We'll go with Approach 2(OE, OHE, Scaling). Also extra trees is performing best. But XgBoost is also performing neck to neck with extra trees. So, we'll hyperparamter tune for XgBoost to get optimal results.<br>
#### MAE: 0.273907 crores<br>
Also will train with full data now. Since experiments done.

XgBoost with Hyperparameter Tuning (Approach 2 (OE, OHE, Scaling) ). Also will train with full data now. Since experiments done.

In [327]:
from sklearn.model_selection import GridSearchCV

In [372]:
param_grid = {
    'regressor__learning_rate': [0.05, 0.1],
    'regressor__max_depth': [3, 5, 7, 10],
    'regressor__min_child_weight': [1, 3, 5],
    'regressor__subsample': [0.5, 0.7],
    'regressor__colsample_bytree': [0.5, 0.7],
    'regressor__n_estimators' : [100, 200, 500, 700, 900],
    'regressor__objective': ['reg:squarederror']
}

In [373]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'extra_rooms']),
        ('cat', OrdinalEncoder(), ['agePossession', 'balcony', 'furnishing_type', 'luxury_category', 'floor_category']),
        ('cat1',OneHotEncoder(drop='first', sparse_output=False),['sector'])
    ], 
    remainder='passthrough'
)

In [374]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

In [375]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [376]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [377]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 480 candidates, totalling 4800 fits
[CV 2/10] END regressor__colsample_bytree=0.5, regressor__learning_rate=0.05, regressor__max_depth=3, regressor__min_child_weight=1, regressor__n_estimators=100, regressor__objective=reg:squarederror, regressor__subsample=0.5;, score=0.754 total time=   0.1s
[CV 3/10] END regressor__colsample_bytree=0.5, regressor__learning_rate=0.05, regressor__max_depth=3, regressor__min_child_weight=1, regressor__n_estimators=100, regressor__objective=reg:squarederror, regressor__subsample=0.5;, score=0.821 total time=   0.1s
[CV 1/10] END regressor__colsample_bytree=0.5, regressor__learning_rate=0.05, regressor__max_depth=3, regressor__min_child_weight=1, regressor__n_estimators=100, regressor__objective=reg:squarederror, regressor__subsample=0.5;, score=0.778 total time=   0.1s
[CV 8/10] END regressor__colsample_bytree=0.5, regressor__learning_rate=0.05, regressor__max_depth=3, regressor__min_child_weight=1, regressor__n_estimators=1

[CV 7/10] END regressor__colsample_bytree=0.5, regressor__learning_rate=0.05, regressor__max_depth=3, regressor__min_child_weight=1, regressor__n_estimators=100, regressor__objective=reg:squarederror, regressor__subsample=0.5;, score=0.778 total time=   0.3s
[CV 10/10] END regressor__colsample_bytree=0.5, regressor__learning_rate=0.05, regressor__max_depth=3, regressor__min_child_weight=1, regressor__n_estimators=100, regressor__objective=reg:squarederror, regressor__subsample=0.5;, score=0.771 total time=   0.2s
[CV 1/10] END regressor__colsample_bytree=0.5, regressor__learning_rate=0.05, regressor__max_depth=3, regressor__min_child_weight=1, regressor__n_estimators=100, regressor__objective=reg:squarederror, regressor__subsample=0.7;, score=0.782 total time=   0.2s
[CV 9/10] END regressor__colsample_bytree=0.5, regressor__learning_rate=0.05, regressor__max_depth=3, regressor__min_child_weight=1, regressor__n_estimators=100, regressor__objective=reg:squarederror, regressor__subsample=

In [379]:
final_pipe = search.best_estimator_

In [380]:
search.best_params_

{'regressor__colsample_bytree': 0.7,
 'regressor__learning_rate': 0.05,
 'regressor__max_depth': 7,
 'regressor__min_child_weight': 1,
 'regressor__n_estimators': 900,
 'regressor__objective': 'reg:squarederror',
 'regressor__subsample': 0.7}

In [381]:
search.best_score_

np.float64(0.9067912826134471)

In [382]:
final_pipe.fit(X,y_transformed)

Exporting the Model

In [383]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'extra_rooms']),
        ('cat', OrdinalEncoder(), ['agePossession', 'balcony', 'furnishing_type', 'luxury_category', 'floor_category']),
        ('cat1',OneHotEncoder(drop='first', sparse_output=False),['sector'])
    ], 
    remainder='passthrough'
)

In [384]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(
        regressor__colsample_bytree = 0.7,
        regressor__learning_rate = 0.05,
        regressor__max_depth = 7,
        regressor__min_child_weight = 1,
        regressor__n_estimators = 900,
        regressor__subsample = 0.7))
])

In [385]:
pipeline.fit(X,y_transformed)

Parameters: { "regressor__colsample_bytree", "regressor__learning_rate", "regressor__max_depth", "regressor__min_child_weight", "regressor__n_estimators", "regressor__subsample" } are not used.



In [386]:
import pickle

with open('../pickle_files/pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)
with open('../pickle_files/df.pkl', 'wb') as file:
    pickle.dump(X, file)

Trying out the predictions

In [387]:
X.columns

Index(['sector', 'built_up_area', 'bedRoom', 'bathroom', 'balcony',
       'extra_rooms', 'agePossession', 'furnishing_type', 'luxury_category',
       'floor_category'],
      dtype='object')

In [388]:
X.iloc[0].values

array(['sector 7', np.float64(1000.0), np.int64(2), np.int64(2), '1',
       np.int64(0), 'Relatively New', 'unfurnished', 'Low', 'Mid Floor'],
      dtype=object)

In [391]:
data = [[ 'sector 102', 4, 3, '3+', 'New Property', 2750, 1, 'unfurnished', 'Low', 'Low Floor']]
columns = ['sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'extra_rooms',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)
one_df

Unnamed: 0,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,extra_rooms,furnishing_type,luxury_category,floor_category
0,sector 102,4,3,3+,New Property,2750,1,unfurnished,Low,Low Floor


In [392]:
price = np.expm1(pipeline.predict(one_df))
print("Price: ", price)

Price:  [2.7576747]
