In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')

In [3]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [4]:
df['furnishing_type'].value_counts()

furnishing_type
0.0    2349
1.0    1018
2.0     187
Name: count, dtype: int64

In [5]:
df['furnishing_type'] = df['furnishing_type'].replace({0.0: 'unfurnised',1.0:'Semi furnised' ,2.0 : 'furnised'})

In [6]:
df['furnishing_type'].value_counts()

furnishing_type
unfurnised       2349
Semi furnised    1018
furnised          187
Name: count, dtype: int64

In [7]:
X = df.drop(columns = ['price'])
y = df['price']

In [8]:
# Applying yLog1 transformation for the target variable

In [9]:
y_transformed = np.log1p(y)

### Ordinal Encoding 

In [10]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [11]:
#Creating a column transformer for the preprocessing 

In [12]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(),['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OneHotEncoder(), columns_to_encode)
    ],
remainder = 'passthrough'
)

In [13]:
#Creating a pipeline

In [14]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor',LinearRegression())
])

In [15]:
# K fild cross validation

In [16]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline , X, y_transformed , cv= kfold, scoring = 'r2')

In [17]:
scores.mean()

0.8558121780523218

In [18]:
scores.std()

0.015558139114247284

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test ,y_train ,y_test = train_test_split(X ,y_transformed, test_size=0.2 ,random_state= 42)

In [20]:
pipeline.fit(X_train ,y_train)

In [21]:
y_pred = pipeline.predict(X_test)

In [22]:

y_pred = np.expm1(y_pred)

In [23]:

mean_absolute_error(np.expm1(y_test),y_pred)

0.648382759135605

In [24]:
def scorer(model_name, model):

    output = []
    output.append(model_name)

    pipeline = Pipeline([
        ('proprocessor', preprocessor),
        ('regressor' , model)
    ])
            
    #Kfold cross validation

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    pipeline.fit(X_train,y_train)
    y_pred = pipeline.predict(X_test)
    y_pred = np.expm1(y_pred)
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    return output
    

In [25]:
!pip install xgboost




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor
)
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [27]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [28]:
model_output = []

for model_name, model in model_dict.items():
    model_output.append(scorer(model_name,model))

In [29]:
model_output

[['linear_reg', 0.8558121780523218, 0.648382759135605],
 ['svr', 0.8841759954308485, 0.5253869609150191],
 ['ridge', 0.8562100443065963, 0.652505790668771],
 ['LASSO', -0.003597657277664901, 1.576391507281618],
 ['decision tree', 0.7947625176218647, 0.6377955316672044],
 ['random forest', 0.8700613425456852, 0.5312195448228679],
 ['extra trees', 0.8867683597522303, 0.4788358627856486],
 ['gradient boosting', 0.8552569472343482, 0.5995955984095471],
 ['adaboost', 0.7346202178211161, 0.8951309492710626],
 ['mlp', 0.8836632924276341, 0.5198679219480601],
 ['xgboost', 0.8853574412425637, 0.504988461551787]]

In [30]:
model_df = pd.DataFrame(model_output , columns = ['name','r2','mae'])

In [31]:
model_df

Unnamed: 0,name,r2,mae
0,linear_reg,0.855812,0.648383
1,svr,0.884176,0.525387
2,ridge,0.85621,0.652506
3,LASSO,-0.003598,1.576392
4,decision tree,0.794763,0.637796
5,random forest,0.870061,0.53122
6,extra trees,0.886768,0.478836
7,gradient boosting,0.855257,0.599596
8,adaboost,0.73462,0.895131
9,mlp,0.883663,0.519868


### One hot Encoding with PCA

In [32]:
#Creating a column tarnsfom=rmer for the preprocessing 

preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat',OrdinalEncoder(), columns_to_encode),
         ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ],
    remainder = 'passthrough'
)

In [33]:
#Craeting a PIPELINE

In [34]:
pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('pca' , PCA(n_components=0.95)),
    ('regressor',LinearRegression())
]
    
)

In [35]:
#Kfold cross validation

In [36]:
kfold = KFold(n_splits=10,shuffle= True, random_state=42)
scores = cross_val_score(pipeline , X, y_transformed, cv = kfold, scoring = 'r2')

In [37]:
scores.mean()

0.062253550927164295

In [38]:
scores.std()

0.01986065906756916

In [39]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [40]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [41]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [42]:
model_output

[['linear_reg', 0.062253550927164295, 1.5267067155841032],
 ['svr', 0.21803618296055954, 1.3611435118486082],
 ['ridge', 0.06225355177447787, 1.526706714533807],
 ['LASSO', 0.05967732802304122, 1.5287385878187565],
 ['decision tree', 0.694315471898961, 0.7661578935718582],
 ['random forest', 0.7624474806201534, 0.6649130796260749],
 ['extra trees', 0.7341476128527172, 0.7016090915841757],
 ['gradient boosting', 0.6125982265607116, 0.9843169038138077],
 ['adaboost', 0.31039791450340704, 1.4349632701585044],
 ['mlp', 0.2068470700523906, 1.4303482755795665],
 ['xgboost', 0.6223958945692608, 0.9674709433719579]]

In [43]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [44]:
model_df

Unnamed: 0,name,r2,mae
0,linear_reg,0.062254,1.526707
1,svr,0.218036,1.361144
2,ridge,0.062254,1.526707
3,LASSO,0.059677,1.528739
4,decision tree,0.694315,0.766158
5,random forest,0.762447,0.664913
6,extra trees,0.734148,0.701609
7,gradient boosting,0.612598,0.984317
8,adaboost,0.310398,1.434963
9,mlp,0.206847,1.430348


### Target Encoder

In [45]:
!pip install category_encoders




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [46]:
import category_encoders as ce

In [47]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [48]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
scores.mean(),scores.std()

(0.8294401705020924, 0.018379538415979193)

In [49]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [50]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [51]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [52]:

model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.762447,0.664913
6,extra trees,0.734148,0.701609
4,decision tree,0.694315,0.766158
10,xgboost,0.622396,0.967471
7,gradient boosting,0.612598,0.984317
1,svr,0.218036,1.361144
9,mlp,0.206847,1.430348
8,adaboost,0.310398,1.434963
2,ridge,0.062254,1.526707
0,linear_reg,0.062254,1.526707


### Hyperparameter tuning 

In [53]:

from sklearn.model_selection import GridSearchCV

In [54]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}


In [55]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [56]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [57]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [58]:
search = GridSearchCV(pipeline , param_grid,  cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [59]:
search.fit(X,y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


640 fits failed out of a total of 1280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
329 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\MB511WS\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\MB511WS\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\MB511WS\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 660, in fit
    self._final_estimator.fit(Xt, y

In [60]:

final_pipe = search.best_estimator_

In [61]:
search.best_params_

{'regressor__max_depth': 30,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 300}

In [62]:
search.best_score_

0.9022390294207376

In [63]:
final_pipe.fit(X,y_transformed)

### Exporting the model

In [64]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [65]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [66]:

pipeline.fit(X,y_transformed)

In [67]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [68]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [69]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnised,Low,Low Floor
1,flat,sector 89,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnised,Low,Mid Floor
2,flat,sohna road,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnised,Low,High Floor
3,flat,sector 92,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,Semi furnised,High,Mid Floor
4,flat,sector 102,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnised,High,Mid Floor
...,...,...,...,...,...,...,...,...,...,...,...,...
3549,flat,sector 84,2.0,2.0,1,Relatively New,532.0,0.0,0.0,unfurnised,Medium,Mid Floor
3550,house,sector 109,5.0,5.0,3+,Relatively New,6228.0,1.0,1.0,unfurnised,High,Low Floor
3551,flat,sector 2,1.0,1.0,1,Moderately Old,665.0,0.0,0.0,Semi furnised,Medium,Mid Floor
3552,house,sector 43,5.0,6.0,3,Moderately Old,5490.0,1.0,1.0,unfurnised,Medium,Mid Floor
