In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score,train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA

In [2]:
df=pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')

In [3]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [4]:
df['furnishing_type'].value_counts()

0.0    2349
1.0    1018
2.0     187
Name: furnishing_type, dtype: int64

In [5]:
# 0-> unfurnished
# 1-> semifurnished
# 2-> furnished
df['furnishing_type']=df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [6]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


In [7]:
X=df.drop(columns=['price'])
y=df['price']

In [8]:
# Applying the log1p transformation to the target variable
y_transformed=np.log1p(y)

In [9]:
columns_to_encode=df.select_dtypes(include='object').columns.to_list()

In [10]:
columns_to_encode

['property_type',
 'sector',
 'balcony',
 'agePossession',
 'furnishing_type',
 'luxury_category',
 'floor_category']

In [11]:
num_col_encode=X.select_dtypes(include=['int','float']).columns.to_list()

In [12]:
num_col_encode

['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']

In [13]:
# Creating a column transformer for preprocessing
preprocessor=ColumnTransformer(
    transformers=[
        ('num',StandardScaler() , num_col_encode),
        ('cat',OrdinalEncoder(), columns_to_encode)
    ],
    remainder='passthrough'
)

In [14]:
# Creating a pipeline
pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('regressor', LinearRegression())
])

In [15]:
# K-fold cross validation
kfold=KFold(n_splits=10,shuffle=True,random_state=42)
scores=cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')

In [16]:
scores.mean(),scores.std()

(0.7363096633436828, 0.03238005754429932)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [18]:
pipeline.fit(X_train,y_train)

In [19]:
y_pred=pipeline.predict(X_test)

In [20]:
y_pred=np.expm1(y_pred)

In [21]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.9463822160089357

In [22]:
def scorer(model_name,model):
    
    output=[]
    
    output.append(model_name)
    pipeline = Pipeline([
        ('preprocessor',preprocessor),
        ('regressor',model)
    ])
    
    # k-fold cross validation
    kfold=KFold(n_splits=10,shuffle=True,random_state=42)
    scores=cross_val_score(pipeline,X,y_transformed, cv=kfold, scoring='r2')
    output.append(scores.mean())
    
    X_train,X_test,y_train,y_test=train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred=pipeline.predict(X_test)
    
    y_pred=np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [23]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge': Ridge(),
    'LASSO': Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest': RandomForestRegressor(),
    'adaboost': AdaBoostRegressor(),
    'gradient boosting': GradientBoostingRegressor()
}

In [24]:
model_output=[]
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name,model))

In [25]:
model_output

[['linear_reg', 0.7363096633436828, 0.9463822160089357],
 ['svr', 0.7642012011196353, 0.8472636473483951],
 ['ridge', 0.7363125343993552, 0.946338774185337],
 ['LASSO', 0.05943378064493573, 1.528905986892753],
 ['decision tree', 0.7716205134129188, 0.7377271148064151],
 ['random forest', 0.8807442266582456, 0.530157181853693],
 ['adaboost', 0.7544455034069303, 0.8335995583654843],
 ['gradient boosting', 0.8725269166504607, 0.577026805539884]]

In [26]:
model_df=pd.DataFrame(model_output,columns=['name','r2','mae'])
model_df

Unnamed: 0,name,r2,mae
0,linear_reg,0.73631,0.946382
1,svr,0.764201,0.847264
2,ridge,0.736313,0.946339
3,LASSO,0.059434,1.528906
4,decision tree,0.771621,0.737727
5,random forest,0.880744,0.530157
6,adaboost,0.754446,0.8336
7,gradient boosting,0.872527,0.577027


In [27]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.880744,0.530157
7,gradient boosting,0.872527,0.577027
4,decision tree,0.771621,0.737727
6,adaboost,0.754446,0.8336
1,svr,0.764201,0.847264
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382
3,LASSO,0.059434,1.528906


## OneHotEncoding

In [28]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_col_encode),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [29]:
# creating a pipeline
pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',LinearRegression())
])

In [30]:
# k-fold cross validation
kfold=KFold(n_splits=10,shuffle=True,random_state=42)
scores=cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')

In [31]:
scores.mean(),scores.std()

(0.8546054073648314, 0.01599847663314007)

In [32]:
X_train,X_test,y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [33]:
pipeline.fit(X_train,y_train)

In [34]:
y_pred=pipeline.predict(X_test)

In [35]:
y_pred=np.expm1(y_pred)

In [36]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.6497382874070646

In [37]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [38]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge': Ridge(),
    'LASSO': Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest': RandomForestRegressor(),
    'adaboost': AdaBoostRegressor(),
    'gradient boosting': GradientBoostingRegressor()
}

In [39]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [40]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [41]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.890537,0.505949
7,gradient boosting,0.876675,0.569091
0,linear_reg,0.854605,0.649738
2,ridge,0.854673,0.652982
4,decision tree,0.809738,0.694911
6,adaboost,0.755673,0.814615
1,svr,0.769741,0.834124
3,LASSO,0.059434,1.528906


## OneHotEncoding With PCA

In [42]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first', sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [43]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [44]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [45]:
scores.mean(),scores.std()

(0.062252014314511384, 0.01986059407164018)

In [46]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [47]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge': Ridge(),
    'LASSO': Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest': RandomForestRegressor(),
    'adaboost': AdaBoostRegressor(),
    'gradient boosting': GradientBoostingRegressor()
}

In [48]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [49]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [50]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.762566,0.661197
4,decision tree,0.696182,0.75729
7,gradient boosting,0.610604,0.987906
6,adaboost,0.300862,1.320387
1,svr,0.218073,1.361163
2,ridge,0.062252,1.526707
0,linear_reg,0.062252,1.526707
3,LASSO,0.059676,1.528739


## Target Encoder

In [51]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [52]:
#!pip install category_encoders

In [53]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [54]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [55]:
scores.mean(),scores.std()

(0.829521918225536, 0.0183844633791229)

In [56]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [57]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge': Ridge(),
    'LASSO': Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest': RandomForestRegressor(),
    'adaboost': AdaBoostRegressor(),
    'gradient boosting': GradientBoostingRegressor()
}

In [58]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [59]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [60]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.901014,0.454615
7,gradient boosting,0.889288,0.5095
4,decision tree,0.833691,0.545762
6,adaboost,0.818342,0.697948
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
1,svr,0.782917,0.818851
3,LASSO,0.059434,1.528906


In [61]:
print(k)

NameError: name 'k' is not defined

## Hyperparameter Tuning

In [62]:
from sklearn.model_selection import GridSearchCV

In [82]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [83]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [84]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [85]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [86]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [87]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


640 fits failed out of a total of 1280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
337 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Lenovo\AppData\Local\anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Lenovo\AppData\Local\anaconda\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Lenovo\AppData\Local\anaconda\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\Lenovo\AppData\Local\anaconda\lib\site-packages\sklearn\ba

## Exporting the model

In [71]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [72]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=200))
])

In [73]:
pipeline.fit(X,y_transformed)

In [74]:
import pickle

with open('pipelines.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [75]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [None]:
X

## Trying out the predictions

In [None]:
X.columns

In [None]:
X.iloc[0].values

In [None]:
data = [['house', 'sector 102', 17, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

In [None]:
np.expm1(pipeline.predict(one_df))

In [None]:
X.dtypes

In [None]:
sorted(X['sector'].unique().tolist())