In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [2]:
df=pd.read_csv("/content/Gurgoan_Wrangled_Data_2.csv")

In [3]:
df.sample(7)

Unnamed: 0,sector,price,area,noOfOpenSides,possession,price_per_sqft,Maintenance_Staff,Water_Storage,Vaastu_Compliant,Rain_Water_Harvesting,Rating,Metro,Park,School,College,Hospital,Shrine,Mall,Airport
1588,sector 67a,3.65,2431.76,1.0,undefined,0.001501,1.0,0.0,1.0,1.0,4.5,1,1,0,0,0,0,1,1
1180,sector 43,7.5,2700.76,3.0,undefined,0.002777,1.0,1.0,1.0,1.0,5.0,1,1,1,1,1,1,0,0
478,sector 67a,4.45,2969.76,1.0,undefined,0.001498,1.0,1.0,1.0,1.0,4.5,1,1,0,0,0,0,1,1
436,sector 102,3.89,2248.84,1.0,immediate,0.00173,1.0,1.0,1.0,1.0,4.125,1,1,1,1,1,0,1,1
858,sector 50,4.24,2388.72,2.0,undefined,0.001775,1.0,1.0,0.0,1.0,4.75,0,0,1,1,1,0,1,0
12,sector 70a,8.0,2603.92,1.0,undefined,0.003072,0.0,0.0,1.0,1.0,4.25,1,1,1,1,1,0,1,1
502,sector 45,8.25,4519.2,1.0,immediate,0.001826,1.0,1.0,1.0,1.0,4.25,1,1,1,1,1,0,0,0


## Feature Selection :

In [4]:
df.isnull().sum()

Unnamed: 0,0
sector,0
price,0
area,0
noOfOpenSides,0
possession,0
price_per_sqft,0
Maintenance_Staff,0
Water_Storage,0
Vaastu_Compliant,0
Rain_Water_Harvesting,0


In [5]:
training_df=df

In [6]:
from sklearn.preprocessing import LabelEncoder
l_encoder=LabelEncoder()
training_df['possession']=l_encoder.fit_transform(training_df['possession'])
training_df['sector']=l_encoder.fit_transform(training_df['sector'])
training_df['noOfOpenSides']=l_encoder.fit_transform(training_df['noOfOpenSides'])

## 1- Correlation

In [7]:
fi_df1=training_df.corr()['price'].iloc[1:].to_frame().reset_index().rename(columns={'index':'feature','price':'corr_coeff'})
fi_df1

Unnamed: 0,feature,corr_coeff
0,price,1.0
1,area,0.089681
2,noOfOpenSides,-0.022198
3,possession,-0.051502
4,price_per_sqft,0.196289
5,Maintenance_Staff,0.023464
6,Water_Storage,0.009157
7,Vaastu_Compliant,0.004249
8,Rain_Water_Harvesting,-0.025389
9,Rating,0.21045


## RFE :

In [8]:
x_label=training_df.drop(['price'],axis=1)
y_label=training_df['price']

In [9]:
x_label.isnull().sum()

Unnamed: 0,0
sector,0
area,0
noOfOpenSides,0
possession,0
price_per_sqft,0
Maintenance_Staff,0
Water_Storage,0
Vaastu_Compliant,0
Rain_Water_Harvesting,0
Rating,0


In [10]:
from sklearn.ensemble import RandomForestRegressor
rf_label=RandomForestRegressor(n_estimators=100, random_state=20)
rf_label.fit(x_label,y_label)

fi_df2 = pd.DataFrame({'feature':x_label.columns,'rf_importance':rf_label.feature_importances_}).sort_values(by='rf_importance',ascending=False)

fi_df2

Unnamed: 0,feature,rf_importance
1,area,0.617562
4,price_per_sqft,0.361313
0,sector,0.006784
9,Rating,0.002824
2,noOfOpenSides,0.002562
7,Vaastu_Compliant,0.001603
3,possession,0.001014
15,Shrine,0.001006
10,Metro,0.000794
8,Rain_Water_Harvesting,0.000762


## Gradient Boosting Regressor :

In [11]:
y_label.isnull().sum(),x_label.isnull().sum()

(0,
 sector                   0
 area                     0
 noOfOpenSides            0
 possession               0
 price_per_sqft           0
 Maintenance_Staff        0
 Water_Storage            0
 Vaastu_Compliant         0
 Rain_Water_Harvesting    0
 Rating                   0
 Metro                    0
 Park                     0
 School                   0
 College                  0
 Hospital                 0
 Shrine                   0
 Mall                     0
 Airport                  0
 dtype: int64)

In [12]:
from sklearn.ensemble import GradientBoostingRegressor
gb_label=GradientBoostingRegressor()
gb_label.fit(x_label,y_label)

fi_df3 = pd.DataFrame({'feature':x_label.columns,'gb_importance':gb_label.feature_importances_}).sort_values(by='gb_importance',ascending=False)
fi_df3

Unnamed: 0,feature,gb_importance
1,area,0.628239
4,price_per_sqft,0.363802
9,Rating,0.002064
2,noOfOpenSides,0.001494
15,Shrine,0.001322
10,Metro,0.001031
8,Rain_Water_Harvesting,0.000895
17,Airport,0.000491
0,sector,0.000409
5,Maintenance_Staff,7e-05


## Permutation Importance

In [13]:
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

x_train_label,x_test_label,y_train_label,y_test_label=train_test_split(x_label,y_label,test_size=0.2,random_state=3)


rf_label=RandomForestRegressor(n_estimators=100,random_state=34)
rf_label.fit(x_train_label,y_train_label)

perm_importance=permutation_importance(rf_label,x_test_label,y_test_label,n_repeats=30,random_state=5)

fi_df4=pd.DataFrame({'feature':x_label.columns,'perm_importance':perm_importance.importances_mean}).sort_values(by='perm_importance',ascending=False)

## Lasso

In [14]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
x_scaled=scaler.fit_transform(x_label)
lasso=Lasso(alpha=0.01,random_state=8)
lasso.fit(x_scaled,y_label)

fi_df5=pd.DataFrame({'feature':x_label.columns,'lasso_coeff':lasso.coef_}).sort_values(by='lasso_coeff',ascending=False)
fi_df5

Unnamed: 0,feature,lasso_coeff
10,Metro,0.6455521
4,price_per_sqft,0.5792091
9,Rating,0.5657634
1,area,0.3838589
15,Shrine,0.3403139
13,College,0.2538696
5,Maintenance_Staff,0.2059649
8,Rain_Water_Harvesting,0.0684823
7,Vaastu_Compliant,0.06304399
2,noOfOpenSides,0.05581547


#RFE

In [15]:
from sklearn.feature_selection import RFE

# Initialize the base estimator
estimator = RandomForestRegressor()

# Apply RFE on the label-encoded and standardized training data
selector_label = RFE(estimator, n_features_to_select=x_label.shape[1], step=1)
selector_label = selector_label.fit(x_label, y_label)

# Get the selected features based on RFE
selected_features = x_label.columns[selector_label.support_]

# Extract the coefficients for the selected features from the underlying linear regression model
selected_coefficients = selector_label.estimator_.feature_importances_

# Organize the results into a DataFrame
fi_df6 = pd.DataFrame({
    "feature": selected_features,
    "rfe_score": selected_coefficients
}).sort_values(by="rfe_score", ascending=False)

fi_df6

Unnamed: 0,feature,rfe_score
1,area,0.617737
4,price_per_sqft,0.361736
0,sector,0.007342
2,noOfOpenSides,0.003538
9,Rating,0.001821
17,Airport,0.001286
7,Vaastu_Compliant,0.001125
3,possession,0.000763
16,Mall,0.000701
14,Hospital,0.000654


# Linear Regression

In [16]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(x_scaled, y_label)

# Extract coefficients
fi_df7 = pd.DataFrame({
    "feature": x_label.columns,
    "reg_coeffs": lin_reg.coef_
}).sort_values(by="reg_coeffs", ascending=False)

fi_df7

Unnamed: 0,feature,reg_coeffs
4,price_per_sqft,0.588128
9,Rating,0.571852
1,area,0.39498
15,Shrine,0.342495
10,Metro,0.326999
11,Park,0.326999
5,Maintenance_Staff,0.23738
14,Hospital,0.139057
13,College,0.139057
8,Rain_Water_Harvesting,0.0864


## SHAP

In [17]:
! pip install shap



In [18]:
import shap


# Assuming X_label and y_label are defined elsewhere in your code

# Compute SHAP values using the trained Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(x_label, y_label)

explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(x_label)

# Summing the absolute SHAP values across all samples to get an overall measure of feature importance
shap_sum = np.abs(shap_values).mean(axis=0)

shap_values

array([[ 6.07695608e-03, -3.52479395e-01, -6.72402403e-04, ...,
         1.53476586e-03, -1.51799280e-04, -5.76026720e-04],
       [-9.35878925e-03, -6.21445399e+00,  8.03745615e-04, ...,
         7.09734811e-04, -5.57260293e-03, -4.87391406e-03],
       [-5.12607754e-03,  1.78988025e+00, -6.36878510e-04, ...,
         1.24992507e-03,  8.31922513e-03, -6.88999020e-03],
       ...,
       [ 1.19722272e-03, -9.22579490e-01,  2.67330264e-05, ...,
         6.78100074e-04,  1.28838067e-03,  5.56129559e-04],
       [-1.28866027e-02, -1.18162068e+00, -4.33194531e-03, ...,
        -8.08542673e-04,  4.23762710e-04, -2.08000593e-03],
       [-5.04258308e-03,  1.22857973e+00,  3.57070192e-04, ...,
        -6.79363673e-05,  4.87180939e-05,  1.14245813e-03]])

In [19]:
fi_df8=pd.DataFrame({'feature':x_label.columns,'SHAP_score':np.abs(shap_values).mean(axis=0)}).sort_values(by='SHAP_score',ascending=False)

In [20]:
merged_feature_selection_results=pd.merge(fi_df1,fi_df2,on='feature').merge(fi_df3,on='feature').merge(fi_df4,on='feature').merge(fi_df5,on='feature').merge(fi_df6,on='feature').merge(fi_df7,on='feature').merge(fi_df8,on='feature').set_index('feature')

In [21]:
merged_feature_selection_results

Unnamed: 0_level_0,corr_coeff,rf_importance,gb_importance,perm_importance,lasso_coeff,rfe_score,reg_coeffs,SHAP_score
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
area,0.089681,0.617562,0.628239,1.643432,0.3838589,0.617737,0.39498,2.099487
noOfOpenSides,-0.022198,0.002562,0.001494,-0.000597,0.05581547,0.003538,0.070628,0.005326
possession,-0.051502,0.001014,4.5e-05,-0.000251,-0.06460123,0.000763,-0.075995,0.002818
price_per_sqft,0.196289,0.361313,0.363802,1.243602,0.5792091,0.361736,0.588128,1.839085
Maintenance_Staff,0.023464,0.000343,7e-05,-0.000429,0.2059649,0.000313,0.23738,0.002163
Water_Storage,0.009157,0.000392,0.0,-0.000528,-0.05433874,0.000476,-0.0851,0.002743
Vaastu_Compliant,0.004249,0.001603,3e-06,-0.000356,0.06304399,0.001125,0.066787,0.002637
Rain_Water_Harvesting,-0.025389,0.000762,0.000895,-0.000304,0.0684823,0.000453,0.0864,0.002619
Rating,0.21045,0.002824,0.002064,0.000679,0.5657634,0.001821,0.571852,0.008009
Metro,0.250304,0.000794,0.001031,0.000114,0.6455521,0.000606,0.326999,0.001773


In [22]:
merged_feature_selection_results=merged_feature_selection_results.divide(merged_feature_selection_results.sum(axis=0),axis=1)

In [23]:
merged_feature_selection_results[['rf_importance'	,'gb_importance','perm_importance','rfe_score','SHAP_score']].mean(axis=1).sort_values(ascending=True)

Unnamed: 0_level_0,0
feature,Unnamed: 1_level_1
Maintenance_Staff,0.000225
Park,0.000234
College,0.000254
Water_Storage,0.000276
Hospital,0.00032
School,0.000343
Mall,0.000383
possession,0.000491
Rain_Water_Harvesting,0.000534
Metro,0.000585


In [24]:
x_label

Unnamed: 0,sector,area,noOfOpenSides,possession,price_per_sqft,Maintenance_Staff,Water_Storage,Vaastu_Compliant,Rain_Water_Harvesting,Rating,Metro,Park,School,College,Hospital,Shrine,Mall,Airport
0,39,2248.84,0,1,0.001668,1.0,1.0,1.0,1.0,3.000,1,1,1,0,0,0,1,1
1,87,182.92,0,1,0.012191,1.0,1.0,0.0,0.0,4.000,0,0,0,1,1,0,1,1
2,65,3077.36,1,1,0.002843,1.0,1.0,1.0,1.0,3.000,1,1,1,1,1,0,1,1
3,29,451.92,1,0,0.001992,1.0,1.0,0.0,0.0,4.500,1,1,1,1,1,1,1,0
4,49,2690.00,0,0,0.003346,0.0,0.0,1.0,1.0,4.000,1,1,1,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2051,89,1614.00,1,0,0.001425,0.0,0.0,0.0,0.0,4.375,0,0,1,1,1,0,1,0
2052,95,1398.80,0,0,0.000357,0.0,0.0,0.0,0.0,4.250,0,0,1,1,1,0,0,1
2053,26,1592.48,0,1,0.000502,0.0,0.0,0.0,0.0,3.500,0,0,1,1,1,0,1,0
2054,72,1796.92,0,1,0.002226,0.0,0.0,0.0,0.0,4.000,1,1,1,1,1,0,1,1


## Features vs Models

In [25]:
from sklearn.model_selection import cross_val_score

rf= RandomForestRegressor(n_estimators=100,random_state=6)

score=cross_val_score(rf,x_label,y_label,cv=5,scoring='r2')

In [26]:
score.mean() # R_2 Score-1

0.9394612888763273

In [27]:
x_label.columns

Index(['sector', 'area', 'noOfOpenSides', 'possession', 'price_per_sqft',
       'Maintenance_Staff', 'Water_Storage', 'Vaastu_Compliant',
       'Rain_Water_Harvesting', 'Rating', 'Metro', 'Park', 'School', 'College',
       'Hospital', 'Shrine', 'Mall', 'Airport'],
      dtype='object')

# After deselecting non important columns

rf=RandomForestRegressor(n_estimators=100,random_state=3)

scores=cross_val_score(rf,x_label.drop(columns=['Maintenance_Staff', 'Water_Storage', 'Vaastu_Compliant',
       'Rain_Water_Harvesting']),y_label,cv=5,scoring='r2')
scores.mean()


export_df_2=x_label.drop(columns=['Maintenance_Staff', 'Water_Storage', 'Vaastu_Compliant',
       'Rain_Water_Harvesting'])

In [28]:
# After deselecting non important columns

rf=RandomForestRegressor(n_estimators=100,random_state=3)

scores=cross_val_score(rf,x_label.drop(columns=[
       'Maintenance_Staff', 'Water_Storage', 'Vaastu_Compliant',
       'Rain_Water_Harvesting','Park', 'School', 'College'

    ]),y_label,cv=5,scoring='r2')
#.drop(columns=[  'Vaastu_Compliant', 'College','Mall','Hospital','School']





In [29]:
scores.mean() # R_2 Score-2

0.9400122629505161

In [30]:
#export_df_2=x_label.drop(columns=['possession', 'Maintenance_Staff','Water_Storage', 'Vaastu_Compliant', 'Rain_Water_Harvesting', 'College', 'Hospital','Airport','Mall','Metro'])

In [32]:
df2=pd.read_csv("/content/Gurgoan_Wrangled_Data_2.csv")
df2.columns

Index(['sector', 'price', 'area', 'noOfOpenSides', 'possession',
       'price_per_sqft', 'Maintenance_Staff', 'Water_Storage',
       'Vaastu_Compliant', 'Rain_Water_Harvesting', 'Rating', 'Metro', 'Park',
       'School', 'College', 'Hospital', 'Shrine', 'Mall', 'Airport'],
      dtype='object')

In [33]:
#df2=df2.drop(columns=['Maintenance_Staff', 'Water_Storage', 'Vaastu_Compliant','Rain_Water_Harvesting', 'Mall','Metro','Park' ,'College','Hospital', 'Airport'],axis=1)

In [34]:
x_label=df2.drop(['price','School','College','Park','Maintenance_Staff', 'Water_Storage',
       'Vaastu_Compliant', 'Rain_Water_Harvesting'],axis=1)
y_label=df2['price']

In [35]:
from sklearn.model_selection import KFold,cross_val_score
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline

In [36]:
y_transferred=np.log1p(y_label)

In [37]:
columns_to_encode=['sector',	'noOfOpenSides',	'possession',		'Metro'	,		'Hospital',	'Shrine',	'Mall',	'Airport']

In [38]:
preprocessor=ColumnTransformer(transformers=[('num',StandardScaler(),['area','Rating','price_per_sqft' ]),('cat',OneHotEncoder(handle_unknown='infrequent_if_exist'),columns_to_encode)],remainder='passthrough')

In [39]:
pipeline=Pipeline([('preprocessor',preprocessor),('regressor',LinearRegression())])

In [40]:
pipeline_2=Pipeline([('preprocessor',preprocessor),('regressor',SVR(kernel='rbf'))])

In [41]:
kfold=KFold(n_splits=10,shuffle=True,random_state=3)
score=cross_val_score(pipeline,x_label,y_transferred,cv=kfold,scoring='r2')
score_2=cross_val_score(pipeline_2,x_label,y_transferred,cv=kfold,scoring='r2')

In [42]:
score.mean(),score_2.mean() # R_2 Score-3

(0.5975075696306067, 0.715846053425163)

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
x_train,x_test,y_train,y_test=train_test_split(x_label,y_transferred,test_size=0.2,random_state=5)

In [45]:
pipeline_2.fit(x_train,y_train)

In [46]:
y_pred=pipeline_2.predict(x_test)

In [47]:
y_pred=np.log1p(y_pred)
y_test=np.log1p(y_test)

In [48]:
from sklearn.metrics import mean_absolute_error


In [49]:
print(mean_absolute_error(y_test,y_pred)) # Error-1

0.09803069462521906


## Ordinal Encoding

In [50]:
from sklearn.preprocessing import OrdinalEncoder

In [51]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['area','Rating','price_per_sqft']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1), columns_to_encode)
    ],
    remainder='passthrough'
)

In [52]:

# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [53]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, x_label, y_transferred, cv=kfold, scoring='r2')

In [54]:

scores.mean(),scores.std() #Error-2

(0.13293838977260586, 0.05973219956122348)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(x_label,y_transferred,test_size=0.2,random_state=42)

In [56]:
pipeline.fit(X_train,y_train)

In [57]:
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)



In [58]:
mean_absolute_error(np.expm1(y_test),y_pred) # Error-3

2.366167749186563

In [59]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x_label, y_transferred, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(x_label,y_transferred,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output

In [60]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [61]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [62]:

model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [63]:

model_output

[['linear_reg', 0.13293838977260586, 2.366167749186563],
 ['svr', 0.29028767391219806, 2.0906644009037456],
 ['ridge', 0.1329777895488137, 2.3662144079366985],
 ['LASSO', 0.009564728822201107, 2.5641284524389274],
 ['decision tree', 0.9706694740755436, 0.2133737864077669],
 ['random forest', 0.9838972587025315, 0.1276253185325768],
 ['extra trees', 0.9582015582377632, 0.3454227777416535],
 ['gradient boosting', 0.9843482745164988, 0.20363390093712938],
 ['adaboost', 0.8469922562514511, 0.9935563919761794],
 ['mlp', 0.7914656374509553, 0.9309678699940639],
 ['xgboost', 0.9808262367730615, 0.14089577410550944]]

In [64]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [65]:

model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.983897,0.127625
10,xgboost,0.980826,0.140896
7,gradient boosting,0.984348,0.203634
4,decision tree,0.970669,0.213374
6,extra trees,0.958202,0.345423
9,mlp,0.791466,0.930968
8,adaboost,0.846992,0.993556
1,svr,0.290288,2.090664
0,linear_reg,0.132938,2.366168
2,ridge,0.132978,2.366214


## OneHotEncoding & Ordinal Encoding

In [66]:
df.columns

Index(['sector', 'price', 'area', 'noOfOpenSides', 'possession',
       'price_per_sqft', 'Maintenance_Staff', 'Water_Storage',
       'Vaastu_Compliant', 'Rain_Water_Harvesting', 'Rating', 'Metro', 'Park',
       'School', 'College', 'Hospital', 'Shrine', 'Mall', 'Airport'],
      dtype='object')

In [67]:
columns_to_encode=['Metro'	,		'Hospital',	'Shrine',	'Mall',	'Airport']

In [68]:

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['area','Rating','price_per_sqft']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(handle_unknown='infrequent_if_exist'),['sector','noOfOpenSides','possession'])
    ],
    remainder='passthrough'
)

In [69]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [70]:
df2.columns

Index(['sector', 'price', 'area', 'noOfOpenSides', 'possession',
       'price_per_sqft', 'Maintenance_Staff', 'Water_Storage',
       'Vaastu_Compliant', 'Rain_Water_Harvesting', 'Rating', 'Metro', 'Park',
       'School', 'College', 'Hospital', 'Shrine', 'Mall', 'Airport'],
      dtype='object')

In [71]:
X=df2.drop([ 'Maintenance_Staff', 'Water_Storage',
       'Vaastu_Compliant', 'Rain_Water_Harvesting',
       'School', 'College','Park','price'],axis=1)

In [72]:
y_transformed=y_transferred

In [73]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transferred, cv=kfold, scoring='r2')

In [74]:
scores.mean() # R_2 Score-4

0.5988399176468795

In [75]:

scores.std()

0.06879794986302631

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [77]:

pipeline.fit(X_train,y_train)

In [78]:

y_pred = pipeline.predict(X_test)

In [79]:
y_pred = np.expm1(y_pred)

In [80]:

mean_absolute_error(np.expm1(y_test),y_pred) #Error-4

1.5882827691657224

In [81]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output

In [82]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [83]:

model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [84]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [85]:

model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.982929,0.128877
10,xgboost,0.981336,0.159165
4,decision tree,0.968192,0.203374
7,gradient boosting,0.982952,0.216124
6,extra trees,0.960367,0.365233
9,mlp,0.84532,0.962708
8,adaboost,0.84375,1.069996
1,svr,0.74085,1.409429
0,linear_reg,0.59884,1.588283
2,ridge,0.599579,1.613575


## OneHotEncoding With PCA





preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['area','Rating']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(handle_unknown='infrequent_if_exist'),['sector','noOfOpenSides'])
    ],
    remainder='passthrough'
)




from sklearn.decomposition import PCA




pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])




kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
scores.mean()   


def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])

    
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output   






model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}



model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))




model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])







## Target Encoder

!pip install category_encoders


import category_encoders as ce




preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['areaWithType','Rating']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(handle_unknown='infrequent_if_exist'),['noOfOpenSides']),
        ('target_enc', ce.TargetEncoder(handle_unknown='value'), ['sector'])
    ],
    remainder='passthrough'
)   


scores.mean(),scores.std()



pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])   


kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output  



model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}  



model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))




model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

## Hyperparameter Tuning

In [86]:
from sklearn.model_selection import GridSearchCV

In [87]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [88]:
! pip install category_encoders


Collecting category_encoders
  Downloading category_encoders-2.8.0-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.0-py3-none-any.whl (85 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.0


In [89]:
import category_encoders as ce

In [90]:
columns_to_encode=['Metro',
       'Hospital', 'Shrine', 'Mall', 'Airport']

In [91]:


# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['area','Rating','price_per_sqft']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(handle_unknown='infrequent_if_exist'),['noOfOpenSides','possession']),
        ('target_enc', ce.TargetEncoder(handle_unknown='value'), ['sector'])
    ],
    remainder='passthrough'
)

In [92]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [93]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [94]:
from sklearn.model_selection import GridSearchCV

In [95]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [96]:

search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


640 fits failed out of a total of 1280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
640 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382,

In [97]:

final_pipe = search.best_estimator_

In [98]:
search.best_params_

{'regressor__max_depth': 20,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 200}

In [99]:

search.best_score_

0.9320246554312556

In [100]:
final_pipe.fit(X,y_transformed)

## Expoting the model

In [101]:
columns_to_encode=['Metro'	,		'Hospital',	'Shrine',	'Mall',	'Airport']

In [102]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['area','Rating','price_per_sqft']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(handle_unknown='infrequent_if_exist'),['sector','noOfOpenSides','possession'])
    ],
    remainder='passthrough'
)

In [103]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [104]:

pipeline.fit(X,y_transformed)

In [105]:

import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [106]:
with open('df.pkl', 'wb') as file:
         pickle.dump(X, file)

In [107]:

X

Unnamed: 0,sector,area,noOfOpenSides,possession,price_per_sqft,Rating,Metro,Hospital,Shrine,Mall,Airport
0,sector 33,2248.84,1.0,undefined,0.001668,3.000,1,0,0,1,1
1,sector 93,182.92,1.0,undefined,0.012191,4.000,0,1,0,1,1
2,sector 63,3077.36,2.0,undefined,0.002843,3.000,1,1,0,1,1
3,sector 22,451.92,2.0,immediate,0.001992,4.500,1,1,1,1,0
4,sector 43,2690.00,1.0,immediate,0.003346,4.000,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2051,sector 99,1614.00,2.0,immediate,0.001425,4.375,0,1,0,1,0
2052,sohna road,1398.80,1.0,immediate,0.000357,4.250,0,1,0,0,1
2053,sector 17,1592.48,1.0,undefined,0.000502,3.500,0,1,0,1,0
2054,sector 7,1796.92,1.0,undefined,0.002226,4.000,1,1,0,1,1


Trying out the predictions

In [108]:

X.columns

Index(['sector', 'area', 'noOfOpenSides', 'possession', 'price_per_sqft',
       'Rating', 'Metro', 'Hospital', 'Shrine', 'Mall', 'Airport'],
      dtype='object')

In [109]:
X.iloc[0].values

array(['sector 33', 2248.84, 1.0, 'undefined', 0.0016675263691503, 3.0, 1,
       0, 0, 1, 1], dtype=object)

In [110]:
data = [['sector 50', 90.84, 1.0, 'undefined', 0.005,
       4.0, 1, 1, 1, 1, 1
]]
columns = ['sector',  'area', 'noOfOpenSides', 'possession',
       'price_per_sqft', 'Rating', 'Metro', 'Hospital', 'Shrine', 'Mall',
       'Airport']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,sector,area,noOfOpenSides,possession,price_per_sqft,Rating,Metro,Hospital,Shrine,Mall,Airport
0,sector 50,90.84,1.0,undefined,0.005,4.0,1,1,1,1,1


In [111]:

np.expm1(pipeline.predict(one_df))

array([0.54956929])

In [112]:
X.dtypes

Unnamed: 0,0
sector,object
area,float64
noOfOpenSides,float64
possession,object
price_per_sqft,float64
Rating,float64
Metro,int64
Hospital,int64
Shrine,int64
Mall,int64


In [113]:
sorted(X['sector'].unique().tolist())

['badshahpur',
 'defence colony',
 'dwarka expressway',
 'farrukh nagar',
 'farukhnagar',
 'golf course road',
 'gwal pahari',
 'hilalpur',
 'mayfield garden',
 'new sector 2',
 'new sector 2 phase 1',
 'new sector 2 phase 2',
 'pataudi',
 'patodi',
 'rajiv chowk',
 'sector 10',
 'sector 102',
 'sector 105',
 'sector 106',
 'sector 108',
 'sector 10a',
 'sector 11',
 'sector 110',
 'sector 112',
 'sector 12',
 'sector 15',
 'sector 17',
 'sector 2',
 'sector 21',
 'sector 22',
 'sector 22b',
 'sector 23',
 'sector 24',
 'sector 25',
 'sector 26',
 'sector 27',
 'sector 28',
 'sector 3',
 'sector 30',
 'sector 33',
 'sector 35',
 'sector 37',
 'sector 37d',
 'sector 38',
 'sector 39',
 'sector 4',
 'sector 40',
 'sector 41',
 'sector 42',
 'sector 43',
 'sector 45',
 'sector 46',
 'sector 47',
 'sector 48',
 'sector 49',
 'sector 5',
 'sector 50',
 'sector 51',
 'sector 52',
 'sector 54',
 'sector 55',
 'sector 56',
 'sector 57',
 'sector 60',
 'sector 62',
 'sector 63',
 'sector 63a',


In [114]:
from google.colab import files

In [115]:
files.download("df.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [116]:
files.download("pipeline.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>