In [37]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [38]:
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')

In [39]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,study room,servant room,store room,furnishing_type,floor_category
0,house,sector 109,8.75,5.0,5.0,3+,New Property,3600.0,1.0,1.0,1.0,2.0,Medium floor
1,house,sector 48,6.4,4.0,4.0,3,Moderately Old,2700.0,0.0,1.0,1.0,2.0,Medium floor
2,flats,sector 49,2.9,4.0,4.0,3+,Moderately Old,2383.0,0.0,1.0,0.0,0.0,Medium floor
3,flats,manesar,0.9,3.0,3.0,3+,Moderately Old,2089.0,0.0,1.0,0.0,0.0,Medium floor
4,flats,sector 112,3.4,3.0,3.0,2,Relatively New,2225.56,0.0,0.0,0.0,2.0,High floor


In [40]:
df['furnishing_type'].value_counts()

furnishing_type
0.0    2371
2.0     994
1.0     185
Name: count, dtype: int64

In [41]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [42]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,study room,servant room,store room,furnishing_type,floor_category
0,house,sector 109,8.75,5.0,5.0,3+,New Property,3600.0,1.0,1.0,1.0,furnished,Medium floor
1,house,sector 48,6.4,4.0,4.0,3,Moderately Old,2700.0,0.0,1.0,1.0,furnished,Medium floor
2,flats,sector 49,2.9,4.0,4.0,3+,Moderately Old,2383.0,0.0,1.0,0.0,unfurnished,Medium floor
3,flats,manesar,0.9,3.0,3.0,3+,Moderately Old,2089.0,0.0,1.0,0.0,unfurnished,Medium floor
4,flats,sector 112,3.4,3.0,3.0,2,Relatively New,2225.56,0.0,0.0,0.0,furnished,High floor


In [43]:
X = df.drop(columns=['price'])
y = df['price']

In [44]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

# Ordinal Encoding

In [45]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'floor_category']

In [46]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), columns_to_encode)
    ])

In [47]:

# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [48]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [49]:
scores.mean(),scores.std()

(0.842899072800518, 0.03364002838939808)

# OneHotEncoding

In [105]:
columns_to_encode = ['property_type', 'balcony','floor_category', 'furnishing_type']

In [106]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(handle_unknown='ignore'),['sector','agePossession','furnishing_type'])
        # ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [107]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [108]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [109]:
scores.mean()

0.8417734330848055

In [110]:
scores.std()

0.03497611855086834

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [112]:
pipeline.fit(X_train,y_train)

In [113]:
y_pred = pipeline.predict(X_test)

In [114]:
y_pred = np.expm1(y_pred)

In [115]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.6565480484522284

In [116]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [117]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [118]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [119]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [120]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [121]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.882372,0.537767
6,extra trees,0.871511,0.54228
5,random forest,0.863304,0.565641
9,mlp,0.86694,0.570026
1,svr,0.871073,0.590056
7,gradient boosting,0.851064,0.631512
0,linear_reg,0.841773,0.656548
2,ridge,0.842461,0.65675
4,decision tree,0.771211,0.74702
8,adaboost,0.696314,0.932085


# OneHotEncoding With PCA

In [122]:
# columns_to_encode = ['property_type', 'balcony','floor_category']

In [126]:
# Creating a column transformer for preprocessing
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
#         ('cat', OrdinalEncoder(), columns_to_encode),
#         ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
#     ], 
#     remainder='passthrough'
# )


# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['sector', 'agePossession'])
    ],
    remainder='passthrough'
)

In [127]:
# # Creating a pipeline
# pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('pca', PCA(n_components=0.95)),
#     ('regressor', LinearRegression())
# ])

# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [128]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [129]:
scores.mean()

0.7483004571159259

In [130]:
scores.std()

0.032810416328537796

In [131]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [132]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [134]:
model_output = [] 
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

