In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA
from sklearn.ensemble import HistGradientBoostingRegressor

In [2]:
df = pd.read_csv('../../prepared_data/post_feature_selection.csv')

In [3]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 53,7.4,4,4,3,Under Construction,2477.0,0,0,0,Low,High Floor
1,flat,sector 33,1.5,3,3,3,Under Construction,1403.0,0,0,0,Medium,Mid Floor
2,flat,sector 33,1.15,2,2,2,Under Construction,103.0,0,0,0,Low,High Floor
3,house,sector 3,3.5,1,1,0,Moderately Old,3800.0,0,0,0,Low,Low Floor
4,flat,sector 106,1.15,3,4,4,Relatively New,1186.0,1,0,1,High,High Floor


In [4]:
df['furnishing_type'].value_counts()

furnishing_type
0    2479
1    1066
2     208
Name: count, dtype: int64

In [5]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [6]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 53,7.4,4,4,3,Under Construction,2477.0,0,0,unfurnished,Low,High Floor
1,flat,sector 33,1.5,3,3,3,Under Construction,1403.0,0,0,unfurnished,Medium,Mid Floor
2,flat,sector 33,1.15,2,2,2,Under Construction,103.0,0,0,unfurnished,Low,High Floor
3,house,sector 3,3.5,1,1,0,Moderately Old,3800.0,0,0,unfurnished,Low,Low Floor
4,flat,sector 106,1.15,3,4,4,Relatively New,1186.0,1,0,semifurnished,High,High Floor


In [7]:
df.to_csv('properties.csv',index=False)

In [7]:
## one hot encoding 
# Import necessary libraries
# from sklearn.preprocessing import OneHotEncoder

# Create the encoder
encoder = OneHotEncoder(sparse=False)

# Assuming that 'df' is your DataFrame and 'column_to_encode' is the column you want to encode
encoded_columns = encoder.fit_transform(df[['property_type','sector']])

# The result is a numpy array of encoded columns
print(encoded_columns)


[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 1.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]




In [8]:
# Assuming that 'df' is your DataFrame and 'encoded_columns' is the one-hot encoded numpy array
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out())

# Concatenate the original DataFrame and the encoded DataFrame
df = pd.concat([df, encoded_df], axis=1)


In [9]:
df = df.drop(columns=['property_type','sector'])

In [10]:
df.head()

Unnamed: 0,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,...,sector_sector 88a,sector_sector 89,sector_sector 9,sector_sector 90,sector_sector 91,sector_sector 92,sector_sector 93,sector_sector 95,sector_sector 99,sector_sohna road
0,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
X = df.drop(columns=['price'])
y = df['price']

In [12]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

### Ordinal Encoding

In [13]:
columns_to_encode = ['balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [14]:
# # i want to see if prices of avg price of flats are higher than houses
# import seaborn as sns
# import matplotlib.pyplot as plt

# fig = plt.figure(figsize=(15,10))
# sns.boxplot(x='floor_category',y='price',data=df,hue='property_type')
# plt.title('Boxplot of Prices by Property Type')
# plt.xlabel('Property Type')
# plt.ylabel('Price')
# plt.show()


In [15]:
# df.groupby('property_type')['price'].describe()

In [16]:
# df.groupby('floor_category')['price'].describe()

In [17]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [18]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('gradientBooster', HistGradientBoostingRegressor())
])

In [19]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [20]:
scores.mean(),scores.std()

(0.8706278085564264, 0.01841169538561265)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [22]:
pipeline.fit(X_train,y_train)

In [23]:
y_pred = pipeline.predict(X_test)

In [24]:
y_pred = np.expm1(y_pred)

In [25]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.5662725789036734

In [26]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [27]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor

model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreeRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
#     'mlp': MLPRegressor()
}

In [28]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [29]:
model_output

[['linear_reg', 0.8539809451682288, 0.6505938722522753],
 ['svr', 0.8788616412879398, 0.5415244037316037],
 ['ridge', 0.8543397479712402, 0.6548406810402876],
 ['LASSO', -0.003597657277664901, 1.576391507281618],
 ['decision tree', 0.7987677660441206, 0.66345369066855],
 ['random forest', 0.8708669462164706, 0.5352955294412802],
 ['extra trees', 0.7752554657623165, 0.5729642886500702],
 ['gradient boosting', 0.8572116278722479, 0.6016000203911945],
 ['adaboost', 0.7322892195141641, 0.9141825220348269]]

In [30]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [31]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.870867,0.535296
1,svr,0.878862,0.541524
6,extra trees,0.775255,0.572964
7,gradient boosting,0.857212,0.6016
0,linear_reg,0.853981,0.650594
2,ridge,0.85434,0.654841
4,decision tree,0.798768,0.663454
8,adaboost,0.732289,0.914183
3,LASSO,-0.003598,1.576392


### OneHotEncoding

In [33]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['agePossession'])
    ], 
    remainder='passthrough'
)

In [34]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SGDRegressor())
])

In [35]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [36]:
scores.mean()

0.7948166767077091

In [37]:
scores.std()

0.028543901637684603

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [39]:
pipeline.fit(X_train,y_train)

In [40]:
y_pred = pipeline.predict(X_test)

In [41]:
y_pred = np.expm1(y_pred)

In [42]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.8431686868723942

In [43]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [47]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    #'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    #'mlp': MLPRegressor(),
    #'xgboost':XGBRegressor()
}

In [48]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [49]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [50]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.869806,0.534562
1,svr,0.880594,0.535347
6,gradient boosting,0.857705,0.602483
0,linear_reg,0.854731,0.649188
2,ridge,0.855109,0.652966
4,decision tree,0.800477,0.688925
7,adaboost,0.724451,0.903705
3,LASSO,-0.003598,1.576392


### OneHotEncoding With PCA

In [52]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession'])
    ], 
    remainder='passthrough'
)

In [53]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [54]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [55]:
scores.mean()

0.7616489727969493

In [56]:
scores.std()

0.029832211223006024

In [57]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [59]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    #'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    #'mlp': MLPRegressor(),
    #'xgboost':XGBRegressor()
}

In [60]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [61]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [62]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.822672,0.646112
1,svr,0.814237,0.681276
6,gradient boosting,0.816085,0.704731
7,adaboost,0.699911,0.911507
2,ridge,0.761667,0.913432
0,linear_reg,0.761649,0.913483
4,decision tree,0.615288,0.924797
3,LASSO,-0.003598,1.576392


### Target Encoder

In [None]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [None]:
!pip install category_encoders

In [None]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [None]:
scores.mean(),scores.std()

In [None]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [None]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [None]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [None]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [None]:
model_df.sort_values(['mae'])

### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [None]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [None]:
search.fit(X, y_transformed)

In [None]:
final_pipe = search.best_estimator_

In [None]:
search.best_params_

In [None]:
search.best_score_

In [None]:
final_pipe.fit(X,y_transformed)

### Exporting the model

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [None]:
pipeline.fit(X,y_transformed)

In [None]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [None]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [None]:
X

### Trying out the predictions

In [None]:
X.columns

In [None]:
X.iloc[0].values

In [None]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df


In [None]:
np.expm1(pipeline.predict(one_df))

In [None]:
X.dtypes

In [None]:
sorted(X['sector'].unique().tolist())