In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

ModuleNotFoundError: No module named 'xgboost'

In [32]:
#!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.1/99.8 MB 1.3 MB/s eta 0:01:16
   ---------------------------------------- 0.2/99.8 MB 1.5 MB/s eta 0:01:07
   ---------------------------------------- 0.2/99.8 MB 1.4 MB/s eta 0:01:09
   ---------------------------------------- 0.3/99.8 MB 1.5 MB/s eta 0:01:06
   ---------------------------------------- 0.4/99.8 MB 1.7 MB/s eta 0:01:00
   ---------------------------------------- 0.5/99.8 MB 1.7 MB/s eta 0:01:00
   ---------------------------------------- 0.6/99.8 MB 1.7 MB/s eta 0:01:00
   ---------------------------------------- 0.7/99.8 MB 1.7 MB/s eta 0:00:59
   ---------------------------------------- 0.8/99.8 MB 1.7 MB/s eta 0:00:58
   -----------------



In [99]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('cleaned_data_v8_1.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_data_v8_1.csv'

In [101]:
df.head()

Unnamed: 0,PROPERTY_TYPE,CITY,BEDROOM_NUM,AREA,BALCONY_NUM,Location,price,luxury_category,floor_category,age_category,Landmark_Category
0,Residential Apartment,Kolkata South,2.0,570.5,1.0,Amtala,0.191,Low,Low Floor,Old Property,Few Landmarks
1,Residential Apartment,Kolkata South,3.0,1115.5,1.0,EM Bypass,1.175,Low,Low Floor,Old Property,Several Landmarks
2,Residential Apartment,Kolkata South,3.0,1446.0,1.0,Garia,1.285,Low,Low Floor,Old Property,Several Landmarks
3,Residential Apartment,Kolkata South,3.0,1295.0,1.0,Joka,0.675,Low,Low Floor,Old Property,Many Landmarks
4,Residential Apartment,Kolkata South,2.0,920.0,1.0,Joka,0.47,Low,Low Floor,Old Property,Many Landmarks


In [102]:
X = df.drop(columns=['price'])
y = df['price']

In [103]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [7]:
X['AREA'] = np.log1p(X['AREA'])

## Ordinal Encoding

In [8]:
columns_to_encode = ['CITY', 'age_category', 'luxury_category', 'floor_category','PROPERTY_TYPE','Landmark_Category']

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['BEDROOM_NUM', 'AREA','BALCONY_NUM']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first',handle_unknown='ignore'), ['Location'])
    ], 
    remainder='passthrough'
)

In [10]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [11]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [12]:
scores.mean(),scores.std()

(0.8160093861151114, 0.017053775350858377)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [14]:
pipeline.fit(X_train,y_train)

In [15]:
y_pred = pipeline.predict(X_test)

In [16]:
y_pred = np.expm1(y_pred)

In [17]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.26477891511719576

In [30]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [34]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [35]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))









In [36]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [38]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.876084,0.185103
5,random forest,0.867207,0.194675
9,mlp,0.887584,0.19774
10,xgboost,0.883096,0.202123
1,svr,0.878342,0.208164
4,decision tree,0.784206,0.232815
7,gradient boosting,0.846358,0.235777
0,linear_reg,0.816009,0.264779
2,ridge,0.821005,0.265261
8,adaboost,0.613426,0.384104


## One Hot Encoding

In [18]:
columns_to_encode = ['CITY', 'age_category', 'luxury_category', 'floor_category','PROPERTY_TYPE','Landmark_Category','Location']

In [19]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['BEDROOM_NUM', 'AREA','BALCONY_NUM']),
        ('cat', OneHotEncoder(drop='first',handle_unknown='ignore'), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [20]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [21]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [22]:
scores.mean()

0.8095763999438976

In [23]:
scores.std()

0.01919081228256809

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [25]:
pipeline.fit(X_train,y_train)

In [26]:
y_pred = pipeline.predict(X_test)

In [27]:
y_pred = np.expm1(y_pred)

In [28]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.26456677869128614

In [29]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [30]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [34]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [35]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [36]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.87827,0.193709
5,random forest,0.866696,0.197757
10,xgboost,0.879531,0.202463
1,svr,0.885099,0.203454
9,mlp,0.891293,0.203495
7,gradient boosting,0.842457,0.236536
4,decision tree,0.790031,0.244423
2,ridge,0.831342,0.256068
0,linear_reg,0.809576,0.264567
8,adaboost,0.693552,0.379739


## OneHotEncoding With PCA

In [43]:
columns_to_encode = ['CITY', 'age_category', 'luxury_category', 'floor_category','PROPERTY_TYPE','Landmark_Category']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['BEDROOM_NUM', 'AREA','BALCONY_NUM']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False), ['Location'])
    ], 
    remainder='passthrough'
)

In [44]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [45]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [46]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [47]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [48]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [49]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.85718,0.195354
6,extra trees,0.864384,0.201289
5,random forest,0.852634,0.210771
1,svr,0.854957,0.218478
9,mlp,0.857167,0.220136
7,gradient boosting,0.843554,0.226541
4,decision tree,0.710345,0.307036
2,ridge,0.747409,0.328563
0,linear_reg,0.747393,0.328595
8,adaboost,0.561325,0.439671


## Target Encoder

In [50]:
#!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
   ---------------------------------------- 0.0/81.9 kB ? eta -:--:--
   --------------- ------------------------ 30.7/81.9 kB ? eta -:--:--
   --------------- ------------------------ 30.7/81.9 kB ? eta -:--:--
   ---------------------------------------- 81.9/81.9 kB 657.0 kB/s eta 0:00:00
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3




In [51]:
import category_encoders as ce

columns_to_encode = ['luxury_category', 'floor_category','PROPERTY_TYPE','Landmark_Category']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['BEDROOM_NUM', 'AREA','BALCONY_NUM']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False), ['age_category']),
        ('target_enc', ce.TargetEncoder(), ['Location','CITY'])
    ], 
    remainder='passthrough'
)

In [52]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [53]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [54]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [55]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [56]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.891581,0.179391
10,xgboost,0.882264,0.187359
5,random forest,0.877818,0.193348
7,gradient boosting,0.871812,0.21589
1,svr,0.859694,0.221446
9,mlp,0.855695,0.228363
4,decision tree,0.800789,0.235162
0,linear_reg,0.78476,0.301785
2,ridge,0.784798,0.301822
8,adaboost,0.652385,0.391002


## Hyperparameter Tuning

In [57]:
from sklearn.model_selection import GridSearchCV

In [72]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300,500],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [73]:
columns_to_encode = ['luxury_category', 'floor_category','PROPERTY_TYPE','Landmark_Category']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['BEDROOM_NUM', 'AREA','BALCONY_NUM']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False), ['age_category']),
        ('target_enc', ce.TargetEncoder(), ['Location','CITY'])
    ], 
    remainder='passthrough'
)

In [74]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', ExtraTreesRegressor(bootstrap=True))
])

In [75]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [76]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [77]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 160 candidates, totalling 1600 fits


In [78]:
final_pipe = search.best_estimator_

In [79]:
search.best_params_

{'regressor__max_depth': None,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 500}

In [80]:
search.best_score_

0.8901644349226535

In [82]:
final_pipe.fit(X,y_transformed)

In [104]:
columns_to_encode = ['luxury_category', 'floor_category','PROPERTY_TYPE','Landmark_Category']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['BEDROOM_NUM', 'AREA','BALCONY_NUM']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False), ['age_category']),
        ('target_enc', ce.TargetEncoder(), ['Location','CITY'])
    ], 
    remainder='passthrough'
)

In [105]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', ExtraTreesRegressor(n_estimators = 500))
])

X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [106]:
pipeline.fit(X_train,y_train)

In [107]:
y_pred = pipeline.predict(X_test)

In [108]:
y_pred = np.expm1(y_pred)

In [109]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.1795084198629146

## Trying Out Predictions

In [110]:
X.columns

Index(['PROPERTY_TYPE', 'CITY', 'BEDROOM_NUM', 'AREA', 'BALCONY_NUM',
       'Location', 'luxury_category', 'floor_category', 'age_category',
       'Landmark_Category'],
      dtype='object')

In [111]:
X.iloc[0].values

array(['Residential Apartment', 'Kolkata South', 2.0, 570.5, 1.0,
       'Amtala', 'Low', 'Low Floor', 'Old Property', 'Few Landmarks'],
      dtype=object)

In [123]:
data = [['Residential Apartment', 'Kolkata South', 3.0, 1750, 3.0, 'Amtala', 'Low', 'Low Floor', 'Old Property', 'Few Landmarks']]
columns = ['PROPERTY_TYPE', 'CITY', 'BEDROOM_NUM', 'AREA', 'BALCONY_NUM',
       'Location', 'luxury_category', 'floor_category', 'age_category',
       'Landmark_Category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,PROPERTY_TYPE,CITY,BEDROOM_NUM,AREA,BALCONY_NUM,Location,luxury_category,floor_category,age_category,Landmark_Category
0,Residential Apartment,Kolkata South,3.0,1750,3.0,Amtala,Low,Low Floor,Old Property,Few Landmarks


In [124]:
np.expm1(pipeline.predict(one_df))

array([0.84376046])

In [125]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [126]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [4]:
import pickle

In [5]:
with open('df.pkl','rb') as file:
    df = pickle.load(file)

In [16]:
df[df['CITY'] == 'Kolkata North']['Location'].unique().tolist()



['Madhyamgram',
 'Narkeldanga',
 'BT Road',
 'Bangur',
 'Dum Dum',
 'Kestopur',
 'Dum Dum Park',
 'Barasat',
 'Belgharia',
 'Ultadanga',
 'Salt Lake',
 'Nager Bazar',
 'Baguiati',
 'Dunlop',
 'Durganagar',
 'Kanchrapara',
 'Noapara',
 'Maniktala',
 'New Barrakpur',
 'Sodepur',
 'Kaikhali',
 'Badu',
 'Lake Town',
 'Sinthi More',
 'Sector 3 Salt Lake',
 'Birati',
 'Nalta',
 'Shobhabazar',
 'Beliaghata',
 'Kankurgachi',
 'Jyangra',
 'Amarpalli',
 'Deshpriya Nagar',
 'Hanapara',
 'Ichapore',
 'Debpukur',
 'Airport',
 'Jessore road Ganga Nagar',
 'Barrackpore',
 'Barat Colony',
 'Badamtala',
 'Raghunathpur',
 'Paikpara',
 'Rathtala',
 'Srirampur',
 'Teghoria',
 'Kalindi',
 'Dum Dum Cantt',
 'VIP Haldiram',
 'EM Bypass',
 'Jessore Road',
 'Sukchar',
 'Girish Park',
 'Khardah',
 'Phoolbagan',
 'Ananya',
 '5/1B MM Feeder Road.Kolkata-700057',
 'Garpar',
 'Bhatpara',
 'Jheel Bagan',
 'Khalisha Kota',
 'Sector 1 Salt Lake',
 'Nimta',
 'South Sinthee',
 'Shyamnagar 24 pgs north',
 'Sinthee',
 'Na