In [1]:
import pandas as pd 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv('../../prepared_data/post_feature_selection.csv')

df = df[~(df['floor_category'].isnull())]

# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

X = df.drop(columns=['price'])
y = df['price']

# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

columns_to_encode = ['property_type', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom','balcony', 'built_up_area', 'servant room', 'store room']),
        ('ord', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector'])
    ], 
    remainder='passthrough'
)

# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('RandomForestRegressor', RandomForestRegressor())

])

pipeline.fit(X,y_transformed)

  from pandas.core import (


In [2]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3752 entries, 0 to 3752
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   property_type    3752 non-null   object 
 1   sector           3752 non-null   object 
 2   bedRoom          3752 non-null   int64  
 3   bathroom         3752 non-null   int64  
 4   balcony          3752 non-null   int64  
 5   agePossession    3752 non-null   object 
 6   built_up_area    3752 non-null   float64
 7   servant room     3752 non-null   int64  
 8   store room       3752 non-null   int64  
 9   furnishing_type  3752 non-null   object 
 10  luxury_category  3752 non-null   object 
 11  floor_category   3752 non-null   object 
dtypes: float64(1), int64(5), object(6)
memory usage: 381.1+ KB


In [3]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA
from sklearn.ensemble import HistGradientBoostingRegressor

def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [4]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor

model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
#     'xgboost':XGBRegressor()
}

In [5]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

Traceback (most recent call last):
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-packages\sklearn\utils\_response.py", line 239, in _get_response_values
    y_pred, pos_label = prediction_method(X), None
                        ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 603, in predict
    Xt = transform.transform(Xt)
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-

Traceback (most recent call last):
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-packages\sklearn\utils\_response.py", line 239, in _get_response_values
    y_pred, pos_label = prediction_method(X), None
                        ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 603, in predict
    Xt = transform.transform(Xt)
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-

Traceback (most recent call last):
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-packages\sklearn\utils\_response.py", line 239, in _get_response_values
    y_pred, pos_label = prediction_method(X), None
                        ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 603, in predict
    Xt = transform.transform(Xt)
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\RISHABH AGRAWAL\anaconda3\Lib\site-

In [8]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [9]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
7,adaboost,,0.571896
5,random forest,,0.57369
1,svr,,0.573713
6,gradient boosting,,0.575525
3,LASSO,,0.576648
0,linear_reg,,0.577567
2,ridge,,0.58082
4,decision tree,,0.586323
