In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold,cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier, XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA

In [3]:

df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')

In [5]:
df.sample(5)

Unnamed: 0,sector,property_type,price,bedRoom,bathroom,balcony,floor_category,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category
1543,sector 107,flat,1.15,2,2,3,Low floor,Relatively new property,1281.0,0,0,1,Low
666,sector 37c,flat,0.9,3,3,3,Low floor,Under Construction,1500.0,0,0,1,Medium
1620,sector 53,flat,6.44,4,4,3,Mid floor,Moderately New Property,3068.0,1,0,0,Medium
949,sector 69,house,2.95,4,3,2,Mid floor,New Property,1799.99,0,0,2,Medium
837,sector 6,house,0.75,2,4,1,Mid floor,Moderately New Property,713.0,0,0,1,Medium


In [7]:
df['furnishing_type'].value_counts()

furnishing_type
1    2374
0     995
2     185
Name: count, dtype: int64

In [9]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0 : "unfurnished", 1 : "semifurnished", 2 : " furnished" })

In [11]:
df.head()

Unnamed: 0,sector,property_type,price,bedRoom,bathroom,balcony,floor_category,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category
0,sector 95,flat,0.45,2,2,2,High floor,New Property,695.0,0,0,semifurnished,Low
1,sector 78,flat,1.2,3,3,2,Mid floor,Moderately New Property,2045.0,0,0,furnished,Low
2,manesar,flat,0.91,3,3,3+,Mid floor,Moderately New Property,1900.0,1,0,semifurnished,Low
3,sector 108,flat,3.35,3,4,2,High floor,New Property,2525.0,0,0,semifurnished,Medium
4,sector 108,flat,1.9,2,2,2,Mid floor,Relatively new property,1250.0,0,0,semifurnished,Medium


In [13]:
x = df.drop(columns = ["price"])
y = df["price"]

In [15]:
x.sample(16)

Unnamed: 0,sector,property_type,bedRoom,bathroom,balcony,floor_category,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category
2539,sector 109,house,5,6,3+,Low floor,Relatively new property,3600.0,1,1,unfurnished,Medium
1047,sector 22,flat,4,6,3+,Mid floor,New Property,2778.0,1,0,furnished,Low
2949,sector 37d,flat,3,3,3,Mid floor,Relatively new property,1639.0,0,0,semifurnished,Medium
3506,sector 72,flat,4,4,3+,Low floor,New Property,4000.0,1,1,unfurnished,High
176,sector 84,flat,4,4,3+,Mid floor,Relatively new property,2998.0,1,0,semifurnished,Medium
2470,sohna road,flat,2,2,2,Mid floor,Relatively new property,640.0,0,0,semifurnished,Low
2301,sohna road,flat,2,2,2,Mid floor,Relatively new property,789.0,0,0,furnished,Low
2844,sector 92,flat,3,3,1,High floor,New Property,711.0,0,0,furnished,Medium
546,sector 37c,flat,3,4,3,Mid floor,Relatively new property,1660.0,1,0,semifurnished,Low
2424,sector 108,flat,2,2,2,High floor,New Property,1250.0,0,0,unfurnished,Medium


In [17]:
y.head(4)

0    0.45
1    1.20
2    0.91
3    3.35
Name: price, dtype: float64

In [19]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

## Ordinal Encoding

In [22]:
columns_to_encode = ['property_type', 'balcony',  'furnishing_type', 'luxury_category', 'floor_category','sector','agePossession']

In [24]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [26]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [28]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')

In [29]:
scores.mean(),scores.std()

(0.7313228676389815, 0.02727766817361159)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)

In [34]:
pipeline.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [36]:
y_pred = pipeline.predict(X_test)

In [38]:
y_pred = np.expm1(y_pred)

In [40]:
mean_absolute_error(np.expm1(y_test),y_pred)   # model does the 0.94 cr of mistake

0.9476743135753907

In [42]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [44]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [46]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [47]:
model_output

[['linear_reg', 0.7313228676389815, 0.9476743135753907],
 ['svr', 0.7564684650028098, 0.9417246668314785],
 ['ridge', 0.7313262108040692, 0.94765969553837],
 ['LASSO', 0.0544403340091963, 1.7449421903045472],
 ['decision tree', 0.7916938214964003, 0.680417679352207],
 ['random forest', 0.8832588068168168, 0.5606262401909247],
 ['extra trees', 0.8713897894648985, 0.5802679610403502],
 ['gradient boosting', 0.876221910676245, 0.5960394881396677],
 ['adaboost', 0.7648860276510635, 0.8841617465262086],
 ['mlp', 0.8006794657947344, 0.8003058904407983],
 ['xgboost', 0.8936007324322597, 0.5446517783755491]]

In [48]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [49]:
model_df.sort_values(['mae'])    # xgboost performing the best

Unnamed: 0,name,r2,mae
10,xgboost,0.893601,0.544652
5,random forest,0.883259,0.560626
6,extra trees,0.87139,0.580268
7,gradient boosting,0.876222,0.596039
4,decision tree,0.791694,0.680418
9,mlp,0.800679,0.800306
8,adaboost,0.764886,0.884162
1,svr,0.756468,0.941725
2,ridge,0.731326,0.94766
0,linear_reg,0.731323,0.947674


## OneHotEncoding

In [51]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',handle_unknown='ignore'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [52]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [53]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')



In [54]:
scores.mean()

0.8549614404199269

In [55]:
scores.std()

0.022089434038714998

In [56]:
X_train, X_test, y_train, y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)


In [57]:
pipeline.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [69]:
y_pred = pipeline.predict(X_test)

In [71]:
y_pred = np.expm1(y_pred)

In [73]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.7101056770256825

In [75]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [77]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}


In [79]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [80]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [81]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.89346,0.539757
6,extra trees,0.894348,0.548
10,xgboost,0.897083,0.556017
9,mlp,0.865674,0.586713
7,gradient boosting,0.876918,0.611779
4,decision tree,0.801276,0.66573
0,linear_reg,0.854961,0.710106
2,ridge,0.855117,0.712586
8,adaboost,0.764902,0.905338
1,svr,0.760562,0.93797


## OneHotEncoding With PCA

In [508]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',handle_unknown='ignore'),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [524]:
# Creating a pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), ['sector','agePossession'])
    ], 
    remainder='passthrough'
)


In [526]:
# K-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\joblib\memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\utils\_set_output.py", line 313, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\decomposition\_pca.py", line 474, in fit_transform
    U, S, _, X, x_is_centered, xp = self._fit(X)
                                    ^^^^^^^^^^^^
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\decomposition\_pca.py", line 495, in _fit
    raise TypeError(
TypeError: PCA only support sparse inputs with the "arpack" and "covariance_eigh" solvers, while "full" was passed. See TruncatedSVD for a possible alternative.


In [528]:
scores.mean()

0.8549567922869696

In [530]:
scores.std()

0.8549567922869696

In [83]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [None]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [536]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [537]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])


In [538]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.842111,0.691337
10,xgboost,0.834909,0.70769
7,gradient boosting,0.8262,0.720864
9,mlp,0.818426,0.724637
5,random forest,0.829764,0.729813
1,svr,0.821669,0.752033
2,ridge,0.759767,0.909685
0,linear_reg,0.759704,0.91017
8,adaboost,0.697061,0.976764
4,decision tree,0.632053,0.990512


## Target Encoder

In [85]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [86]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])


In [109]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')
scores.mean(),scores.std()


Traceback (most recent call last):
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 152, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 400, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\utils\_response.py", line 242, in _get_response_values
    y_pred, pos_label = prediction_method(X), None
                        ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 788, in predict
    Xt = transform.transform(Xt)
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in w

(nan, nan)

In [None]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [None]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}



In [None]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [None]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

In [None]:
## Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [None]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])



In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [None]:
search.fit(x, y_transformed)

In [None]:
final_pipe = search.best_estimator_


In [None]:
search.best_params_

In [None]:
search.best_score_

In [None]:
final_pipe.fit(x,y_transformed)

## Exporting the model

In [101]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)


In [103]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])


In [105]:
pipeline.fit(x,y_transformed)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,500
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [106]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)


In [107]:
with open('df.pkl', 'wb') as file:
    pickle.dump(x, file)


In [111]:
x

Unnamed: 0,sector,property_type,bedRoom,bathroom,balcony,floor_category,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category
0,sector 95,flat,2,2,2,High floor,New Property,695.0,0,0,semifurnished,Low
1,sector 78,flat,3,3,2,Mid floor,Moderately New Property,2045.0,0,0,furnished,Low
2,manesar,flat,3,3,3+,Mid floor,Moderately New Property,1900.0,1,0,semifurnished,Low
3,sector 108,flat,3,4,2,High floor,New Property,2525.0,0,0,semifurnished,Medium
4,sector 108,flat,2,2,2,Mid floor,Relatively new property,1250.0,0,0,semifurnished,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...
3549,sector 102,flat,3,3,2,Mid floor,Relatively new property,1493.0,1,0,semifurnished,High
3550,sector 76,flat,2,2,2,High floor,Under Construction,665.0,0,0,semifurnished,Medium
3551,sector 65,flat,3,3,2,High floor,Moderately New Property,4278.0,0,1,semifurnished,Medium
3552,sector 86,flat,3,3,1,High floor,Relatively new property,1746.0,0,0,semifurnished,Medium


In [None]:
## Trying out the predictions

In [None]:
X.columns

In [None]:
X.iloc[0].values

In [None]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

In [None]:
np.expm1(pipeline.predict(one_df))

In [17]:
x.dtypes

sector              object
property_type       object
bedRoom              int64
bathroom             int64
balcony             object
floor_category      object
agePossession       object
built_up_area      float64
servant room         int64
store room           int64
furnishing_type     object
luxury_category     object
dtype: object

In [None]:
sorted(x['sector'].unique().tolist())