In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('gurgaon_v6.csv')

In [3]:
df = df.drop(columns=['study room' , 'pooja room' , 'others'])
df.head()

Unnamed: 0,property_type,sector,price,area,bedRoom,bathroom,balcony,floor,agePossession,furnish,features,servant room,store room
0,flats,sector 113,2.0,1665.0,3.0,3.0,2.0,High Floor,under construction,unfurnished,mid,0.0,0.0
1,flats,sector 88a,1.29,1654.0,3.0,3.0,3.0,Mid Floor,new property,unfurnished,mid,0.0,0.0
2,flats,sector 104,0.2,301.0,1.0,1.0,1.0,Mid Floor,relative new,unfurnished,low,0.0,0.0
3,houses,sector 8,0.32,450.0,2.0,3.0,1.0,Low Floor,relative new,unfurnished,low,0.0,0.0
4,flats,sector 69,1.7,2013.0,4.0,4.0,2.0,Mid Floor,relative new,unfurnished,high,0.0,0.0


In [7]:
df.bedRoom.unique()

array([ 3.,  1.,  2.,  4.,  5.,  8.,  6.,  7., 12., 10.,  9., 21., 16.,
       14., 13., 19., 18., 20., 11.])

In [186]:
X = df.drop(columns=['price'])
y = df['price']

In [187]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

### Ordinal Encoding

In [188]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnish', 'features', 'floor']

In [189]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value' , unknown_value=-1), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [190]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [191]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [192]:
scores.mean(),scores.std()

(0.7239402144846412, 0.03428447031642899)

In [193]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [194]:
pipeline.fit(X_train,y_train)

In [195]:
y_pred = pipeline.predict(X_test)

In [196]:
y_pred = np.expm1(y_pred)

In [197]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.8942146552196732

In [198]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [199]:
from sklearn.linear_model import Ridge , Lasso
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import ExtraTreesRegressor , GradientBoostingRegressor , RandomForestRegressor , AdaBoostRegressor
from xgboost import XGBRegressor


model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'xgboost':XGBRegressor()
}

In [200]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [201]:
model_output

[['linear_reg', 0.7239402144846412, 0.8942146552196732],
 ['svr', 0.740523439779592, 0.9004521331824762],
 ['ridge', 0.7239420843857526, 0.8944214005626053],
 ['LASSO', 0.05338403030232257, 1.5136978660995508],
 ['decision tree', 0.7745304219336834, 0.7488333168586765],
 ['random forest', 0.8720007852923658, 0.5139506952069474],
 ['extra trees', 0.8597018435915453, 0.5404160303491252],
 ['gradient boosting', 0.8551503969260448, 0.627150937412268],
 ['adaboost', 0.6372968661550346, 0.9930371272725249],
 ['xgboost', 0.8806258535913025, 0.5208762764014945]]

In [202]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [203]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.872001,0.513951
9,xgboost,0.880626,0.520876
6,extra trees,0.859702,0.540416
7,gradient boosting,0.85515,0.627151
4,decision tree,0.77453,0.748833
0,linear_reg,0.72394,0.894215
2,ridge,0.723942,0.894421
1,svr,0.740523,0.900452
8,adaboost,0.637297,0.993037
3,LASSO,0.053384,1.513698


### OneHotEncoding

In [204]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value' , unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnish'])
    ], 
    remainder='passthrough'
)

In [205]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [206]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

Traceback (most recent call last):
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 137, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 345, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 87, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_response.py", line 238, in _get_response_values
    y_pred, pos_label = prediction_method(X), None
                        ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 602, in predict
    Xt = transform.transform(Xt)


In [207]:
scores.mean()

nan

In [208]:
scores.std()

nan

In [209]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [210]:
pipeline.fit(X_train,y_train)

In [211]:
y_pred = pipeline.predict(X_test)

In [212]:
y_pred = np.expm1(y_pred)

In [213]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.6797162114945865

In [214]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [215]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'xgboost':XGBRegressor()
}

In [216]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

Traceback (most recent call last):
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 137, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 345, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 87, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_response.py", line 238, in _get_response_values
    y_pred, pos_label = prediction_method(X), None
                        ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 602, in predict
    Xt = transform.transform(Xt)


In [217]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [218]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,,0.464656
5,random forest,,0.505719
9,xgboost,,0.532687
7,gradient boosting,,0.608345
4,decision tree,,0.669981
0,linear_reg,,0.679716
2,ridge,,0.680777
1,svr,,0.890795
8,adaboost,,0.93214
3,LASSO,,1.513698


### OneHotEncoding With PCA

In [219]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value' , unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [220]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [221]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

Traceback (most recent call last):
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 137, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 345, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 87, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_response.py", line 238, in _get_response_values
    y_pred, pos_label = prediction_method(X), None
                        ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 602, in predict
    Xt = transform.transform(Xt)


In [222]:
scores.mean()

nan

In [223]:
scores.std()

nan

In [224]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [225]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'xgboost':XGBRegressor()
}

In [226]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

Traceback (most recent call last):
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 137, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 345, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 87, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_response.py", line 238, in _get_response_values
    y_pred, pos_label = prediction_method(X), None
                        ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 602, in predict
    Xt = transform.transform(Xt)


In [227]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [228]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,,0.732697
6,extra trees,,0.741188
4,decision tree,,0.786838
9,xgboost,,0.991357
7,gradient boosting,,1.019475
1,svr,,1.34858
8,adaboost,,1.417098
3,LASSO,,1.513642
2,ridge,,1.520272
0,linear_reg,,1.520272


### Target Encoder

In [229]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnish', 'features', 'floor']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value' , unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [230]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [231]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [232]:
scores.mean(),scores.std()

(0.8123162405654096, 0.023097067508391265)

In [233]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [234]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'xgboost':XGBRegressor()
}

In [235]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [236]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [237]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.89176,0.447824
9,xgboost,0.887666,0.487203
5,random forest,0.890133,0.495513
7,gradient boosting,0.869699,0.5905
4,decision tree,0.806903,0.625298
0,linear_reg,0.812316,0.732268
2,ridge,0.812373,0.732627
8,adaboost,0.70484,0.822826
1,svr,0.760161,0.870103
3,LASSO,0.053384,1.513698


### Hyperparameter Tuning

In [238]:
from sklearn.model_selection import GridSearchCV

In [239]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [240]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnish', 'features', 'floor']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value' , unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [241]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [242]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [243]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [244]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


640 fits failed out of a total of 1280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
322 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\pc\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_p

In [245]:
final_pipe = search.best_estimator_

In [246]:
search.best_params_

{'regressor__max_depth': None,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 300}

In [272]:
search.best_score_

0.8936343126045699

In [248]:
final_pipe.fit(X,y_transformed)

### Exporting the model

In [249]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [273]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=300))
])

In [274]:
pipeline.fit(X,y_transformed)

In [275]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [276]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [277]:
X

Unnamed: 0,property_type,sector,area,bedRoom,bathroom,balcony,floor,agePossession,furnish,features,servant room,store room
0,flats,sector 113,1665.0,3.0,3.0,2.0,High Floor,under construction,unfurnished,mid,0.0,0.0
1,flats,sector 88a,1654.0,3.0,3.0,3.0,Mid Floor,new property,unfurnished,mid,0.0,0.0
2,flats,sector 104,301.0,1.0,1.0,1.0,Mid Floor,relative new,unfurnished,low,0.0,0.0
3,houses,sector 8,450.0,2.0,3.0,1.0,Low Floor,relative new,unfurnished,low,0.0,0.0
4,flats,sector 69,2013.0,4.0,4.0,2.0,Mid Floor,relative new,unfurnished,high,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3768,houses,sector 108,7331.0,5.0,4.0,3.0,Low Floor,relative new,unfurnished,low,0.0,0.0
3769,flats,sector 69,1538.0,3.0,3.0,1.0,High Floor,relative new,unfurnished,high,0.0,0.0
3770,flats,sector 110,3830.0,5.0,5.0,3.0,Mid Floor,relative new,semifurnished,high,1.0,0.0
3771,flats,sector 85,2408.0,3.0,4.0,3.0,Mid Floor,relative new,semifurnished,high,0.0,0.0


### Trying out the predictions

In [278]:
X.columns

Index(['property_type', 'sector', 'area', 'bedRoom', 'bathroom', 'balcony',
       'floor', 'agePossession', 'furnish', 'features', 'servant room',
       'store room'],
      dtype='object')

In [279]:
X.iloc[0].values

array(['flats', 'sector 113', 1665.0, 3.0, 3.0, 2.0, 'High Floor',
       'under construction', 'unfurnished', 'mid', 0.0, 0.0], dtype=object)

In [284]:
data = [['flats', 'sector 113', 1665.0, 3.0, 3.0, 2.0, 'High Floor',
       'under construction', 'unfurnished', 'mid', 0.0, 0.0]]
columns = ['property_type', 'sector', 'area', 'bedRoom', 'bathroom', 'balcony',
       'floor', 'agePossession', 'furnish', 'features', 'servant room',
       'store room']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df


Unnamed: 0,property_type,sector,area,bedRoom,bathroom,balcony,floor,agePossession,furnish,features,servant room,store room
0,flats,sector 113,1665.0,3.0,3.0,2.0,High Floor,under construction,unfurnished,mid,0.0,0.0


In [285]:
np.expm1(pipeline.predict(one_df))

array([2.0812708])

In [295]:
x = df.drop(columns=['price'])
y = df['price']
y_trans = np.log1p(y)

In [300]:
x_train , x_test , y_train , y_test = train_test_split(x,y_trans , test_size=0.2 , random_state=42) 

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=300))
])

pipeline.fit(x_train , y_train)

y_pred = pipeline.predict(x_test)

err = mean_absolute_error(y_test , y_pred)
print(err)

0.11964001911888916


<bound method Pipeline.score of Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['bedRoom', 'bathroom',
                                                   'area', 'servant room',
                                                   'store room']),
                                                 ('cat', OrdinalEncoder(),
                                                  ['property_type', 'sector',
                                                   'balcony', 'agePossession',
                                                   'furnish', 'features',
                                                   'floor']),
                                                 ('cat1',
                                                  OneHotEncoder(drop='first',
                                                                sparse_output=Fa

ValueError: X does not contain any features, but ColumnTransformer is expecting 12 features