In [1]:
# import the necessary libraries 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv("Datasets/gurgaon_properties_post_feature_selection_v2.csv")
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [51]:
df["furnishing_type"].value_counts()

# 0: unfurnished 
# 1: semifurnished
# 2: furnished 

0.0    2349
1.0    1018
2.0     187
Name: furnishing_type, dtype: int64

In [52]:
# convert the furnishing_type column to again categorical (as we can done the encoding into ml pipeline also)
df["furnishing_type"] = df["furnishing_type"].replace({0.0: "unfurnished", 1.0: "semifurnished", 2.0: "furnished"})

In [53]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


In [54]:
# Split the data into X and y 
X = df.drop(columns=["price"])
y = df["price"]

In [55]:
# as the output column(price) is right skewed, apply the log transformation 
y_transformed = np.log1p(y)

## Encoding Techniques 
#### Converting categorical columns into numerical columns 
* 1. Ordinal Encoding 
    * Works well with tree based models (as there is no concept of order(weight) i.e 0 < 1 so 1 has higher order)
* 2. One Hot Encoding  
    * generally works well with linear models 
* 3. Target Encoding 
    * works well with tree based models

### Ordinal Encoding 
* assign the integer value to each category 
    * for ex.: there is sector column in our dataset, so it assigns the integer value to each of the sector like 0, 1, 2, etc.
    

In [56]:
df.columns

Index(['property_type', 'sector', 'price', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [57]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [58]:
# Creating a column transformer for preprocessing 
preprocessor = ColumnTransformer(
    transformers=[
        # scaling down the numerical columns
        ("num", StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        # applying the ordinal encoding to the categorical columns
        ("cat", OrdinalEncoder(), columns_to_encode)
    ],
    # parameters that are not mention in the columntransformer will pass through without any transformation
    remainder = "passthrough" 
)

preprocessor

ColumnTransformer(remainder='passthrough',
                  transformers=[('num', StandardScaler(),
                                 ['bedRoom', 'bathroom', 'built_up_area',
                                  'servant room', 'store room']),
                                ('cat', OrdinalEncoder(),
                                 ['property_type', 'sector', 'balcony',
                                  'agePossession', 'furnishing_type',
                                  'luxury_category', 'floor_category'])])

In [59]:
# Creating the pipeline 
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

pipeline

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['bedRoom', 'bathroom',
                                                   'built_up_area',
                                                   'servant room',
                                                   'store room']),
                                                 ('cat', OrdinalEncoder(),
                                                  ['property_type', 'sector',
                                                   'balcony', 'agePossession',
                                                   'furnishing_type',
                                                   'luxury_category',
                                                   'floor_category'])])),
                ('regressor', LinearRegression())])

In [60]:
# Apply the K-fold cross validation 
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring="r2")

In [61]:
scores

array([0.7112841 , 0.75780656, 0.75338985, 0.75435482, 0.7495126 ,
       0.68068362, 0.79641346, 0.72883971, 0.69340419, 0.73740773])

In [62]:
scores.mean(), scores.std()

(0.7363096633436828, 0.03238005754429927)

In [63]:
# Split the data into train and test sets 
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y_transformed,
                                                    test_size=0.2, random_state=42)

In [64]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2843, 12), (711, 12), (2843,), (711,))

In [65]:
# fit the traning data to the pipeline 
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['bedRoom', 'bathroom',
                                                   'built_up_area',
                                                   'servant room',
                                                   'store room']),
                                                 ('cat', OrdinalEncoder(),
                                                  ['property_type', 'sector',
                                                   'balcony', 'agePossession',
                                                   'furnishing_type',
                                                   'luxury_category',
                                                   'floor_category'])])),
                ('regressor', LinearRegression())])

In [66]:
# predict the result on the test data 
y_preds = pipeline.predict(X_test)

# convert to the exponential again (as we normalize the data(price column) above)
y_preds = np.expm1(y_preds)

In [67]:
# Calculate the error (MAE)
mean_absolute_error(np.expm1(y_test), y_preds)

0.9463822160089355

#### Observation
* As we are using linear regression, the encoding that we have done is ordinal encoding i.e the this technique assigns the integer value to each of the category, so the linear regressor treating these integer values by giving them order for ex. highet the integer value more the order to the particular category and thats why the error is coming to be about 94 lacks. 
* for ex. if the price of the property is 2 crores our above model will predict the price of the property as 2.94 crores.
* We have to improve the model as this is significantly larger error value.

In [68]:
# Check for other models as well (Using Ordinal Encoding)
# create a function to which will give the MAE score and r2_score 

def give_score1(model_name, model):
    result = []
    result.append(model_name)
    
    pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("regressor", model)
    ])
    
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring="r2")
    
    # append the scores(cross_val_score) (mean value) to the result
    result.append(scores.mean())
    
    # split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y_transformed, 
                                                        test_size=0.2, random_state=42)
    
    pipeline.fit(X_train, y_train)
    y_preds = pipeline.predict(X_test)
    y_preds = np.expm1(y_preds)
    # calculate the r2 MAE 
    result.append(mean_absolute_error(np.expm1(y_test), y_preds))
    
    return result

In [69]:
models = {
    "linear_regression": LinearRegression(),
    "svr": SVR(),
    "ridge": Ridge(),
    "lasso": Lasso(),
    "decision tree": DecisionTreeRegressor(),
    "random forest": RandomForestRegressor(),
    "extra tree": ExtraTreesRegressor(),
    "gradient boosting": GradientBoostingRegressor(),
    "adaboost": AdaBoostRegressor(),
    "xgboost": XGBRegressor(),
    "mlp": MLPRegressor()
}

In [70]:
model_output = []
for model_name, model in models.items():
    model_output.append(give_score1(model_name, model))

In [71]:
model_output

[['linear_regression', 0.7363096633436828, 0.9463822160089355],
 ['svr', 0.7642012011196353, 0.8472636473483951],
 ['ridge', 0.7363125343993552, 0.946338774185337],
 ['lasso', 0.05943378064493573, 1.528905986892753],
 ['decision tree', 0.7690293420054966, 0.7261947547175063],
 ['random forest', 0.8809957609626086, 0.5341362925969813],
 ['extra tree', 0.8668294569963821, 0.5525688692131147],
 ['gradient boosting', 0.8724594234220306, 0.5763533095613873],
 ['adaboost', 0.7503199757879004, 0.8201034814865114],
 ['xgboost', 0.8917010012719994, 0.5113240614244203],
 ['mlp', 0.8043842375467223, 0.7142654442365062]]

In [72]:
model_scores_df1 = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_scores_df1

Unnamed: 0,name,r2,mae
0,linear_regression,0.73631,0.946382
1,svr,0.764201,0.847264
2,ridge,0.736313,0.946339
3,lasso,0.059434,1.528906
4,decision tree,0.769029,0.726195
5,random forest,0.880996,0.534136
6,extra tree,0.866829,0.552569
7,gradient boosting,0.872459,0.576353
8,adaboost,0.75032,0.820103
9,xgboost,0.891701,0.511324


In [73]:
model_scores_df1.sort_values(["mae"])

Unnamed: 0,name,r2,mae
9,xgboost,0.891701,0.511324
5,random forest,0.880996,0.534136
6,extra tree,0.866829,0.552569
7,gradient boosting,0.872459,0.576353
10,mlp,0.804384,0.714265
4,decision tree,0.769029,0.726195
8,adaboost,0.75032,0.820103
1,svr,0.764201,0.847264
2,ridge,0.736313,0.946339
0,linear_regression,0.73631,0.946382


### One Hot Encoding 

In [74]:
# Creating the columntransformer for preprocessing 
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(),  ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ("cat", OrdinalEncoder(), columns_to_encode),
        # number of dimensions will increase in this case 
        ("cat1", OneHotEncoder(drop="first"), ['sector','agePossession','furnishing_type'])
    ],
    remainder="passthrough"
)

In [75]:
# creating the pipeline 
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

In [76]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring="r2")

In [77]:
scores.mean(), scores.std()

(0.8546151830349675, 0.01599250037818318)

In [78]:
# function to get the r2_score and MAE scores of other models (Using One Hot Encoding)
def get_score2(model_name, model):
    result = []
    result.append(model_name)
    
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])
    
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring="r2")
    
    result.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y_transformed, 
                                                        test_size=0.2, random_state=42)
    
    pipeline.fit(X_train, y_train)
    y_preds = pipeline.predict(X_test)
    y_preds = np.expm1(y_preds)
    
    result.append(mean_absolute_error(np.expm1(y_test), y_preds))
    
    return result

In [79]:
models = {
    "linear_regression": LinearRegression(),
    "svr": SVR(),
    "ridge": Ridge(),
    "lasso": Lasso(),
    "decision tree": DecisionTreeRegressor(),
    "random forest": RandomForestRegressor(),
    "extra tree": ExtraTreesRegressor(),
    "gradient boosting": GradientBoostingRegressor(),
    "adaboost": AdaBoostRegressor(),
    "xgboost": XGBRegressor(),
    "mlp": MLPRegressor()
}

In [80]:
model_scores = []
for model_name, model in models.items():
    model_scores.append(get_score2(model_name, model))

In [81]:
model_scores_df2 = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_scores_df2.sort_values(["mae"])

Unnamed: 0,name,r2,mae
9,xgboost,0.891701,0.511324
5,random forest,0.880996,0.534136
6,extra tree,0.866829,0.552569
7,gradient boosting,0.872459,0.576353
10,mlp,0.804384,0.714265
4,decision tree,0.769029,0.726195
8,adaboost,0.75032,0.820103
1,svr,0.764201,0.847264
2,ridge,0.736313,0.946339
0,linear_regression,0.73631,0.946382


### One Hot Encoding (using PCA)

In [82]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [83]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [84]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [85]:
scores.mean(), scores.std()

(0.06225201431451136, 0.01986059407164015)

In [86]:
def scorer(model_name, model):
    
    result = []
    
    result.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    result.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    result.append(mean_absolute_error(np.expm1(y_test), y_pred))
    
    return result

In [87]:
models = {
    "linear_regression": LinearRegression(),
    "svr": SVR(),
    "ridge": Ridge(),
    "lasso": Lasso(),
    "decision tree": DecisionTreeRegressor(),
    "random forest": RandomForestRegressor(),
    "extra tree": ExtraTreesRegressor(),
    "gradient boosting": GradientBoostingRegressor(),
    "adaboost": AdaBoostRegressor(),
    "xgboost": XGBRegressor(),
    "mlp": MLPRegressor()
}

In [88]:
model_output = []
for model_name,model in models.items():
    model_output.append(scorer(model_name, model))

In [89]:
model_output

[['linear_regression', 0.06225201431451136, 1.5267074088549337],
 ['svr', 0.21807348496172357, 1.3611626793047285],
 ['ridge', 0.062252015161791484, 1.5267074078044667],
 ['lasso', 0.059675784467370006, 1.5287392557835464],
 ['decision tree', 0.6961824748125831, 0.7572895569516725],
 ['random forest', 0.7629111315851719, 0.6539335664318153],
 ['extra tree', 0.7397182679246979, 0.7025480450308214],
 ['gradient boosting', 0.610603787945885, 0.9879063301936336],
 ['adaboost', 0.31050419435027565, 1.419457187504457],
 ['xgboost', 0.7698872722858927, 0.679861871046859],
 ['mlp', 0.21062861955584128, 1.4305097831234863]]

In [90]:
model_scores_df3 = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_scores_df3.sort_values(["mae"])

Unnamed: 0,name,r2,mae
5,random forest,0.762911,0.653934
9,xgboost,0.769887,0.679862
6,extra tree,0.739718,0.702548
4,decision tree,0.696182,0.75729
7,gradient boosting,0.610604,0.987906
1,svr,0.218073,1.361163
8,adaboost,0.310504,1.419457
10,mlp,0.210629,1.43051
2,ridge,0.062252,1.526707
0,linear_regression,0.062252,1.526707


### Target Encoding 
* also called as mean encoding 

In [91]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category',
                     'floor_category']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ]
)

In [92]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [93]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [94]:
scores.mean(),scores.std()

(0.829521918225536, 0.018384463379122876)

In [95]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [96]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [97]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [98]:
model_output

[['linear_reg', 0.829521918225536, 0.7130109838896391],
 ['svr', 0.7829174051174261, 0.8188507474317219],
 ['ridge', 0.8295359700269425, 0.7135228301064968],
 ['LASSO', 0.05943378064493573, 1.528905986892753],
 ['decision tree', 0.835153787783369, 0.5414629892990243],
 ['random forest', 0.9017725151980518, 0.4506988250215617],
 ['extra trees', 0.9018301190445073, 0.45657646852259665],
 ['gradient boosting', 0.8890891506862595, 0.5108715043147913],
 ['adaboost', 0.8141501872609236, 0.6698293917424857],
 ['mlp', 0.849694649212265, 0.6169847308308207],
 ['xgboost', 0.9006427855783141, 0.48340860167971467]]

In [99]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [100]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.901773,0.450699
6,extra trees,0.90183,0.456576
10,xgboost,0.900643,0.483409
7,gradient boosting,0.889089,0.510872
4,decision tree,0.835154,0.541463
9,mlp,0.849695,0.616985
8,adaboost,0.81415,0.669829
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
1,svr,0.782917,0.818851


### Hyperparameter Tuning

In [101]:
from sklearn.model_selection import GridSearchCV

In [102]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [103]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [104]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [105]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [106]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [107]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


GridSearchCV(cv=KFold(n_splits=10, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['bedRoom',
                                                                          'bathroom',
                                                                          'built_up_area',
                                                                          'servant '
                                                                          'room',
                                                                          'store '
                                                                          'room']),
                           

In [108]:
final_pipe = search.best_estimator_

In [109]:
search.best_params_

{'regressor__max_depth': None,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 200}

In [110]:
search.best_score_

0.9028958808239503

In [111]:
final_pipe.fit(X, y_transformed)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['bedRoom', 'bathroom',
                                                   'built_up_area',
                                                   'servant room',
                                                   'store room']),
                                                 ('cat', OrdinalEncoder(),
                                                  ['property_type', 'sector',
                                                   'balcony', 'agePossession',
                                                   'furnishing_type',
                                                   'luxury_category',
                                                   'floor_category']),
                                                 ('cat1',
                                                  OneHo

In [157]:
df.columns

Index(['property_type', 'sector', 'price', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [158]:
columns_to_encode

['property_type',
 'sector',
 'balcony',
 'agePossession',
 'furnishing_type',
 'luxury_category',
 'floor_category']

### Exporting the model 

In [179]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [184]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [None]:
pipeline.fit(X, y_transformed)

In [166]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [167]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [134]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor
...,...,...,...,...,...,...,...,...,...,...,...,...
3549,flat,sector 84,2.0,2.0,1,Relatively New,532.0,0.0,0.0,unfurnished,Medium,Mid Floor
3550,house,sector 109,5.0,5.0,3+,Relatively New,6228.0,1.0,1.0,unfurnished,High,Low Floor
3551,flat,sector 2,1.0,1.0,1,Moderately Old,665.0,0.0,0.0,semifurnished,Medium,Mid Floor
3552,house,sector 43,5.0,6.0,3,Moderately Old,5490.0,1.0,1.0,unfurnished,Medium,Mid Floor


In [163]:
X.iloc[0].values

array(['flat', 'sector 36', 3.0, 2.0, '2', 'New Property', 850.0, 0.0,
       0.0, 'unfurnished', 'Low', 'Low Floor'], dtype=object)

In [164]:
data = [['house', 'sector 102', 5, 3, '3+', 'New Property', 4000, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,5,3,3+,New Property,4000,0,0,unfurnished,Low,Low Floor


In [165]:
np.expm1(pipeline.predict(one_df))

array([3.81439886])