<a href="https://colab.research.google.com/github/Shahbaz894/meachine-learning-/blob/main/model_selection_by_shahbaz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA


In [2]:
df=pd.read_csv('/content/sample_data/gurgaon_properties_post_feature_selection_v2.csv')

In [3]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [4]:
df['furnishing_type']=df['furnishing_type'].replace({0.0 :"unfurnished",1.0 : 'semifurnished',2.0:'furnished'})

In [5]:
X=df.drop(columns=['price'])
y=df['price']

In [6]:
#since our price column is write skewed so for normalized the data we used log transform function

y_transformed=np.log1p(y)

# **Ordinal Encoding**
the problem with ordinal encoding is that it assign a numer 0,1,2,3 ,when we apply a linear regression model on it give weighted according to number value it gives more value to 3

In [7]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']


In [8]:
#creating a column transform
preprocessor =ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room',]),
        ('cat',OrdinalEncoder(),columns_to_encode)
    ],
    remainder='passthrough'
)

In [9]:
#creating a pipeline
pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',LinearRegression())

])

In [10]:
#k fold cross validation
kfold=KFold (n_splits=10,shuffle=True, random_state=42)
scores=cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')

In [11]:
scores.mean(),scores.std()

(0.7363096633436828, 0.0323800575442993)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)


In [13]:
pipeline.fit(X_train,y_train)

In [14]:
y_pred=pipeline.predict(X_test)

In [15]:
y_pred=np.expm1(y_pred)

In [16]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.9463822160089363

In [17]:
from os import pipe

def scorer(model_name, model):
    # creating a column transform
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room', ]),
            ('cat', OrdinalEncoder(), columns_to_encode)
        ],
        remainder='passthrough'
    )

    output = []
    output.append(model_name)
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred = np.expm1(y_pred)
    output.append(mean_absolute_error(np.expm1(y_test), y_pred))

    return output


In [18]:


from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor


model_dict={
     'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()


}

In [19]:

model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [20]:
model_output

[['linear_reg', 0.7363096633436828, 0.9463822160089363],
 ['svr', 0.7642021216646014, 0.8472636473483917],
 ['ridge', 0.7363125343993554, 0.9463387741853387],
 ['LASSO', 0.05943378064493573, 1.528905986892753],
 ['decision tree', 0.7758231907706923, 0.7406639834421024],
 ['random forest', 0.8819934544902102, 0.5330225432379374],
 ['extra trees', 0.8674534052343915, 0.5495038012383477],
 ['gradient boosting', 0.8725466254480926, 0.5764173101053007],
 ['adaboost', 0.7540917480032704, 0.8516004733620272],
 ['mlp', 0.8146315808628287, 0.7520664738538148],
 ['xgboost', 0.8894876835260124, 0.5040475127230885]]

In [21]:
model_df=pd.DataFrame(model_output,columns=['name','r2','mae'])

In [22]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.889488,0.504048
5,random forest,0.881993,0.533023
6,extra trees,0.867453,0.549504
7,gradient boosting,0.872547,0.576417
4,decision tree,0.775823,0.740664
9,mlp,0.814632,0.752066
1,svr,0.764202,0.847264
8,adaboost,0.754092,0.8516
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382


# **One Hot Ecncoding With PCA**

In [23]:
#creating a column transformer for processing
preprocessor = ColumnTransformer(
    transformers=[
       ( 'num',StandardScaler(),['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
       ('cat', OrdinalEncoder(),columns_to_encode),
       ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ],
    remainder='passthrough'
)

In [24]:
#creating pipline
pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('pca',PCA(n_components=0.95)),
    ('regressor',LinearRegression())
])

In [25]:
#kfold clustring validation
kfold=KFold(n_splits=10,shuffle=True,random_state=42)
scores=cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')

In [26]:
scores.mean()

0.06225201431451136

In [27]:
scores.std()

0.019860594071640144

In [28]:
def scorer(model_name,model):
  output=[]
  output.append(model_name)
  pipeline=Pipeline([
      ('preprocessor',preprocessor),
      ('pca',PCA(n_components=0.95)),
      ('regressor',model)
  ])
  #kfold cross validation
  kfold=KFold(n_splits=10,shuffle=True,random_state=42)
  scores=cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')
  output.append(scores.mean())
  X_train,X_test,y_train,y_test=train_test_split(X,y_transformed,test_size=0.2,random_state=42)
  pipeline.fit(X_train,y_train)
  y_pred=pipeline.predict(X_test)
  y_pred=np.expm1(y_pred)
  output.append(mean_absolute_error(np.expm1(y_test),y_pred))

  return output






In [29]:
model_dict={
     'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [30]:

model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [31]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [32]:

model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.762906,0.652967
6,extra trees,0.73888,0.701458
4,decision tree,0.696182,0.75729
10,xgboost,0.620664,0.948597
7,gradient boosting,0.610604,0.987906
1,svr,0.218073,1.361198
8,adaboost,0.315332,1.370135
9,mlp,0.215344,1.421858
2,ridge,0.062252,1.526707
0,linear_reg,0.062252,1.526707





# **Target Encoder**

In [33]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m71.7/81.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [34]:

import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

In [35]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [36]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [37]:
scores.mean(),scores.std()

(0.829521918225536, 0.01838446337912283)

In [38]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output

In [39]:

model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [40]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [41]:

model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [42]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.904798,0.447518
5,random forest,0.901189,0.455512
6,extra trees,0.901186,0.461288
7,gradient boosting,0.88929,0.507504
4,decision tree,0.82807,0.549231
9,mlp,0.855459,0.614033
8,adaboost,0.818542,0.68756
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
1,svr,0.782917,0.818851


In [43]:
from sklearn.model_selection import GridSearchCV

In [44]:

param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [45]:

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

In [46]:

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [47]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [48]:

search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [49]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


  warn(


In [50]:
final_pipe = search.best_estimator_

In [51]:

search.best_params_

{'regressor__max_depth': 20,
 'regressor__max_features': 'auto',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 100}

In [52]:

search.best_score_

0.902602127688187

In [53]:
final_pipe.fit(X,y_transformed)

  warn(


In [54]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ],
    remainder='passthrough'
)

In [55]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [56]:
pipeline.fit(X,y_transformed)

In [57]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [58]:

with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [59]:

X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [60]:

X.iloc[0].values

array(['flat', 'sector 36', 3.0, 2.0, '2', 'New Property', 850.0, 0.0,
       0.0, 'unfurnished', 'Low', 'Low Floor'], dtype=object)

In [61]:

data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,4,3,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [62]:

np.expm1(pipeline.predict(one_df))

array([3.19033957])

In [63]:
sorted(X['sector'].unique().tolist())

['dwarka expressway',
 'gwal pahari',
 'manesar',
 'sector 1',
 'sector 10',
 'sector 102',
 'sector 103',
 'sector 104',
 'sector 105',
 'sector 106',
 'sector 107',
 'sector 108',
 'sector 109',
 'sector 11',
 'sector 110',
 'sector 111',
 'sector 112',
 'sector 113',
 'sector 12',
 'sector 13',
 'sector 14',
 'sector 15',
 'sector 17',
 'sector 2',
 'sector 21',
 'sector 22',
 'sector 23',
 'sector 24',
 'sector 25',
 'sector 26',
 'sector 27',
 'sector 28',
 'sector 3',
 'sector 30',
 'sector 31',
 'sector 33',
 'sector 36',
 'sector 37',
 'sector 37d',
 'sector 38',
 'sector 39',
 'sector 4',
 'sector 40',
 'sector 41',
 'sector 43',
 'sector 45',
 'sector 46',
 'sector 47',
 'sector 48',
 'sector 49',
 'sector 5',
 'sector 50',
 'sector 51',
 'sector 52',
 'sector 53',
 'sector 54',
 'sector 55',
 'sector 56',
 'sector 57',
 'sector 58',
 'sector 59',
 'sector 6',
 'sector 60',
 'sector 61',
 'sector 62',
 'sector 63',
 'sector 63a',
 'sector 65',
 'sector 66',
 'sector 67',
 'se