In [78]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import pickle

In [79]:
data = pd.read_csv('final_data.csv')
data.head()

Unnamed: 0,type,locality,latitude,longitude,lease_type,gym,lift,swimming_pool,negotiable,furnishing,...,PARK,RWH,STP,HK,PB,VP,activation_day,activation_month,activation_year,Total_amenities
0,1,336,12.929557,77.67228,0,0,1,0,1,2,...,1,1,1,0,1,1,10,9,2017,8
1,2,1521,12.98287,80.262012,3,0,1,0,0,2,...,1,0,0,0,1,1,6,12,2018,6
2,0,195,12.955991,77.531634,3,0,0,0,1,2,...,0,0,0,0,0,0,20,7,2018,0
3,2,953,12.963903,77.649446,3,0,0,0,1,2,...,1,0,0,0,0,1,31,5,2018,3
4,2,1196,12.967144,77.750662,0,1,1,1,1,0,...,1,1,1,1,1,1,7,12,2018,18


In [80]:
X = data.drop(columns=['rent'], axis=1)
y = data['rent']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [81]:
def regression(df, algorithm):
    X = data.drop(columns=['rent'], axis=1)
    y = data['rent']
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    model = algorithm().fit(X_train, y_train)
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    mae = mean_absolute_error(y_train,train_pred)
    mse = mean_squared_error(y_train,train_pred)
    rmse = np.sqrt(mean_squared_error(y_train,train_pred))
    r2_train = r2_score(y_train, train_pred)
    
    
    mae_test = mean_absolute_error(y_test,test_pred)
    mse_test = mean_squared_error(y_test,test_pred)
    rmse_test = np.sqrt(mean_squared_error(y_test,test_pred))
    r2_test = r2_score(y_test,test_pred)
    
    metrics1 = {'algorithm': algorithm.__name__,
               'MAE':mae,
               'MSE':mse,
               'RMSE':rmse,
               'R2_train':r2_train}
    metrics2 = {'algorithm': algorithm.__name__,
               'MAE':mae_test,
               'MSE':mse_test,
               'RMSE':rmse_test,
               'R2_train':r2_test}

    return metrics1,metrics2

In [82]:
print(regression(data, DecisionTreeRegressor))
print('=========================================================')
print(regression(data, ExtraTreesRegressor))
print('=========================================================')
print(regression(data, RandomForestRegressor))
print('=========================================================')
print(regression(data, AdaBoostRegressor))
print('=========================================================')
print(regression(data, GradientBoostingRegressor))
print('=========================================================')
print(regression(data, XGBRegressor))

({'algorithm': 'DecisionTreeRegressor', 'MAE': 0.0, 'MSE': 0.0, 'RMSE': 0.0, 'R2_train': 1.0}, {'algorithm': 'DecisionTreeRegressor', 'MAE': 3747.2254556143444, 'MSE': 27726613.919753086, 'RMSE': 5265.606700063449, 'R2_train': 0.6118882480393413})
({'algorithm': 'ExtraTreesRegressor', 'MAE': 0.0, 'MSE': 0.0, 'RMSE': 0.0, 'R2_train': 1.0}, {'algorithm': 'ExtraTreesRegressor', 'MAE': 2894.174276895944, 'MSE': 16299187.03240203, 'RMSE': 4037.2251649371788, 'R2_train': 0.7718471482674168})
({'algorithm': 'RandomForestRegressor', 'MAE': 983.8894767399133, 'MSE': 1983594.3726771146, 'RMSE': 1408.401353548453, 'R2_train': 0.9708414493948679}, {'algorithm': 'RandomForestRegressor', 'MAE': 2674.6124397413287, 'MSE': 14204096.963877015, 'RMSE': 3768.8323077416185, 'R2_train': 0.8011738117887539})
({'algorithm': 'AdaBoostRegressor', 'MAE': 4699.436045196204, 'MSE': 31626722.200989787, 'RMSE': 5623.76405986149, 'R2_train': 0.5350917544057148}, {'algorithm': 'AdaBoostRegressor', 'MAE': 4729.1046156

After evaluating all the Models, the Random Forest and XGBoost regressor have better R2 score. Hence performing hyper parameter tuning in both the models.

# HyperParameter Tuning -XGBoost Regressor

In [83]:
params = {
    "learning_rate" : [0.05,0.10,0.15,0.20,0.25,0.30],
    "max_depth"     : [ 3, 4, 5, 6, 8, 10, 12,15],
    "min_child_weight":[1, 3, 5, 7],
    "gamma" : [0.0, 0.1, 0.2, 0.3,0.4],
    "colsample_bytree" : [0.3, 0.4, 0.5, 0.7]
}

### Xgboost tuning using Random Search CV

In [84]:
from sklearn.model_selection import RandomizedSearchCV

In [85]:
model = XGBRegressor()

In [86]:
xg_random = RandomizedSearchCV(model,param_distributions=params,n_iter=5,scoring='neg_mean_squared_error',n_jobs=1,cv=5,verbose=3)
xg_random.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END colsample_bytree=0.3, gamma=0.4, learning_rate=0.2, max_depth=10, min_child_weight=3;, score=-14655471.725 total time=   1.2s
[CV 2/5] END colsample_bytree=0.3, gamma=0.4, learning_rate=0.2, max_depth=10, min_child_weight=3;, score=-12832861.883 total time=   1.1s
[CV 3/5] END colsample_bytree=0.3, gamma=0.4, learning_rate=0.2, max_depth=10, min_child_weight=3;, score=-13874171.986 total time=   1.2s
[CV 4/5] END colsample_bytree=0.3, gamma=0.4, learning_rate=0.2, max_depth=10, min_child_weight=3;, score=-13306469.340 total time=   1.1s
[CV 5/5] END colsample_bytree=0.3, gamma=0.4, learning_rate=0.2, max_depth=10, min_child_weight=3;, score=-14924209.617 total time=   1.1s
[CV 1/5] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.1, max_depth=12, min_child_weight=5;, score=-14622790.174 total time=   1.6s
[CV 2/5] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.1, max_depth=12, min_child_weight=5;, score=

In [87]:
xg_random.best_params_

{'min_child_weight': 3,
 'max_depth': 4,
 'learning_rate': 0.3,
 'gamma': 0.0,
 'colsample_bytree': 0.5}

In [88]:
xg_random.best_estimator_

In [89]:
predictions = xg_random.predict(X_test)

In [90]:
print("Evaluating Test Data")
print("-----------------------")
print('R2 score:',r2_score(y_test,predictions))
print("MAE :",mean_absolute_error(y_test,predictions))
print("MSE :",mean_squared_error(y_test,predictions))
print('RMSE :', np.sqrt(mean_squared_error(y_test,predictions)))

Evaluating Test Data
-----------------------
R2 score: 0.8111841469926642
MAE : 2605.904536279211
MSE : 13488960.928948948
RMSE : 3672.732079657996


### Random Forest Tuning

In [24]:
rf = RandomForestRegressor()

In [22]:
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]
max_features = ['log2', 'sqrt']
max_depth = [int(x) for x in np.linspace(start= 5, stop= 30, num= 6)]
min_samples_split = [2,5,10,15,100]
min_samples_leaf = [1,2,5,10]

In [23]:
random_grid= {'n_estimators': n_estimators, 
              'max_features' : max_features,
              'max_depth' : max_depth,
              'min_samples_split' : min_samples_split,
              'min_samples_leaf' : min_samples_leaf}

In [25]:
rf_random = RandomizedSearchCV(estimator=  rf, param_distributions=  random_grid, scoring= 'neg_mean_squared_error',
                                      n_iter = 10, cv=5, verbose = 2, random_state=42, n_jobs=1,error_score='raise')
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=  28.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=  29.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=  30.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=  24.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=  24.4s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=  44.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=  44.9s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimator

In [26]:
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 25}

In [27]:
rf_random.best_estimator_

In [28]:
rf_predictions = rf_random.predict(X_test)

In [29]:
print("Evaluating Test Data")
print("-----------------------")
print('R2 score:',r2_score(y_test,rf_predictions))
print("MAE :",mean_absolute_error(y_test,rf_predictions))
print("MSE :",mean_squared_error(y_test,rf_predictions))
print('RMSE :', np.sqrt(mean_squared_error(y_test,rf_predictions)))

Evaluating Test Data
-----------------------
R2 score: 0.7771773496891766
MAE : 2867.9259856453905
MSE : 15918398.67392266
RMSE : 3989.786795547183


# Saving the model as pickle file

In [91]:
with open('xg_model.pkl','wb') as file:
    pickle.dump(xg_random,file)

In [93]:
ip = [[1.0,1098.0,12.95984305,77.70160289,1.0,0.0,0.0,0.0,1.0,2.0,1.0,750.0,4.0,1.0,1.0,2.0,2.0,2.0,1.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,
       0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,15.0,5.0,2018.0,1]]
round(xg_random.predict(np.array(ip))[0],2)

14477.58

In [94]:
X_test.columns

Index(['type', 'locality', 'latitude', 'longitude', 'lease_type', 'gym',
       'lift', 'swimming_pool', 'negotiable', 'furnishing', 'parking',
       'property_size', 'property_age', 'bathroom', 'facing', 'cup_board',
       'floor', 'total_floor', 'water_supply', 'building_type', 'balconies',
       'INTERNET', 'AC', 'CLUB', 'INTERCOM', 'CPA', 'FS', 'SERVANT',
       'SECURITY', 'SC', 'GP', 'PARK', 'RWH', 'STP', 'HK', 'PB', 'VP',
       'activation_day', 'activation_month', 'activation_year',
       'Total_amenities'],
      dtype='object')

In [77]:
data['cup_board'].sort_values()

13357     0
14159     0
6309      0
6314      0
10391     0
         ..
15099    20
1616     20
16070    20
6341     20
14686    40
Name: cup_board, Length: 17009, dtype: int64