# Model Experimentation

1. Random Forest Regressor
2. XGBoost Regressor
3. LightGBM Regressor
4. <b>Support Vector Regressor</b>

## Import Libraries

In [14]:
# dataframe packages
import pandas as pd
import numpy as np
from skopt.space import Categorical, Integer, Real
from skopt.utils import use_named_args
from skopt import gp_minimize
import joblib

# statistical packages
import math
from scipy.stats import uniform
from math import sqrt

# modeling packages
from sklearn.ensemble import RandomForestRegressor
#import lightgbm as lgb
#from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

# evaluation packages
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score,mean_squared_error, mean_squared_log_error
from sklearn.model_selection import cross_val_score, RepeatedKFold, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import TimeSeriesSplit

# scaling packages
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import time
# visualisation packages
import seaborn as sns
import matplotlib.pyplot as plt
import shap

## Load Dataset

In [2]:
model_df = pd.read_csv('modelling_dataset.csv')

In [3]:
model_df.head()

Unnamed: 0,Area (SQM),Unit Price ($ PSM),Floor Number,PPI,Average Cases Per Year,Nearest Primary School,nearest_station_distance,Remaining Lease,Ang Mo Kio,Bedok,...,Yishun,BLUE,BROWN,GREEN,LRT,PURPLE,RED,YELLOW,Apartment,Executive Condominium
0,95.0,6316.0,1.0,124.3,33,705.752731,1207.822015,87.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,138.0,13833.0,1.0,124.3,58,1233.947139,768.529003,88.0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,99.0,8990.0,10.0,124.3,50,1039.586179,816.818037,80.0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,111.0,6306.0,6.0,124.3,33,509.516515,501.364218,80.0,0,0,...,1,0,0,0,0,0,1,0,0,0
4,122.0,13934.0,10.0,124.3,58,1253.73326,554.491114,88.0,0,0,...,0,0,0,0,0,1,0,0,0,0


## Train Test Split

1. Training set 0.7
    - 80% train
    - 20% validation
2. Test set - 0.3

The output variable will be <b>Unit Price ($ PSM)</b>

In [4]:
X = model_df.drop(["Unit Price ($ PSM)"], axis=1)
y = model_df['Unit Price ($ PSM)']

print('Shape of X is:', X.shape)
print('Shape of Y is:', y.shape)

Shape of X is: (54674, 52)
Shape of Y is: (54674,)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

print('Shape of X_train is:', X_train.shape)
print('Shape of y_train is:', y_train.shape)
print('Shape of X_test is:', X_test.shape)
print('Shape of y_test is:', y_test.shape)

Shape of X_train is: (38271, 52)
Shape of y_train is: (38271,)
Shape of X_test is: (16403, 52)
Shape of y_test is: (16403,)


## Scaling

In [6]:
all_features = list(X_train.columns)

standardScale_vars = ['Area (SQM)',
                      'Floor Number',
                      'PPI',
                      'Average Cases Per Year',
                      'Nearest Primary School',
                      'nearest_station_distance']

minMax_vars = ['Remaining Lease']

remaining_features = [x for x in all_features if x not in standardScale_vars and x not in minMax_vars]

In [7]:
s_scaler = StandardScaler()
mm_scaler = MinMaxScaler()

s_scaled = pd.DataFrame(s_scaler.fit_transform(X_train.loc[:, standardScale_vars].copy()), columns=standardScale_vars, index=X_train.index)
mm_scaled = pd.DataFrame(mm_scaler.fit_transform(X_train.loc[:, minMax_vars].copy()), columns=minMax_vars, index=X_train.index)

X_train = pd.concat([s_scaled, 
                     mm_scaled,
                     X_train.loc[:, remaining_features].copy()], axis=1)
X_train

Unnamed: 0,Area (SQM),Floor Number,PPI,Average Cases Per Year,Nearest Primary School,nearest_station_distance,Remaining Lease,Ang Mo Kio,Bedok,Bishan,...,Yishun,BLUE,BROWN,GREEN,LRT,PURPLE,RED,YELLOW,Apartment,Executive Condominium
11187,0.291530,-0.109176,-0.236229,0.243812,-0.771624,1.226898,0.745098,0,0,0,...,0,0,0,0,0,0,0,0,0,1
18593,-1.125873,0.371998,0.210453,0.125051,-0.570640,1.231468,0.725490,0,1,0,...,0,0,0,0,0,0,0,0,0,0
27013,-1.238366,5.785202,-0.452365,0.995966,1.801408,-1.015899,0.764706,0,0,0,...,0,1,0,1,0,0,1,1,1,0
39116,-0.248433,0.973465,0.109589,0.243812,-0.861865,-0.889309,0.764706,0,0,0,...,0,0,0,0,1,1,0,0,0,0
6042,-0.113442,3.259040,-1.187229,0.006290,-0.716828,-0.836027,0.784314,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44732,-0.203436,-0.830937,0.628316,-0.943799,-1.218061,-0.237225,0.509804,1,0,0,...,0,0,0,0,0,0,1,0,0,0
54343,-1.125873,-0.951230,1.795452,0.164638,-0.556898,0.062359,0.784314,0,0,0,...,0,0,1,0,0,0,1,0,0,0
38158,-0.945886,-0.830937,0.109589,3.252428,0.204740,-0.494623,0.568627,0,0,0,...,0,1,0,0,0,1,0,0,0,0
860,-0.563412,0.492291,-2.383183,2.896144,-0.041901,-0.749697,0.705882,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [8]:
s_scaled_test = pd.DataFrame(s_scaler.transform(X_test.loc[:, standardScale_vars].copy()), columns=standardScale_vars, index=X_test.index)
mm_scaled_test = pd.DataFrame(mm_scaler.transform(X_test.loc[:, minMax_vars].copy()), columns=minMax_vars, index=X_test.index)

X_test = pd.concat([s_scaled_test, 
                     mm_scaled_test,
                     X_test.loc[:, remaining_features].copy()], axis=1)
X_test

Unnamed: 0,Area (SQM),Floor Number,PPI,Average Cases Per Year,Nearest Primary School,nearest_station_distance,Remaining Lease,Ang Mo Kio,Bedok,Bishan,...,Yishun,BLUE,BROWN,GREEN,LRT,PURPLE,RED,YELLOW,Apartment,Executive Condominium
26109,0.134041,0.011118,-0.149775,0.243812,-0.629690,-0.453873,0.666667,0,0,0,...,0,0,0,0,1,1,0,0,0,1
51446,-0.765898,-0.830937,1.147043,-0.864625,-1.058519,-0.912692,0.823529,0,0,0,...,0,1,0,1,0,0,0,0,0,0
21609,0.224035,-0.349763,1.031770,2.896144,2.509617,0.157013,0.705882,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6390,-0.045947,-0.951230,-1.187229,-0.033297,0.062520,-0.976248,0.843137,0,0,0,...,0,0,0,0,1,0,1,0,0,0
40798,5.983642,3.018453,0.757998,-1.102147,0.138606,-0.501497,0.862745,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7350,1.056478,0.612585,-0.841411,-0.389580,0.018348,0.334854,0.509804,0,0,0,...,0,0,0,1,0,0,0,0,0,0
24268,-0.248433,0.011118,0.440998,-0.349993,0.014873,1.248442,0.627451,0,0,0,...,0,0,0,0,0,0,0,0,1,0
29622,0.089044,-0.470056,-0.855820,0.402161,-0.675015,1.227770,0.647059,0,0,0,...,0,0,0,0,0,0,0,0,0,1
50030,-1.575843,0.492291,1.132634,0.243812,-0.740679,-0.669763,0.882353,0,0,0,...,0,0,0,0,1,0,0,0,0,0


## Model Tuning

In [9]:
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True)

print('Shape of X_train is:', X_train.shape)
print('Shape of y_train is:', y_train.shape)
print('Shape of X_eval is:', X_eval.shape)
print('Shape of y_eval is:', y_eval.shape)

Shape of X_train is: (30616, 52)
Shape of y_train is: (30616,)
Shape of X_eval is: (7655, 52)
Shape of y_eval is: (7655,)


### Support Vector Regressor 

Bayesian Optimisation

In [10]:
regressor = SVR()

In [11]:
# Kernels tested were poly and linear with the other params remaining the same. 
## Trained separately 
startTime = time.time()
search_space = [Real(low=0, high=1, name='C'),
                Real(low=0.1, high=1, name='gamma'),
                Real(low=0, high=0.1, name='epsilon'),
                Categorical(categories=['poly'], name='kernel')]

@use_named_args(search_space)
def evaluate_model(**params):
    regressor.set_params(**params)
    
    result = cross_val_score(regressor, X_train, y_train, cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error')
    
    estimate = -np.mean(result)
    
    return estimate

rmse_result = gp_minimize(evaluate_model, search_space)

print('Best RMSE: %.2f' % rmse_result.fun)
print('Best Parameters: C=%.2f, gamma=%.2f, epsilon=%.2f, kernel=%s' % (rmse_result.x[0], rmse_result.x[1],  rmse_result.x[2], rmse_result.x[3]))
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))



Best RMSE: 1749.07
Best Parameters: C=1.00, gamma=1.00, epsilon=0.00, kernel=poly
Execution time in seconds: 33910.38332486153


In [15]:
best = SVR(C = rmse_result.x[0], gamma = rmse_result.x[1], epsilon =  rmse_result.x[2], kernel = rmse_result.x[3])
best.fit(X_train, y_train)
y_eval_pred = best.predict(X_eval)
y_test_pred = best.predict(X_test)

train_RMSE = math.sqrt(mean_squared_error(y_eval, y_eval_pred))
test_RMSE = math.sqrt(mean_squared_error(y_test, y_test_pred))
train_MAPE = mean_absolute_percentage_error(y_eval, y_eval_pred) * 100
test_MAPE = mean_absolute_percentage_error(y_test, y_test_pred) * 100

print('Train RMSE : ' + str(train_RMSE))
print('Test RMSE : ' + str(test_RMSE))
print('Train MAPE : ' + str(train_MAPE))
print('Test MAPE : ' + str(test_MAPE))

Train RMSE : 1541.6898717423471
Test RMSE : 6066.051002025376
Train MAPE : 7.734403854921021
Test MAPE : 7.985621213520582


In [35]:
best.get_params()

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.0,
 'gamma': 1.0,
 'kernel': 'poly',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [21]:
df_error_log = []
error_log = {
    'model_type': 'SVR',
    'search_type': 'bayesian',
    'params':  ['C=1.0', 'gamma=1.0', 'epsilon=0.0', 'kernel=poly'],
    'eval_rmse': train_RMSE,
    'test_rmse': test_RMSE, 
    'eval_mape': train_MAPE,
    'test_mape': test_MAPE
}

df_error_log.append(error_log)

In [22]:
df_error_log = pd.DataFrame(df_error_log)
df_error_log

Unnamed: 0,model_type,search_type,params,train_rmse,test_rmse,train_mape,test_mape
0,SVR,bayesian,"{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'd...",1541.689872,6066.051002,7.734404,7.985621


In [18]:
df_error_log.to_csv('svr_error_log.csv', index=False)