# Modeling

## 1.) Import necessary packages

In [15]:
# Importing necessary packages

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from prettytable import PrettyTable

## 2.) Load & Prepare The Data

In [16]:
# Load the featurised data sets previously obtained from Feature_Engineering.ipynb
train_data = np.load('featurised_train_data.npy')
y_train = np.load('target_variable.npy')
test_data = np.load('featurised_test_data.npy')
test_raw = pd.read_csv('Data/test_data/test.csv')

## 3.) Random Forest

### 3.1) Grid Search 

In [17]:
# Perform gridsearchcv 

parameters = {'n_estimators':[5, 10, 20, 30, 40], 'max_depth': [2,3,5,8,10,20], 'min_samples_split': [2,3,4]}
rf_reg = RandomForestRegressor(random_state= 3)

Grid = GridSearchCV(estimator = rf_reg, param_grid = parameters, \
                     cv = 4, return_train_score= True, scoring = 'r2', verbose = 1)
Grid.fit(train_data,y_train)

Fitting 4 folds for each of 90 candidates, totalling 360 fits


GridSearchCV(cv=4, estimator=RandomForestRegressor(random_state=3),
             param_grid={'max_depth': [2, 3, 5, 8, 10, 20],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [5, 10, 20, 30, 40]},
             return_train_score=True, scoring='r2', verbose=1)

In [18]:
# Get the best hyperparameters

best_params1 = Grid.best_params_
print(best_params1)

{'max_depth': 2, 'min_samples_split': 2, 'n_estimators': 30}


### 3.2) Train RF Model

In [19]:
# Use the obtained hyperparameters to build the final RF model

rf = RandomForestRegressor(random_state= 12, n_estimators= best_params1['n_estimators'], \
                           max_depth= best_params1['max_depth'], min_samples_split= best_params1['min_samples_split'])
rf.fit(train_data,y_train)

r2_training =  round(r2_score(y_train,rf.predict(train_data)),4)
print('R-squared for training data when using RF: ', r2_training)

r2_cv = round(np.mean(cross_val_score(rf,train_data,y_train,cv=5, scoring='r2')),4)
print('R-squared cross validation  when using RF: ', r2_cv)

R-squared for training data when using RF:  0.5869
R-squared cross validation  when using RF:  0.5838


### 3.3) Get predictions For Test Data

In [11]:
# fit the trained RF on test data 
y_pred_test = rf.predict(test_data)

# save the predictions, later to be uploaded to kaggle 
submission_df = pd.DataFrame({'ID': test_raw['ID'], 'y': y_pred_test})
submission_df.to_csv('Submission1_RF.csv')

## 4.) XgBoost

### 4.1) Grid Search

In [20]:
# Perform gridsearchcv 

parameters = {"learning_rate": (0.05, 0.10, 0.15, 0.2), "max_depth": [ 2, 3, 4, 5, 6], "min_child_weight": [3, 5, 7, 10],\
              "gamma":[ 0.0, 0.05, 0.1], "colsample_bytree":[ 0.1, 0.2, 0.3, 0.4]}
xgb_reg = XGBRegressor(random_state= 3)

Grid = GridSearchCV(estimator = xgb_reg, param_grid = parameters, \
                     cv = 4, return_train_score= True, scoring = 'r2', verbose = 1)
Grid.fit(train_data,y_train)

Fitting 4 folds for each of 960 candidates, totalling 3840 fits


GridSearchCV(cv=4,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    num_parallel_tree=None, random_state=3,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parameter

In [22]:
# Get the best hyperparameters

best_params2 = Grid.best_params_
print(best_params2)

{'colsample_bytree': 0.4, 'gamma': 0.0, 'learning_rate': 0.05, 'max_depth': 2, 'min_child_weight': 7}


### 4.2) Train XGBRegressor

In [23]:
xgb_reg = XGBRegressor(learning_rate= best_params2['learning_rate'], max_depth= best_params2['max_depth'], \
                       min_child_weight= best_params2['min_child_weight'], gamma= best_params2['gamma'], \
                       colsample_bytree= best_params2['colsample_bytree'],random_state=3)
xgb_reg.fit(train_data,y_train)

r2_training =  round(r2_score(y_train,xgb_reg.predict(train_data)),4)
print('R-squared for training data when using XGB: ', r2_training)

r2_cv = round(np.mean(cross_val_score(xgb_reg,train_data,y_train,cv=5, scoring='r2')),4)
print('R-squared cross validation  when using XGB: ', r2_cv)

R-squared for training data when using XGB:  0.6012
R-squared cross validation  when using XGB:  0.5907


### 4.3) Get predictions For Test Data

In [24]:
# fit the trained RF on test data 
y_pred_test2 = xgb_reg.predict(test_data)

# save the predictions, later to be uploaded to kaggle 
submission_df2 = pd.DataFrame({'ID': test_raw['ID'], 'y': y_pred_test2})
submission_df2.to_csv('Submission1_XGB.csv', index=False)