# 10. Trial of Training Continuous Model

- Trains a selection of predictive models with default hyperparameter values to gauge the time required to train-score one model, and also get a preview of the R^2 scores
- notebook follows Initialise, Train, Predict, Get Accuracy Score steps

In [9]:
import pandas as pd
import numpy as np

# Import Data

In [2]:
Train_data_X = pd.read_csv('../data/curated/ModelBuilding/Continuous/XTrain_16-1_16-5.csv')
Train_data_y = pd.read_csv('../data/curated/ModelBuilding/Continuous/yTrain_16-1_16-5.csv')
XTrain = Train_data_X.drop(['datetime'], axis=1)
yTrain = Train_data_y['count']

In [3]:
Val_data_X = pd.read_csv('../data/curated/ModelBuilding/Continuous/XVal_16-5_16-6.csv')
Val_data_y = pd.read_csv('../data/curated/ModelBuilding/Continuous/yVal_16-5_16-6.csv')
XVal = Val_data_X.drop(['datetime'], axis=1)
yVal = Val_data_y['count']

In [4]:
Test_data_X = pd.read_csv('../data/curated/ModelBuilding/Continuous/XTest_16-6_16-6.csv')
Test_data_y = pd.read_csv('../data/curated/ModelBuilding/Continuous/yTest_16-6_16-6.csv')
XTest = Test_data_X.drop(['datetime'], axis=1)
yTest = Test_data_y['count']

# Extra One Hot Encoding (on the fly)

In [5]:
from sklearn.preprocessing import OneHotEncoder

# OHE for DOLocationID
ohe = OneHotEncoder(handle_unknown='ignore')

Train_data_to_ohe = XTrain[['DOLocationID']]
Train_data_ohe = ohe.fit_transform(Train_data_to_ohe).toarray()

Train_data_ohe = pd.DataFrame(Train_data_ohe,
                              columns=list(ohe.get_feature_names_out(['DOLocationID'])))

XTrain = Train_data_X.drop(['DOLocationID'], axis=1)

for col in Train_data_ohe.columns:
    new_col = Train_data_ohe[col]
    new_col.index = range(len(new_col))

    XTrain[str(col)] = new_col



Val_data_to_ohe = XVal[['DOLocationID']]
Val_data_ohe = ohe.transform(Val_data_to_ohe).toarray()

Val_data_ohe = pd.DataFrame(Val_data_ohe,
                            columns=list(ohe.get_feature_names_out(['DOLocationID'])))

XVal = XVal.drop(['DOLocationID'], axis=1)

for col in Val_data_ohe.columns:
    new_col = Val_data_ohe[col]
    new_col.index = range(len(new_col))

    XVal[str(col)] = new_col



Test_data_to_ohe = XTest[['DOLocationID']]
Test_data_ohe = ohe.transform(Test_data_to_ohe).toarray()

Test_data_ohe = pd.DataFrame(Test_data_ohe,
                             columns=list(ohe.get_feature_names_out(['DOLocationID'])))

XTest = XTest.drop(['DOLocationID'], axis=1)

for col in Test_data_ohe.columns:
    new_col = Test_data_ohe[col]
    new_col.index = range(len(new_col))

    XTest[str(col)] = new_col

  XTrain[str(col)] = new_col
  XVal[str(col)] = new_col
  XTest[str(col)] = new_col


In [6]:
# OHE for PULocationID
ohe = OneHotEncoder(handle_unknown='ignore')

Train_data_to_ohe = XTrain[['PULocationID']]
Train_data_ohe = ohe.fit_transform(Train_data_to_ohe).toarray()

Train_data_ohe = pd.DataFrame(Train_data_ohe,
                              columns=list(ohe.get_feature_names_out(['PULocationID'])))

XTrain = XTrain.drop(['PULocationID'], axis=1)

for col in Train_data_ohe.columns:
    new_col = Train_data_ohe[col]
    new_col.index = range(len(new_col))

    XTrain[str(col)] = new_col



Val_data_to_ohe = XVal[['PULocationID']]
Val_data_ohe = ohe.transform(Val_data_to_ohe).toarray()

Val_data_ohe = pd.DataFrame(Val_data_ohe,
                            columns=list(ohe.get_feature_names_out(['PULocationID'])))

XVal = XVal.drop(['PULocationID'], axis=1)

for col in Val_data_ohe.columns:
    new_col = Val_data_ohe[col]
    new_col.index = range(len(new_col))

    XVal[str(col)] = new_col



Test_data_to_ohe = XTest[['PULocationID']]
Test_data_ohe = ohe.transform(Test_data_to_ohe).toarray()

Test_data_ohe = pd.DataFrame(Test_data_ohe,
                             columns=list(ohe.get_feature_names_out(['PULocationID'])))

XTest = XTest.drop(['PULocationID'], axis=1)

for col in Test_data_ohe.columns:
    new_col = Test_data_ohe[col]
    new_col.index = range(len(new_col))

    XTest[str(col)] = new_col

  XTrain[str(col)] = new_col
  XVal[str(col)] = new_col
  XTest[str(col)] = new_col


In [7]:
XTrain = XTrain.drop('datetime', axis=1)

# Models
## 0R (manual - alwasy predict mean)

In [17]:
# get mean
mean_y = np.mean(yTrain)
mean_y

26.353318895400452

In [20]:
# get the MSE and sd of null model (always predict mean)
ZeroR_MSE = sum((yTrain-mean_y)**2)/len(yTrain)
print(ZeroR_MSE)
print(np.sqrt(ZeroR_MSE))

875.406278659021
29.5872654812678


In [22]:
# Validation R^2
ZeroR_MSE_val = sum((yVal - mean_y) ** 2) / len(yVal)
print(ZeroR_MSE_val)
print(np.sqrt(ZeroR_MSE_val))

772.1266870677043
27.78716766904652


In [21]:
# Test R^2
ZeroR_MSE_test = sum((yTest-mean_y)**2)/len(yTest)
print(ZeroR_MSE_test)
print(np.sqrt(ZeroR_MSE_test))

742.2973997260035
27.2451353405705


## RFRegressor

In [7]:
from sklearn.ensemble import RandomForestRegressor

In [1]:
# Initialise and Fit
RF = RandomForestRegressor(n_estimators = 100, max_depth=16)
RF.fit(XTrain, yTrain)

In [None]:
# Predict and score for train
train_pred_RF = RF.predict(XTrain)
RF.score(XTrain, yTrain)

In [None]:
# Predict and score for validation
val_pred_RF = RF.predict(XVal)
RF.score(XVal, yVal)

In [None]:
# Predict and score for test
test_pred_RF = RF.predict(XTest)
RF.score(XTest, yTest)