In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

import pickle
import bz2

In [177]:
# Read CSV file into Data Frame:
rt = pd.read_csv('v9_route_9.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [178]:
rt.shape

(2681064, 27)

## Arrival Difference Column

In [179]:
rt['arr_diff'] = rt['act_stp_arr'] - rt['PLANNEDTIME_ARR']

In [180]:
rt['arr_diff'] = rt['arr_diff'].astype('int64')

In [181]:
rt.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,VEHICLEID,year,month,day,id,...,stop_actARR_hour,wthr_tr_lt_id,temp,humidity,wind_speed,weather_id,weather_description,dos_trpID_stpID_prgNum,act_stp_arr,arr_diff
0,2018-01-01,5957793,1,7132,32400,2172314,2018,1,1,20180101595779301,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-7132-1,32608.0,208
1,2018-01-01,5957793,2,6230,32470,2172314,2018,1,1,20180101595779302,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-6230-2,32672.0,202
2,2018-01-01,5957793,3,6228,32513,2172314,2018,1,1,20180101595779303,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-6228-3,32696.0,183
3,2018-01-01,5957793,4,4788,32569,2172314,2018,1,1,20180101595779304,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-4788-4,32744.0,175
4,2018-01-01,5957793,5,6273,32615,2172314,2018,1,1,20180101595779305,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-6273-5,32803.0,188


## Hour Column

In [182]:
rt['hour'] = (rt['PLANNEDTIME_ARR'] / 3600).round()

In [183]:
rt['hour'] = rt['hour'].astype('int16')

In [184]:
rt.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,VEHICLEID,year,month,day,id,...,wthr_tr_lt_id,temp,humidity,wind_speed,weather_id,weather_description,dos_trpID_stpID_prgNum,act_stp_arr,arr_diff,hour
0,2018-01-01,5957793,1,7132,32400,2172314,2018,1,1,20180101595779301,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-7132-1,32608.0,208,9
1,2018-01-01,5957793,2,6230,32470,2172314,2018,1,1,20180101595779302,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-6230-2,32672.0,202,9
2,2018-01-01,5957793,3,6228,32513,2172314,2018,1,1,20180101595779303,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-6228-3,32696.0,183,9
3,2018-01-01,5957793,4,4788,32569,2172314,2018,1,1,20180101595779304,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-4788-4,32744.0,175,9
4,2018-01-01,5957793,5,6273,32615,2172314,2018,1,1,20180101595779305,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-6273-5,32803.0,188,9


## Just 39A Trips going in Direction '2'

In [185]:
rt_39a1 = rt [rt['DIRECTION'] == 2]

In [186]:
rt_39a1.shape

(1376542, 29)

## Shuffling Data

In [187]:
# Row shuffle inspired from Geeks for Geeks: https://www.geeksforgeeks.org/pandas-how-to-shuffle-a-dataframe-rows/
shuf = rt_39a1.sample(frac = 1)
shuf.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,VEHICLEID,year,month,day,id,...,wthr_tr_lt_id,temp,humidity,wind_speed,weather_id,weather_description,dos_trpID_stpID_prgNum,act_stp_arr,arr_diff,hour
2530580,2018-12-07,8154375,65,4369,77177,2868369,2018,12,7,20181207815437565,...,2018120721,8.39,76,8.8,500,light rain,2018-12-07-8154375-4369-65,78763.0,1586,21
1861316,2018-09-11,8081394,18,2441,61509,3265710,2018,9,11,20180911808139418,...,2018091117,15.39,63,5.7,803,broken clouds,2018-09-11-8081394-2441-18,61521.0,12,17
1495752,2018-07-20,7175726,45,148,32689,2868320,2018,7,20,20180720717572645,...,2018072009,14.39,93,3.6,500,light rain,2018-07-20-7175726-148-45,33152.0,463,9
388850,2018-02-20,6264688,12,2435,48303,2693287,2018,2,20,20180220626468812,...,2018022013,9.46,71,6.7,801,few clouds,2018-02-20-6264688-2435-12,48208.0,-95,13
2299143,2018-11-07,8100623,40,197,57960,3303848,2018,11,7,20181107810062340,...,2018110716,8.39,87,5.7,500,light rain,2018-11-07-8100623-197-40,58016.0,56,16


## Splitting Shuffled Data: Train (70%) & Test (30%)

In [188]:
# train_test_split already includes a shuffle method, but no harm to shuffle again
train, test = train_test_split(shuf, test_size=0.3, random_state=42, shuffle=True)

## Random Forest

## Training

In [189]:
X = train[['month', 'dayOfWeek', 'rushHour', 'hour', 'temp', 'wind_speed']]
y = train.arr_diff

In [190]:
random_forest = RandomForestRegressor(n_estimators = 20, random_state = 42)
random_forest.fit(X, y)

In [191]:
# Compute the importance of each feature based on the trained random forest regressor
feature_importance = pd.DataFrame({'feature': X.columns, 'importance':random_forest.feature_importances_})
feature_importance.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
4,temp,0.234347
3,hour,0.214139
5,wind_speed,0.206011
0,month,0.156864
1,dayOfWeek,0.115881
2,rushHour,0.072759


<h2>Prediction & Evaluation on Training Data</h2>

In [192]:
train_rf_predictions = random_forest.predict(X)

train_actual_vs_pred_rf = pd.concat([y, pd.DataFrame(train_rf_predictions, columns=['Pred_arr_diff'], index=y.index)], axis=1)
train_actual_vs_pred_rf.head(10)

Unnamed: 0,arr_diff,Pred_arr_diff
2235686,-86,-34.596767
66808,-161,-237.208769
1921431,717,577.285963
611262,-101,59.322038
1203528,337,293.847522
2503084,-1,118.146633
2072746,645,275.13164
2225450,23,-51.698781
2298209,62,134.504612
1489401,164,310.617906


In [193]:
# Function to output evaluation metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [194]:
printMetrics(y, train_rf_predictions)

MAE:  235.27000278217105
MSE:  103473.10257640052
RMSE:  321.67235283188467
R2:  0.358670071227141


<h2>Prediction & Evaluation on Testing Data</h2>

In [195]:
X_test = test[['month', 'dayOfWeek', 'rushHour', 'hour', 'temp', 'wind_speed']]
y_test = test.arr_diff

In [196]:
test_rf_predictions = random_forest.predict(X_test)

test_actual_vs_pred_rf = pd.concat([y_test, pd.DataFrame(test_rf_predictions, columns=['Pred_arr_diff'], index=y_test.index)], axis=1)
test_actual_vs_pred_rf.head(10)

Unnamed: 0,arr_diff,Pred_arr_diff
1171102,-185,20.341614
1555970,1315,622.059457
1475516,100,232.535689
1931597,-192,176.690204
2014985,648,225.268069
1079947,1136,159.515603
517282,83,-113.333518
2228963,-473,-129.640035
390252,967,469.149132
465112,660,417.296811


In [197]:
printMetrics(y_test, test_rf_predictions)

MAE:  237.04862125060941
MSE:  104912.8466875292
RMSE:  323.9025265222999
R2:  0.347143828333905


<h2>Prediction & Evaluation on Full Data (5-Fold Cross-Validation):</h2>

In [198]:
X_fold = shuf[['month', 'dayOfWeek', 'rushHour', 'hour', 'temp', 'wind_speed']]
y_fold = shuf.arr_diff

<h3>3-Fold Cross-Validation Metrics:</h3>

In [199]:
mae = cross_val_score(RandomForestRegressor(n_estimators=20, random_state=42), X_fold, y_fold, scoring='neg_mean_absolute_error', cv=3)
mse = cross_val_score(RandomForestRegressor(n_estimators=20, random_state=42), X_fold, y_fold, scoring='neg_mean_squared_error', cv=3)
rmse = cross_val_score(RandomForestRegressor(n_estimators=20, random_state=42), X_fold, y_fold, scoring='neg_root_mean_squared_error', cv=3)
r2 = cross_val_score(RandomForestRegressor(n_estimators=20, random_state=42), X_fold, y_fold, scoring='r2', cv=3)


print("MAE: ", -mae.mean())
print("MSE: ", -mse.mean())
print("RMSE: ", -rmse.mean())
print("R2: ", r2.mean())

MAE:  237.15690981916472
MSE:  105200.3425584901
RMSE:  324.34563731259783
R2:  0.34718493342140455


## Pickle the Model

In [200]:
# Serialize model object into a file called model.pkl on disk using pickle
with open('9_2dir_rf_model.pkl', 'wb') as handle:
    pickle.dump(random_forest, handle, pickle.HIGHEST_PROTOCOL)

In [201]:
# Serialize model object into a file called model.pkl on disk using pickle
with bz2.open('9_2dir_rf_model_sm.pkl', 'wb') as handle:
    pickle.dump(random_forest, handle, pickle.HIGHEST_PROTOCOL)