In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

import pickle

In [2]:
# Read CSV file into Data Frame:
rt = pd.read_csv('v9_route_39A.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
rt.shape

(3417494, 27)

## Arrival Difference Column

In [4]:
rt['arr_diff'] = rt['act_stp_arr'] - rt['PLANNEDTIME_ARR']

In [5]:
rt['arr_diff'] = rt['arr_diff'].astype('int64')

In [6]:
rt.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,VEHICLEID,year,month,day,id,...,stop_actARR_hour,wthr_tr_lt_id,temp,humidity,wind_speed,weather_id,weather_description,dos_trpID_stpID_prgNum,act_stp_arr,arr_diff
0,2018-01-01,5959380,1,767,32400,2172316,2018,1,1,20180101595938001,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-767-1,32323.0,-77
1,2018-01-01,5959380,2,768,32503,2172316,2018,1,1,20180101595938002,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-768-2,32461.0,-42
2,2018-01-01,5959380,3,769,32527,2172316,2018,1,1,20180101595938003,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-769-3,32481.0,-46
3,2018-01-01,5959380,4,770,32565,2172316,2018,1,1,20180101595938004,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-770-4,32510.0,-55
4,2018-01-01,5959380,5,771,32581,2172316,2018,1,1,20180101595938005,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-771-5,32520.0,-61


## Hour Column

In [7]:
rt['hour'] = (rt['PLANNEDTIME_ARR'] / 3600).round()

In [8]:
rt['hour'] = rt['hour'].astype('int16')

In [9]:
rt.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,VEHICLEID,year,month,day,id,...,wthr_tr_lt_id,temp,humidity,wind_speed,weather_id,weather_description,dos_trpID_stpID_prgNum,act_stp_arr,arr_diff,hour
0,2018-01-01,5959380,1,767,32400,2172316,2018,1,1,20180101595938001,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-767-1,32323.0,-77,9
1,2018-01-01,5959380,2,768,32503,2172316,2018,1,1,20180101595938002,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-768-2,32461.0,-42,9
2,2018-01-01,5959380,3,769,32527,2172316,2018,1,1,20180101595938003,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-769-3,32481.0,-46,9
3,2018-01-01,5959380,4,770,32565,2172316,2018,1,1,20180101595938004,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-770-4,32510.0,-55,9
4,2018-01-01,5959380,5,771,32581,2172316,2018,1,1,20180101595938005,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-771-5,32520.0,-61,9


## Just 39A Trips going in Direction '1'

In [10]:
rt_39a1 = rt [rt['DIRECTION'] == 1]

In [11]:
rt_39a1.shape

(1585989, 29)

## Shuffling Data

In [12]:
# Row shuffle inspired from Geeks for Geeks: https://www.geeksforgeeks.org/pandas-how-to-shuffle-a-dataframe-rows/
shuf = rt_39a1.sample(frac = 1)
shuf.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,VEHICLEID,year,month,day,id,...,wthr_tr_lt_id,temp,humidity,wind_speed,weather_id,weather_description,dos_trpID_stpID_prgNum,act_stp_arr,arr_diff,hour
235114,2018-01-24,6226318,2,768,41516,1000617,2018,1,24,20180124622631802,...,2018012412,8.39,70,12.9,801,few clouds,2018-01-24-6226318-768-2,41741.0,225,12
3135394,2018-11-27,8137688,72,7047,76589,1000603,2018,11,27,20181127813768872,...,2018112721,8.39,75,3.1,803,broken clouds,2018-11-27-8137688-7047-72,77105.0,516,21
3277850,2018-12-13,8177200,10,776,44306,1000944,2018,12,13,20181213817720010,...,2018121312,6.39,75,10.3,500,light rain,2018-12-13-8177200-776-10,44312.0,6,12
1611956,2018-06-21,7019600,48,7025,35320,1000936,2018,6,21,20180621701960048,...,2018062110,15.39,51,7.2,802,scattered clouds,2018-06-21-7019600-7025-48,35746.0,426,10
92722,2018-01-10,6101484,10,776,24450,2534866,2018,1,10,20180110610148410,...,2018011007,4.39,93,3.6,801,few clouds,2018-01-10-6101484-776-10,24826.0,376,7


## Splitting Shuffled Data: Train (70%) & Test (30%)

In [13]:
# train_test_split already includes a shuffle method, but no harm to shuffle again
train, test = train_test_split(shuf, test_size=0.3, random_state=42, shuffle=True)

## Random Forest

## Training

In [41]:
X = train[['dayOfWeek', 'rushHour', 'hour', 'temp', 'wind_speed']]
y = train.arr_diff

In [42]:
random_forest = RandomForestRegressor(n_estimators = 20, random_state = 42)
random_forest.fit(X, y)

In [43]:
# Compute the importance of each feature based on the trained random forest regressor
feature_importance = pd.DataFrame({'feature': X.columns, 'importance':random_forest.feature_importances_})
feature_importance.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
3,temp,0.33445
4,wind_speed,0.280164
2,hour,0.221407
0,dayOfWeek,0.122783
1,rushHour,0.041195


<h2>Prediction & Evaluation on Training Data</h2>

In [44]:
train_rf_predictions = random_forest.predict(X)

train_actual_vs_pred_rf = pd.concat([y, pd.DataFrame(train_rf_predictions, columns=['Pred_arr_diff'], index=y.index)], axis=1)
train_actual_vs_pred_rf.head(10)

Unnamed: 0,arr_diff,Pred_arr_diff
1995140,588,-44.363823
2739755,213,60.680289
3328529,-135,86.980016
1738814,-38,259.740924
1841835,139,373.975387
1910159,36,71.199592
2478931,1110,506.033612
625673,115,358.741196
1506902,5,200.917088
1066228,147,44.578667


In [45]:
# Function to output evaluation metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [46]:
printMetrics(y, train_rf_predictions)

MAE:  286.25971719166137
MSE:  146218.25595566403
RMSE:  382.38495780517314
R2:  0.3261320327483602


<h2>Prediction & Evaluation on Testing Data</h2>

In [47]:
X_test = test[['dayOfWeek', 'rushHour', 'hour', 'temp', 'wind_speed']]
y_test = test.arr_diff

In [48]:
test_rf_predictions = random_forest.predict(X_test)

test_actual_vs_pred_rf = pd.concat([y_test, pd.DataFrame(test_rf_predictions, columns=['Pred_arr_diff'], index=y_test.index)], axis=1)
test_actual_vs_pred_rf.head(10)

Unnamed: 0,arr_diff,Pred_arr_diff
2473862,49,277.179
181524,78,-105.23512
1825333,39,231.862947
1189404,615,259.593937
1589438,69,91.415046
2887442,-76,435.577741
1488991,157,284.043625
1890589,1018,274.403831
2302013,-177,79.541082
1573928,209,67.135067


In [49]:
printMetrics(y_test, test_rf_predictions)

MAE:  288.6778156984142
MSE:  148575.87117683134
RMSE:  385.45540750757584
R2:  0.3177014064832876


<h2>Prediction & Evaluation on Full Data (5-Fold Cross-Validation):</h2>

In [None]:
X_fold = shuf[['month', 'dayOfWeek', 'hour', 'temp', 'wind_speed']]
y_fold = shuf.arr_diff

<h3>3-Fold Cross-Validation Metrics:</h3>

In [None]:
mae = cross_val_score(RandomForestRegressor(n_estimators=20, random_state=42), X_fold, y_fold, scoring='neg_mean_absolute_error', cv=3)
mse = cross_val_score(RandomForestRegressor(n_estimators=20, random_state=42), X_fold, y_fold, scoring='neg_mean_squared_error', cv=3)
rmse = cross_val_score(RandomForestRegressor(n_estimators=20, random_state=42), X_fold, y_fold, scoring='neg_root_mean_squared_error', cv=3)
r2 = cross_val_score(RandomForestRegressor(n_estimators=20, random_state=42), X_fold, y_fold, scoring='r2', cv=3)


print("MAE: ", -mae.mean())
print("MSE: ", -mse.mean())
print("RMSE: ", -rmse.mean())
print("R2: ", r2.mean())

## Pickle the Model

In [None]:
# Serialize model object into a file called model.pkl on disk using pickle
with open('39A_1dir_rf_model.pkl', 'wb') as handle:
    pickle.dump(random_forest, handle, pickle.HIGHEST_PROTOCOL)