In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

import pickle

In [2]:
# Read CSV file into Data Frame:
rt = pd.read_csv('v9_route_39A.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
rt.shape

(3417494, 27)

## Arrival Difference Column

In [4]:
rt['arr_diff'] = rt['act_stp_arr'] - rt['PLANNEDTIME_ARR']

In [5]:
rt.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,VEHICLEID,year,month,day,id,...,stop_actARR_hour,wthr_tr_lt_id,temp,humidity,wind_speed,weather_id,weather_description,dos_trpID_stpID_prgNum,act_stp_arr,arr_diff
0,2018-01-01,5959380,1,767,32400,2172316,2018,1,1,20180101595938001,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-767-1,32323.0,-77.0
1,2018-01-01,5959380,2,768,32503,2172316,2018,1,1,20180101595938002,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-768-2,32461.0,-42.0
2,2018-01-01,5959380,3,769,32527,2172316,2018,1,1,20180101595938003,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-769-3,32481.0,-46.0
3,2018-01-01,5959380,4,770,32565,2172316,2018,1,1,20180101595938004,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-770-4,32510.0,-55.0
4,2018-01-01,5959380,5,771,32581,2172316,2018,1,1,20180101595938005,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-771-5,32520.0,-61.0


## Hour Column

In [6]:
rt['hour'] = (rt['PLANNEDTIME_ARR'] / 3600).round()

In [7]:
rt['hour'] = rt['hour'].astype('int16')

In [8]:
rt.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,VEHICLEID,year,month,day,id,...,wthr_tr_lt_id,temp,humidity,wind_speed,weather_id,weather_description,dos_trpID_stpID_prgNum,act_stp_arr,arr_diff,hour
0,2018-01-01,5959380,1,767,32400,2172316,2018,1,1,20180101595938001,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-767-1,32323.0,-77.0,9
1,2018-01-01,5959380,2,768,32503,2172316,2018,1,1,20180101595938002,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-768-2,32461.0,-42.0,9
2,2018-01-01,5959380,3,769,32527,2172316,2018,1,1,20180101595938003,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-769-3,32481.0,-46.0,9
3,2018-01-01,5959380,4,770,32565,2172316,2018,1,1,20180101595938004,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-770-4,32510.0,-55.0,9
4,2018-01-01,5959380,5,771,32581,2172316,2018,1,1,20180101595938005,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-771-5,32520.0,-61.0,9


## Shuffling Data

In [9]:
# Row shuffle inspired from Geeks for Geeks: https://www.geeksforgeeks.org/pandas-how-to-shuffle-a-dataframe-rows/
shuf = rt.sample(frac = 1)
shuf.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,VEHICLEID,year,month,day,id,...,wthr_tr_lt_id,temp,humidity,wind_speed,weather_id,weather_description,dos_trpID_stpID_prgNum,act_stp_arr,arr_diff,hour
2298223,2018-08-30,7512573,2,768,57133,1000607,2018,8,30,20180830751257302,...,2018083016,15.39,67,4.6,803,broken clouds,2018-08-30-7512573-768-2,57221.0,88.0,16
2100929,2018-08-11,7320279,39,1661,34637,2406871,2018,8,11,20180811732027939,...,2018081110,17.39,72,5.1,803,broken clouds,2018-08-11-7320279-1661-39,34945.0,308.0,10
2503242,2018-09-29,7762129,19,1863,35353,2868372,2018,9,29,20180929776212919,...,2018092910,11.46,71,4.6,800,sky is clear,2018-09-29-7762129-1863-19,35796.0,443.0,10
606729,2018-03-07,6388328,58,751,40460,1000936,2018,3,7,20180307638832858,...,2018030711,6.39,75,9.3,802,scattered clouds,2018-03-07-6388328-751-58,41450.0,990.0,11
2742648,2018-10-24,8049473,74,7162,35173,1000938,2018,10,24,20181024804947374,...,2018102410,10.39,82,7.7,803,broken clouds,2018-10-24-8049473-7162-74,36294.0,1121.0,10


## Splitting Shuffled Data: Train (70%) & Test (30%)

In [10]:
# train_test_split already includes a shuffle method, but no harm to shuffle again
train, test = train_test_split(shuf, test_size=0.3, random_state=42, shuffle=True)

## Random Forest

## Training

In [11]:
X = train[['month', 'dayOfWeek', 'rushHour', 'hour', 'temp', 'wind_speed']]
y = train.arr_diff

In [12]:
random_forest = RandomForestRegressor(n_estimators = 20, random_state = 42)
random_forest.fit(X, y)

In [13]:
# Compute the importance of each feature based on the trained random forest regressor
feature_importance = pd.DataFrame({'feature': X.columns, 'importance':random_forest.feature_importances_})
feature_importance.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
3,hour,0.239407
4,temp,0.229453
0,month,0.187995
5,wind_speed,0.179137
1,dayOfWeek,0.128348
2,rushHour,0.035659


<h2>Prediction & Evaluation on Training Data</h2>

In [14]:
train_rf_predictions = random_forest.predict(X)

train_actual_vs_pred_rf = pd.concat([y, pd.DataFrame(train_rf_predictions, columns=['Pred_arr_diff'], index=y.index)], axis=1)
train_actual_vs_pred_rf.head(10)

Unnamed: 0,arr_diff,Pred_arr_diff
1043466,-113.0,164.155427
3262681,6.0,381.832261
2698502,416.0,10.834283
1275129,92.0,216.734198
2739887,112.0,103.765933
3243436,198.0,276.991174
898330,35.0,159.334907
2183256,90.0,298.331193
3138848,-282.0,128.858562
3101080,307.0,657.814476


In [15]:
# Function to output evaluation metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [16]:
printMetrics(y, train_rf_predictions)

MAE:  269.75320356190343
MSE:  132394.49474398317
RMSE:  363.8605429886334
R2:  0.23864383274523016


<h2>Prediction & Evaluation on Testing Data</h2>

In [17]:
X_test = test[['month', 'dayOfWeek', 'rushHour', 'hour', 'temp', 'wind_speed']]
y_test = test.arr_diff

In [18]:
test_rf_predictions = random_forest.predict(X_test)

test_actual_vs_pred_rf = pd.concat([y_test, pd.DataFrame(test_rf_predictions, columns=['Pred_arr_diff'], index=y_test.index)], axis=1)
test_actual_vs_pred_rf.head(10)

Unnamed: 0,arr_diff,Pred_arr_diff
1684653,173.0,237.023164
3135746,-17.0,-21.129498
2283412,167.0,24.2332
1250018,83.0,17.011545
555944,-277.0,75.188713
2717272,603.0,280.614076
1786510,-35.0,157.009956
209833,20.0,195.504283
1823751,1109.0,306.495377
413702,65.0,136.273288


In [19]:
printMetrics(y_test, test_rf_predictions)

MAE:  270.9493849772164
MSE:  133524.1981535044
RMSE:  365.4096306250075
R2:  0.23269798145344434


<h2>Prediction & Evaluation on Full Data (5-Fold Cross-Validation):</h2>

In [None]:
X_fold = shuf[['month', 'dayOfWeek', 'rushHour', 'hour', 'temp', 'wind_speed']]
y_fold = shuf.arr_diff

<h3>3-Fold Cross-Validation Metrics:</h3>

In [None]:
mae = cross_val_score(RandomForestRegressor(n_estimators=20, random_state=42), X_fold, y_fold, scoring='neg_mean_absolute_error', cv=3)
mse = cross_val_score(RandomForestRegressor(n_estimators=20, random_state=42), X_fold, y_fold, scoring='neg_mean_squared_error', cv=3)
rmse = cross_val_score(RandomForestRegressor(n_estimators=20, random_state=42), X_fold, y_fold, scoring='neg_root_mean_squared_error', cv=3)
r2 = cross_val_score(RandomForestRegressor(n_estimators=20, random_state=42), X_fold, y_fold, scoring='r2', cv=3)


print("MAE: ", -mae.mean())
print("MSE: ", -mse.mean())
print("RMSE: ", -rmse.mean())
print("R2: ", r2.mean())