In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

import pickle
import bz2

In [177]:
# Read CSV file into Data Frame:
rt = pd.read_csv('v9_route_9.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [178]:
rt.shape

(2681064, 27)

## Arrival Difference Column

In [179]:
rt['arr_diff'] = rt['act_stp_arr'] - rt['PLANNEDTIME_ARR']

In [180]:
rt['arr_diff'] = rt['arr_diff'].astype('int64')

In [181]:
rt.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,VEHICLEID,year,month,day,id,...,stop_actARR_hour,wthr_tr_lt_id,temp,humidity,wind_speed,weather_id,weather_description,dos_trpID_stpID_prgNum,act_stp_arr,arr_diff
0,2018-01-01,5957793,1,7132,32400,2172314,2018,1,1,20180101595779301,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-7132-1,32608.0,208
1,2018-01-01,5957793,2,6230,32470,2172314,2018,1,1,20180101595779302,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-6230-2,32672.0,202
2,2018-01-01,5957793,3,6228,32513,2172314,2018,1,1,20180101595779303,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-6228-3,32696.0,183
3,2018-01-01,5957793,4,4788,32569,2172314,2018,1,1,20180101595779304,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-4788-4,32744.0,175
4,2018-01-01,5957793,5,6273,32615,2172314,2018,1,1,20180101595779305,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-6273-5,32803.0,188


## Hour Column

In [182]:
rt['hour'] = (rt['PLANNEDTIME_ARR'] / 3600).round()

In [183]:
rt['hour'] = rt['hour'].astype('int16')

In [184]:
rt.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,VEHICLEID,year,month,day,id,...,wthr_tr_lt_id,temp,humidity,wind_speed,weather_id,weather_description,dos_trpID_stpID_prgNum,act_stp_arr,arr_diff,hour
0,2018-01-01,5957793,1,7132,32400,2172314,2018,1,1,20180101595779301,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-7132-1,32608.0,208,9
1,2018-01-01,5957793,2,6230,32470,2172314,2018,1,1,20180101595779302,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-6230-2,32672.0,202,9
2,2018-01-01,5957793,3,6228,32513,2172314,2018,1,1,20180101595779303,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-6228-3,32696.0,183,9
3,2018-01-01,5957793,4,4788,32569,2172314,2018,1,1,20180101595779304,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-4788-4,32744.0,175,9
4,2018-01-01,5957793,5,6273,32615,2172314,2018,1,1,20180101595779305,...,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5957793-6273-5,32803.0,188,9


## Just 39A Trips going in Direction '1'

In [185]:
rt_39a1 = rt [rt['DIRECTION'] == 1]

In [186]:
rt_39a1.shape

(1304522, 29)

## Shuffling Data

In [187]:
# Row shuffle inspired from Geeks for Geeks: https://www.geeksforgeeks.org/pandas-how-to-shuffle-a-dataframe-rows/
shuf = rt_39a1.sample(frac = 1)
shuf.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,VEHICLEID,year,month,day,id,...,wthr_tr_lt_id,temp,humidity,wind_speed,weather_id,weather_description,dos_trpID_stpID_prgNum,act_stp_arr,arr_diff,hour
1778830,2018-08-29,7505651,36,1282,47789,3265680,2018,8,29,20180829750565136,...,2018082913,17.39,55,6.7,802,scattered clouds,2018-08-29-7505651-1282-36,47729.0,-60,13
2261346,2018-09-18,8090414,35,1278,65341,2172308,2018,9,18,20180918809041435,...,2018091818,18.39,56,7.7,500,light rain,2018-09-18-8090414-1278-35,65827.0,486,18
2342044,2018-11-12,8129705,38,1284,81370,2868370,2018,11,12,20181112812970538,...,2018111223,8.39,87,5.1,803,broken clouds,2018-11-12-8129705-1284-38,81535.0,165,23
2190035,2018-10-29,8075045,28,189,61685,2534815,2018,10,29,20181029807504528,...,2018102917,5.46,70,3.6,803,broken clouds,2018-10-29-8075045-189-28,61938.0,253,17
1853092,2018-09-09,8081541,56,2417,42226,2172312,2018,9,9,20180909808154156,...,2018090912,16.39,68,7.2,802,scattered clouds,2018-09-09-8081541-2417-56,41932.0,-294,12


## Splitting Shuffled Data: Train (70%) & Test (30%)

In [188]:
# train_test_split already includes a shuffle method, but no harm to shuffle again
train, test = train_test_split(shuf, test_size=0.3, random_state=42, shuffle=True)

## Random Forest

## Training

In [189]:
X = train[['month', 'dayOfWeek', 'rushHour', 'hour', 'temp', 'wind_speed']]
y = train.arr_diff

In [190]:
random_forest = RandomForestRegressor(n_estimators = 20, random_state = 42)
random_forest.fit(X, y)

In [191]:
# Compute the importance of each feature based on the trained random forest regressor
feature_importance = pd.DataFrame({'feature': X.columns, 'importance':random_forest.feature_importances_})
feature_importance.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
3,hour,0.237919
4,temp,0.234026
5,wind_speed,0.216703
0,month,0.138562
1,dayOfWeek,0.116803
2,rushHour,0.055987


<h2>Prediction & Evaluation on Training Data</h2>

In [192]:
train_rf_predictions = random_forest.predict(X)

train_actual_vs_pred_rf = pd.concat([y, pd.DataFrame(train_rf_predictions, columns=['Pred_arr_diff'], index=y.index)], axis=1)
train_actual_vs_pred_rf.head(10)

Unnamed: 0,arr_diff,Pred_arr_diff
1325541,704,448.837584
1337252,865,264.725792
2400284,333,162.145329
1147122,-53,-63.399506
684016,128,25.668007
1700318,428,171.870027
59911,74,15.694895
1409440,288,288.556655
1895850,388,133.414961
2392761,244,13.644661


In [193]:
# Function to output evaluation metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [194]:
printMetrics(y, train_rf_predictions)

MAE:  232.90868426691142
MSE:  101152.23631593672
RMSE:  318.04439362443844
R2:  0.31574024796679023


<h2>Prediction & Evaluation on Testing Data</h2>

In [195]:
X_test = test[['month', 'dayOfWeek', 'rushHour', 'hour', 'temp', 'wind_speed']]
y_test = test.arr_diff

In [196]:
test_rf_predictions = random_forest.predict(X_test)

test_actual_vs_pred_rf = pd.concat([y_test, pd.DataFrame(test_rf_predictions, columns=['Pred_arr_diff'], index=y_test.index)], axis=1)
test_actual_vs_pred_rf.head(10)

Unnamed: 0,arr_diff,Pred_arr_diff
2410218,230,501.495766
2302993,747,239.08039
1315570,43,42.24018
704566,377,462.397735
2525388,500,238.665606
2304260,218,290.41226
2581137,203,38.211893
1780786,509,139.36544
240762,490,212.650917
1143430,99,97.248896


In [197]:
printMetrics(y_test, test_rf_predictions)

MAE:  234.40415711010513
MSE:  102462.26950718531
RMSE:  320.09728131801637
R2:  0.30684665703156744


<h2>Prediction & Evaluation on Full Data (5-Fold Cross-Validation):</h2>

In [198]:
X_fold = shuf[['month', 'dayOfWeek', 'rushHour', 'hour', 'temp', 'wind_speed']]
y_fold = shuf.arr_diff

<h3>3-Fold Cross-Validation Metrics:</h3>

In [199]:
mae = cross_val_score(RandomForestRegressor(n_estimators=20, random_state=42), X_fold, y_fold, scoring='neg_mean_absolute_error', cv=3)
mse = cross_val_score(RandomForestRegressor(n_estimators=20, random_state=42), X_fold, y_fold, scoring='neg_mean_squared_error', cv=3)
rmse = cross_val_score(RandomForestRegressor(n_estimators=20, random_state=42), X_fold, y_fold, scoring='neg_root_mean_squared_error', cv=3)
r2 = cross_val_score(RandomForestRegressor(n_estimators=20, random_state=42), X_fold, y_fold, scoring='r2', cv=3)


print("MAE: ", -mae.mean())
print("MSE: ", -mse.mean())
print("RMSE: ", -rmse.mean())
print("R2: ", r2.mean())

MAE:  234.77811959655313
MSE:  102796.12248331761
RMSE:  320.61754783193396
R2:  0.304608720608325


## Pickle the Model

In [200]:
# Serialize model object into a file called model.pkl on disk using pickle
with open('9_1dir_rf_model.pkl', 'wb') as handle:
    pickle.dump(random_forest, handle, pickle.HIGHEST_PROTOCOL)

In [201]:
# Serialize model object into a file called model.pkl on disk using pickle
with bz2.open('9_1dir_rf_model_sm.pkl', 'wb') as handle:
    pickle.dump(random_forest, handle, pickle.HIGHEST_PROTOCOL)