In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

import pickle

In [2]:
# Read CSV file into Data Frame:
rt = pd.read_csv('v9_route_39A.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
rt.shape

(3417494, 27)

## Arrival Difference Column

In [4]:
rt['arr_diff'] = rt['act_stp_arr'] - rt['PLANNEDTIME_ARR']

In [5]:
rt.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,VEHICLEID,year,month,day,id,...,stop_actARR_hour,wthr_tr_lt_id,temp,humidity,wind_speed,weather_id,weather_description,dos_trpID_stpID_prgNum,act_stp_arr,arr_diff
0,2018-01-01,5959380,1,767,32400,2172316,2018,1,1,20180101595938001,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-767-1,32323.0,-77.0
1,2018-01-01,5959380,2,768,32503,2172316,2018,1,1,20180101595938002,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-768-2,32461.0,-42.0
2,2018-01-01,5959380,3,769,32527,2172316,2018,1,1,20180101595938003,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-769-3,32481.0,-46.0
3,2018-01-01,5959380,4,770,32565,2172316,2018,1,1,20180101595938004,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-770-4,32510.0,-55.0
4,2018-01-01,5959380,5,771,32581,2172316,2018,1,1,20180101595938005,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-771-5,32520.0,-61.0


## Shuffling Data

In [6]:
# Row shuffle inspired from Geeks for Geeks: https://www.geeksforgeeks.org/pandas-how-to-shuffle-a-dataframe-rows/
shuf = rt.sample(frac = 1)
shuf.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,VEHICLEID,year,month,day,id,...,stop_actARR_hour,wthr_tr_lt_id,temp,humidity,wind_speed,weather_id,weather_description,dos_trpID_stpID_prgNum,act_stp_arr,arr_diff
384285,2018-02-08,6258363,44,1807,34227,1000942,2018,2,8,20180208625836344,...,10,2018020810,8.39,93,5.7,300,light intensity drizzle,2018-02-08-6258363-1807-44,34686.0,459.0
651339,2018-03-12,6388330,48,1479,51422,1000615,2018,3,12,20180312638833048,...,14,2018031214,9.39,81,4.6,500,light rain,2018-03-12-6388330-1479-48,51347.0,-75.0
1225044,2018-05-11,6736086,18,785,53168,1000934,2018,5,11,20180511673608618,...,15,2018051115,10.39,76,4.6,500,light rain,2018-05-11-6736086-785-18,53160.0,-8.0
2527244,2018-10-02,7780348,19,1863,28813,2868310,2018,10,2,20181002778034819,...,8,2018100208,13.46,100,6.7,310,light intensity drizzle rain,2018-10-02-7780348-1863-19,28950.0,137.0
2914295,2018-11-04,8098776,65,758,78106,1000615,2018,11,4,20181104809877665,...,22,2018110422,9.39,100,2.1,500,light rain,2018-11-04-8098776-758-65,78487.0,381.0


## Splitting Shuffled Data: Train (70%) & Test (30%)

In [7]:
# train_test_split already includes a shuffle method, but no harm to shuffle again
train, test = train_test_split(shuf, test_size=0.3, random_state=42, shuffle=True)

## Decision Tree

## Training

In [8]:
X = train[['DIRECTION', 'PROGRNUMBER', 'month', 'dayOfWeek', 'rushHour', 'temp', 'wind_speed']]
y = train.arr_diff

In [9]:
dtr = DecisionTreeRegressor(max_depth=3)
dtr.fit(X, y)

In [10]:
# Compute the importance of each feature based on the trained decision tree regressor
feature_importance = pd.DataFrame({'feature': X.columns, 'importance':dtr.feature_importances_})
feature_importance.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
1,PROGRNUMBER,0.851856
2,month,0.081129
0,DIRECTION,0.067015
3,dayOfWeek,0.0
4,rushHour,0.0
5,temp,0.0
6,wind_speed,0.0


<h2>Prediction & Evaluation on Training Data</h2>

In [11]:
train_dtr_predictions = dtr.predict(X)

train_actual_vs_pred_dtr = pd.concat([y, pd.DataFrame(train_dtr_predictions, columns=['Pred_arr_diff'], index=y.index)], axis=1)
train_actual_vs_pred_dtr.head(10)

Unnamed: 0,arr_diff,Pred_arr_diff
1299166,1274.0,350.138034
493767,267.0,90.407998
3126889,-442.0,373.024569
1436985,545.0,350.138034
2249030,-178.0,223.466933
1739569,134.0,37.556329
576938,2304.0,223.466933
3222261,-97.0,37.556329
1816192,186.0,90.407998
1190867,844.0,350.138034


In [12]:
# Function to output evaluation metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [13]:
printMetrics(y, train_dtr_predictions)

MAE:  268.92159401590465
MSE:  150688.91781663193
RMSE:  388.186704842698
R2:  0.13320972186407276


<h2>Prediction & Evaluation on Testing Data</h2>

In [14]:
X_test = test[['DIRECTION', 'PROGRNUMBER', 'month', 'dayOfWeek', 'rushHour', 'temp', 'wind_speed']]
y_test = test.arr_diff

In [15]:
test_dtr_predictions = dtr.predict(X_test)

test_actual_vs_pred_dtr = pd.concat([y_test, pd.DataFrame(test_dtr_predictions, columns=['Pred_arr_diff'], index=y_test.index)], axis=1)
test_actual_vs_pred_dtr.head(10)

Unnamed: 0,arr_diff,Pred_arr_diff
3122729,845.0,350.138034
1982833,115.0,90.407998
1462810,403.0,223.466933
206756,377.0,50.080518
374541,191.0,90.407998
839636,-196.0,223.466933
1397142,7.0,90.407998
953402,382.0,223.466933
719971,132.0,90.407998
1215070,-10.0,37.556329


In [16]:
printMetrics(y_test, test_dtr_predictions)

MAE:  269.07956414349
MSE:  151080.2487116237
RMSE:  388.6904278621017
R2:  0.13234592468809736


<h2>Prediction & Evaluation on Full Data (5-Fold Cross-Validation):</h2>

In [17]:
X_fold = shuf[['DIRECTION', 'PROGRNUMBER', 'month', 'dayOfWeek', 'rushHour', 'temp', 'wind_speed']]
y_fold = shuf.arr_diff

<h3>5-Fold Cross-Validation Metrics:</h3>

In [18]:
mae = cross_val_score(DecisionTreeRegressor(max_depth=3, random_state=1), X_fold, y_fold, scoring='neg_mean_absolute_error', cv=5)
mse = cross_val_score(DecisionTreeRegressor(max_depth=3, random_state=1), X_fold, y_fold, scoring='neg_mean_squared_error', cv=5)
rmse = cross_val_score(DecisionTreeRegressor(max_depth=3, random_state=1), X_fold, y_fold, scoring='neg_root_mean_squared_error', cv=5)
r2 = cross_val_score(DecisionTreeRegressor(max_depth=3, random_state=1), X_fold, y_fold, scoring='r2', cv=5)


print("MAE: ", -mae.mean())
print("MSE: ", -mse.mean())
print("RMSE: ", -rmse.mean())
print("R2: ", r2.mean())

MAE:  268.985839660453
MSE:  150827.17492757135
RMSE:  388.36447525472624
R2:  0.13283063591736782
