In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

import pickle

In [2]:
# Read CSV file into Data Frame:
rt = pd.read_csv('v9_route_39A.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
rt.shape

(3417494, 27)

## Arrival Difference Column

In [4]:
rt['arr_diff'] = rt['act_stp_arr'] - rt['PLANNEDTIME_ARR']

In [5]:
rt.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,VEHICLEID,year,month,day,id,...,stop_actARR_hour,wthr_tr_lt_id,temp,humidity,wind_speed,weather_id,weather_description,dos_trpID_stpID_prgNum,act_stp_arr,arr_diff
0,2018-01-01,5959380,1,767,32400,2172316,2018,1,1,20180101595938001,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-767-1,32323.0,-77.0
1,2018-01-01,5959380,2,768,32503,2172316,2018,1,1,20180101595938002,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-768-2,32461.0,-42.0
2,2018-01-01,5959380,3,769,32527,2172316,2018,1,1,20180101595938003,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-769-3,32481.0,-46.0
3,2018-01-01,5959380,4,770,32565,2172316,2018,1,1,20180101595938004,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-770-4,32510.0,-55.0
4,2018-01-01,5959380,5,771,32581,2172316,2018,1,1,20180101595938005,...,9,2018010109,4.39,86,7.7,802,scattered clouds,2018-01-01-5959380-771-5,32520.0,-61.0


## Shuffling Data

In [6]:
# Row shuffle inspired from Geeks for Geeks: https://www.geeksforgeeks.org/pandas-how-to-shuffle-a-dataframe-rows/
shuf = rt.sample(frac = 1)
shuf.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,VEHICLEID,year,month,day,id,...,stop_actARR_hour,wthr_tr_lt_id,temp,humidity,wind_speed,weather_id,weather_description,dos_trpID_stpID_prgNum,act_stp_arr,arr_diff
2416562,2018-09-17,8086345,8,7021,73490,1000936,2018,9,17,20180917808634508,...,20,2018091720,17.59,77,5.66,803,broken clouds,2018-09-17-8086345-7021-8,73573.0,83.0
3208296,2018-12-04,8162639,55,1874,69752,1000931,2018,12,4,20181204816263955,...,19,2018120419,4.92,84,0.0,500,light rain,2018-12-04-8162639-1874-55,71049.0,1297.0
1234899,2018-05-12,6735667,19,786,69391,2868316,2018,5,12,20180512673566719,...,19,2018051219,10.39,76,3.6,803,broken clouds,2018-05-12-6735667-786-19,69403.0,12.0
2654970,2018-10-15,8024540,17,784,28535,1000612,2018,10,15,20181015802454017,...,8,2018101508,2.92,96,2.1,802,scattered clouds,2018-10-15-8024540-784-17,28642.0,107.0
649303,2018-03-11,6400011,41,1908,50421,2868376,2018,3,11,20180311640001141,...,14,2018031114,9.39,81,6.7,803,broken clouds,2018-03-11-6400011-1908-41,50507.0,86.0


## Splitting Shuffled Data: Train (70%) & Test (30%)

In [7]:
# train_test_split already includes a shuffle method, but no harm to shuffle again
train, test = train_test_split(shuf, test_size=0.3, random_state=42, shuffle=True)

## Multiple Linear Regression

## Training

In [8]:
X = train[['DIRECTION', 'PROGRNUMBER', 'month', 'dayOfWeek', 'rushHour', 'temp', 'wind_speed']]
y = train.arr_diff

In [9]:
linreg = LinearRegression().fit(X, y)

# Weights for each Feature
print("Features: \n", X)
print("Coeficients: \n", linreg.coef_)
print("\nIntercept: \n", linreg.intercept_)

feature_importance = pd.DataFrame({'feature': ['DIRECTION', 'PROGRNUMBER', 'month', 'dayOfWeek', 'rushHour', 'temp', 'wind_speed'], 'importance':linreg.coef_})
feature_importance.sort_values('importance', ascending=False)

Features: 
          DIRECTION  PROGRNUMBER  month  dayOfWeek  rushHour   temp  wind_speed
848551           1            1      4          0         0   7.46        2.60
2898206          2           12      9          3         0   8.46        7.20
189104           2           43      1          4         0   4.39        8.80
1627807          2           55      6          4         0  18.39        3.60
871206           2           17      4          2         0   5.39        9.80
...            ...          ...    ...        ...       ...    ...         ...
1795330          2           33      7          1         0  13.39        2.10
1282504          2           66      5          3         0  13.39        5.70
3061721          1            8     11          0         1   9.49        6.69
2098732          2           16      8          4         0  16.39        5.70
1340653          2           16      5          2         0   6.39        1.50

[2392245 rows x 7 columns]
Coeficients:

Unnamed: 0,feature,importance
4,rushHour,92.457016
3,dayOfWeek,19.194264
2,month,10.496878
0,DIRECTION,7.017979
1,PROGRNUMBER,6.623768
5,temp,1.114019
6,wind_speed,-2.475168


<h2>Prediction & Evaluation on Training Data</h2>

In [10]:
train_linreg_predictions = linreg.predict(X)

train_actual_vs_pred_linreg = pd.concat([y, pd.DataFrame(train_linreg_predictions, columns=['pred_duration_diff'], index=y.index)], axis=1)
train_actual_vs_pred_linreg.head(10)

Unnamed: 0,arr_diff,pred_duration_diff
848551,60.0,-116.832229
2898206,-359.0,62.842624
189104,651.0,194.904341
1627807,175.0,355.341082
871206,22.0,14.427335
2560756,28.0,450.152461
852936,-546.0,222.847157
1565065,923.0,417.812761
2651533,-153.0,320.614757
3255963,20.0,235.422319


In [11]:
# Function to output evaluation metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [12]:
printMetrics(y, train_linreg_predictions)

MAE:  271.61508780619045
MSE:  151247.4448529077
RMSE:  388.9054446172073
R2:  0.13158417352458085


<h2>Prediction & Evaluation on Testing Data</h2>

In [13]:
X_test = test[['DIRECTION', 'PROGRNUMBER', 'month', 'dayOfWeek', 'rushHour', 'temp', 'wind_speed']]
y_test = test.arr_diff

In [14]:
test_linreg_predictions = linreg.predict(X_test)

test_actual_vs_pred_linreg = pd.concat([y_test, pd.DataFrame(test_linreg_predictions, columns=['pred_duration_diff'], index=y_test.index)], axis=1)
test_actual_vs_pred_linreg.head(10)

Unnamed: 0,arr_diff,pred_duration_diff
146516,-448.0,108.57531
389544,1648.0,341.895642
111550,-246.0,364.844261
1908331,336.0,459.321103
2962643,1157.0,479.766778
612807,64.0,228.440583
3404453,380.0,356.00539
1067967,235.0,154.553742
1042636,-75.0,26.199877
2594838,1516.0,401.58275


In [15]:
printMetrics(y_test, test_linreg_predictions)

MAE:  271.07450186209735
MSE:  150487.78475105236
RMSE:  387.92755090487236
R2:  0.13205320164895284
