In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

In [2]:
# Read CSV file into Data Frame:
trips = pd.read_csv('cleaned_trips.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
trips.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
0,2018-02-07,6253783,68,68_80,1,87245,84600,87524,84600,28-FEB-18 12:05:11,...,2,7,2,279,0,2645,2924,279,0,0
1,2018-02-07,6262138,25B,25B_271,2,30517,26460,32752,26460,28-FEB-18 12:05:11,...,2,7,2,2235,0,4057,6292,2235,0,0
2,2018-02-07,6254942,45A,45A_70,2,35512,32100,36329,32082,28-FEB-18 12:05:11,...,2,7,2,817,-18,3412,4247,835,0,0
3,2018-02-07,6259460,25A,25A_273,1,57261,54420,58463,54443,28-FEB-18 12:05:11,...,2,7,2,1202,23,2841,4020,1179,0,0
4,2018-02-07,6253175,14,14_15,1,85383,81600,84682,81608,28-FEB-18 12:05:11,...,2,7,2,-701,8,3783,3074,-709,0,0


In [4]:
trips.dtypes

DAYOFSERVICE        object
TRIPID               int64
LINEID              object
ROUTEID             object
DIRECTION            int64
PLANNEDTIME_ARR      int64
PLANNEDTIME_DEP      int64
ACTUALTIME_ARR       int64
ACTUALTIME_DEP       int64
LASTUPDATE          object
NOTE                object
planDep_time         int64
month                int64
day                  int64
dayOfWeek            int64
arrival_diff         int64
departure_diff       int64
planned_duration     int64
actual_duration      int64
duration_diff        int64
weekend              int64
rushHour             int64
dtype: object

## Just 39A Trips

In [5]:
trips_39a = trips [trips['LINEID'] == '39A']

In [6]:
trips_39a.shape

(49506, 22)

In [7]:
trips_39a.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
18,2018-02-07,6246028,39A,39A_40,1,49276,44400,50038,44382,28-FEB-18 12:05:11,...,2,7,2,762,-18,4876,5656,780,0,0
19,2018-02-07,6261203,39A,39A_43,2,39535,34800,40249,34984,28-FEB-18 12:05:11,...,2,7,2,714,184,4735,5265,530,0,0
31,2018-02-07,6251013,39A,39A_40,1,71287,66600,72534,66602,28-FEB-18 12:05:11,...,2,7,2,1247,2,4687,5932,1245,0,0
491,2018-06-26,7104333,39A,39A_43,2,40074,35400,40759,35333,04-JUL-18 12:21:00,...,6,26,1,685,-67,4674,5426,752,0,0
492,2018-06-26,7104336,39A,39A_40,1,58316,53400,58367,53393,04-JUL-18 12:21:00,...,6,26,1,51,-7,4916,4974,58,0,0


## Just 39A Trips going in Direction '2'

In [8]:
trips_39a2 = trips_39a [trips_39a['DIRECTION'] == 2]

In [9]:
trips_39a2.shape

(26894, 22)

## Shuffling 39A Direction 2 Dataset:

In [10]:
# Row shuffle inspired from Geeks for Geeks: https://www.geeksforgeeks.org/pandas-how-to-shuffle-a-dataframe-rows/
shuf_39a2 = trips_39a2.sample(frac = 1)
shuf_39a2.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
784822,2018-02-03,6232754,39A,39A_43,2,77221,73200,77558,73106,12-FEB-18 08:21:25,...,2,3,5,337,-94,4021,4452,431,1,0
88164,2018-03-23,6400155,39A,39A_43,2,85112,81600,85987,81911,04-APR-18 09:27:55,...,3,23,4,875,311,3512,4076,564,0,0
1048603,2018-09-04,7651061,39A,39A_43,2,28278,24000,29059,24018,12-SEP-18 16:17:32,...,9,4,1,781,18,4278,5041,763,0,0
661116,2018-09-23,7759552,39A,39A_43,2,61923,57600,62129,57506,22-OCT-18 14:56:01,...,9,23,6,206,-94,4323,4623,300,1,0
1188626,2018-10-01,7785178,39A,39A_43,2,59542,54600,59454,54547,22-OCT-18 21:50:42,...,10,1,0,-88,-53,4942,4907,-35,0,0


## Splitting Shuffled Data: Train (70%) & Test (30%)

In [11]:
# train_test_split already includes a shuffle method, but no harm to shuffle again
train, test = train_test_split(shuf_39a2, test_size=0.3, random_state=42, shuffle=True)

In [12]:
train

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
111137,2018-09-19,8089756,39A,39A_43,2,44889,40200,45344,40187,30-OCT-18 16:37:58,...,9,19,2,455,-13,4689,5157,468,0,0
1452736,2018-09-18,8086339,39A,39A_43,2,39296,34200,39557,34247,02-NOV-18 12:08:16,...,9,18,1,261,47,5096,5310,214,0,0
1217054,2018-11-24,8116131,39A,39A_43,2,42012,37200,42491,37217,04-DEC-18 08:21:59,...,11,24,5,479,17,4812,5274,462,1,0
575724,2018-10-15,8030084,39A,39A_43,2,34265,28800,34385,28805,09-NOV-18 19:04:42,...,10,15,0,120,5,5465,5580,115,0,1
662513,2018-08-01,7318704,39A,39A_43,2,40540,36000,42019,35969,18-AUG-18 17:59:42,...,8,1,2,1479,-31,4540,6050,1510,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1984443,2018-11-05,8110014,39A,39A_43,2,32666,27000,33292,26994,13-NOV-18 16:52:57,...,11,5,0,626,-6,5666,6298,632,0,1
645585,2018-09-07,7648305,39A,39A_43,2,42661,37800,43011,37795,09-OCT-18 13:19:04,...,9,7,4,350,-5,4861,5216,355,0,0
89112,2018-02-06,6261201,39A,39A_43,2,27438,22800,27894,22780,28-FEB-18 11:48:21,...,2,6,1,456,-20,4638,5114,476,0,0
1935638,2018-05-09,6742964,39A,39A_43,2,42079,37200,42193,37235,26-JUN-18 08:08:03,...,5,9,2,114,35,4879,4958,79,0,0


In [13]:
test

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
2014550,2018-05-28,6842751,39A,39A_43,2,37459,32400,37714,32418,14-JUN-18 13:31:37,...,5,28,0,255,18,5059,5296,237,0,0
1845300,2018-08-21,7323467,39A,39A_43,2,84971,81600,85561,81725,30-AUG-18 09:35:43,...,8,21,1,590,125,3371,3836,465,0,0
1755187,2018-10-30,8072940,39A,39A_43,2,66688,61800,66861,61767,09-NOV-18 23:36:22,...,10,30,1,173,-33,4888,5094,206,0,1
339143,2018-04-30,6652866,39A,39A_42,2,86581,84600,86712,84586,25-JUN-18 13:48:12,...,4,30,0,131,-14,1981,2126,145,0,0
793587,2018-07-19,7183111,39A,39A_43,2,34181,29400,34889,29388,28-JUL-18 10:09:47,...,7,19,3,708,-12,4781,5501,720,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1407607,2018-01-19,6113448,39A,39A_43,2,68738,63600,68606,63562,26-JAN-18 21:36:42,...,1,19,4,-132,-38,5138,5044,-94,0,0
1488769,2018-12-11,8176796,39A,39A_43,2,24897,20400,24467,20390,19-DEC-18 09:35:43,...,12,11,1,-430,-10,4497,4077,-420,0,0
1696280,2018-06-03,6848592,39A,39A_43,2,76788,72900,76719,72914,14-JUN-18 14:37:35,...,6,3,6,-69,14,3888,3805,-83,1,0
1486172,2018-07-13,7153250,39A,39A_43,2,26468,22200,26803,22215,24-JUL-18 14:23:17,...,7,13,4,335,15,4268,4588,320,0,0


In [14]:
train.dtypes

DAYOFSERVICE        object
TRIPID               int64
LINEID              object
ROUTEID             object
DIRECTION            int64
PLANNEDTIME_ARR      int64
PLANNEDTIME_DEP      int64
ACTUALTIME_ARR       int64
ACTUALTIME_DEP       int64
LASTUPDATE          object
NOTE                object
planDep_time         int64
month                int64
day                  int64
dayOfWeek            int64
arrival_diff         int64
departure_diff       int64
planned_duration     int64
actual_duration      int64
duration_diff        int64
weekend              int64
rushHour             int64
dtype: object

## Multiple Linear Regression

<h3>Training</h3>

In [15]:
X = train[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y = train.duration_diff

In [16]:
linreg = LinearRegression().fit(X, y)

# Weights for each Feature
print("Features: \n", X)
print("Coeficients: \n", linreg.coef_)
print("\nIntercept: \n", linreg.intercept_)

feature_importance = pd.DataFrame({'feature': ['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID'], 'importance':linreg.coef_})
feature_importance.sort_values('importance', ascending=False)

Features: 
          dayOfWeek  weekend  rushHour  planDep_time  planned_duration   TRIPID
111137           2        0         0            11              4689  8089756
1452736          1        0         0            10              5096  8086339
1217054          5        1         0            10              4812  8116131
575724           0        0         1             8              5465  8030084
662513           2        0         0            10              4540  7318704
...            ...      ...       ...           ...               ...      ...
1984443          0        0         1             8              5666  8110014
645585           4        0         0            10              4861  7648305
89112            1        0         0             6              4638  6261201
1935638          2        0         0            10              4879  6742964
332850           2        0         1             8              5293  7785555

[18825 rows x 6 columns]
Coeficients: 


Unnamed: 0,feature,importance
2,rushHour,220.404785
0,dayOfWeek,53.177244
3,planDep_time,0.250141
5,TRIPID,8.6e-05
4,planned_duration,-0.108954
1,weekend,-208.897336


<h2>Prediction & Evaluation on Training Data</h2>

In [17]:
train_linreg_predictions = linreg.predict(X)

train_actual_vs_pred_linreg = pd.concat([y, pd.DataFrame(train_linreg_predictions, columns=['pred_duration_diff'], index=y.index)], axis=1)
train_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
111137,468,427.99018
1452736,214,329.924107
1217054,462,367.245588
575724,115,451.600356
662513,1510,377.539603
1043401,525,361.432569
1509354,22,347.531233
1154592,570,474.897937
956168,148,444.251127
700836,435,421.764222


In [18]:
# Function to output evaluation metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [19]:
printMetrics(y, train_linreg_predictions)

MAE:  416.8483021170839
MSE:  286420.8608647116
RMSE:  535.1830162334298
R2:  0.05368637275100274


<h2>Prediction & Evaluation on Testing Data</h2>

In [20]:
X_test = test[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_test = test.duration_diff

In [21]:
test_linreg_predictions = linreg.predict(X_test)

test_actual_vs_pred_linreg = pd.concat([y_test, pd.DataFrame(test_linreg_predictions, columns=['pred_duration_diff'], index=y_test.index)], axis=1)
test_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
2014550,237,173.379281
1845300,465,455.391798
1755187,206,573.587839
339143,145,490.12774
793587,720,612.680613
979340,569,583.207893
111457,736,646.441761
221457,999,535.87242
1836528,733,477.209179
875286,639,328.577632


In [22]:
printMetrics(y_test, test_linreg_predictions)

MAE:  411.12485536175075
MSE:  276161.9670154366
RMSE:  525.5111483264999
R2:  0.05575636128750927


<h2>Prediction & Evaluation on Full Data (5-Fold Cross-Validation):</h2>

<h3>1st Fold</h3>

In [23]:
cv_lin1, cv_lin2, cv_lin3, cv_lin4, cv_lin5 = np.array_split(shuf_39a2, 5)

In [24]:
cvlin = cv_lin2
cvlin = cvlin.append(cv_lin3)
cvlin = cvlin.append(cv_lin4)
cvlin = cvlin.append(cv_lin5)
cvlin.shape

  cvlin = cvlin.append(cv_lin3)
  cvlin = cvlin.append(cv_lin4)
  cvlin = cvlin.append(cv_lin5)


(21515, 22)

In [25]:
X_cvlin = cvlin[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cvlin.duration_diff

In [26]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
74388,623,369.130825
528545,237,319.275271
1309892,448,516.338088
567606,841,280.198436
935358,530,385.592072
311511,693,404.752314
979143,709,335.80895
1225768,220,650.4095
870332,281,608.82334
1552991,1149,519.314416


In [27]:
X_cvlin = cv_lin1[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cv_lin1.duration_diff

In [28]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
784822,431,293.656034
88164,564,520.007565
1048603,763,380.794082
661116,300,444.478992
1188626,-35,268.828162
593788,742,583.448595
1540532,877,196.912943
879306,706,465.498817
1204158,243,812.819712
556919,-583,424.537536


In [29]:
fold1_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold1_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold1_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold1_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 1 MAE: ', fold1_MAE)
print('Fold 1 MSE: ', fold1_MSE)
print('Fold 1 RMSE: ', fold1_RMSE)
print('Fold 1 R2: ', fold1_R2)

Fold 1 MAE:  414.1371666673027
Fold 1 MSE:  280345.30589582503
Fold 1 RMSE:  529.4764450812
Fold 1 R2:  0.05294994387542307


<h3>2nd Fold</h3>

In [30]:
cvlin2 = cv_lin1
cvlin2 = cvlin2.append(cv_lin3)
cvlin2 = cvlin2.append(cv_lin4)
cvlin2 = cvlin2.append(cv_lin5)
cvlin2.shape

  cvlin2 = cvlin2.append(cv_lin3)
  cvlin2 = cvlin2.append(cv_lin4)
  cvlin2 = cvlin2.append(cv_lin5)


(21515, 22)

In [31]:
X_cvlin = cvlin2[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cvlin2.duration_diff

In [32]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
784822,431,293.656034
88164,564,520.007565
1048603,763,380.794082
661116,300,444.478992
1188626,-35,268.828162
593788,742,583.448595
1540532,877,196.912943
879306,706,465.498817
1204158,243,812.819712
556919,-583,424.537536


In [33]:
X_cvlin = cv_lin2[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cv_lin2.duration_diff

In [34]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
74388,623,369.130825
528545,237,319.275271
1309892,448,516.338088
567606,841,280.198436
935358,530,385.592072
311511,693,404.752314
979143,709,335.80895
1225768,220,650.4095
870332,281,608.82334
1552991,1149,519.314416


In [35]:
fold2_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold2_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold2_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold2_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 2 MAE: ', fold2_MAE)
print('Fold 2 MSE: ', fold2_MSE)
print('Fold 2 RMSE: ', fold2_RMSE)
print('Fold 2 R2: ', fold2_R2)

Fold 2 MAE:  415.89895088677076
Fold 2 MSE:  286933.3652146926
Fold 2 RMSE:  535.6616144682131
Fold 2 R2:  0.042208158992544154


<h3>3rd Fold</h3>

In [36]:
cvlin3 = cv_lin1
cvlin3 = cvlin3.append(cv_lin2)
cvlin3 = cvlin3.append(cv_lin4)
cvlin3 = cvlin3.append(cv_lin5)
cvlin3.shape

  cvlin3 = cvlin3.append(cv_lin2)
  cvlin3 = cvlin3.append(cv_lin4)
  cvlin3 = cvlin3.append(cv_lin5)


(21515, 22)

In [37]:
X_cvlin = cvlin3[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cvlin3.duration_diff

In [38]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
784822,431,293.656034
88164,564,520.007565
1048603,763,380.794082
661116,300,444.478992
1188626,-35,268.828162
593788,742,583.448595
1540532,877,196.912943
879306,706,465.498817
1204158,243,812.819712
556919,-583,424.537536


In [39]:
X_cvlin = cv_lin3[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cv_lin3.duration_diff

In [40]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1888465,546,296.020058
4902,84,446.579549
1925776,-262,215.994152
856987,812,334.825777
952933,949,368.688749
263998,45,431.444664
444829,63,316.433755
1079772,304,578.20053
1676261,582,387.874494
1286251,99,133.709975


In [41]:
fold3_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold3_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold3_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold3_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 3 MAE: ', fold3_MAE)
print('Fold 3 MSE: ', fold3_MSE)
print('Fold 3 RMSE: ', fold3_RMSE)
print('Fold 3 R2: ', fold3_R2)

Fold 3 MAE:  415.6040524722808
Fold 3 MSE:  283141.6235559375
Fold 3 RMSE:  532.1105369713491
Fold 3 R2:  0.06008540804100693


<h3>4th Fold</h3>

In [42]:
cvlin4 = cv_lin1
cvlin4 = cvlin4.append(cv_lin2)
cvlin4 = cvlin4.append(cv_lin3)
cvlin4 = cvlin4.append(cv_lin5)
cvlin4.shape

  cvlin4 = cvlin4.append(cv_lin2)
  cvlin4 = cvlin4.append(cv_lin3)
  cvlin4 = cvlin4.append(cv_lin5)


(21515, 22)

In [43]:
X_cvlin = cvlin4[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cvlin4.duration_diff

In [44]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
784822,431,293.656034
88164,564,520.007565
1048603,763,380.794082
661116,300,444.478992
1188626,-35,268.828162
593788,742,583.448595
1540532,877,196.912943
879306,706,465.498817
1204158,243,812.819712
556919,-583,424.537536


In [45]:
X_cvlin = cv_lin4[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cv_lin4.duration_diff

In [46]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
352357,750,427.244269
1301426,698,251.694445
456640,-158,403.122406
1025036,468,434.664441
780488,691,373.089899
696532,-759,239.70193
897792,688,516.515363
1534048,573,282.195112
103513,-503,450.290967
644416,70,684.006517


In [47]:
fold4_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold4_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold4_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold4_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 4 MAE: ', fold4_MAE)
print('Fold 4 MSE: ', fold4_MSE)
print('Fold 4 RMSE: ', fold4_RMSE)
print('Fold 4 R2: ', fold4_R2)

Fold 4 MAE:  416.70397456887787
Fold 4 MSE:  285814.05558031844
Fold 4 RMSE:  534.6158018430791
Fold 4 R2:  0.058421427102713586


<h3>5th Fold</h3>

In [48]:
cvlin5 = cv_lin1
cvlin5 = cvlin5.append(cv_lin2)
cvlin5 = cvlin5.append(cv_lin3)
cvlin5 = cvlin5.append(cv_lin4)
cvlin5.shape

  cvlin5 = cvlin5.append(cv_lin2)
  cvlin5 = cvlin5.append(cv_lin3)
  cvlin5 = cvlin5.append(cv_lin4)


(21516, 22)

In [49]:
X_cvlin = cvlin5[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cvlin5.duration_diff

In [50]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
784822,431,293.656034
88164,564,520.007565
1048603,763,380.794082
661116,300,444.478992
1188626,-35,268.828162
593788,742,583.448595
1540532,877,196.912943
879306,706,465.498817
1204158,243,812.819712
556919,-583,424.537536


In [51]:
X_cvlin = cv_lin5[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cv_lin5.duration_diff

In [52]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1393513,1491,334.782749
1978089,337,204.13788
894150,624,418.084355
864154,285,438.272956
1546798,605,180.701343
1062215,-280,254.865433
1087371,471,528.045865
1284807,137,325.656467
447402,750,441.986487
1254609,522,503.438689


In [53]:
fold5_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold5_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold5_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold5_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 5 MAE: ', fold5_MAE)
print('Fold 5 MSE: ', fold5_MSE)
print('Fold 5 RMSE: ', fold5_RMSE)
print('Fold 5 R2: ', fold5_R2)

Fold 5 MAE:  413.31100622728115
Fold 5 MSE:  280479.5551596055
Fold 5 RMSE:  529.6032053902294
Fold 5 R2:  0.05734510150475358


<h3>5-Fold Cross-Validation Metrics:</h3>

In [54]:
cv_MAE = (fold1_MAE + fold2_MAE + fold3_MAE + fold4_MAE + fold5_MAE) / 5
cv_MSE = (fold1_MSE + fold2_MSE + fold3_MSE + fold4_MSE + fold5_MSE) / 5
cv_RMSE = (fold1_RMSE + fold2_RMSE + fold3_RMSE + fold4_RMSE + fold5_RMSE) / 5
cv_R2 = (fold1_R2 + fold2_R2 + fold3_R2 + fold4_R2 + fold5_R2) / 5

print("MAE: ", cv_MAE)
print("MSE: ", cv_MSE)
print("RMSE: ", cv_RMSE)
print("R2: ", cv_R2)

MAE:  415.1310301645027
MSE:  283342.78108127584
RMSE:  532.2935207508142
R2:  0.05420200790328826
