In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

In [2]:
# Read CSV file into Data Frame:
trips = pd.read_csv('cleaned_trips.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
trips.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
0,2018-02-07,6253783,68,68_80,1,87245,84600,87524,84600,28-FEB-18 12:05:11,...,2,7,2,279,0,2645,2924,279,0,0
1,2018-02-07,6262138,25B,25B_271,2,30517,26460,32752,26460,28-FEB-18 12:05:11,...,2,7,2,2235,0,4057,6292,2235,0,0
2,2018-02-07,6254942,45A,45A_70,2,35512,32100,36329,32082,28-FEB-18 12:05:11,...,2,7,2,817,-18,3412,4247,835,0,0
3,2018-02-07,6259460,25A,25A_273,1,57261,54420,58463,54443,28-FEB-18 12:05:11,...,2,7,2,1202,23,2841,4020,1179,0,0
4,2018-02-07,6253175,14,14_15,1,85383,81600,84682,81608,28-FEB-18 12:05:11,...,2,7,2,-701,8,3783,3074,-709,0,0


In [4]:
trips.dtypes

DAYOFSERVICE        object
TRIPID               int64
LINEID              object
ROUTEID             object
DIRECTION            int64
PLANNEDTIME_ARR      int64
PLANNEDTIME_DEP      int64
ACTUALTIME_ARR       int64
ACTUALTIME_DEP       int64
LASTUPDATE          object
NOTE                object
planDep_time         int64
month                int64
day                  int64
dayOfWeek            int64
arrival_diff         int64
departure_diff       int64
planned_duration     int64
actual_duration      int64
duration_diff        int64
weekend              int64
rushHour             int64
dtype: object

## Just 39A Trips

In [5]:
trips_39a = trips [trips['LINEID'] == '39A']

In [6]:
trips_39a.shape

(49506, 22)

In [7]:
trips_39a.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
18,2018-02-07,6246028,39A,39A_40,1,49276,44400,50038,44382,28-FEB-18 12:05:11,...,2,7,2,762,-18,4876,5656,780,0,0
19,2018-02-07,6261203,39A,39A_43,2,39535,34800,40249,34984,28-FEB-18 12:05:11,...,2,7,2,714,184,4735,5265,530,0,0
31,2018-02-07,6251013,39A,39A_40,1,71287,66600,72534,66602,28-FEB-18 12:05:11,...,2,7,2,1247,2,4687,5932,1245,0,0
491,2018-06-26,7104333,39A,39A_43,2,40074,35400,40759,35333,04-JUL-18 12:21:00,...,6,26,1,685,-67,4674,5426,752,0,0
492,2018-06-26,7104336,39A,39A_40,1,58316,53400,58367,53393,04-JUL-18 12:21:00,...,6,26,1,51,-7,4916,4974,58,0,0


## Just 39A Trips going in Direction '1'

In [8]:
trips_39a1 = trips_39a [trips_39a['DIRECTION'] == 1]

In [9]:
trips_39a1.shape

(22612, 22)

## Shuffling 39A Direction 1 Dataset:

In [10]:
# Row shuffle inspired from Geeks for Geeks: https://www.geeksforgeeks.org/pandas-how-to-shuffle-a-dataframe-rows/
shuf_39a1 = trips_39a1.sample(frac = 1)
shuf_39a1.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
1313252,2018-02-22,6263914,39A,39A_40,1,51777,46800,52503,46837,05-MAR-18 22:10:29,...,2,22,3,726,37,4977,5666,689,0,0
984081,2018-09-23,7770885,39A,39A_40,1,41771,37800,42582,37856,22-OCT-18 14:56:01,...,9,23,6,811,56,3971,4726,755,1,0
201879,2018-06-17,7019631,39A,39A_40,1,55025,50400,55488,50369,27-JUN-18 10:37:53,...,6,17,6,463,-31,4625,5119,494,1,0
1594232,2018-06-07,6854065,39A,39A_40,1,46556,42000,48027,42266,18-JUN-18 08:50:15,...,6,7,3,1471,266,4556,5761,1205,0,0
272640,2018-04-18,6635423,39A,39A_40,1,37404,33000,36875,33437,25-JUN-18 08:34:03,...,4,18,2,-529,437,4404,3438,-966,0,0


## Splitting Shuffled Data: Train (70%) & Test (30%)

In [11]:
# train_test_split already includes a shuffle method, but no harm to shuffle again
train, test = train_test_split(shuf_39a1, test_size=0.3, random_state=42, shuffle=True)

In [12]:
train

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
510380,2018-08-10,7316007,39A,39A_40,1,45092,40800,45937,40796,28-AUG-18 09:21:32,...,8,10,4,845,-4,4292,5141,849,0,0
802102,2018-12-15,8166977,39A,39A_40,1,65842,60300,66183,60524,24-DEC-18 09:43:35,...,12,15,5,341,224,5542,5659,117,1,0
292741,2018-08-29,7513554,39A,39A_40,1,49894,45000,50600,45000,07-SEP-18 09:22:43,...,8,29,2,706,0,4894,5600,706,0,0
1107135,2018-09-08,7648778,39A,39A_40,1,66790,62100,68437,62144,18-SEP-18 11:50:01,...,9,8,5,1647,44,4690,6293,1603,1,0
1973895,2018-07-29,7319713,39A,39A_40,1,77008,73200,77750,73206,18-AUG-18 17:20:12,...,7,29,6,742,6,3808,4544,736,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202936,2018-08-27,7500302,39A,39A_40,1,72236,67200,71785,67139,05-SEP-18 11:54:42,...,8,27,0,-451,-61,5036,4646,-390,0,0
1621082,2018-02-20,6263918,39A,39A_40,1,77610,73200,77608,73154,28-FEB-18 14:15:30,...,2,20,1,-2,-46,4410,4454,44,0,0
678814,2018-03-20,6400557,39A,39A_40,1,73966,69600,75259,69605,28-MAR-18 15:03:15,...,3,20,1,1293,5,4366,5654,1288,0,0
336690,2018-09-21,8082618,39A,39A_40,1,70265,65400,72004,65444,30-OCT-18 17:10:36,...,9,21,4,1739,44,4865,6560,1695,0,0


In [13]:
test

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
655089,2018-12-22,8457198,39A,39A_40,1,46096,41400,46623,41457,07-JAN-19 18:10:29,...,12,22,5,527,57,4696,5166,470,1,0
776822,2018-05-29,6837994,39A,39A_40,1,57280,52200,57885,52648,14-JUN-18 13:42:26,...,5,29,1,605,448,5080,5237,157,0,0
185439,2018-03-13,6399470,39A,39A_40,1,30405,25800,30743,25772,21-MAR-18 10:00:25,...,3,13,1,338,-28,4605,4971,366,0,0
172437,2018-06-21,7019572,39A,39A_40,1,40566,36000,41320,36000,29-JUN-18 15:43:00,...,6,21,3,754,0,4566,5320,754,0,0
303444,2018-03-06,6383109,39A,39A_40,1,58307,53400,59973,53291,16-MAR-18 08:32:10,...,3,6,1,1666,-109,4907,6682,1775,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046379,2018-03-23,6399196,39A,39A_44,1,64666,61200,65078,61203,04-APR-18 09:27:55,...,3,23,4,412,3,3466,3875,409,0,1
564298,2018-03-24,6403309,39A,39A_40,1,42100,37800,43298,38244,10-APR-18 15:08:12,...,3,24,5,1198,444,4300,5054,754,1,0
1821531,2018-11-12,8131876,39A,39A_40,1,72529,67200,72006,67192,21-NOV-18 08:18:17,...,11,12,0,-523,-8,5329,4814,-515,0,0
381947,2018-12-26,8575810,39A,39A_40,1,39680,36000,40353,36000,09-JAN-19 17:30:44,...,12,26,2,673,0,3680,4353,673,0,0


In [14]:
train.dtypes

DAYOFSERVICE        object
TRIPID               int64
LINEID              object
ROUTEID             object
DIRECTION            int64
PLANNEDTIME_ARR      int64
PLANNEDTIME_DEP      int64
ACTUALTIME_ARR       int64
ACTUALTIME_DEP       int64
LASTUPDATE          object
NOTE                object
planDep_time         int64
month                int64
day                  int64
dayOfWeek            int64
arrival_diff         int64
departure_diff       int64
planned_duration     int64
actual_duration      int64
duration_diff        int64
weekend              int64
rushHour             int64
dtype: object

## Multiple Linear Regression

<h3>Training</h3>

In [15]:
X = train[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y = train.duration_diff

In [16]:
linreg = LinearRegression().fit(X, y)

# Weights for each Feature
print("Features: \n", X)
print("Coeficients: \n", linreg.coef_)
print("\nIntercept: \n", linreg.intercept_)

feature_importance = pd.DataFrame({'feature': ['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID'], 'importance':linreg.coef_})
feature_importance.sort_values('importance', ascending=False)

Features: 
          dayOfWeek  weekend  rushHour  planDep_time  planned_duration   TRIPID
510380           4        0         0            11              4292  7316007
802102           5        1         0            17              5542  8166977
292741           2        0         0            12              4894  7513554
1107135          5        1         0            17              4690  7648778
1973895          6        1         0            20              3808  7319713
...            ...      ...       ...           ...               ...      ...
202936           0        0         0            19              5036  7500302
1621082          1        0         0            20              4410  6263918
678814           1        0         0            19              4366  6400557
336690           4        0         0            18              4865  8082618
2005696          3        0         0             9              4573  6658513

[15828 rows x 6 columns]
Coeficients: 


Unnamed: 0,feature,importance
2,rushHour,326.030757
0,dayOfWeek,74.503276
3,planDep_time,15.375475
5,TRIPID,7.7e-05
4,planned_duration,-0.007355
1,weekend,-282.892641


<h2>Prediction & Evaluation on Training Data</h2>

In [17]:
train_linreg_predictions = linreg.predict(X)

train_actual_vs_pred_linreg = pd.concat([y, pd.DataFrame(train_linreg_predictions, columns=['pred_duration_diff'], index=y.index)], axis=1)
train_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
510380,849,766.325933
802102,117,706.234634
292741,706,643.411927
1107135,1603,672.773765
1973895,736,774.663069
1681134,1803,470.541109
375294,1037,780.155622
254066,1232,682.2705
681084,241,681.361723
453547,1152,1055.846779


In [18]:
# Function to output evaluation metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [19]:
printMetrics(y, train_linreg_predictions)

MAE:  448.2173279889506
MSE:  326696.92429471895
RMSE:  571.5740759470455
R2:  0.0860112607210487


<h2>Prediction & Evaluation on Testing Data</h2>

In [20]:
X_test = test[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_test = test.duration_diff

In [21]:
test_linreg_predictions = linreg.predict(X_test)

test_actual_vs_pred_linreg = pd.concat([y_test, pd.DataFrame(test_linreg_predictions, columns=['pred_duration_diff'], index=y_test.index)], axis=1)
test_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
655089,470,657.829197
776822,157,546.500245
185439,366,408.746434
172437,754,651.705929
303444,1775,528.27471
201791,472,508.755541
1139191,1017,643.933862
845011,1177,967.413189
218713,438,487.789007
235050,1164,710.918281


In [22]:
printMetrics(y_test, test_linreg_predictions)

MAE:  449.6740810637428
MSE:  333082.25920915714
RMSE:  577.1327916599066
R2:  0.08546782425042343


<h2>Prediction & Evaluation on Full Data (5-Fold Cross-Validation):</h2>

<h3>1st Fold</h3>

In [23]:
cv_lin1, cv_lin2, cv_lin3, cv_lin4, cv_lin5 = np.array_split(shuf_39a1, 5)

In [24]:
cvlin = cv_lin2
cvlin = cvlin.append(cv_lin3)
cvlin = cvlin.append(cv_lin4)
cvlin = cvlin.append(cv_lin5)
cvlin.shape

  cvlin = cvlin.append(cv_lin3)
  cvlin = cvlin.append(cv_lin4)
  cvlin = cvlin.append(cv_lin5)


(18089, 22)

In [25]:
X_cvlin = cvlin[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cvlin.duration_diff

In [26]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
273679,1544,734.07018
1215288,318,471.333725
1451025,1370,892.131403
1639069,531,620.126482
1605681,1294,864.515958
149178,734,838.369122
698628,722,816.678142
1039746,1225,635.393154
1200842,1095,707.217758
1262410,597,817.437039


In [27]:
X_cvlin = cv_lin1[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cv_lin1.duration_diff

In [28]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1313252,689,636.877463
984081,755,654.298227
201879,494,653.395594
1594232,1205,669.841954
272640,-966,533.568187
202940,-10,463.361432
663847,1135,723.649139
1697943,555,718.646025
852496,827,602.56766
570895,905,855.793282


In [29]:
fold1_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold1_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold1_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold1_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 1 MAE: ', fold1_MAE)
print('Fold 1 MSE: ', fold1_MSE)
print('Fold 1 RMSE: ', fold1_RMSE)
print('Fold 1 R2: ', fold1_R2)

Fold 1 MAE:  448.2881186946038
Fold 1 MSE:  325814.75550469104
Fold 1 RMSE:  570.8018531020122
Fold 1 R2:  0.09293458732597726


<h3>2nd Fold</h3>

In [30]:
cvlin2 = cv_lin1
cvlin2 = cvlin2.append(cv_lin3)
cvlin2 = cvlin2.append(cv_lin4)
cvlin2 = cvlin2.append(cv_lin5)
cvlin2.shape

  cvlin2 = cvlin2.append(cv_lin3)
  cvlin2 = cvlin2.append(cv_lin4)
  cvlin2 = cvlin2.append(cv_lin5)


(18089, 22)

In [31]:
X_cvlin = cvlin2[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cvlin2.duration_diff

In [32]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1313252,689,636.877463
984081,755,654.298227
201879,494,653.395594
1594232,1205,669.841954
272640,-966,533.568187
202940,-10,463.361432
663847,1135,723.649139
1697943,555,718.646025
852496,827,602.56766
570895,905,855.793282


In [33]:
X_cvlin = cv_lin2[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cv_lin2.duration_diff

In [34]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
273679,1544,734.07018
1215288,318,471.333725
1451025,1370,892.131403
1639069,531,620.126482
1605681,1294,864.515958
149178,734,838.369122
698628,722,816.678142
1039746,1225,635.393154
1200842,1095,707.217758
1262410,597,817.437039


In [35]:
fold2_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold2_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold2_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold2_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 2 MAE: ', fold2_MAE)
print('Fold 2 MSE: ', fold2_MSE)
print('Fold 2 RMSE: ', fold2_RMSE)
print('Fold 2 R2: ', fold2_R2)

Fold 2 MAE:  446.8806078938833
Fold 2 MSE:  328055.08039567317
Fold 2 RMSE:  572.7609277837248
Fold 2 R2:  0.09357233643938312


<h3>3rd Fold</h3>

In [36]:
cvlin3 = cv_lin1
cvlin3 = cvlin3.append(cv_lin2)
cvlin3 = cvlin3.append(cv_lin4)
cvlin3 = cvlin3.append(cv_lin5)
cvlin3.shape

  cvlin3 = cvlin3.append(cv_lin2)
  cvlin3 = cvlin3.append(cv_lin4)
  cvlin3 = cvlin3.append(cv_lin5)


(18090, 22)

In [37]:
X_cvlin = cvlin3[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cvlin3.duration_diff

In [38]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1313252,689,636.877463
984081,755,654.298227
201879,494,653.395594
1594232,1205,669.841954
272640,-966,533.568187
202940,-10,463.361432
663847,1135,723.649139
1697943,555,718.646025
852496,827,602.56766
570895,905,855.793282


In [39]:
X_cvlin = cv_lin3[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cv_lin3.duration_diff

In [40]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
129601,71,471.869274
1222175,450,622.907071
1132567,2026,1137.068195
158319,1329,647.934649
1410278,1088,520.384415
60566,821,1018.866835
1270998,405,563.399418
657695,428,653.808144
1148577,153,1149.839024
710873,-226,647.887867


In [41]:
fold3_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold3_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold3_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold3_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 3 MAE: ', fold3_MAE)
print('Fold 3 MSE: ', fold3_MSE)
print('Fold 3 RMSE: ', fold3_RMSE)
print('Fold 3 R2: ', fold3_R2)

Fold 3 MAE:  457.54476630160474
Fold 3 MSE:  341862.85540230334
Fold 3 RMSE:  584.690392774076
Fold 3 R2:  0.07353666925126101


<h3>4th Fold</h3>

In [42]:
cvlin4 = cv_lin1
cvlin4 = cvlin4.append(cv_lin2)
cvlin4 = cvlin4.append(cv_lin3)
cvlin4 = cvlin4.append(cv_lin5)
cvlin4.shape

  cvlin4 = cvlin4.append(cv_lin2)
  cvlin4 = cvlin4.append(cv_lin3)
  cvlin4 = cvlin4.append(cv_lin5)


(18090, 22)

In [43]:
X_cvlin = cvlin4[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cvlin4.duration_diff

In [44]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1313252,689,636.877463
984081,755,654.298227
201879,494,653.395594
1594232,1205,669.841954
272640,-966,533.568187
202940,-10,463.361432
663847,1135,723.649139
1697943,555,718.646025
852496,827,602.56766
570895,905,855.793282


In [45]:
X_cvlin = cv_lin4[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cv_lin4.duration_diff

In [46]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
856319,52,630.361915
80612,1334,1063.215894
481152,447,700.6659
1373148,-279,626.831819
1381280,1136,478.491414
1593505,968,749.133706
1635744,624,969.470527
387153,1346,725.284514
248001,243,454.245001
978885,1069,814.002952


In [47]:
fold4_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold4_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold4_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold4_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 4 MAE: ', fold4_MAE)
print('Fold 4 MSE: ', fold4_MSE)
print('Fold 4 RMSE: ', fold4_RMSE)
print('Fold 4 R2: ', fold4_R2)

Fold 4 MAE:  443.3458212924464
Fold 4 MSE:  318253.69176909816
Fold 4 RMSE:  564.1397803462349
Fold 4 R2:  0.08957464457952036


<h3>5th Fold</h3>

In [48]:
cvlin5 = cv_lin1
cvlin5 = cvlin5.append(cv_lin2)
cvlin5 = cvlin5.append(cv_lin3)
cvlin5 = cvlin5.append(cv_lin4)
cvlin5.shape

  cvlin5 = cvlin5.append(cv_lin2)
  cvlin5 = cvlin5.append(cv_lin3)
  cvlin5 = cvlin5.append(cv_lin4)


(18090, 22)

In [49]:
X_cvlin = cvlin5[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cvlin5.duration_diff

In [50]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1313252,689,636.877463
984081,755,654.298227
201879,494,653.395594
1594232,1205,669.841954
272640,-966,533.568187
202940,-10,463.361432
663847,1135,723.649139
1697943,555,718.646025
852496,827,602.56766
570895,905,855.793282


In [51]:
X_cvlin = cv_lin5[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'TRIPID']]
y_cvlin = cv_lin5.duration_diff

In [52]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
852939,209,512.100696
695794,658,603.472956
1452531,112,557.752987
1928049,-389,603.526164
1301242,688,711.476697
471648,1295,876.821462
699822,1528,956.468555
1295666,-219,777.292475
1980924,681,652.371156
226806,99,598.551239


In [53]:
fold5_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold5_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold5_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold5_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 5 MAE: ', fold5_MAE)
print('Fold 5 MSE: ', fold5_MSE)
print('Fold 5 RMSE: ', fold5_RMSE)
print('Fold 5 R2: ', fold5_R2)

Fold 5 MAE:  447.2130574713655
Fold 5 MSE:  329077.5475730448
Fold 5 RMSE:  573.6528110042212
Fold 5 R2:  0.07903614616052446


<h3>5-Fold Cross-Validation Metrics:</h3>

In [54]:
cv_MAE = (fold1_MAE + fold2_MAE + fold3_MAE + fold4_MAE + fold5_MAE) / 5
cv_MSE = (fold1_MSE + fold2_MSE + fold3_MSE + fold4_MSE + fold5_MSE) / 5
cv_RMSE = (fold1_RMSE + fold2_RMSE + fold3_RMSE + fold4_RMSE + fold5_RMSE) / 5
cv_R2 = (fold1_R2 + fold2_R2 + fold3_R2 + fold4_R2 + fold5_R2) / 5

print("MAE: ", cv_MAE)
print("MSE: ", cv_MSE)
print("RMSE: ", cv_RMSE)
print("R2: ", cv_R2)

MAE:  448.6544743307807
MSE:  328612.7861289621
RMSE:  573.2091530020539
R2:  0.08573087675133324
