In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

In [2]:
# Read CSV file into Data Frame:
trips = pd.read_csv('cleaned_trips.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
trips.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
0,2018-02-07,6253783,68,68_80,1,87245,84600,87524,84600,28-FEB-18 12:05:11,...,2,7,2,279,0,2645,2924,279,0,0
1,2018-02-07,6262138,25B,25B_271,2,30517,26460,32752,26460,28-FEB-18 12:05:11,...,2,7,2,2235,0,4057,6292,2235,0,0
2,2018-02-07,6254942,45A,45A_70,2,35512,32100,36329,32082,28-FEB-18 12:05:11,...,2,7,2,817,-18,3412,4247,835,0,0
3,2018-02-07,6259460,25A,25A_273,1,57261,54420,58463,54443,28-FEB-18 12:05:11,...,2,7,2,1202,23,2841,4020,1179,0,0
4,2018-02-07,6253175,14,14_15,1,85383,81600,84682,81608,28-FEB-18 12:05:11,...,2,7,2,-701,8,3783,3074,-709,0,0


In [4]:
trips.dtypes

DAYOFSERVICE        object
TRIPID               int64
LINEID              object
ROUTEID             object
DIRECTION            int64
PLANNEDTIME_ARR      int64
PLANNEDTIME_DEP      int64
ACTUALTIME_ARR       int64
ACTUALTIME_DEP       int64
LASTUPDATE          object
NOTE                object
planDep_time         int64
month                int64
day                  int64
dayOfWeek            int64
arrival_diff         int64
departure_diff       int64
planned_duration     int64
actual_duration      int64
duration_diff        int64
weekend              int64
rushHour             int64
dtype: object

## Just 39A Trips

In [5]:
trips_39a = trips [trips['LINEID'] == '39A']

In [8]:
trips_39a.shape

(49506, 22)

In [9]:
trips_39a.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
18,2018-02-07,6246028,39A,39A_40,1,49276,44400,50038,44382,28-FEB-18 12:05:11,...,2,7,2,762,-18,4876,5656,780,0,0
19,2018-02-07,6261203,39A,39A_43,2,39535,34800,40249,34984,28-FEB-18 12:05:11,...,2,7,2,714,184,4735,5265,530,0,0
31,2018-02-07,6251013,39A,39A_40,1,71287,66600,72534,66602,28-FEB-18 12:05:11,...,2,7,2,1247,2,4687,5932,1245,0,0
491,2018-06-26,7104333,39A,39A_43,2,40074,35400,40759,35333,04-JUL-18 12:21:00,...,6,26,1,685,-67,4674,5426,752,0,0
492,2018-06-26,7104336,39A,39A_40,1,58316,53400,58367,53393,04-JUL-18 12:21:00,...,6,26,1,51,-7,4916,4974,58,0,0


## Shuffling 39A Dataset:

In [10]:
# Row shuffle inspired from Geeks for Geeks: https://www.geeksforgeeks.org/pandas-how-to-shuffle-a-dataframe-rows/
shuf_39a = trips_39a.sample(frac = 1)
shuf_39a.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
1496429,2018-11-02,8078892,39A,39A_40,1,45925,41400,47992,41453,13-NOV-18 16:08:27,...,11,2,4,2067,53,4525,6539,2014,0,0
1097882,2018-05-11,6734975,39A,39A_40,1,72291,67200,72798,67208,26-JUN-18 08:40:43,...,5,11,4,507,8,5091,5590,499,0,0
1219060,2018-02-09,6258365,39A,39A_40,1,47093,42600,48665,42546,28-FEB-18 12:32:51,...,2,9,4,1572,-54,4493,6119,1626,0,0
1773838,2018-10-30,8068376,39A,39A_43,2,62832,57600,64130,57583,09-NOV-18 23:36:22,...,10,30,1,1298,-17,5232,6547,1315,0,1
1207309,2018-08-02,7329713,39A,39A_43,2,32609,28200,33204,28208,18-AUG-18 18:10:33,...,8,2,3,595,8,4409,4996,587,0,1


## Splitting Shuffled Data: Train (70%) & Test (30%)

In [11]:
# train_test_split already includes a shuffle method, but no harm to shuffle again
train, test = train_test_split(shuf_39a, test_size=0.3, random_state=42, shuffle=True)

In [12]:
train

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
387676,2018-08-04,7324278,39A,39A_43,2,39336,35400,40143,35400,18-AUG-18 18:37:12,...,8,4,5,807,0,3936,4743,807,1,0
1440361,2018-09-18,8089756,39A,39A_43,2,44889,40200,45280,40172,02-NOV-18 12:08:16,...,9,18,1,391,-28,4689,5108,419,0,0
1509720,2018-03-07,6398126,39A,39A_43,2,32747,27000,33201,27001,15-MAR-18 13:13:59,...,3,7,2,454,1,5747,6200,453,0,1
917798,2018-05-01,6649480,39A,39A_40,1,35616,30600,35455,30587,25-JUN-18 18:30:03,...,5,1,1,-161,-13,5016,4868,-148,0,1
1192829,2018-11-10,8113230,39A,39A_40,1,62584,56700,63190,57577,13-DEC-18 15:41:02,...,11,10,5,606,877,5884,5613,-271,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1655584,2018-07-16,7168401,39A,39A_40,1,56240,51600,57079,51600,24-JUL-18 14:51:47,...,7,16,0,839,0,4640,5479,839,0,0
176152,2018-09-18,8086347,39A,39A_42,2,84831,82800,85096,82942,02-NOV-18 12:08:16,...,9,18,1,265,142,2031,2154,123,0,0
1364890,2018-06-11,6877822,39A,39A_40,1,53679,48600,54574,48647,21-JUN-18 08:14:53,...,6,11,0,895,47,5079,5927,848,0,0
162049,2018-03-08,6402893,39A,39A_43,2,28460,24000,29390,24008,16-MAR-18 08:41:59,...,3,8,3,930,8,4460,5382,922,0,0


In [13]:
test

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
1301370,2018-02-19,6279974,39A,39A_43,2,23750,19200,23610,19206,28-FEB-18 14:04:41,...,2,19,0,-140,6,4550,4404,-146,0,0
973689,2018-10-05,7780728,39A,39A_40,1,38488,34200,39352,34111,22-OCT-18 22:38:11,...,10,5,4,864,-89,4288,5241,953,0,0
183388,2018-09-18,8089253,39A,39A_40,1,75230,70800,76410,70772,02-NOV-18 12:08:16,...,9,18,1,1180,-28,4430,5638,1208,0,0
849883,2018-06-29,7101558,39A,39A_43,2,47233,42600,48372,42629,13-JUL-18 21:53:59,...,6,29,4,1139,29,4633,5743,1110,0,0
1778443,2018-10-23,8064015,39A,39A_40,1,34573,29400,35075,29404,02-NOV-18 12:17:06,...,10,23,1,502,4,5173,5671,498,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1527438,2018-08-22,7316266,39A,39A_43,2,65774,60600,65814,60589,30-AUG-18 09:47:32,...,8,22,2,40,-11,5174,5225,51,0,1
2022121,2018-05-28,6837668,39A,39A_40,1,81858,78000,82034,78005,14-JUN-18 13:31:37,...,5,28,0,176,5,3858,4029,171,0,0
839582,2018-05-30,6847665,39A,39A_40,1,69212,63600,70345,63615,14-JUN-18 13:52:15,...,5,30,2,1133,15,5612,6730,1118,0,0
1535575,2018-11-03,8068675,39A,39A_40,1,65284,59400,66135,59363,13-NOV-18 16:32:17,...,11,3,5,851,-37,5884,6772,888,1,0


In [14]:
train.dtypes

DAYOFSERVICE        object
TRIPID               int64
LINEID              object
ROUTEID             object
DIRECTION            int64
PLANNEDTIME_ARR      int64
PLANNEDTIME_DEP      int64
ACTUALTIME_ARR       int64
ACTUALTIME_DEP       int64
LASTUPDATE          object
NOTE                object
planDep_time         int64
month                int64
day                  int64
dayOfWeek            int64
arrival_diff         int64
departure_diff       int64
planned_duration     int64
actual_duration      int64
duration_diff        int64
weekend              int64
rushHour             int64
dtype: object

## Multiple Linear Regression

<h3>Training</h3>

In [16]:
X = train[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'TRIPID']]
y = train.duration_diff

In [17]:
linreg = LinearRegression().fit(X, y)

# Weights for each Feature
print("Features: \n", X)
print("Coeficients: \n", linreg.coef_)
print("\nIntercept: \n", linreg.intercept_)

feature_importance = pd.DataFrame({'feature': ['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'TRIPID'], 'importance':linreg.coef_})
feature_importance.sort_values('importance', ascending=False)

Features: 
          dayOfWeek  weekend  rushHour  planDep_time  planned_duration  \
387676           5        1         0            10              3936   
1440361          1        0         0            11              4689   
1509720          2        0         1             8              5747   
917798           1        0         1             8              5016   
1192829          5        1         0            16              5884   
...            ...      ...       ...           ...               ...   
1655584          0        0         0            14              4640   
176152           1        0         0            23              2031   
1364890          0        0         0            14              5079   
162049           3        0         0             7              4460   
902082           5        1         0            18              4806   

         DIRECTION   TRIPID  
387676           2  7324278  
1440361          2  8089756  
1509720          2  6

Unnamed: 0,feature,importance
2,rushHour,237.136296
0,dayOfWeek,64.598283
3,planDep_time,8.510935
6,TRIPID,8.7e-05
4,planned_duration,-0.032178
1,weekend,-246.712748
5,DIRECTION,-300.074271


<h2>Prediction & Evaluation on Training Data</h2>

In [18]:
train_linreg_predictions = linreg.predict(X)

train_actual_vs_pred_linreg = pd.concat([y, pd.DataFrame(train_linreg_predictions, columns=['pred_duration_diff'], index=y.index)], axis=1)
train_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
387676,807,311.92157
1440361,419,350.87549
1509720,453,446.399142
917798,-148,727.184978
1192829,-271,668.767147
1593098,84,610.66945
637934,-182,650.02559
1873640,794,561.341002
1130672,439,553.554007
1366305,808,851.299341


In [19]:
# Function to output evaluation metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [20]:
printMetrics(y, train_linreg_predictions)

MAE:  432.82236971951886
MSE:  307807.40346015827
RMSE:  554.8039324483544
R2:  0.11806307353792889


<h2>Prediction & Evaluation on Testing Data</h2>

In [21]:
X_test = test[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'TRIPID']]
y_test = test.duration_diff

In [22]:
test_linreg_predictions = linreg.predict(X_test)

test_actual_vs_pred_linreg = pd.concat([y_test, pd.DataFrame(test_linreg_predictions, columns=['pred_duration_diff'], index=y_test.index)], axis=1)
test_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1301370,-146,82.808615
973689,953,822.349771
183388,1208,735.838616
849883,1110,469.324156
1778443,498,844.747924
233078,713,743.522922
1526458,587,433.313696
1670901,565,724.167629
598677,569,163.674605
1626305,1293,212.18128


In [23]:
printMetrics(y_test, test_linreg_predictions)

MAE:  432.6285667548178
MSE:  307891.05981686985
RMSE:  554.879320047945
R2:  0.11963685757035625


<h2>Prediction & Evaluation on Full Data (5-Fold Cross-Validation):</h2>

<h3>1st Fold</h3>

In [25]:
cv_lin1, cv_lin2, cv_lin3, cv_lin4, cv_lin5 = np.array_split(shuf_39a, 5)

In [27]:
cvlin = cv_lin2
cvlin = cvlin.append(cv_lin3)
cvlin = cvlin.append(cv_lin4)
cvlin = cvlin.append(cv_lin5)
cvlin.shape

  cvlin = cvlin.append(cv_lin3)
  cvlin = cvlin.append(cv_lin4)
  cvlin = cvlin.append(cv_lin5)


(39604, 22)

In [28]:
X_cvlin = cvlin[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'TRIPID']]
y_cvlin = cvlin.duration_diff

In [29]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1869525,311,590.492412
1742275,-1250,41.222699
1612292,1350,829.043445
217133,-350,890.444442
1206697,1292,1038.47171
856966,1425,807.976497
976157,668,507.167111
1838287,357,574.30146
1120136,845,869.952251
29295,980,794.531764


In [30]:
X_cvlin = cv_lin1[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'TRIPID']]
y_cvlin = cv_lin1.duration_diff

In [31]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1496429,2014,857.590987
1097882,499,782.461363
1219060,1626,700.81358
1773838,1315,611.240661
1207309,587,634.803206
307085,-242,180.052266
966292,1485,904.126255
449185,1161,615.405372
1271776,1328,427.989898
2023971,368,447.353075


In [32]:
fold1_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold1_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold1_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold1_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 1 MAE: ', fold1_MAE)
print('Fold 1 MSE: ', fold1_MSE)
print('Fold 1 RMSE: ', fold1_RMSE)
print('Fold 1 R2: ', fold1_R2)

Fold 1 MAE:  428.5476328136132
Fold 1 MSE:  303700.6696421634
Fold 1 RMSE:  551.090436899574
Fold 1 R2:  0.12037489613449748


<h3>2nd Fold</h3>

In [33]:
cvlin2 = cv_lin1
cvlin2 = cvlin2.append(cv_lin3)
cvlin2 = cvlin2.append(cv_lin4)
cvlin2 = cvlin2.append(cv_lin5)
cvlin2.shape

  cvlin2 = cvlin2.append(cv_lin3)
  cvlin2 = cvlin2.append(cv_lin4)
  cvlin2 = cvlin2.append(cv_lin5)


(39605, 22)

In [34]:
X_cvlin = cvlin2[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'TRIPID']]
y_cvlin = cvlin2.duration_diff

In [35]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1496429,2014,857.590987
1097882,499,782.461363
1219060,1626,700.81358
1773838,1315,611.240661
1207309,587,634.803206
307085,-242,180.052266
966292,1485,904.126255
449185,1161,615.405372
1271776,1328,427.989898
2023971,368,447.353075


In [36]:
X_cvlin = cv_lin2[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'TRIPID']]
y_cvlin = cv_lin2.duration_diff

In [37]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1869525,311,590.492412
1742275,-1250,41.222699
1612292,1350,829.043445
217133,-350,890.444442
1206697,1292,1038.47171
856966,1425,807.976497
976157,668,507.167111
1838287,357,574.30146
1120136,845,869.952251
29295,980,794.531764


In [38]:
fold2_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold2_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold2_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold2_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 2 MAE: ', fold2_MAE)
print('Fold 2 MSE: ', fold2_MSE)
print('Fold 2 RMSE: ', fold2_RMSE)
print('Fold 2 R2: ', fold2_R2)

Fold 2 MAE:  435.86664171279415
Fold 2 MSE:  306415.0934109268
Fold 2 RMSE:  553.5477336336288
Fold 2 R2:  0.12764106192904923


<h3>3rd Fold</h3>

In [39]:
cvlin3 = cv_lin1
cvlin3 = cvlin3.append(cv_lin2)
cvlin3 = cvlin3.append(cv_lin4)
cvlin3 = cvlin3.append(cv_lin5)
cvlin3.shape

  cvlin3 = cvlin3.append(cv_lin2)
  cvlin3 = cvlin3.append(cv_lin4)
  cvlin3 = cvlin3.append(cv_lin5)


(39605, 22)

In [40]:
X_cvlin = cvlin3[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'TRIPID']]
y_cvlin = cvlin3.duration_diff

In [41]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1496429,2014,857.590987
1097882,499,782.461363
1219060,1626,700.81358
1773838,1315,611.240661
1207309,587,634.803206
307085,-242,180.052266
966292,1485,904.126255
449185,1161,615.405372
1271776,1328,427.989898
2023971,368,447.353075


In [42]:
X_cvlin = cv_lin3[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'TRIPID']]
y_cvlin = cv_lin3.duration_diff

In [43]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1813544,902,947.809114
1339693,355,282.926948
1510686,389,449.858031
412655,1143,764.046589
655755,543,428.674564
1711181,325,403.455647
420280,623,486.480662
44961,102,815.061378
1394133,146,299.675666
168791,-290,600.540693


In [44]:
fold3_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold3_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold3_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold3_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 3 MAE: ', fold3_MAE)
print('Fold 3 MSE: ', fold3_MSE)
print('Fold 3 RMSE: ', fold3_RMSE)
print('Fold 3 R2: ', fold3_R2)

Fold 3 MAE:  432.6202789390716
Fold 3 MSE:  306825.3009316891
Fold 3 RMSE:  553.9181355865586
Fold 3 R2:  0.11554410974271101


<h3>4th Fold</h3>

In [45]:
cvlin4 = cv_lin1
cvlin4 = cvlin4.append(cv_lin2)
cvlin4 = cvlin4.append(cv_lin3)
cvlin4 = cvlin4.append(cv_lin5)
cvlin4.shape

  cvlin4 = cvlin4.append(cv_lin2)
  cvlin4 = cvlin4.append(cv_lin3)
  cvlin4 = cvlin4.append(cv_lin5)


(39605, 22)

In [46]:
X_cvlin = cvlin4[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'TRIPID']]
y_cvlin = cvlin4.duration_diff

In [47]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1496429,2014,857.590987
1097882,499,782.461363
1219060,1626,700.81358
1773838,1315,611.240661
1207309,587,634.803206
307085,-242,180.052266
966292,1485,904.126255
449185,1161,615.405372
1271776,1328,427.989898
2023971,368,447.353075


In [48]:
X_cvlin = cv_lin4[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'TRIPID']]
y_cvlin = cv_lin4.duration_diff

In [49]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1993771,908,858.91151
2024320,1127,372.321603
255512,852,744.672128
1698614,-234,379.727433
558594,307,428.945544
174410,748,441.442153
816840,668,437.033426
754565,816,626.345762
1449138,-4,249.636572
1905203,-394,303.248346


In [50]:
fold4_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold4_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold4_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold4_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 4 MAE: ', fold4_MAE)
print('Fold 4 MSE: ', fold4_MSE)
print('Fold 4 RMSE: ', fold4_RMSE)
print('Fold 4 R2: ', fold4_R2)

Fold 4 MAE:  430.87914191278645
Fold 4 MSE:  307416.6352005294
Fold 4 RMSE:  554.4516527169249
Fold 4 R2:  0.11927082054207372


<h3>5th Fold</h3>

In [51]:
cvlin5 = cv_lin1
cvlin5 = cvlin5.append(cv_lin2)
cvlin5 = cvlin5.append(cv_lin3)
cvlin5 = cvlin5.append(cv_lin4)
cvlin5.shape

  cvlin5 = cvlin5.append(cv_lin2)
  cvlin5 = cvlin5.append(cv_lin3)
  cvlin5 = cvlin5.append(cv_lin4)


(39605, 22)

In [52]:
X_cvlin = cvlin5[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'TRIPID']]
y_cvlin = cvlin5.duration_diff

In [53]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1496429,2014,857.590987
1097882,499,782.461363
1219060,1626,700.81358
1773838,1315,611.240661
1207309,587,634.803206
307085,-242,180.052266
966292,1485,904.126255
449185,1161,615.405372
1271776,1328,427.989898
2023971,368,447.353075


In [54]:
X_cvlin = cv_lin5[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'TRIPID']]
y_cvlin = cv_lin5.duration_diff

In [55]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

Unnamed: 0,duration_diff,pred_duration_diff
1318647,1473,850.059101
482014,759,625.273964
558750,17,285.221906
1905226,1925,910.656209
1240298,-93,718.016441
987254,2180,424.381074
247696,-149,364.7454
122527,316,249.053964
1078797,798,688.550558
719894,780,824.468951


In [56]:
fold5_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold5_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold5_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold5_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 5 MAE: ', fold5_MAE)
print('Fold 5 MSE: ', fold5_MSE)
print('Fold 5 RMSE: ', fold5_RMSE)
print('Fold 5 R2: ', fold5_R2)

Fold 5 MAE:  435.90787073323315
Fold 5 MSE:  314805.2216548924
Fold 5 RMSE:  561.0750588423017
Fold 5 R2:  0.1094344216117421


<h3>5-Fold Cross-Validation Metrics:</h3>

In [57]:
cv_MAE = (fold1_MAE + fold2_MAE + fold3_MAE + fold4_MAE + fold5_MAE) / 5
cv_MSE = (fold1_MSE + fold2_MSE + fold3_MSE + fold4_MSE + fold5_MSE) / 5
cv_RMSE = (fold1_RMSE + fold2_RMSE + fold3_RMSE + fold4_RMSE + fold5_RMSE) / 5
cv_R2 = (fold1_R2 + fold2_R2 + fold3_R2 + fold4_R2 + fold5_R2) / 5

print("MAE: ", cv_MAE)
print("MSE: ", cv_MSE)
print("RMSE: ", cv_RMSE)
print("R2: ", cv_R2)

MAE:  432.76431322229973
MSE:  307832.58416804025
RMSE:  554.8166035357975
R2:  0.1184530619920147
