In [15]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

import pickle

In [2]:
# Read CSV file into Data Frame:
trips = pd.read_csv('cleaned_trips.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
trips.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
0,2018-02-07,6253783,68,68_80,1,87245,84600,87524,84600,28-FEB-18 12:05:11,...,2,7,2,279,0,2645,2924,279,0,0
1,2018-02-07,6262138,25B,25B_271,2,30517,26460,32752,26460,28-FEB-18 12:05:11,...,2,7,2,2235,0,4057,6292,2235,0,0
2,2018-02-07,6254942,45A,45A_70,2,35512,32100,36329,32082,28-FEB-18 12:05:11,...,2,7,2,817,-18,3412,4247,835,0,0
3,2018-02-07,6259460,25A,25A_273,1,57261,54420,58463,54443,28-FEB-18 12:05:11,...,2,7,2,1202,23,2841,4020,1179,0,0
4,2018-02-07,6253175,14,14_15,1,85383,81600,84682,81608,28-FEB-18 12:05:11,...,2,7,2,-701,8,3783,3074,-709,0,0


In [4]:
trips.dtypes

DAYOFSERVICE        object
TRIPID               int64
LINEID              object
ROUTEID             object
DIRECTION            int64
PLANNEDTIME_ARR      int64
PLANNEDTIME_DEP      int64
ACTUALTIME_ARR       int64
ACTUALTIME_DEP       int64
LASTUPDATE          object
NOTE                object
planDep_time         int64
month                int64
day                  int64
dayOfWeek            int64
arrival_diff         int64
departure_diff       int64
planned_duration     int64
actual_duration      int64
duration_diff        int64
weekend              int64
rushHour             int64
dtype: object

In [5]:
trips['DIRECTION'] = trips['DIRECTION'].astype('int32')
trips['planDep_time'] = trips['planDep_time'].astype('int32')
trips['month'] = trips['month'].astype('int32')
trips['day'] = trips['day'].astype('int32')
trips['dayOfWeek'] = trips['dayOfWeek'].astype('int32')
trips['weekend'] = trips['weekend'].astype('int32')
trips['rushHour'] = trips['rushHour'].astype('int32')
trips['LINEID'] = trips['LINEID'].astype('category')

## Encoding Route Numbers

In [6]:
# Categorical Encoding
trips['num_routeID'] = trips['LINEID'].cat.codes
trips.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour,num_routeID
0,2018-02-07,6253783,68,68_80,1,87245,84600,87524,84600,28-FEB-18 12:05:11,...,7,2,279,0,2645,2924,279,0,0,106
1,2018-02-07,6262138,25B,25B_271,2,30517,26460,32752,26460,28-FEB-18 12:05:11,...,7,2,2235,0,4057,6292,2235,0,0,39
2,2018-02-07,6254942,45A,45A_70,2,35512,32100,36329,32082,28-FEB-18 12:05:11,...,7,2,817,-18,3412,4247,835,0,0,85
3,2018-02-07,6259460,25A,25A_273,1,57261,54420,58463,54443,28-FEB-18 12:05:11,...,7,2,1202,23,2841,4020,1179,0,0,38
4,2018-02-07,6253175,14,14_15,1,85383,81600,84682,81608,28-FEB-18 12:05:11,...,7,2,-701,8,3783,3074,-709,0,0,13


In [7]:
trips.dtypes

DAYOFSERVICE          object
TRIPID                 int64
LINEID              category
ROUTEID               object
DIRECTION              int32
PLANNEDTIME_ARR        int64
PLANNEDTIME_DEP        int64
ACTUALTIME_ARR         int64
ACTUALTIME_DEP         int64
LASTUPDATE            object
NOTE                  object
planDep_time           int32
month                  int32
day                    int32
dayOfWeek              int32
arrival_diff           int64
departure_diff         int64
planned_duration       int64
actual_duration        int64
duration_diff          int64
weekend                int32
rushHour               int32
num_routeID            int16
dtype: object

## Shuffling Dataset:

In [8]:
# Row shuffle inspired from Geeks for Geeks: https://www.geeksforgeeks.org/pandas-how-to-shuffle-a-dataframe-rows/
shuf_trips = trips.sample(frac = 1)
shuf_trips.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour,num_routeID
1951361,2018-01-01,5966975,122,122_14,1,72449,69000,72011,68997,08-JAN-18 17:21:10,...,1,0,-438,-3,3449,3014,-435,0,0,9
1104086,2018-11-27,8148876,130,130_10,1,69969,68100,70058,68149,10-DEC-18 13:43:19,...,27,1,89,49,1869,1909,40,0,0,12
1234549,2018-11-24,8121835,150,150_9,2,65877,63600,67190,64536,04-DEC-18 08:21:59,...,24,5,1313,936,2277,2654,377,1,0,19
532195,2018-03-31,6588534,27B,27B_23,1,28838,27000,28806,27000,09-APR-18 09:21:14,...,31,5,-32,0,1838,1806,-32,1,0,46
1981227,2018-11-05,8112935,145,145_88,2,32280,27000,32389,27000,13-NOV-18 16:52:57,...,5,0,109,0,5280,5389,109,0,1,16


## Splitting Shuffled Data: Train (70%) & Test (30%)

In [9]:
# train_test_split already includes a shuffle method, but no harm to shuffle again
train, test = train_test_split(shuf_trips, test_size=0.3, random_state=42, shuffle=True)

In [10]:
train

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour,num_routeID
1736079,2018-02-26,6297384,17A,17A_17,2,63504,58800,63890,58736,21-MAR-18 09:37:23,...,26,0,386,-64,4704,5154,450,0,1,29
256045,2018-05-26,6784288,40,40_27,1,71088,66000,71271,66000,14-JUN-18 13:12:56,...,26,5,183,0,5088,5271,183,1,0,70
759245,2018-07-17,7174923,140,140_21,2,40040,36900,39291,36872,25-JUL-18 14:14:46,...,17,1,-749,-28,3140,2419,-721,0,0,14
1387878,2018-04-22,6632074,9,9_7,2,66464,62100,66670,62152,14-MAY-18 13:41:56,...,22,6,206,52,4364,4518,154,1,0,129
1914138,2018-11-28,8146740,33,33_44,1,27825,22800,28278,23325,10-DEC-18 14:14:08,...,28,2,453,525,5025,4953,-72,0,0,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699624,2018-08-07,7331939,37,37_14,1,65353,60900,65461,60890,18-AUG-18 19:07:42,...,7,1,108,-10,4453,4571,118,0,1,61
214385,2018-06-17,7030518,27A,27A_4,1,82542,81000,82667,80989,27-JUN-18 10:37:53,...,17,6,125,-11,1542,1678,136,1,0,45
1987855,2018-01-08,6093505,4,4_10,1,33836,28800,33716,28869,16-JAN-18 08:33:19,...,8,0,-120,69,5036,4847,-189,0,1,69
1241791,2018-08-02,7331098,75,75_19,2,43464,39600,44376,39609,18-AUG-18 18:10:33,...,2,3,912,9,3864,4767,903,0,0,114


In [11]:
test

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour,num_routeID
763762,2018-01-29,6240620,39A,39A_43,2,44743,40200,45730,40200,28-FEB-18 10:24:30,...,29,0,987,0,4543,5530,987,0,0,67
1705519,2018-07-02,7142463,46A,46A_67,2,72783,69000,73920,69555,14-JUL-18 06:47:31,...,2,0,1137,555,3783,4365,582,0,0,86
1371340,2018-12-07,8161934,15,15_17,2,49567,45000,50533,45026,17-DEC-18 07:53:36,...,7,4,966,26,4567,5507,940,0,0,18
215218,2018-08-25,7322365,130,130_10,1,72378,70800,72801,70836,03-SEP-18 10:39:32,...,25,5,423,36,1578,1965,387,1,0,12
740001,2018-07-23,7173718,46A,46A_74,1,43127,39240,43601,39237,16-AUG-18 14:11:40,...,23,0,474,-3,3887,4364,477,0,0,86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
874635,2018-07-31,7332416,15,15_16,1,48840,43200,47549,43185,18-AUG-18 17:47:52,...,31,1,-1291,-15,5640,4364,-1276,0,0,18
559228,2018-09-24,7771467,39A,39A_40,1,68722,63000,69881,63007,22-OCT-18 17:24:51,...,24,0,1159,7,5722,6874,1152,0,0,67
840378,2018-07-23,7168407,13,13_60,1,44009,38400,44740,38521,16-AUG-18 14:11:40,...,23,0,731,121,5609,6219,610,0,0,11
1268745,2018-12-29,8579079,83A,83A_23,2,34319,31200,34581,31225,16-JAN-19 18:00:42,...,29,5,262,25,3119,3356,237,1,0,125


## Multiple Linear Regression

<h2>Training</h2>

In [12]:
X = train[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'num_routeID', 'TRIPID']]
y = train.duration_diff

In [13]:
linreg = LinearRegression().fit(X, y)

# Weights for each Feature
print("Features: \n", X)
print("Coeficients: \n", linreg.coef_)
print("\nIntercept: \n", linreg.intercept_)

feature_importance = pd.DataFrame({'feature': ['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'num_routeID', 'TRIPID'], 'importance':linreg.coef_})
feature_importance.sort_values('importance', ascending=False)

Features: 
          dayOfWeek  weekend  rushHour  planDep_time  planned_duration  \
1736079          0        0         1            16              4704   
256045           5        1         0            18              5088   
759245           1        0         0            10              3140   
1387878          6        1         0            17              4364   
1914138          2        0         0             6              5025   
...            ...      ...       ...           ...               ...   
699624           1        0         1            17              4453   
214385           6        1         0            22              1542   
1987855          0        0         1             8              5036   
1241791          3        0         0            11              3864   
1093             2        0         0             6              3778   

         DIRECTION  num_routeID   TRIPID  
1736079          2           29  6297384  
256045           1       

Unnamed: 0,feature,importance
2,rushHour,124.040977
0,dayOfWeek,42.422275
5,DIRECTION,3.955719
3,planDep_time,0.69583
6,num_routeID,0.165915
4,planned_duration,0.037673
7,TRIPID,6.7e-05
1,weekend,-149.185377


In [16]:
# Serialize model object into a file called model.pkl on disk using pickle
with open('trips_allRoutes_linreg_model.pkl', 'wb') as handle:
    pickle.dump(linreg, handle, pickle.HIGHEST_PROTOCOL)

<h2>Prediction & Evaluation on Training Data</h2>

In [None]:
train_linreg_predictions = linreg.predict(X)

train_actual_vs_pred_linreg = pd.concat([y, pd.DataFrame(train_linreg_predictions, columns=['pred_duration_diff'], index=y.index)], axis=1)
train_actual_vs_pred_linreg.head(10)

In [None]:
# Function to output evaluation metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [None]:
printMetrics(y, train_linreg_predictions)

<h2>Prediction & Evaluation on Testing Data</h2>

In [None]:
X_test = test[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'num_routeID', 'TRIPID']]
y_test = test.duration_diff

In [None]:
test_linreg_predictions = linreg.predict(X_test)

test_actual_vs_pred_linreg = pd.concat([y_test, pd.DataFrame(test_linreg_predictions, columns=['pred_duration_diff'], index=y_test.index)], axis=1)
test_actual_vs_pred_linreg.head(10)

In [None]:
printMetrics(y_test, test_linreg_predictions)

<h2>Prediction & Evaluation on Full Data (5-Fold Cross-Validation):</h2>

<h3>1st Fold</h3>

In [None]:
cv_lin1, cv_lin2, cv_lin3, cv_lin4, cv_lin5 = np.array_split(shuf_trips, 5)

In [None]:
cvlin = cv_lin2
cvlin = cvlin.append(cv_lin3)
cvlin = cvlin.append(cv_lin4)
cvlin = cvlin.append(cv_lin5)
cvlin.shape

In [None]:
X_cvlin = cvlin[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'num_routeID', 'TRIPID']]
y_cvlin = cvlin.duration_diff

In [None]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

In [None]:
X_cvlin = cv_lin1[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'num_routeID', 'TRIPID']]
y_cvlin = cv_lin1.duration_diff

In [None]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

In [None]:
fold1_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold1_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold1_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold1_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 1 MAE: ', fold1_MAE)
print('Fold 1 MSE: ', fold1_MSE)
print('Fold 1 RMSE: ', fold1_RMSE)
print('Fold 1 R2: ', fold1_R2)

<h3>2nd Fold</h3>

In [None]:
cvlin2 = cv_lin1
cvlin2 = cvlin2.append(cv_lin3)
cvlin2 = cvlin2.append(cv_lin4)
cvlin2 = cvlin2.append(cv_lin5)
cvlin2.shape

In [None]:
X_cvlin = cvlin2[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'num_routeID', 'TRIPID']]
y_cvlin = cvlin2.duration_diff

In [None]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

In [None]:
X_cvlin = cv_lin2[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'num_routeID', 'TRIPID']]
y_cvlin = cv_lin2.duration_diff

In [None]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

In [None]:
fold2_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold2_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold2_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold2_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 2 MAE: ', fold2_MAE)
print('Fold 2 MSE: ', fold2_MSE)
print('Fold 2 RMSE: ', fold2_RMSE)
print('Fold 2 R2: ', fold2_R2)

<h3>3rd Fold</h3>

In [None]:
cvlin3 = cv_lin1
cvlin3 = cvlin3.append(cv_lin2)
cvlin3 = cvlin3.append(cv_lin4)
cvlin3 = cvlin3.append(cv_lin5)
cvlin3.shape

In [None]:
X_cvlin = cvlin3[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'num_routeID', 'TRIPID']]
y_cvlin = cvlin3.duration_diff

In [None]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

In [None]:
X_cvlin = cv_lin3[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'num_routeID', 'TRIPID']]
y_cvlin = cv_lin3.duration_diff

In [None]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

In [None]:
fold3_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold3_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold3_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold3_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 3 MAE: ', fold3_MAE)
print('Fold 3 MSE: ', fold3_MSE)
print('Fold 3 RMSE: ', fold3_RMSE)
print('Fold 3 R2: ', fold3_R2)

<h3>4th Fold</h3>

In [None]:
cvlin4 = cv_lin1
cvlin4 = cvlin4.append(cv_lin2)
cvlin4 = cvlin4.append(cv_lin3)
cvlin4 = cvlin4.append(cv_lin5)
cvlin4.shape

In [None]:
X_cvlin = cvlin4[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'num_routeID', 'TRIPID']]
y_cvlin = cvlin4.duration_diff

In [None]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

In [None]:
X_cvlin = cv_lin4[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'num_routeID', 'TRIPID']]
y_cvlin = cv_lin4.duration_diff

In [None]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

In [None]:
fold4_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold4_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold4_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold4_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 4 MAE: ', fold4_MAE)
print('Fold 4 MSE: ', fold4_MSE)
print('Fold 4 RMSE: ', fold4_RMSE)
print('Fold 4 R2: ', fold4_R2)

<h3>5th Fold</h3>

In [None]:
cvlin5 = cv_lin1
cvlin5 = cvlin5.append(cv_lin2)
cvlin5 = cvlin5.append(cv_lin3)
cvlin5 = cvlin5.append(cv_lin4)
cvlin5.shape

In [None]:
X_cvlin = cvlin5[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'num_routeID', 'TRIPID']]
y_cvlin = cvlin5.duration_diff

In [None]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

In [None]:
X_cvlin = cv_lin5[['dayOfWeek', 'weekend', 'rushHour', 'planDep_time', 'planned_duration', 'DIRECTION', 'num_routeID', 'TRIPID']]
y_cvlin = cv_lin5.duration_diff

In [None]:
full_linreg_predictions = linreg.predict(X_cvlin)

full_actual_vs_pred_linreg = pd.concat([y_cvlin, pd.DataFrame(full_linreg_predictions, columns=['pred_duration_diff'], index=y_cvlin.index)], axis=1)
full_actual_vs_pred_linreg.head(10)

In [None]:
fold5_MAE = metrics.mean_absolute_error(y_cvlin, full_linreg_predictions)
fold5_MSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)
fold5_RMSE = metrics.mean_squared_error(y_cvlin, full_linreg_predictions)**0.5
fold5_R2 = metrics.r2_score(y_cvlin, full_linreg_predictions)

print('Fold 5 MAE: ', fold5_MAE)
print('Fold 5 MSE: ', fold5_MSE)
print('Fold 5 RMSE: ', fold5_RMSE)
print('Fold 5 R2: ', fold5_R2)

<h3>5-Fold Cross-Validation Metrics:</h3>

In [None]:
cv_MAE = (fold1_MAE + fold2_MAE + fold3_MAE + fold4_MAE + fold5_MAE) / 5
cv_MSE = (fold1_MSE + fold2_MSE + fold3_MSE + fold4_MSE + fold5_MSE) / 5
cv_RMSE = (fold1_RMSE + fold2_RMSE + fold3_RMSE + fold4_RMSE + fold5_RMSE) / 5
cv_R2 = (fold1_R2 + fold2_R2 + fold3_R2 + fold4_R2 + fold5_R2) / 5

print("MAE: ", cv_MAE)
print("MSE: ", cv_MSE)
print("RMSE: ", cv_RMSE)
print("R2: ", cv_R2)