In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

import pickle

In [2]:
# Read tripleave1 route split 2 CSV file into Data Frame:
wk = pd.read_csv('v5_tripLeave1_route_split_2.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
wk.shape

(12947289, 22)

In [4]:
wk.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour,actualARR_hour
0,2018-01-01,5955784,3,7271,31848,31848,31848,31848,2534807,2018,...,20180101595578403,201801015955784,84,126,2,4255,0,0,0,9
1,2018-01-01,5955784,4,5136,31858,31858,31858,31858,2534807,2018,...,20180101595578404,201801015955784,84,126,2,4255,0,0,0,9
2,2018-01-01,5955784,5,4262,31905,31905,31905,31905,2534807,2018,...,20180101595578405,201801015955784,84,126,2,4255,0,0,0,9
3,2018-01-01,5955784,6,4263,31957,31957,31957,31957,2534807,2018,...,20180101595578406,201801015955784,84,126,2,4255,0,0,0,9
4,2018-01-01,5955784,7,4264,31995,31995,31995,31995,2534807,2018,...,20180101595578407,201801015955784,84,126,2,4255,0,0,0,9


In [5]:
wk.dtypes

DAYOFSERVICE       object
TRIPID              int64
PROGRNUMBER         int64
STOPPOINTID         int64
PLANNEDTIME_ARR     int64
PLANNEDTIME_DEP     int64
ACTUALTIME_ARR      int64
ACTUALTIME_DEP      int64
VEHICLEID           int64
year                int64
month               int64
day                 int64
id                  int64
trip_leave_id       int64
LINEID             object
num_lineID          int64
DIRECTION           int64
actual_duration     int64
dayOfWeek           int64
weekend             int64
rushHour            int64
actualARR_hour      int64
dtype: object

## Datatype Changes

In [6]:
wk['num_lineID'] = wk['num_lineID'].astype('int16')
wk['DIRECTION'] = wk['DIRECTION'].astype('int16')

In [7]:
wk['actual_duration'] = wk['actual_duration'].astype('int64')
wk['dayOfWeek'] = wk['dayOfWeek'].astype('int16')

In [8]:
wk['weekend'] = wk['weekend'].astype('int16')
wk['rushHour'] = wk['rushHour'].astype('int16')

In [9]:
wk.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour,actualARR_hour
0,2018-01-01,5955784,3,7271,31848,31848,31848,31848,2534807,2018,...,20180101595578403,201801015955784,84,126,2,4255,0,0,0,9
1,2018-01-01,5955784,4,5136,31858,31858,31858,31858,2534807,2018,...,20180101595578404,201801015955784,84,126,2,4255,0,0,0,9
2,2018-01-01,5955784,5,4262,31905,31905,31905,31905,2534807,2018,...,20180101595578405,201801015955784,84,126,2,4255,0,0,0,9
3,2018-01-01,5955784,6,4263,31957,31957,31957,31957,2534807,2018,...,20180101595578406,201801015955784,84,126,2,4255,0,0,0,9
4,2018-01-01,5955784,7,4264,31995,31995,31995,31995,2534807,2018,...,20180101595578407,201801015955784,84,126,2,4255,0,0,0,9


In [10]:
wk.dtypes

DAYOFSERVICE       object
TRIPID              int64
PROGRNUMBER         int64
STOPPOINTID         int64
PLANNEDTIME_ARR     int64
PLANNEDTIME_DEP     int64
ACTUALTIME_ARR      int64
ACTUALTIME_DEP      int64
VEHICLEID           int64
year                int64
month               int64
day                 int64
id                  int64
trip_leave_id       int64
LINEID             object
num_lineID          int16
DIRECTION           int16
actual_duration     int64
dayOfWeek           int16
weekend             int16
rushHour            int16
actualARR_hour      int64
dtype: object

## Route 84

In [11]:
lwk = wk[(wk['LINEID'] == '84')]

In [12]:
lwk.shape

(153825, 22)

In [13]:
# Drop Route 84 rows from wk
wk.drop(wk[wk['LINEID'] == '84'].index, inplace = True)

In [14]:
wk.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour,actualARR_hour
1026,2018-01-01,5955815,1,6004,36000,36000,36000,36000,1001212,2018,...,20180101595581501,201801015955815,120,8,1,1016,0,0,0,10
1027,2018-01-01,5955815,2,3,36073,36073,36073,36073,1001212,2018,...,20180101595581502,201801015955815,120,8,1,1016,0,0,0,10
1028,2018-01-01,5955815,3,192,36251,36251,36251,36251,1001212,2018,...,20180101595581503,201801015955815,120,8,1,1016,0,0,0,10
1029,2018-01-01,5955815,4,795,36278,36278,36278,36278,1001212,2018,...,20180101595581504,201801015955815,120,8,1,1016,0,0,0,10
1030,2018-01-01,5955815,5,796,36305,36305,36305,36305,1001212,2018,...,20180101595581505,201801015955815,120,8,1,1016,0,0,0,10


In [15]:
wk.shape

(12793464, 22)

<h3>Send to CSV</h3>

In [16]:
# Send df to csv:
lwk.to_csv('v5_84_tripLeave1_sp2.csv', index=False)

## Route 120

In [17]:
lwk = wk[(wk['LINEID'] == '120')]

In [18]:
lwk.shape

(136499, 22)

In [19]:
# Drop Route 120 rows from wk
wk.drop(wk[wk['LINEID'] == '120'].index, inplace = True)

In [20]:
wk.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour,actualARR_hour
1223,2018-01-01,5955843,1,449,39000,39000,39000,39000,2534811,2018,...,20180101595584301,201801015955843,11,3,2,2884,0,0,0,11
1224,2018-01-01,5955843,2,450,39033,39033,39033,39033,2534811,2018,...,20180101595584302,201801015955843,11,3,2,2884,0,0,0,11
1225,2018-01-01,5955843,3,3181,39100,39100,39100,39100,2534811,2018,...,20180101595584303,201801015955843,11,3,2,2884,0,0,0,11
1226,2018-01-01,5955843,4,451,39131,39131,39131,39131,2534811,2018,...,20180101595584304,201801015955843,11,3,2,2884,0,0,0,11
1227,2018-01-01,5955843,5,447,39178,39178,39178,39178,2534811,2018,...,20180101595584305,201801015955843,11,3,2,2884,0,0,0,11


In [21]:
wk.shape

(12656965, 22)

<h3>Send to CSV</h3>

In [22]:
# Send df to csv:
lwk.to_csv('v5_120_tripLeave1_sp2.csv', index=False)

## Route 11

In [23]:
lwk = wk[(wk['LINEID'] == '11')]

In [24]:
lwk.shape

(258160, 22)

In [25]:
# Drop Route 11 rows from wk
wk.drop(wk[wk['LINEID'] == '11'].index, inplace = True)

In [26]:
wk.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour,actualARR_hour
1475,2018-01-01,5955967,1,395,35100,35100,35100,35100,1932330,2018,...,20180101595596701,201801015955967,56A,94,1,1935,0,0,0,10
1476,2018-01-01,5955967,2,396,35155,35155,35155,35155,1932330,2018,...,20180101595596702,201801015955967,56A,94,1,1935,0,0,0,10
1477,2018-01-01,5955967,3,397,35185,35185,35185,35185,1932330,2018,...,20180101595596703,201801015955967,56A,94,1,1935,0,0,0,10
1478,2018-01-01,5955967,4,398,35214,35214,35214,35214,1932330,2018,...,20180101595596704,201801015955967,56A,94,1,1935,0,0,0,10
1479,2018-01-01,5955967,5,399,35238,35238,35238,35238,1932330,2018,...,20180101595596705,201801015955967,56A,94,1,1935,0,0,0,10


In [27]:
wk.shape

(12398805, 22)

<h3>Send to CSV</h3>

In [28]:
# Send df to csv:
lwk.to_csv('v5_11_tripLeave1_sp2.csv', index=False)

## Route 56A

In [29]:
lwk = wk[(wk['LINEID'] == '56A')]

In [30]:
lwk.shape

(86426, 22)

In [31]:
# Drop Route 56A rows from wk
wk.drop(wk[wk['LINEID'] == '56A'].index, inplace = True)

In [32]:
wk.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour,actualARR_hour
2014,2018-01-01,5956060,3,2807,46625,46625,46625,46625,1000156,2018,...,20180101595606003,201801015956060,18,30,2,3274,0,0,0,13
2015,2018-01-01,5956060,4,2808,46675,46675,46675,46675,1000156,2018,...,20180101595606004,201801015956060,18,30,2,3274,0,0,0,13
2016,2018-01-01,5956060,5,486,46810,46810,46810,46810,1000156,2018,...,20180101595606005,201801015956060,18,30,2,3274,0,0,0,13
2017,2018-01-01,5956060,6,487,46883,46883,46883,46883,1000156,2018,...,20180101595606006,201801015956060,18,30,2,3274,0,0,0,13
2018,2018-01-01,5956060,7,2798,46917,46917,46917,46917,1000156,2018,...,20180101595606007,201801015956060,18,30,2,3274,0,0,0,13


In [33]:
wk.shape

(12312379, 22)

<h3>Send to CSV</h3>

In [34]:
# Send df to csv:
lwk.to_csv('v5_56A_tripLeave1_sp2.csv', index=False)

## Route 18

In [35]:
lwk = wk[(wk['LINEID'] == '18')]

In [36]:
lwk.shape

(211465, 22)

In [37]:
# Drop Route 41C rows from wk
wk.drop(wk[wk['LINEID'] == '18'].index, inplace = True)

In [38]:
wk.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour,actualARR_hour
2319,2018-01-01,5956222,4,962,28920,28920,28920,28920,2406900,2018,...,20180101595622204,201801015956222,27A,45,2,1557,0,0,1,8
2320,2018-01-01,5956222,5,635,28956,28956,28956,28956,2406900,2018,...,20180101595622205,201801015956222,27A,45,2,1557,0,0,1,8
2321,2018-01-01,5956222,6,636,28979,28979,28979,28979,2406900,2018,...,20180101595622206,201801015956222,27A,45,2,1557,0,0,1,8
2322,2018-01-01,5956222,7,637,29017,29017,29017,29017,2406900,2018,...,20180101595622207,201801015956222,27A,45,2,1557,0,0,1,8
2323,2018-01-01,5956222,8,638,29036,29036,29036,29036,2406900,2018,...,20180101595622208,201801015956222,27A,45,2,1557,0,0,1,8


In [39]:
wk.shape

(12100914, 22)

<h3>Send to CSV</h3>

In [40]:
# Send df to csv:
lwk.to_csv('v5_18_tripLeave1_sp2.csv', index=False)

## Route 27A

In [41]:
lwk = wk[(wk['LINEID'] == '27A')]

In [42]:
lwk.shape

(124391, 22)

In [43]:
# Drop Route 27A rows from wk
wk.drop(wk[wk['LINEID'] == '27A'].index, inplace = True)

In [44]:
wk.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour,actualARR_hour
2592,2018-01-01,5956256,1,4747,48600,48600,48600,48600,1000186,2018,...,20180101595625601,201801015956256,17A,29,1,3166,0,0,0,14
2593,2018-01-01,5956256,2,7298,48890,48890,48890,48890,1000186,2018,...,20180101595625602,201801015956256,17A,29,1,3166,0,0,0,14
2594,2018-01-01,5956256,3,2294,48960,48960,48960,48960,1000186,2018,...,20180101595625603,201801015956256,17A,29,1,3166,0,0,0,14
2595,2018-01-01,5956256,4,6039,49046,49046,49046,49046,1000186,2018,...,20180101595625604,201801015956256,17A,29,1,3166,0,0,0,14
2596,2018-01-01,5956256,6,6171,49144,49144,49144,49144,1000186,2018,...,20180101595625606,201801015956256,17A,29,1,3166,0,0,0,14


In [45]:
wk.shape

(11976523, 22)

<h3>Send to CSV</h3>

In [46]:
# Send df to csv:
lwk.to_csv('v5_27A_tripLeave1_sp2.csv', index=False)

## Route 17A

In [47]:
lwk = wk[(wk['LINEID'] == '17A')]

In [48]:
lwk.shape

(304922, 22)

In [49]:
# Drop Route 17A rows from wk
wk.drop(wk[wk['LINEID'] == '17A'].index, inplace = True)

In [50]:
wk.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour,actualARR_hour
2887,2018-01-01,5956265,1,226,36000,36000,36000,36000,2693264,2018,...,20180101595626501,201801015956265,1,0,1,1876,0,0,0,10
2888,2018-01-01,5956265,2,228,36052,36052,36052,36052,2693264,2018,...,20180101595626502,201801015956265,1,0,1,1876,0,0,0,10
2889,2018-01-01,5956265,3,229,36095,36095,36095,36095,2693264,2018,...,20180101595626503,201801015956265,1,0,1,1876,0,0,0,10
2890,2018-01-01,5956265,4,227,36166,36166,36166,36166,2693264,2018,...,20180101595626504,201801015956265,1,0,1,1876,0,0,0,10
2891,2018-01-01,5956265,5,230,36209,36209,36209,36209,2693264,2018,...,20180101595626505,201801015956265,1,0,1,1876,0,0,0,10


In [51]:
wk.shape

(11671601, 22)

<h3>Send to CSV</h3>

In [52]:
# Send df to csv:
lwk.to_csv('v5_17A_tripLeave1_sp2.csv', index=False)

## Route 1

In [53]:
lwk = wk[(wk['LINEID'] == '1')]

In [54]:
lwk.shape

(202277, 22)

In [55]:
# Drop Route 1 rows from wk
wk.drop(wk[wk['LINEID'] == '1'].index, inplace = True)

In [56]:
wk.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour,actualARR_hour
3094,2018-01-01,5956287,1,7564,34200,34200,34200,34200,2534840,2018,...,20180101595628701,201801015956287,65,98,1,3682,0,0,0,10
3095,2018-01-01,5956287,2,4521,34346,34346,34346,34346,2534840,2018,...,20180101595628702,201801015956287,65,98,1,3682,0,0,0,10
3096,2018-01-01,5956287,3,1283,34449,34449,34449,34449,2534840,2018,...,20180101595628703,201801015956287,65,98,1,3682,0,0,0,10
3097,2018-01-01,5956287,4,4456,34486,34486,34486,34486,2534840,2018,...,20180101595628704,201801015956287,65,98,1,3682,0,0,0,10
3098,2018-01-01,5956287,5,1284,34520,34520,34520,34520,2534840,2018,...,20180101595628705,201801015956287,65,98,1,3682,0,0,0,10


In [57]:
wk.shape

(11469324, 22)

<h3>Send to CSV</h3>

In [58]:
# Send df to csv:
lwk.to_csv('v5_1_tripLeave1_sp2.csv', index=False)

## Route 65

In [59]:
lwk = wk[(wk['LINEID'] == '65')]

In [60]:
lwk.shape

(112289, 22)

In [61]:
# Drop Route 65 rows from wk
wk.drop(wk[wk['LINEID'] == '65'].index, inplace = True)

In [62]:
wk.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour,actualARR_hour
3749,2018-01-01,5956325,3,3926,40597,40597,40597,40597,2534817,2018,...,20180101595632503,201801015956325,67,104,2,3352,0,0,0,11
3750,2018-01-01,5956325,4,7074,40649,40649,40649,40649,2534817,2018,...,20180101595632504,201801015956325,67,104,2,3352,0,0,0,11
3751,2018-01-01,5956325,5,3919,40728,40728,40728,40728,2534817,2018,...,20180101595632505,201801015956325,67,104,2,3352,0,0,0,11
3752,2018-01-01,5956325,6,3920,40802,40802,40802,40802,2534817,2018,...,20180101595632506,201801015956325,67,104,2,3352,0,0,0,11
3753,2018-01-01,5956325,7,3921,40840,40840,40840,40840,2534817,2018,...,20180101595632507,201801015956325,67,104,2,3352,0,0,0,11


In [63]:
wk.shape

(11357035, 22)

<h3>Send to CSV</h3>

In [64]:
# Send df to csv:
lwk.to_csv('v5_65_tripLeave1_sp2.csv', index=False)

## Continue v5_tripLeave1_route_split_3

In [65]:
# Send df to csv:
wk.to_csv('v5_tripLeave1_route_split_3.csv', index=False)