In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

In [2]:
# Read leave3 CSV file into Data Frame:
leave3 = pd.read_csv('v3_leave3_LT_2018.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
leave3.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID
0,30-APR-18 00:00:00,6651088,12,2121,27558,27558,27558,27558,2172259
1,02-MAY-18 00:00:00,6651088,27,2716,28392,28392,28392,28392,2868347
2,01-MAY-18 00:00:00,6651088,67,1599,31698,31698,31698,31698,2172300
3,03-MAY-18 00:00:00,6651088,1,4795,27000,27000,27000,27000,1932309
4,30-APR-18 00:00:00,6651088,34,1994,28980,28980,28980,28980,2172259


In [4]:
leave3.shape

(19393499, 9)

In [5]:
leave3.dtypes

DAYOFSERVICE       object
TRIPID              int64
PROGRNUMBER         int64
STOPPOINTID         int64
PLANNEDTIME_ARR     int64
PLANNEDTIME_DEP     int64
ACTUALTIME_ARR      int64
ACTUALTIME_DEP      int64
VEHICLEID           int64
dtype: object

<h3>Changing Data Types to make size of leave dataframe smaller</h3>

In [6]:
leave3['TRIPID'] = leave3['TRIPID'].astype('int32')

In [7]:
leave3['PROGRNUMBER'] = leave3['PROGRNUMBER'].astype('int16')

In [8]:
leave3['STOPPOINTID'] = leave3['STOPPOINTID'].astype('int16')

In [9]:
leave3['PLANNEDTIME_ARR'] = leave3['PLANNEDTIME_ARR'].astype('int32')

In [10]:
leave3['PLANNEDTIME_DEP'] = leave3['PLANNEDTIME_DEP'].astype('int32')

In [11]:
leave3['ACTUALTIME_ARR'] = leave3['PLANNEDTIME_DEP'].astype('int32')

In [12]:
leave3['ACTUALTIME_DEP'] = leave3['PLANNEDTIME_DEP'].astype('int32')

In [13]:
leave3['VEHICLEID'] = leave3['VEHICLEID'].astype('int32')

In [14]:
leave3.dtypes

DAYOFSERVICE       object
TRIPID              int32
PROGRNUMBER         int16
STOPPOINTID         int16
PLANNEDTIME_ARR     int32
PLANNEDTIME_DEP     int32
ACTUALTIME_ARR      int32
ACTUALTIME_DEP      int32
VEHICLEID           int32
dtype: object

In [15]:
leave3.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID
0,30-APR-18 00:00:00,6651088,12,2121,27558,27558,27558,27558,2172259
1,02-MAY-18 00:00:00,6651088,27,2716,28392,28392,28392,28392,2868347
2,01-MAY-18 00:00:00,6651088,67,1599,31698,31698,31698,31698,2172300
3,03-MAY-18 00:00:00,6651088,1,4795,27000,27000,27000,27000,1932309
4,30-APR-18 00:00:00,6651088,34,1994,28980,28980,28980,28980,2172259


<h2>Add Day, Month, Year Columns</h2>

<h3>leave3:</h3>

In [16]:
leave3.dtypes

DAYOFSERVICE       object
TRIPID              int32
PROGRNUMBER         int16
STOPPOINTID         int16
PLANNEDTIME_ARR     int32
PLANNEDTIME_DEP     int32
ACTUALTIME_ARR      int32
ACTUALTIME_DEP      int32
VEHICLEID           int32
dtype: object

In [17]:
leave3['DAYOFSERVICE'] = leave3['DAYOFSERVICE'].astype('datetime64')

In [18]:
leave3.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID
0,2018-04-30,6651088,12,2121,27558,27558,27558,27558,2172259
1,2018-05-02,6651088,27,2716,28392,28392,28392,28392,2868347
2,2018-05-01,6651088,67,1599,31698,31698,31698,31698,2172300
3,2018-05-03,6651088,1,4795,27000,27000,27000,27000,1932309
4,2018-04-30,6651088,34,1994,28980,28980,28980,28980,2172259


In [19]:
leave3['DAYOFSERVICE'] = leave3['DAYOFSERVICE'].astype('string')

In [20]:
leave3.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID
0,2018-04-30,6651088,12,2121,27558,27558,27558,27558,2172259
1,2018-05-02,6651088,27,2716,28392,28392,28392,28392,2868347
2,2018-05-01,6651088,67,1599,31698,31698,31698,31698,2172300
3,2018-05-03,6651088,1,4795,27000,27000,27000,27000,1932309
4,2018-04-30,6651088,34,1994,28980,28980,28980,28980,2172259


In [21]:
# Split the DAYOFSERVICE Column to get Month, Day, & Year (year needs one more split)
leave3[['year', 'month', 'day']] = leave3.DAYOFSERVICE.str.split('-', expand=True)

In [22]:
leave3.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day
0,2018-04-30,6651088,12,2121,27558,27558,27558,27558,2172259,2018,4,30
1,2018-05-02,6651088,27,2716,28392,28392,28392,28392,2868347,2018,5,2
2,2018-05-01,6651088,67,1599,31698,31698,31698,31698,2172300,2018,5,1
3,2018-05-03,6651088,1,4795,27000,27000,27000,27000,1932309,2018,5,3
4,2018-04-30,6651088,34,1994,28980,28980,28980,28980,2172259,2018,4,30


In [23]:
leave3['year'] = leave3['year'].astype('int16')
leave3['month'] = leave3['month'].astype('int16')
leave3['day'] = leave3['day'].astype('int16')

In [24]:
leave3.dtypes

DAYOFSERVICE       string
TRIPID              int32
PROGRNUMBER         int16
STOPPOINTID         int16
PLANNEDTIME_ARR     int32
PLANNEDTIME_DEP     int32
ACTUALTIME_ARR      int32
ACTUALTIME_DEP      int32
VEHICLEID           int32
year                int16
month               int16
day                 int16
dtype: object

In [25]:
leave3.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day
0,2018-04-30,6651088,12,2121,27558,27558,27558,27558,2172259,2018,4,30
1,2018-05-02,6651088,27,2716,28392,28392,28392,28392,2868347,2018,5,2
2,2018-05-01,6651088,67,1599,31698,31698,31698,31698,2172300,2018,5,1
3,2018-05-03,6651088,1,4795,27000,27000,27000,27000,1932309,2018,5,3
4,2018-04-30,6651088,34,1994,28980,28980,28980,28980,2172259,2018,4,30


## Create a Unique ID for Dataframe

In [26]:
leave3[['str_Year', 'str_Month', 'str_Day']] = leave3.DAYOFSERVICE.str.split('-', expand=True)

In [27]:
leave3.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day
0,2018-04-30,6651088,12,2121,27558,27558,27558,27558,2172259,2018,4,30,2018,4,30
1,2018-05-02,6651088,27,2716,28392,28392,28392,28392,2868347,2018,5,2,2018,5,2
2,2018-05-01,6651088,67,1599,31698,31698,31698,31698,2172300,2018,5,1,2018,5,1
3,2018-05-03,6651088,1,4795,27000,27000,27000,27000,1932309,2018,5,3,2018,5,3
4,2018-04-30,6651088,34,1994,28980,28980,28980,28980,2172259,2018,4,30,2018,4,30


In [28]:
# Convert DAYOFSERVICE Column back to datetime for kernel space
leave3['DAYOFSERVICE'] = leave3['DAYOFSERVICE'].astype('datetime64')

In [29]:
leave3['str_progrnumber'] = leave3['PROGRNUMBER'].astype('string')

In [30]:
leave3.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day,str_progrnumber
0,2018-04-30,6651088,12,2121,27558,27558,27558,27558,2172259,2018,4,30,2018,4,30,12
1,2018-05-02,6651088,27,2716,28392,28392,28392,28392,2868347,2018,5,2,2018,5,2,27
2,2018-05-01,6651088,67,1599,31698,31698,31698,31698,2172300,2018,5,1,2018,5,1,67
3,2018-05-03,6651088,1,4795,27000,27000,27000,27000,1932309,2018,5,3,2018,5,3,1
4,2018-04-30,6651088,34,1994,28980,28980,28980,28980,2172259,2018,4,30,2018,4,30,34


In [31]:
# Replace any under 10 stop with a leading zero
leave3['str_progrnumber'] = np.where(leave3['str_progrnumber'] == '1', '01', leave3['str_progrnumber'])
leave3['str_progrnumber'] = np.where(leave3['str_progrnumber'] == '2', '02', leave3['str_progrnumber'])
leave3['str_progrnumber'] = np.where(leave3['str_progrnumber'] == '3', '03', leave3['str_progrnumber'])
leave3['str_progrnumber'] = np.where(leave3['str_progrnumber'] == '4', '04', leave3['str_progrnumber'])
leave3['str_progrnumber'] = np.where(leave3['str_progrnumber'] == '5', '05', leave3['str_progrnumber'])
leave3['str_progrnumber'] = np.where(leave3['str_progrnumber'] == '6', '06', leave3['str_progrnumber'])
leave3['str_progrnumber'] = np.where(leave3['str_progrnumber'] == '7', '07', leave3['str_progrnumber'])
leave3['str_progrnumber'] = np.where(leave3['str_progrnumber'] == '8', '08', leave3['str_progrnumber'])
leave3['str_progrnumber'] = np.where(leave3['str_progrnumber'] == '9', '09', leave3['str_progrnumber'])

In [32]:
leave3['id'] = leave3['str_Year'] + leave3['str_Month'] + leave3['str_Day'] + leave3['TRIPID'].map(str) + leave3['str_progrnumber']

In [33]:
leave3.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day,str_progrnumber,id
0,2018-04-30,6651088,12,2121,27558,27558,27558,27558,2172259,2018,4,30,2018,4,30,12,20180430665108812
1,2018-05-02,6651088,27,2716,28392,28392,28392,28392,2868347,2018,5,2,2018,5,2,27,20180502665108827
2,2018-05-01,6651088,67,1599,31698,31698,31698,31698,2172300,2018,5,1,2018,5,1,67,20180501665108867
3,2018-05-03,6651088,1,4795,27000,27000,27000,27000,1932309,2018,5,3,2018,5,3,1,20180503665108801
4,2018-04-30,6651088,34,1994,28980,28980,28980,28980,2172259,2018,4,30,2018,4,30,34,20180430665108834


In [34]:
# Drop str_progrnumber Column for kernel space
leave3.drop('str_progrnumber', axis=1, inplace=True)

In [35]:
leave3.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day,id
0,2018-04-30,6651088,12,2121,27558,27558,27558,27558,2172259,2018,4,30,2018,4,30,20180430665108812
1,2018-05-02,6651088,27,2716,28392,28392,28392,28392,2868347,2018,5,2,2018,5,2,20180502665108827
2,2018-05-01,6651088,67,1599,31698,31698,31698,31698,2172300,2018,5,1,2018,5,1,20180501665108867
3,2018-05-03,6651088,1,4795,27000,27000,27000,27000,1932309,2018,5,3,2018,5,3,20180503665108801
4,2018-04-30,6651088,34,1994,28980,28980,28980,28980,2172259,2018,4,30,2018,4,30,20180430665108834


In [36]:
leave3 = leave3.sort_values(by=['id'])

In [37]:
leave3.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day,id
5234,2018-04-29,6651125,1,6004,44100,44100,44100,44100,2406908,2018,4,29,2018,4,29,20180429665112501
5221,2018-04-29,6651125,2,7,44208,44208,44208,44208,2406908,2018,4,29,2018,4,29,20180429665112502
5203,2018-04-29,6651125,3,11,44402,44402,44402,44402,2406908,2018,4,29,2018,4,29,20180429665112503
5198,2018-04-29,6651125,4,14,44479,44479,44479,44479,2406908,2018,4,29,2018,4,29,20180429665112504
5227,2018-04-29,6651125,5,15,44567,44567,44567,44567,2406908,2018,4,29,2018,4,29,20180429665112505


## Create a Trip/Leave ID for Combining Trip & Leavetime Datasets

In [38]:
leave3['trip_leave_id'] = leave3['str_Year'] + leave3['str_Month'] + leave3['str_Day'] + leave3['TRIPID'].map(str)

In [39]:
leave3.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day,id,trip_leave_id
5234,2018-04-29,6651125,1,6004,44100,44100,44100,44100,2406908,2018,4,29,2018,4,29,20180429665112501,201804296651125
5221,2018-04-29,6651125,2,7,44208,44208,44208,44208,2406908,2018,4,29,2018,4,29,20180429665112502,201804296651125
5203,2018-04-29,6651125,3,11,44402,44402,44402,44402,2406908,2018,4,29,2018,4,29,20180429665112503,201804296651125
5198,2018-04-29,6651125,4,14,44479,44479,44479,44479,2406908,2018,4,29,2018,4,29,20180429665112504,201804296651125
5227,2018-04-29,6651125,5,15,44567,44567,44567,44567,2406908,2018,4,29,2018,4,29,20180429665112505,201804296651125


In [40]:
# Drop str_Year Column for kernel space
leave3.drop('str_Year', axis=1, inplace=True)

In [41]:
leave3.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Month,str_Day,id,trip_leave_id
5234,2018-04-29,6651125,1,6004,44100,44100,44100,44100,2406908,2018,4,29,4,29,20180429665112501,201804296651125
5221,2018-04-29,6651125,2,7,44208,44208,44208,44208,2406908,2018,4,29,4,29,20180429665112502,201804296651125
5203,2018-04-29,6651125,3,11,44402,44402,44402,44402,2406908,2018,4,29,4,29,20180429665112503,201804296651125
5198,2018-04-29,6651125,4,14,44479,44479,44479,44479,2406908,2018,4,29,4,29,20180429665112504,201804296651125
5227,2018-04-29,6651125,5,15,44567,44567,44567,44567,2406908,2018,4,29,4,29,20180429665112505,201804296651125


In [42]:
# Drop str_Year Column for kernel space
leave3.drop('str_Month', axis=1, inplace=True)

In [43]:
leave3.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Day,id,trip_leave_id
5234,2018-04-29,6651125,1,6004,44100,44100,44100,44100,2406908,2018,4,29,29,20180429665112501,201804296651125
5221,2018-04-29,6651125,2,7,44208,44208,44208,44208,2406908,2018,4,29,29,20180429665112502,201804296651125
5203,2018-04-29,6651125,3,11,44402,44402,44402,44402,2406908,2018,4,29,29,20180429665112503,201804296651125
5198,2018-04-29,6651125,4,14,44479,44479,44479,44479,2406908,2018,4,29,29,20180429665112504,201804296651125
5227,2018-04-29,6651125,5,15,44567,44567,44567,44567,2406908,2018,4,29,29,20180429665112505,201804296651125


In [44]:
# Drop str_Year Column for kernel space
leave3.drop('str_Day', axis=1, inplace=True)

In [45]:
leave3.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,id,trip_leave_id
5234,2018-04-29,6651125,1,6004,44100,44100,44100,44100,2406908,2018,4,29,20180429665112501,201804296651125
5221,2018-04-29,6651125,2,7,44208,44208,44208,44208,2406908,2018,4,29,20180429665112502,201804296651125
5203,2018-04-29,6651125,3,11,44402,44402,44402,44402,2406908,2018,4,29,20180429665112503,201804296651125
5198,2018-04-29,6651125,4,14,44479,44479,44479,44479,2406908,2018,4,29,20180429665112504,201804296651125
5227,2018-04-29,6651125,5,15,44567,44567,44567,44567,2406908,2018,4,29,20180429665112505,201804296651125


## Send to CSV for Combination of Trip and Leave Data

In [46]:
# Send leave3 to csv:
leave3.to_csv('v3a_leave3Combine_LT_2018.csv', index=False)