In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

In [2]:
# Read leave5 CSV file into Data Frame:
leave5 = pd.read_csv('v3_leave5_LT_2018.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
leave5.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID
0,28-AUG-18 00:00:00,7502493,14,7603,54710,54710,54710,54710,1932370
1,27-AUG-18 00:00:00,7502493,17,47,54891,54891,54891,54891,1932370
2,28-AUG-18 00:00:00,7502493,25,350,56029,56029,56029,56029,1932370
3,28-AUG-18 00:00:00,7502493,5,230,54257,54257,54257,54257,1932370
4,27-AUG-18 00:00:00,7502493,33,390,56703,56703,56703,56703,1932370


In [4]:
leave5.shape

(19393626, 9)

In [5]:
leave5.dtypes

DAYOFSERVICE       object
TRIPID              int64
PROGRNUMBER         int64
STOPPOINTID         int64
PLANNEDTIME_ARR     int64
PLANNEDTIME_DEP     int64
ACTUALTIME_ARR      int64
ACTUALTIME_DEP      int64
VEHICLEID           int64
dtype: object

<h3>Changing Data Types to make size of leave dataframe smaller</h3>

In [6]:
leave5['TRIPID'] = leave5['TRIPID'].astype('int32')

In [7]:
leave5['PROGRNUMBER'] = leave5['PROGRNUMBER'].astype('int16')

In [8]:
leave5['STOPPOINTID'] = leave5['STOPPOINTID'].astype('int16')

In [9]:
leave5['PLANNEDTIME_ARR'] = leave5['PLANNEDTIME_ARR'].astype('int32')

In [10]:
leave5['PLANNEDTIME_DEP'] = leave5['PLANNEDTIME_DEP'].astype('int32')

In [11]:
leave5['ACTUALTIME_ARR'] = leave5['PLANNEDTIME_DEP'].astype('int32')

In [12]:
leave5['ACTUALTIME_DEP'] = leave5['PLANNEDTIME_DEP'].astype('int32')

In [13]:
leave5['VEHICLEID'] = leave5['VEHICLEID'].astype('int32')

In [14]:
leave5.dtypes

DAYOFSERVICE       object
TRIPID              int32
PROGRNUMBER         int16
STOPPOINTID         int16
PLANNEDTIME_ARR     int32
PLANNEDTIME_DEP     int32
ACTUALTIME_ARR      int32
ACTUALTIME_DEP      int32
VEHICLEID           int32
dtype: object

In [15]:
leave5.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID
0,28-AUG-18 00:00:00,7502493,14,7603,54710,54710,54710,54710,1932370
1,27-AUG-18 00:00:00,7502493,17,47,54891,54891,54891,54891,1932370
2,28-AUG-18 00:00:00,7502493,25,350,56029,56029,56029,56029,1932370
3,28-AUG-18 00:00:00,7502493,5,230,54257,54257,54257,54257,1932370
4,27-AUG-18 00:00:00,7502493,33,390,56703,56703,56703,56703,1932370


<h2>Add Day, Month, Year Columns</h2>

<h3>leave5:</h3>

In [16]:
leave5.dtypes

DAYOFSERVICE       object
TRIPID              int32
PROGRNUMBER         int16
STOPPOINTID         int16
PLANNEDTIME_ARR     int32
PLANNEDTIME_DEP     int32
ACTUALTIME_ARR      int32
ACTUALTIME_DEP      int32
VEHICLEID           int32
dtype: object

In [17]:
leave5['DAYOFSERVICE'] = leave5['DAYOFSERVICE'].astype('datetime64')

In [18]:
leave5.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID
0,2018-08-28,7502493,14,7603,54710,54710,54710,54710,1932370
1,2018-08-27,7502493,17,47,54891,54891,54891,54891,1932370
2,2018-08-28,7502493,25,350,56029,56029,56029,56029,1932370
3,2018-08-28,7502493,5,230,54257,54257,54257,54257,1932370
4,2018-08-27,7502493,33,390,56703,56703,56703,56703,1932370


In [19]:
leave5['DAYOFSERVICE'] = leave5['DAYOFSERVICE'].astype('string')

In [20]:
leave5.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID
0,2018-08-28,7502493,14,7603,54710,54710,54710,54710,1932370
1,2018-08-27,7502493,17,47,54891,54891,54891,54891,1932370
2,2018-08-28,7502493,25,350,56029,56029,56029,56029,1932370
3,2018-08-28,7502493,5,230,54257,54257,54257,54257,1932370
4,2018-08-27,7502493,33,390,56703,56703,56703,56703,1932370


In [21]:
# Split the DAYOFSERVICE Column to get Month, Day, & Year (year needs one more split)
leave5[['year', 'month', 'day']] = leave5.DAYOFSERVICE.str.split('-', expand=True)

In [22]:
leave5.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day
0,2018-08-28,7502493,14,7603,54710,54710,54710,54710,1932370,2018,8,28
1,2018-08-27,7502493,17,47,54891,54891,54891,54891,1932370,2018,8,27
2,2018-08-28,7502493,25,350,56029,56029,56029,56029,1932370,2018,8,28
3,2018-08-28,7502493,5,230,54257,54257,54257,54257,1932370,2018,8,28
4,2018-08-27,7502493,33,390,56703,56703,56703,56703,1932370,2018,8,27


In [23]:
leave5['year'] = leave5['year'].astype('int16')
leave5['month'] = leave5['month'].astype('int16')
leave5['day'] = leave5['day'].astype('int16')

In [24]:
leave5.dtypes

DAYOFSERVICE       string
TRIPID              int32
PROGRNUMBER         int16
STOPPOINTID         int16
PLANNEDTIME_ARR     int32
PLANNEDTIME_DEP     int32
ACTUALTIME_ARR      int32
ACTUALTIME_DEP      int32
VEHICLEID           int32
year                int16
month               int16
day                 int16
dtype: object

In [25]:
leave5.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day
0,2018-08-28,7502493,14,7603,54710,54710,54710,54710,1932370,2018,8,28
1,2018-08-27,7502493,17,47,54891,54891,54891,54891,1932370,2018,8,27
2,2018-08-28,7502493,25,350,56029,56029,56029,56029,1932370,2018,8,28
3,2018-08-28,7502493,5,230,54257,54257,54257,54257,1932370,2018,8,28
4,2018-08-27,7502493,33,390,56703,56703,56703,56703,1932370,2018,8,27


## Create a Unique ID for Dataframe

In [26]:
leave5[['str_Year', 'str_Month', 'str_Day']] = leave5.DAYOFSERVICE.str.split('-', expand=True)

In [27]:
leave5.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day
0,2018-08-28,7502493,14,7603,54710,54710,54710,54710,1932370,2018,8,28,2018,8,28
1,2018-08-27,7502493,17,47,54891,54891,54891,54891,1932370,2018,8,27,2018,8,27
2,2018-08-28,7502493,25,350,56029,56029,56029,56029,1932370,2018,8,28,2018,8,28
3,2018-08-28,7502493,5,230,54257,54257,54257,54257,1932370,2018,8,28,2018,8,28
4,2018-08-27,7502493,33,390,56703,56703,56703,56703,1932370,2018,8,27,2018,8,27


In [28]:
# Convert DAYOFSERVICE Column back to datetime for kernel space
leave5['DAYOFSERVICE'] = leave5['DAYOFSERVICE'].astype('datetime64')

In [29]:
leave5['str_progrnumber'] = leave5['PROGRNUMBER'].astype('string')

In [30]:
leave5.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day,str_progrnumber
0,2018-08-28,7502493,14,7603,54710,54710,54710,54710,1932370,2018,8,28,2018,8,28,14
1,2018-08-27,7502493,17,47,54891,54891,54891,54891,1932370,2018,8,27,2018,8,27,17
2,2018-08-28,7502493,25,350,56029,56029,56029,56029,1932370,2018,8,28,2018,8,28,25
3,2018-08-28,7502493,5,230,54257,54257,54257,54257,1932370,2018,8,28,2018,8,28,5
4,2018-08-27,7502493,33,390,56703,56703,56703,56703,1932370,2018,8,27,2018,8,27,33


In [31]:
# Replace any under 10 stop with a leading zero
leave5['str_progrnumber'] = np.where(leave5['str_progrnumber'] == '1', '01', leave5['str_progrnumber'])
leave5['str_progrnumber'] = np.where(leave5['str_progrnumber'] == '2', '02', leave5['str_progrnumber'])
leave5['str_progrnumber'] = np.where(leave5['str_progrnumber'] == '3', '03', leave5['str_progrnumber'])
leave5['str_progrnumber'] = np.where(leave5['str_progrnumber'] == '4', '04', leave5['str_progrnumber'])
leave5['str_progrnumber'] = np.where(leave5['str_progrnumber'] == '5', '05', leave5['str_progrnumber'])
leave5['str_progrnumber'] = np.where(leave5['str_progrnumber'] == '6', '06', leave5['str_progrnumber'])
leave5['str_progrnumber'] = np.where(leave5['str_progrnumber'] == '7', '07', leave5['str_progrnumber'])
leave5['str_progrnumber'] = np.where(leave5['str_progrnumber'] == '8', '08', leave5['str_progrnumber'])
leave5['str_progrnumber'] = np.where(leave5['str_progrnumber'] == '9', '09', leave5['str_progrnumber'])

In [32]:
leave5['id'] = leave5['str_Year'] + leave5['str_Month'] + leave5['str_Day'] + leave5['TRIPID'].map(str) + leave5['str_progrnumber']

In [33]:
leave5.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day,str_progrnumber,id
0,2018-08-28,7502493,14,7603,54710,54710,54710,54710,1932370,2018,8,28,2018,8,28,14,20180828750249314
1,2018-08-27,7502493,17,47,54891,54891,54891,54891,1932370,2018,8,27,2018,8,27,17,20180827750249317
2,2018-08-28,7502493,25,350,56029,56029,56029,56029,1932370,2018,8,28,2018,8,28,25,20180828750249325
3,2018-08-28,7502493,5,230,54257,54257,54257,54257,1932370,2018,8,28,2018,8,28,5,20180828750249305
4,2018-08-27,7502493,33,390,56703,56703,56703,56703,1932370,2018,8,27,2018,8,27,33,20180827750249333


In [34]:
# Drop str_progrnumber Column for kernel space
leave5.drop('str_progrnumber', axis=1, inplace=True)

In [35]:
leave5.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day,id
0,2018-08-28,7502493,14,7603,54710,54710,54710,54710,1932370,2018,8,28,2018,8,28,20180828750249314
1,2018-08-27,7502493,17,47,54891,54891,54891,54891,1932370,2018,8,27,2018,8,27,20180827750249317
2,2018-08-28,7502493,25,350,56029,56029,56029,56029,1932370,2018,8,28,2018,8,28,20180828750249325
3,2018-08-28,7502493,5,230,54257,54257,54257,54257,1932370,2018,8,28,2018,8,28,20180828750249305
4,2018-08-27,7502493,33,390,56703,56703,56703,56703,1932370,2018,8,27,2018,8,27,20180827750249333


In [36]:
leave5 = leave5.sort_values(by=['id'])

In [37]:
leave5.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day,id
1196,2018-08-26,7502507,1,7347,33600,33600,33600,33600,1000953,2018,8,26,2018,8,26,20180826750250701
1189,2018-08-26,7502507,2,3669,33682,33682,33682,33682,1000953,2018,8,26,2018,8,26,20180826750250702
1166,2018-08-26,7502507,3,7349,33742,33742,33742,33742,1000953,2018,8,26,2018,8,26,20180826750250703
1168,2018-08-26,7502507,4,1631,33814,33814,33814,33814,1000953,2018,8,26,2018,8,26,20180826750250704
1154,2018-08-26,7502507,5,1632,33830,33830,33830,33830,1000953,2018,8,26,2018,8,26,20180826750250705


## Create a Trip/Leave ID for Combining Trip & Leavetime Datasets

In [38]:
leave5['trip_leave_id'] = leave5['str_Year'] + leave5['str_Month'] + leave5['str_Day'] + leave5['TRIPID'].map(str)

In [39]:
leave5.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day,id,trip_leave_id
1196,2018-08-26,7502507,1,7347,33600,33600,33600,33600,1000953,2018,8,26,2018,8,26,20180826750250701,201808267502507
1189,2018-08-26,7502507,2,3669,33682,33682,33682,33682,1000953,2018,8,26,2018,8,26,20180826750250702,201808267502507
1166,2018-08-26,7502507,3,7349,33742,33742,33742,33742,1000953,2018,8,26,2018,8,26,20180826750250703,201808267502507
1168,2018-08-26,7502507,4,1631,33814,33814,33814,33814,1000953,2018,8,26,2018,8,26,20180826750250704,201808267502507
1154,2018-08-26,7502507,5,1632,33830,33830,33830,33830,1000953,2018,8,26,2018,8,26,20180826750250705,201808267502507


In [40]:
# Drop str_Year Column for kernel space
leave5.drop('str_Year', axis=1, inplace=True)

In [41]:
leave5.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Month,str_Day,id,trip_leave_id
1196,2018-08-26,7502507,1,7347,33600,33600,33600,33600,1000953,2018,8,26,8,26,20180826750250701,201808267502507
1189,2018-08-26,7502507,2,3669,33682,33682,33682,33682,1000953,2018,8,26,8,26,20180826750250702,201808267502507
1166,2018-08-26,7502507,3,7349,33742,33742,33742,33742,1000953,2018,8,26,8,26,20180826750250703,201808267502507
1168,2018-08-26,7502507,4,1631,33814,33814,33814,33814,1000953,2018,8,26,8,26,20180826750250704,201808267502507
1154,2018-08-26,7502507,5,1632,33830,33830,33830,33830,1000953,2018,8,26,8,26,20180826750250705,201808267502507


In [42]:
# Drop str_Year Column for kernel space
leave5.drop('str_Month', axis=1, inplace=True)

In [43]:
leave5.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Day,id,trip_leave_id
1196,2018-08-26,7502507,1,7347,33600,33600,33600,33600,1000953,2018,8,26,26,20180826750250701,201808267502507
1189,2018-08-26,7502507,2,3669,33682,33682,33682,33682,1000953,2018,8,26,26,20180826750250702,201808267502507
1166,2018-08-26,7502507,3,7349,33742,33742,33742,33742,1000953,2018,8,26,26,20180826750250703,201808267502507
1168,2018-08-26,7502507,4,1631,33814,33814,33814,33814,1000953,2018,8,26,26,20180826750250704,201808267502507
1154,2018-08-26,7502507,5,1632,33830,33830,33830,33830,1000953,2018,8,26,26,20180826750250705,201808267502507


In [44]:
# Drop str_Year Column for kernel space
leave5.drop('str_Day', axis=1, inplace=True)

In [45]:
leave5.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,id,trip_leave_id
1196,2018-08-26,7502507,1,7347,33600,33600,33600,33600,1000953,2018,8,26,20180826750250701,201808267502507
1189,2018-08-26,7502507,2,3669,33682,33682,33682,33682,1000953,2018,8,26,20180826750250702,201808267502507
1166,2018-08-26,7502507,3,7349,33742,33742,33742,33742,1000953,2018,8,26,20180826750250703,201808267502507
1168,2018-08-26,7502507,4,1631,33814,33814,33814,33814,1000953,2018,8,26,20180826750250704,201808267502507
1154,2018-08-26,7502507,5,1632,33830,33830,33830,33830,1000953,2018,8,26,20180826750250705,201808267502507


## Send to CSV for Combination of Trip and Leave Data

In [46]:
# Send leave5 to csv:
leave5.to_csv('v3a_leave5Combine_LT_2018.csv', index=False)