In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

In [2]:
# Read leave6 CSV file into Data Frame:
leave6 = pd.read_csv('v3_leave6_LT_2018.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
leave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID
0,14-SEP-18 00:00:00,8089322,8,107,38101,38101,38101,38101,2172264
1,14-SEP-18 00:00:00,8089322,10,7092,38229,38229,38229,38229,2172264
2,10-SEP-18 00:00:00,8089322,79,2139,43124,43124,43124,43124,2693186
3,11-SEP-18 00:00:00,8089322,44,1944,41269,41269,41269,41269,1932348
4,21-SEP-18 00:00:00,8089322,70,4665,42677,42677,42677,42677,3088370


In [4]:
leave6.shape

(19393293, 9)

In [5]:
leave6.dtypes

DAYOFSERVICE       object
TRIPID              int64
PROGRNUMBER         int64
STOPPOINTID         int64
PLANNEDTIME_ARR     int64
PLANNEDTIME_DEP     int64
ACTUALTIME_ARR      int64
ACTUALTIME_DEP      int64
VEHICLEID           int64
dtype: object

<h3>Changing Data Types to make size of leave dataframe smaller</h3>

In [6]:
leave6['TRIPID'] = leave6['TRIPID'].astype('int32')

In [7]:
leave6['PROGRNUMBER'] = leave6['PROGRNUMBER'].astype('int16')

In [8]:
leave6['STOPPOINTID'] = leave6['STOPPOINTID'].astype('int16')

In [9]:
leave6['PLANNEDTIME_ARR'] = leave6['PLANNEDTIME_ARR'].astype('int32')

In [10]:
leave6['PLANNEDTIME_DEP'] = leave6['PLANNEDTIME_DEP'].astype('int32')

In [11]:
leave6['ACTUALTIME_ARR'] = leave6['PLANNEDTIME_DEP'].astype('int32')

In [12]:
leave6['ACTUALTIME_DEP'] = leave6['PLANNEDTIME_DEP'].astype('int32')

In [13]:
leave6['VEHICLEID'] = leave6['VEHICLEID'].astype('int32')

In [14]:
leave6.dtypes

DAYOFSERVICE       object
TRIPID              int32
PROGRNUMBER         int16
STOPPOINTID         int16
PLANNEDTIME_ARR     int32
PLANNEDTIME_DEP     int32
ACTUALTIME_ARR      int32
ACTUALTIME_DEP      int32
VEHICLEID           int32
dtype: object

In [15]:
leave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID
0,14-SEP-18 00:00:00,8089322,8,107,38101,38101,38101,38101,2172264
1,14-SEP-18 00:00:00,8089322,10,7092,38229,38229,38229,38229,2172264
2,10-SEP-18 00:00:00,8089322,79,2139,43124,43124,43124,43124,2693186
3,11-SEP-18 00:00:00,8089322,44,1944,41269,41269,41269,41269,1932348
4,21-SEP-18 00:00:00,8089322,70,4665,42677,42677,42677,42677,3088370


<h2>Add Day, Month, Year Columns</h2>

<h3>leave6:</h3>

In [16]:
leave6.dtypes

DAYOFSERVICE       object
TRIPID              int32
PROGRNUMBER         int16
STOPPOINTID         int16
PLANNEDTIME_ARR     int32
PLANNEDTIME_DEP     int32
ACTUALTIME_ARR      int32
ACTUALTIME_DEP      int32
VEHICLEID           int32
dtype: object

In [17]:
leave6['DAYOFSERVICE'] = leave6['DAYOFSERVICE'].astype('datetime64')

In [18]:
leave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID
0,2018-09-14,8089322,8,107,38101,38101,38101,38101,2172264
1,2018-09-14,8089322,10,7092,38229,38229,38229,38229,2172264
2,2018-09-10,8089322,79,2139,43124,43124,43124,43124,2693186
3,2018-09-11,8089322,44,1944,41269,41269,41269,41269,1932348
4,2018-09-21,8089322,70,4665,42677,42677,42677,42677,3088370


In [19]:
leave6['DAYOFSERVICE'] = leave6['DAYOFSERVICE'].astype('string')

In [20]:
leave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID
0,2018-09-14,8089322,8,107,38101,38101,38101,38101,2172264
1,2018-09-14,8089322,10,7092,38229,38229,38229,38229,2172264
2,2018-09-10,8089322,79,2139,43124,43124,43124,43124,2693186
3,2018-09-11,8089322,44,1944,41269,41269,41269,41269,1932348
4,2018-09-21,8089322,70,4665,42677,42677,42677,42677,3088370


In [21]:
# Split the DAYOFSERVICE Column to get Month, Day, & Year (year needs one more split)
leave6[['year', 'month', 'day']] = leave6.DAYOFSERVICE.str.split('-', expand=True)

In [22]:
leave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day
0,2018-09-14,8089322,8,107,38101,38101,38101,38101,2172264,2018,9,14
1,2018-09-14,8089322,10,7092,38229,38229,38229,38229,2172264,2018,9,14
2,2018-09-10,8089322,79,2139,43124,43124,43124,43124,2693186,2018,9,10
3,2018-09-11,8089322,44,1944,41269,41269,41269,41269,1932348,2018,9,11
4,2018-09-21,8089322,70,4665,42677,42677,42677,42677,3088370,2018,9,21


In [23]:
leave6['year'] = leave6['year'].astype('int16')
leave6['month'] = leave6['month'].astype('int16')
leave6['day'] = leave6['day'].astype('int16')

In [24]:
leave6.dtypes

DAYOFSERVICE       string
TRIPID              int32
PROGRNUMBER         int16
STOPPOINTID         int16
PLANNEDTIME_ARR     int32
PLANNEDTIME_DEP     int32
ACTUALTIME_ARR      int32
ACTUALTIME_DEP      int32
VEHICLEID           int32
year                int16
month               int16
day                 int16
dtype: object

In [25]:
leave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day
0,2018-09-14,8089322,8,107,38101,38101,38101,38101,2172264,2018,9,14
1,2018-09-14,8089322,10,7092,38229,38229,38229,38229,2172264,2018,9,14
2,2018-09-10,8089322,79,2139,43124,43124,43124,43124,2693186,2018,9,10
3,2018-09-11,8089322,44,1944,41269,41269,41269,41269,1932348,2018,9,11
4,2018-09-21,8089322,70,4665,42677,42677,42677,42677,3088370,2018,9,21


## Create a Unique ID for Dataframe

In [26]:
leave6[['str_Year', 'str_Month', 'str_Day']] = leave6.DAYOFSERVICE.str.split('-', expand=True)

In [27]:
leave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day
0,2018-09-14,8089322,8,107,38101,38101,38101,38101,2172264,2018,9,14,2018,9,14
1,2018-09-14,8089322,10,7092,38229,38229,38229,38229,2172264,2018,9,14,2018,9,14
2,2018-09-10,8089322,79,2139,43124,43124,43124,43124,2693186,2018,9,10,2018,9,10
3,2018-09-11,8089322,44,1944,41269,41269,41269,41269,1932348,2018,9,11,2018,9,11
4,2018-09-21,8089322,70,4665,42677,42677,42677,42677,3088370,2018,9,21,2018,9,21


In [28]:
# Convert DAYOFSERVICE Column back to datetime for kernel space
leave6['DAYOFSERVICE'] = leave6['DAYOFSERVICE'].astype('datetime64')

In [29]:
leave6['str_progrnumber'] = leave6['PROGRNUMBER'].astype('string')

In [30]:
leave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day,str_progrnumber
0,2018-09-14,8089322,8,107,38101,38101,38101,38101,2172264,2018,9,14,2018,9,14,8
1,2018-09-14,8089322,10,7092,38229,38229,38229,38229,2172264,2018,9,14,2018,9,14,10
2,2018-09-10,8089322,79,2139,43124,43124,43124,43124,2693186,2018,9,10,2018,9,10,79
3,2018-09-11,8089322,44,1944,41269,41269,41269,41269,1932348,2018,9,11,2018,9,11,44
4,2018-09-21,8089322,70,4665,42677,42677,42677,42677,3088370,2018,9,21,2018,9,21,70


In [31]:
# Replace any under 10 stop with a leading zero
leave6['str_progrnumber'] = np.where(leave6['str_progrnumber'] == '1', '01', leave6['str_progrnumber'])
leave6['str_progrnumber'] = np.where(leave6['str_progrnumber'] == '2', '02', leave6['str_progrnumber'])
leave6['str_progrnumber'] = np.where(leave6['str_progrnumber'] == '3', '03', leave6['str_progrnumber'])
leave6['str_progrnumber'] = np.where(leave6['str_progrnumber'] == '4', '04', leave6['str_progrnumber'])
leave6['str_progrnumber'] = np.where(leave6['str_progrnumber'] == '5', '05', leave6['str_progrnumber'])
leave6['str_progrnumber'] = np.where(leave6['str_progrnumber'] == '6', '06', leave6['str_progrnumber'])
leave6['str_progrnumber'] = np.where(leave6['str_progrnumber'] == '7', '07', leave6['str_progrnumber'])
leave6['str_progrnumber'] = np.where(leave6['str_progrnumber'] == '8', '08', leave6['str_progrnumber'])
leave6['str_progrnumber'] = np.where(leave6['str_progrnumber'] == '9', '09', leave6['str_progrnumber'])

In [32]:
leave6['id'] = leave6['str_Year'] + leave6['str_Month'] + leave6['str_Day'] + leave6['TRIPID'].map(str) + leave6['str_progrnumber']

In [33]:
leave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day,str_progrnumber,id
0,2018-09-14,8089322,8,107,38101,38101,38101,38101,2172264,2018,9,14,2018,9,14,8,20180914808932208
1,2018-09-14,8089322,10,7092,38229,38229,38229,38229,2172264,2018,9,14,2018,9,14,10,20180914808932210
2,2018-09-10,8089322,79,2139,43124,43124,43124,43124,2693186,2018,9,10,2018,9,10,79,20180910808932279
3,2018-09-11,8089322,44,1944,41269,41269,41269,41269,1932348,2018,9,11,2018,9,11,44,20180911808932244
4,2018-09-21,8089322,70,4665,42677,42677,42677,42677,3088370,2018,9,21,2018,9,21,70,20180921808932270


In [34]:
# Drop str_progrnumber Column for kernel space
leave6.drop('str_progrnumber', axis=1, inplace=True)

In [35]:
leave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day,id
0,2018-09-14,8089322,8,107,38101,38101,38101,38101,2172264,2018,9,14,2018,9,14,20180914808932208
1,2018-09-14,8089322,10,7092,38229,38229,38229,38229,2172264,2018,9,14,2018,9,14,20180914808932210
2,2018-09-10,8089322,79,2139,43124,43124,43124,43124,2693186,2018,9,10,2018,9,10,20180910808932279
3,2018-09-11,8089322,44,1944,41269,41269,41269,41269,1932348,2018,9,11,2018,9,11,20180911808932244
4,2018-09-21,8089322,70,4665,42677,42677,42677,42677,3088370,2018,9,21,2018,9,21,20180921808932270


In [36]:
leave6 = leave6.sort_values(by=['id'])

In [37]:
leave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day,id
16031,2018-09-09,8089358,1,4795,45900,45900,45900,45900,3088378,2018,9,9,2018,9,9,20180909808935801
16018,2018-09-09,8089358,2,4688,46058,46058,46058,46058,3088378,2018,9,9,2018,9,9,20180909808935802
16017,2018-09-09,8089358,3,4689,46096,46096,46096,46096,3088378,2018,9,9,2018,9,9,20180909808935803
16016,2018-09-09,8089358,4,4410,46129,46129,46129,46129,3088378,2018,9,9,2018,9,9,20180909808935804
16014,2018-09-09,8089358,5,2678,46153,46153,46153,46153,3088378,2018,9,9,2018,9,9,20180909808935805


## Create a Trip/Leave ID for Combining Trip & Leavetime Datasets

In [38]:
leave6['trip_leave_id'] = leave6['str_Year'] + leave6['str_Month'] + leave6['str_Day'] + leave6['TRIPID'].map(str)

In [39]:
leave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Year,str_Month,str_Day,id,trip_leave_id
16031,2018-09-09,8089358,1,4795,45900,45900,45900,45900,3088378,2018,9,9,2018,9,9,20180909808935801,201809098089358
16018,2018-09-09,8089358,2,4688,46058,46058,46058,46058,3088378,2018,9,9,2018,9,9,20180909808935802,201809098089358
16017,2018-09-09,8089358,3,4689,46096,46096,46096,46096,3088378,2018,9,9,2018,9,9,20180909808935803,201809098089358
16016,2018-09-09,8089358,4,4410,46129,46129,46129,46129,3088378,2018,9,9,2018,9,9,20180909808935804,201809098089358
16014,2018-09-09,8089358,5,2678,46153,46153,46153,46153,3088378,2018,9,9,2018,9,9,20180909808935805,201809098089358


In [40]:
# Drop str_Year Column for kernel space
leave6.drop('str_Year', axis=1, inplace=True)

In [41]:
leave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Month,str_Day,id,trip_leave_id
16031,2018-09-09,8089358,1,4795,45900,45900,45900,45900,3088378,2018,9,9,9,9,20180909808935801,201809098089358
16018,2018-09-09,8089358,2,4688,46058,46058,46058,46058,3088378,2018,9,9,9,9,20180909808935802,201809098089358
16017,2018-09-09,8089358,3,4689,46096,46096,46096,46096,3088378,2018,9,9,9,9,20180909808935803,201809098089358
16016,2018-09-09,8089358,4,4410,46129,46129,46129,46129,3088378,2018,9,9,9,9,20180909808935804,201809098089358
16014,2018-09-09,8089358,5,2678,46153,46153,46153,46153,3088378,2018,9,9,9,9,20180909808935805,201809098089358


In [42]:
# Drop str_Year Column for kernel space
leave6.drop('str_Month', axis=1, inplace=True)

In [43]:
leave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,str_Day,id,trip_leave_id
16031,2018-09-09,8089358,1,4795,45900,45900,45900,45900,3088378,2018,9,9,9,20180909808935801,201809098089358
16018,2018-09-09,8089358,2,4688,46058,46058,46058,46058,3088378,2018,9,9,9,20180909808935802,201809098089358
16017,2018-09-09,8089358,3,4689,46096,46096,46096,46096,3088378,2018,9,9,9,20180909808935803,201809098089358
16016,2018-09-09,8089358,4,4410,46129,46129,46129,46129,3088378,2018,9,9,9,20180909808935804,201809098089358
16014,2018-09-09,8089358,5,2678,46153,46153,46153,46153,3088378,2018,9,9,9,20180909808935805,201809098089358


In [44]:
# Drop str_Year Column for kernel space
leave6.drop('str_Day', axis=1, inplace=True)

In [45]:
leave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,id,trip_leave_id
16031,2018-09-09,8089358,1,4795,45900,45900,45900,45900,3088378,2018,9,9,20180909808935801,201809098089358
16018,2018-09-09,8089358,2,4688,46058,46058,46058,46058,3088378,2018,9,9,20180909808935802,201809098089358
16017,2018-09-09,8089358,3,4689,46096,46096,46096,46096,3088378,2018,9,9,20180909808935803,201809098089358
16016,2018-09-09,8089358,4,4410,46129,46129,46129,46129,3088378,2018,9,9,20180909808935804,201809098089358
16014,2018-09-09,8089358,5,2678,46153,46153,46153,46153,3088378,2018,9,9,20180909808935805,201809098089358


## Send to CSV for Combination of Trip and Leave Data

In [46]:
# Send leave6 to csv:
leave6.to_csv('v3a_leave6Combine_LT_2018.csv', index=False)