In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

import pickle

In [2]:
# Read tripleave1 CSV file into Data Frame:
wk = pd.read_csv('v4_tripLeave1_LT_2018.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
wk.shape

(18362590, 21)

In [4]:
wk.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,day,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour
0,2018-01-01,5955277,1,7347,30000,30000,30000,30000,1001127,2018,...,1,20180101595527701,201801015955277,16,24.0,1.0,3155.0,0.0,0.0,1.0
1,2018-01-01,5955277,2,3669,30080,30080,30080,30080,1001127,2018,...,1,20180101595527702,201801015955277,16,24.0,1.0,3155.0,0.0,0.0,1.0
2,2018-01-01,5955277,3,7349,30138,30138,30138,30138,1001127,2018,...,1,20180101595527703,201801015955277,16,24.0,1.0,3155.0,0.0,0.0,1.0
3,2018-01-01,5955277,4,1631,30206,30206,30206,30206,1001127,2018,...,1,20180101595527704,201801015955277,16,24.0,1.0,3155.0,0.0,0.0,1.0
4,2018-01-01,5955277,5,1632,30221,30221,30221,30221,1001127,2018,...,1,20180101595527705,201801015955277,16,24.0,1.0,3155.0,0.0,0.0,1.0


In [5]:
wk.dtypes

DAYOFSERVICE        object
TRIPID               int64
PROGRNUMBER          int64
STOPPOINTID          int64
PLANNEDTIME_ARR      int64
PLANNEDTIME_DEP      int64
ACTUALTIME_ARR       int64
ACTUALTIME_DEP       int64
VEHICLEID            int64
year                 int64
month                int64
day                  int64
id                   int64
trip_leave_id        int64
LINEID              object
num_lineID         float64
DIRECTION          float64
actual_duration    float64
dayOfWeek          float64
weekend            float64
rushHour           float64
dtype: object

## Add Actual Arrival Time Hour

In [6]:
wk['actualARR_hour'] = (wk['ACTUALTIME_ARR'] / 3600).round()

In [None]:
wk.head()

In [7]:
# Replace any 24 hour values to zero
wk['actualARR_hour'] = np.where(wk['actualARR_hour'] == 24, 0, wk['actualARR_hour'])

In [8]:
# Update actualARR_hour to int to rid of decimals
wk['actualARR_hour'] = wk['actualARR_hour'].astype('int32')

In [9]:
wk.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour,actualARR_hour
0,2018-01-01,5955277,1,7347,30000,30000,30000,30000,1001127,2018,...,20180101595527701,201801015955277,16,24.0,1.0,3155.0,0.0,0.0,1.0,8
1,2018-01-01,5955277,2,3669,30080,30080,30080,30080,1001127,2018,...,20180101595527702,201801015955277,16,24.0,1.0,3155.0,0.0,0.0,1.0,8
2,2018-01-01,5955277,3,7349,30138,30138,30138,30138,1001127,2018,...,20180101595527703,201801015955277,16,24.0,1.0,3155.0,0.0,0.0,1.0,8
3,2018-01-01,5955277,4,1631,30206,30206,30206,30206,1001127,2018,...,20180101595527704,201801015955277,16,24.0,1.0,3155.0,0.0,0.0,1.0,8
4,2018-01-01,5955277,5,1632,30221,30221,30221,30221,1001127,2018,...,20180101595527705,201801015955277,16,24.0,1.0,3155.0,0.0,0.0,1.0,8


## Datatype Changes

In [10]:
wk['num_lineID'] = wk['num_lineID'].astype('int16')
wk['DIRECTION'] = wk['DIRECTION'].astype('int16')
wk['actual_duration'] = wk['actual_duration'].astype('int64')
wk['dayOfWeek'] = wk['dayOfWeek'].astype('int16')
wk['weekend'] = wk['weekend'].astype('int16')
wk['rushHour'] = wk['rushHour'].astype('int16')

In [11]:
wk.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour,actualARR_hour
0,2018-01-01,5955277,1,7347,30000,30000,30000,30000,1001127,2018,...,20180101595527701,201801015955277,16,24,1,3155,0,0,1,8
1,2018-01-01,5955277,2,3669,30080,30080,30080,30080,1001127,2018,...,20180101595527702,201801015955277,16,24,1,3155,0,0,1,8
2,2018-01-01,5955277,3,7349,30138,30138,30138,30138,1001127,2018,...,20180101595527703,201801015955277,16,24,1,3155,0,0,1,8
3,2018-01-01,5955277,4,1631,30206,30206,30206,30206,1001127,2018,...,20180101595527704,201801015955277,16,24,1,3155,0,0,1,8
4,2018-01-01,5955277,5,1632,30221,30221,30221,30221,1001127,2018,...,20180101595527705,201801015955277,16,24,1,3155,0,0,1,8


In [None]:
wk.dtypes

## Route 16

In [12]:
lwk = wk[(wk['LINEID'] == '16')]

In [13]:
lwk.shape

(547911, 22)

In [14]:
# Drop Route 16 rows from wk
wk.drop(wk[wk['LINEID'] == '16'].index, inplace = True)

In [None]:
wk.head()

In [None]:
wk.shape

<h3>Send to CSV</h3>

In [None]:
# Send df to csv:
lwk.to_csv('v5_16_tripLeave1_sp0.csv', index=False)

## Route 16C

In [15]:
lwk = wk[(wk['LINEID'] == '16C')]

In [16]:
lwk.shape

(9131, 22)

In [17]:
# Drop Route 16C rows from wk
wk.drop(wk[wk['LINEID'] == '16C'].index, inplace = True)

In [None]:
wk.head()

In [None]:
wk.shape

<h3>Send to CSV</h3>

In [None]:
# Send df to csv:
lwk.to_csv('v5_16C_tripLeave1_sp0.csv', index=False)

## Route 40

In [18]:
lwk = wk[(wk['LINEID'] == '40')]

In [19]:
lwk.shape

(688020, 22)

In [20]:
# Drop Route 40 rows from wk
wk.drop(wk[wk['LINEID'] == '40'].index, inplace = True)

In [None]:
wk.head()

In [None]:
wk.shape

<h3>Send to CSV</h3>

In [None]:
# Send df to csv:
lwk.to_csv('v5_40_tripLeave1_sp0.csv', index=False)

## Route 25B

In [21]:
lwk = wk[(wk['LINEID'] == '25B')]

In [22]:
lwk.shape

(165856, 22)

In [23]:
# Drop Route 25B rows from wk
wk.drop(wk[wk['LINEID'] == '25B'].index, inplace = True)

In [None]:
wk.head()

In [None]:
wk.shape

<h3>Send to CSV</h3>

In [None]:
# Send df to csv:
lwk.to_csv('v5_25B_tripLeave1_sp0.csv', index=False)

## Route 25A

In [24]:
lwk = wk[(wk['LINEID'] == '25A')]

In [25]:
lwk.shape

(171037, 22)

In [26]:
# Drop Route 25A rows from wk
wk.drop(wk[wk['LINEID'] == '25A'].index, inplace = True)

In [None]:
wk.head()

In [None]:
wk.shape

<h3>Send to CSV</h3>

In [None]:
# Send df to csv:
lwk.to_csv('v5_25A_tripLeave1_sp0.csv', index=False)

## Route 15

In [27]:
lwk = wk[(wk['LINEID'] == '15')]

In [28]:
lwk.shape

(612829, 22)

In [29]:
# Drop Route 15 rows from wk
wk.drop(wk[wk['LINEID'] == '15'].index, inplace = True)

In [None]:
wk.head()

In [None]:
wk.shape

<h3>Send to CSV</h3>

In [None]:
# Send df to csv:
lwk.to_csv('v5_15_tripLeave1_sp0.csv', index=False)

## Route 47

In [30]:
lwk = wk[(wk['LINEID'] == '47')]

In [31]:
lwk.shape

(78186, 22)

In [32]:
# Drop Route 47 rows from wk
wk.drop(wk[wk['LINEID'] == '47'].index, inplace = True)

In [None]:
wk.head()

In [None]:
wk.shape

<h3>Send to CSV</h3>

In [None]:
# Send df to csv:
lwk.to_csv('v5_47_tripLeave1_sp0.csv', index=False)

## Route 33

In [33]:
lwk = wk[(wk['LINEID'] == '33')]

In [34]:
lwk.shape

(219692, 22)

In [35]:
# Drop Route 33 rows from wk
wk.drop(wk[wk['LINEID'] == '33'].index, inplace = True)

In [36]:
wk.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour,actualARR_hour
3177,2018-01-01,5955456,1,6048,72900,72900,72900,72900,2534846,2018,...,20180101595545601,201801015955456,33A,56,2,2579,0,0,0,20
3178,2018-01-01,5955456,2,7173,72942,72942,72942,72942,2534846,2018,...,20180101595545602,201801015955456,33A,56,2,2579,0,0,0,20
3179,2018-01-01,5955456,3,3811,72968,72968,72968,72968,2534846,2018,...,20180101595545603,201801015955456,33A,56,2,2579,0,0,0,20
3180,2018-01-01,5955456,4,3812,72993,72993,72993,72993,2534846,2018,...,20180101595545604,201801015955456,33A,56,2,2579,0,0,0,20
3181,2018-01-01,5955456,5,3664,73014,73014,73014,73014,2534846,2018,...,20180101595545605,201801015955456,33A,56,2,2579,0,0,0,20


In [37]:
wk.shape

(15869928, 22)

<h3>Send to CSV</h3>

In [38]:
# Send df to csv:
lwk.to_csv('v5_33_tripLeave1_sp0.csv', index=False)

## Continue v5_tripLeave1_route_split_1

In [39]:
# Send df to csv:
wk.to_csv('v5_tripLeave1_route_split_1.csv', index=False)