In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

import pickle

In [2]:
# Read CSV file into Data Frame:
trips = pd.read_csv('cleaned_trips.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
trips.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,month,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour
0,2018-02-07,6253783,68,68_80,1,87245,84600,87524,84600,28-FEB-18 12:05:11,...,2,7,2,279,0,2645,2924,279,0,0
1,2018-02-07,6262138,25B,25B_271,2,30517,26460,32752,26460,28-FEB-18 12:05:11,...,2,7,2,2235,0,4057,6292,2235,0,0
2,2018-02-07,6254942,45A,45A_70,2,35512,32100,36329,32082,28-FEB-18 12:05:11,...,2,7,2,817,-18,3412,4247,835,0,0
3,2018-02-07,6259460,25A,25A_273,1,57261,54420,58463,54443,28-FEB-18 12:05:11,...,2,7,2,1202,23,2841,4020,1179,0,0
4,2018-02-07,6253175,14,14_15,1,85383,81600,84682,81608,28-FEB-18 12:05:11,...,2,7,2,-701,8,3783,3074,-709,0,0


In [4]:
trips.dtypes

DAYOFSERVICE        object
TRIPID               int64
LINEID              object
ROUTEID             object
DIRECTION            int64
PLANNEDTIME_ARR      int64
PLANNEDTIME_DEP      int64
ACTUALTIME_ARR       int64
ACTUALTIME_DEP       int64
LASTUPDATE          object
NOTE                object
planDep_time         int64
month                int64
day                  int64
dayOfWeek            int64
arrival_diff         int64
departure_diff       int64
planned_duration     int64
actual_duration      int64
duration_diff        int64
weekend              int64
rushHour             int64
dtype: object

In [5]:
trips['DIRECTION'] = trips['DIRECTION'].astype('int16')
trips['planDep_time'] = trips['planDep_time'].astype('int32')
trips['month'] = trips['month'].astype('int16')
trips['day'] = trips['day'].astype('int16')
trips['dayOfWeek'] = trips['dayOfWeek'].astype('int16')
trips['weekend'] = trips['weekend'].astype('int16')
trips['rushHour'] = trips['rushHour'].astype('int16')
trips['LINEID'] = trips['LINEID'].astype('category')

## Encoding Route Numbers

In [6]:
# Categorical Encoding
trips['num_lineID'] = trips['LINEID'].cat.codes
trips.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,day,dayOfWeek,arrival_diff,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour,num_lineID
0,2018-02-07,6253783,68,68_80,1,87245,84600,87524,84600,28-FEB-18 12:05:11,...,7,2,279,0,2645,2924,279,0,0,106
1,2018-02-07,6262138,25B,25B_271,2,30517,26460,32752,26460,28-FEB-18 12:05:11,...,7,2,2235,0,4057,6292,2235,0,0,39
2,2018-02-07,6254942,45A,45A_70,2,35512,32100,36329,32082,28-FEB-18 12:05:11,...,7,2,817,-18,3412,4247,835,0,0,85
3,2018-02-07,6259460,25A,25A_273,1,57261,54420,58463,54443,28-FEB-18 12:05:11,...,7,2,1202,23,2841,4020,1179,0,0,38
4,2018-02-07,6253175,14,14_15,1,85383,81600,84682,81608,28-FEB-18 12:05:11,...,7,2,-701,8,3783,3074,-709,0,0,13


In [7]:
trips.dtypes

DAYOFSERVICE          object
TRIPID                 int64
LINEID              category
ROUTEID               object
DIRECTION              int16
PLANNEDTIME_ARR        int64
PLANNEDTIME_DEP        int64
ACTUALTIME_ARR         int64
ACTUALTIME_DEP         int64
LASTUPDATE            object
NOTE                  object
planDep_time           int32
month                  int16
day                    int16
dayOfWeek              int16
arrival_diff           int64
departure_diff         int64
planned_duration       int64
actual_duration        int64
duration_diff          int64
weekend                int16
rushHour               int16
num_lineID             int16
dtype: object

In [8]:
trips['DAYOFSERVICE'] = trips['DAYOFSERVICE'].astype('string')

In [9]:
trips[['str_Year', 'str_Month', 'str_Day']] = trips.DAYOFSERVICE.str.split('-', expand=True)

In [10]:
trips.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,departure_diff,planned_duration,actual_duration,duration_diff,weekend,rushHour,num_lineID,str_Year,str_Month,str_Day
0,2018-02-07,6253783,68,68_80,1,87245,84600,87524,84600,28-FEB-18 12:05:11,...,0,2645,2924,279,0,0,106,2018,2,7
1,2018-02-07,6262138,25B,25B_271,2,30517,26460,32752,26460,28-FEB-18 12:05:11,...,0,4057,6292,2235,0,0,39,2018,2,7
2,2018-02-07,6254942,45A,45A_70,2,35512,32100,36329,32082,28-FEB-18 12:05:11,...,-18,3412,4247,835,0,0,85,2018,2,7
3,2018-02-07,6259460,25A,25A_273,1,57261,54420,58463,54443,28-FEB-18 12:05:11,...,23,2841,4020,1179,0,0,38,2018,2,7
4,2018-02-07,6253175,14,14_15,1,85383,81600,84682,81608,28-FEB-18 12:05:11,...,8,3783,3074,-709,0,0,13,2018,2,7


## Create a Trip/Leave ID for Combining Trip & Leavetime Datasets

In [11]:
trips['trip_leave_id'] = trips['str_Year'] + trips['str_Month'] + trips['str_Day'] + trips['TRIPID'].map(str)

In [12]:
trips.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,...,planned_duration,actual_duration,duration_diff,weekend,rushHour,num_lineID,str_Year,str_Month,str_Day,trip_leave_id
0,2018-02-07,6253783,68,68_80,1,87245,84600,87524,84600,28-FEB-18 12:05:11,...,2645,2924,279,0,0,106,2018,2,7,201802076253783
1,2018-02-07,6262138,25B,25B_271,2,30517,26460,32752,26460,28-FEB-18 12:05:11,...,4057,6292,2235,0,0,39,2018,2,7,201802076262138
2,2018-02-07,6254942,45A,45A_70,2,35512,32100,36329,32082,28-FEB-18 12:05:11,...,3412,4247,835,0,0,85,2018,2,7,201802076254942
3,2018-02-07,6259460,25A,25A_273,1,57261,54420,58463,54443,28-FEB-18 12:05:11,...,2841,4020,1179,0,0,38,2018,2,7,201802076259460
4,2018-02-07,6253175,14,14_15,1,85383,81600,84682,81608,28-FEB-18 12:05:11,...,3783,3074,-709,0,0,13,2018,2,7,201802076253175


## Creating a Trips Dataframe to combine with Leavetime

In [15]:
tripsCombine = trips[['trip_leave_id', 'LINEID', 'num_lineID', 'DIRECTION', 'actual_duration', 'dayOfWeek', 'weekend', 'rushHour']].copy()

In [16]:
tripsCombine.head()

Unnamed: 0,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour
0,201802076253783,68,106,1,2924,2,0,0
1,201802076262138,25B,39,2,6292,2,0,0
2,201802076254942,45A,85,2,4247,2,0,0
3,201802076259460,25A,38,1,4020,2,0,0
4,201802076253175,14,13,1,3074,2,0,0


## Send to CSV for Combination of Trip and Leave Data

In [17]:
# Send tripsCombine to csv:
tripsCombine.to_csv('v3b_tripsCombine_LT_2018.csv', index=False)