In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

import pickle

In [2]:
# Read CSV file into Data Frame:
trips = pd.read_csv('v3b_tripsCombine_LT_2018.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
# Read leave6Combine CSV file into Data Frame:
leave6 = pd.read_csv('v3a_leave6Combine_LT_2018.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [4]:
trips.head()

Unnamed: 0,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour
0,201802076253783,68,106,1,2924,2,0,0
1,201802076262138,25B,39,2,6292,2,0,0
2,201802076254942,45A,85,2,4247,2,0,0
3,201802076259460,25A,38,1,4020,2,0,0
4,201802076253175,14,13,1,3074,2,0,0


In [5]:
leave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,month,day,id,trip_leave_id
0,2018-09-09,8089358,1,4795,45900,45900,45900,45900,3088378,2018,9,9,20180909808935801,201809098089358
1,2018-09-09,8089358,2,4688,46058,46058,46058,46058,3088378,2018,9,9,20180909808935802,201809098089358
2,2018-09-09,8089358,3,4689,46096,46096,46096,46096,3088378,2018,9,9,20180909808935803,201809098089358
3,2018-09-09,8089358,4,4410,46129,46129,46129,46129,3088378,2018,9,9,20180909808935804,201809098089358
4,2018-09-09,8089358,5,2678,46153,46153,46153,46153,3088378,2018,9,9,20180909808935805,201809098089358


In [6]:
trips.dtypes

trip_leave_id       int64
LINEID             object
num_lineID          int64
DIRECTION           int64
actual_duration     int64
dayOfWeek           int64
weekend             int64
rushHour            int64
dtype: object

In [7]:
leave6.dtypes

DAYOFSERVICE       object
TRIPID              int64
PROGRNUMBER         int64
STOPPOINTID         int64
PLANNEDTIME_ARR     int64
PLANNEDTIME_DEP     int64
ACTUALTIME_ARR      int64
ACTUALTIME_DEP      int64
VEHICLEID           int64
year                int64
month               int64
day                 int64
id                  int64
trip_leave_id       int64
dtype: object

## Combine Trip & Leave Data

In [8]:
tripLeave6 = pd.merge(leave6, trips, how="left", on=["trip_leave_id"])

In [9]:
tripLeave6.shape

(19393293, 21)

In [10]:
tripLeave6.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,day,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour
0,2018-09-09,8089358,1,4795,45900,45900,45900,45900,3088378,2018,...,9,20180909808935801,201809098089358,40,70.0,2.0,5853.0,6.0,1.0,0.0
1,2018-09-09,8089358,2,4688,46058,46058,46058,46058,3088378,2018,...,9,20180909808935802,201809098089358,40,70.0,2.0,5853.0,6.0,1.0,0.0
2,2018-09-09,8089358,3,4689,46096,46096,46096,46096,3088378,2018,...,9,20180909808935803,201809098089358,40,70.0,2.0,5853.0,6.0,1.0,0.0
3,2018-09-09,8089358,4,4410,46129,46129,46129,46129,3088378,2018,...,9,20180909808935804,201809098089358,40,70.0,2.0,5853.0,6.0,1.0,0.0
4,2018-09-09,8089358,5,2678,46153,46153,46153,46153,3088378,2018,...,9,20180909808935805,201809098089358,40,70.0,2.0,5853.0,6.0,1.0,0.0


In [11]:
tripLeave6.dtypes

DAYOFSERVICE        object
TRIPID               int64
PROGRNUMBER          int64
STOPPOINTID          int64
PLANNEDTIME_ARR      int64
PLANNEDTIME_DEP      int64
ACTUALTIME_ARR       int64
ACTUALTIME_DEP       int64
VEHICLEID            int64
year                 int64
month                int64
day                  int64
id                   int64
trip_leave_id        int64
LINEID              object
num_lineID         float64
DIRECTION          float64
actual_duration    float64
dayOfWeek          float64
weekend            float64
rushHour           float64
dtype: object

In [12]:
tripLeave6.isnull().sum()

DAYOFSERVICE             0
TRIPID                   0
PROGRNUMBER              0
STOPPOINTID              0
PLANNEDTIME_ARR          0
PLANNEDTIME_DEP          0
ACTUALTIME_ARR           0
ACTUALTIME_DEP           0
VEHICLEID                0
year                     0
month                    0
day                      0
id                       0
trip_leave_id            0
LINEID             1097783
num_lineID         1097783
DIRECTION          1097783
actual_duration    1097783
dayOfWeek          1097783
weekend            1097783
rushHour           1097783
dtype: int64

## Create a New Dataframe for the NaNs:

In [13]:
tripLeave6_NaNs = tripLeave6[(tripLeave6['LINEID'].isnull())]

In [14]:
tripLeave6_NaNs.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,year,...,day,id,trip_leave_id,LINEID,num_lineID,DIRECTION,actual_duration,dayOfWeek,weekend,rushHour
3202,2018-09-09,8089777,1,324,84600,84600,84600,84600,1932311,2018,...,9,20180909808977701,201809098089777,,,,,,,
3203,2018-09-09,8089777,2,327,84703,84703,84703,84703,1932311,2018,...,9,20180909808977702,201809098089777,,,,,,,
3204,2018-09-09,8089777,3,7113,84788,84788,84788,84788,1932311,2018,...,9,20180909808977703,201809098089777,,,,,,,
3205,2018-09-09,8089777,4,127,84849,84849,84849,84849,1932311,2018,...,9,20180909808977704,201809098089777,,,,,,,
3206,2018-09-09,8089777,5,112,84902,84902,84902,84902,1932311,2018,...,9,20180909808977705,201809098089777,,,,,,,


In [15]:
tripLeave6_NaNs.shape

(1097783, 21)

In [16]:
# Drop NaN rows from tripLeave6
tripLeave6.drop(tripLeave6[tripLeave6['LINEID'].isnull()].index, inplace = True)

In [17]:
tripLeave6.isnull().sum()

DAYOFSERVICE       0
TRIPID             0
PROGRNUMBER        0
STOPPOINTID        0
PLANNEDTIME_ARR    0
PLANNEDTIME_DEP    0
ACTUALTIME_ARR     0
ACTUALTIME_DEP     0
VEHICLEID          0
year               0
month              0
day                0
id                 0
trip_leave_id      0
LINEID             0
num_lineID         0
DIRECTION          0
actual_duration    0
dayOfWeek          0
weekend            0
rushHour           0
dtype: int64

In [18]:
tripLeave6.shape

(18295510, 21)

## Send the tripLeave6_NaNs Dataframe to CSV

In [19]:
# Send tripLeave6_NaNs to csv:
tripLeave6_NaNs.to_csv('v4_tripLeave6_NaNs_LT_2018.csv', index=False)

## Send tripLeave6 to CSV

In [20]:
# Send tripLeave6 to csv:
tripLeave6.to_csv('v4_tripLeave6_LT_2018.csv', index=False)