In [31]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sys
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import pickle

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Pre Modeling Prepare (Stop to Stop)
## Read cleaned data¶

In [32]:
month_num = sys.argv[1]
# month_num = 1

In [3]:
leavetimes = pd.read_csv(f'../Final_DB/leavetimes_trips_month_{month_num}.csv', keep_default_na = True, delimiter = ',', skipinitialspace = True)

leavetimes.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,ACTUALTIME_ARR,ACTUALTIME_DEP,PLANNEDTIME,SKIP,LINEID,ROUTEID
0,2018-01-01,5955277,1,7347,2018-01-01 08:20:29,2018-01-01 08:20:29,2018-01-01 08:20:00,1.0,16_1,16_20
1,2018-01-01,5955277,2,3669,2018-01-01 08:22:08,2018-01-01 08:22:08,2018-01-01 08:21:20,1.0,16_1,16_20
2,2018-01-01,5955277,3,7349,2018-01-01 08:23:02,2018-01-01 08:23:02,2018-01-01 08:22:18,1.0,16_1,16_20
3,2018-01-01,5955277,4,1631,2018-01-01 08:24:15,2018-01-01 08:24:15,2018-01-01 08:23:26,1.0,16_1,16_20
4,2018-01-01,5955277,5,1632,2018-01-01 08:24:21,2018-01-01 08:24:21,2018-01-01 08:23:41,1.0,16_1,16_20


In [4]:
leavetimes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9323130 entries, 0 to 9323129
Data columns (total 10 columns):
 #   Column          Dtype  
---  ------          -----  
 0   DAYOFSERVICE    object 
 1   TRIPID          int64  
 2   PROGRNUMBER     int64  
 3   STOPPOINTID     int64  
 4   ACTUALTIME_ARR  object 
 5   ACTUALTIME_DEP  object 
 6   PLANNEDTIME     object 
 7   SKIP            float64
 8   LINEID          object 
 9   ROUTEID         object 
dtypes: float64(1), int64(3), object(6)
memory usage: 711.3+ MB


In [5]:
# convert 'DAYOFSERVICE', 'PLANNEDTIME', 'ACTUALTIME_ARR' and 'ACTUALTIME_DEP' to datetime
leavetimes['DAYOFSERVICE'] = pd.to_datetime(leavetimes.DAYOFSERVICE)
leavetimes['PLANNEDTIME'] = pd.to_datetime(leavetimes.PLANNEDTIME)
leavetimes['ACTUALTIME_ARR'] = pd.to_datetime(leavetimes.ACTUALTIME_ARR)
leavetimes['ACTUALTIME_DEP'] = pd.to_datetime(leavetimes.ACTUALTIME_DEP)

# convert feature 'TRIPID', 'STOPPOINTID' and 'VEHICLEID' to object
leavetimes['TRIPID'] = leavetimes['TRIPID'].astype('object')
leavetimes['STOPPOINTID'] = leavetimes['STOPPOINTID'].astype('object')

leavetimes.dtypes

DAYOFSERVICE      datetime64[ns]
TRIPID                    object
PROGRNUMBER                int64
STOPPOINTID               object
ACTUALTIME_ARR    datetime64[ns]
ACTUALTIME_DEP    datetime64[ns]
PLANNEDTIME       datetime64[ns]
SKIP                     float64
LINEID                    object
ROUTEID                   object
dtype: object

In [6]:
leavetimes.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,ACTUALTIME_ARR,ACTUALTIME_DEP,PLANNEDTIME,SKIP,LINEID,ROUTEID
0,2018-01-01,5955277,1,7347,2018-01-01 08:20:29,2018-01-01 08:20:29,2018-01-01 08:20:00,1.0,16_1,16_20
1,2018-01-01,5955277,2,3669,2018-01-01 08:22:08,2018-01-01 08:22:08,2018-01-01 08:21:20,1.0,16_1,16_20
2,2018-01-01,5955277,3,7349,2018-01-01 08:23:02,2018-01-01 08:23:02,2018-01-01 08:22:18,1.0,16_1,16_20
3,2018-01-01,5955277,4,1631,2018-01-01 08:24:15,2018-01-01 08:24:15,2018-01-01 08:23:26,1.0,16_1,16_20
4,2018-01-01,5955277,5,1632,2018-01-01 08:24:21,2018-01-01 08:24:21,2018-01-01 08:23:41,1.0,16_1,16_20


In [7]:
leavetimes_raw = leavetimes.copy()

In [8]:
leavetimes = leavetimes_raw

## New Features
### 1. 'MONTH', 'DAY', 'HOUR'

In [9]:
leavetimes['MONTH'] = leavetimes['DAYOFSERVICE'].dt.month_name()
leavetimes['DAY'] = leavetimes['DAYOFSERVICE'].dt.day_name()
leavetimes['HOUR'] = (leavetimes['PLANNEDTIME'].dt.round(freq='H')).dt.hour

leavetimes.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,ACTUALTIME_ARR,ACTUALTIME_DEP,PLANNEDTIME,SKIP,LINEID,ROUTEID,MONTH,DAY,HOUR
0,2018-01-01,5955277,1,7347,2018-01-01 08:20:29,2018-01-01 08:20:29,2018-01-01 08:20:00,1.0,16_1,16_20,January,Monday,8
1,2018-01-01,5955277,2,3669,2018-01-01 08:22:08,2018-01-01 08:22:08,2018-01-01 08:21:20,1.0,16_1,16_20,January,Monday,8
2,2018-01-01,5955277,3,7349,2018-01-01 08:23:02,2018-01-01 08:23:02,2018-01-01 08:22:18,1.0,16_1,16_20,January,Monday,8
3,2018-01-01,5955277,4,1631,2018-01-01 08:24:15,2018-01-01 08:24:15,2018-01-01 08:23:26,1.0,16_1,16_20,January,Monday,8
4,2018-01-01,5955277,5,1632,2018-01-01 08:24:21,2018-01-01 08:24:21,2018-01-01 08:23:41,1.0,16_1,16_20,January,Monday,8


### 2.'JOURNEYTIMES', 'PLANNED_JOURNEYTIME', 'STOP_TIME'

In [10]:
leavetimes = leavetimes.sort_values(by = ['DAYOFSERVICE', 'TRIPID', 'PROGRNUMBER'])

In [11]:
leavetimes.reset_index(drop = True, inplace = True)
leavetimes.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,ACTUALTIME_ARR,ACTUALTIME_DEP,PLANNEDTIME,SKIP,LINEID,ROUTEID,MONTH,DAY,HOUR
0,2018-01-01,5955277,1,7347,2018-01-01 08:20:29,2018-01-01 08:20:29,2018-01-01 08:20:00,1.0,16_1,16_20,January,Monday,8
1,2018-01-01,5955277,2,3669,2018-01-01 08:22:08,2018-01-01 08:22:08,2018-01-01 08:21:20,1.0,16_1,16_20,January,Monday,8
2,2018-01-01,5955277,3,7349,2018-01-01 08:23:02,2018-01-01 08:23:02,2018-01-01 08:22:18,1.0,16_1,16_20,January,Monday,8
3,2018-01-01,5955277,4,1631,2018-01-01 08:24:15,2018-01-01 08:24:15,2018-01-01 08:23:26,1.0,16_1,16_20,January,Monday,8
4,2018-01-01,5955277,5,1632,2018-01-01 08:24:21,2018-01-01 08:24:21,2018-01-01 08:23:41,1.0,16_1,16_20,January,Monday,8


In [12]:
leavetimes.rename(columns = {'STOPPOINTID':'STARTSTOP'}, inplace = True)
leavetimes['ENDSTOP'] = np.nan
leavetimes['JOURNEYTIME'] = np.nan
leavetimes['PLANNED_JOURNEYTIME'] = np.nan
leavetimes['STOP_TIME'] = np.nan

leavetimes.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STARTSTOP,ACTUALTIME_ARR,ACTUALTIME_DEP,PLANNEDTIME,SKIP,LINEID,ROUTEID,MONTH,DAY,HOUR,ENDSTOP,JOURNEYTIME,PLANNED_JOURNEYTIME,STOP_TIME
0,2018-01-01,5955277,1,7347,2018-01-01 08:20:29,2018-01-01 08:20:29,2018-01-01 08:20:00,1.0,16_1,16_20,January,Monday,8,,,,
1,2018-01-01,5955277,2,3669,2018-01-01 08:22:08,2018-01-01 08:22:08,2018-01-01 08:21:20,1.0,16_1,16_20,January,Monday,8,,,,
2,2018-01-01,5955277,3,7349,2018-01-01 08:23:02,2018-01-01 08:23:02,2018-01-01 08:22:18,1.0,16_1,16_20,January,Monday,8,,,,
3,2018-01-01,5955277,4,1631,2018-01-01 08:24:15,2018-01-01 08:24:15,2018-01-01 08:23:26,1.0,16_1,16_20,January,Monday,8,,,,
4,2018-01-01,5955277,5,1632,2018-01-01 08:24:21,2018-01-01 08:24:21,2018-01-01 08:23:41,1.0,16_1,16_20,January,Monday,8,,,,


In [13]:
leavetimes.loc[(leavetimes['TRIPID'] == leavetimes['TRIPID'].shift(-1)), 'ENDSTOP'] = leavetimes['STARTSTOP'].shift(-1)
leavetimes.loc[(leavetimes['TRIPID'] == leavetimes['TRIPID'].shift(-1)), 'JOURNEYTIME'] = leavetimes['ACTUALTIME_ARR'].shift(-1)
leavetimes.loc[(leavetimes['TRIPID'] == leavetimes['TRIPID'].shift(-1)), 'PLANNED_JOURNEYTIME'] = leavetimes['PLANNEDTIME'].shift(-1)
leavetimes['STOP_TIME'] = leavetimes['ACTUALTIME_DEP'] - leavetimes['ACTUALTIME_ARR']

leavetimes.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STARTSTOP,ACTUALTIME_ARR,ACTUALTIME_DEP,PLANNEDTIME,SKIP,LINEID,ROUTEID,MONTH,DAY,HOUR,ENDSTOP,JOURNEYTIME,PLANNED_JOURNEYTIME,STOP_TIME
0,2018-01-01,5955277,1,7347,2018-01-01 08:20:29,2018-01-01 08:20:29,2018-01-01 08:20:00,1.0,16_1,16_20,January,Monday,8,3669,2018-01-01 08:22:08,2018-01-01 08:21:20,0 days
1,2018-01-01,5955277,2,3669,2018-01-01 08:22:08,2018-01-01 08:22:08,2018-01-01 08:21:20,1.0,16_1,16_20,January,Monday,8,7349,2018-01-01 08:23:02,2018-01-01 08:22:18,0 days
2,2018-01-01,5955277,3,7349,2018-01-01 08:23:02,2018-01-01 08:23:02,2018-01-01 08:22:18,1.0,16_1,16_20,January,Monday,8,1631,2018-01-01 08:24:15,2018-01-01 08:23:26,0 days
3,2018-01-01,5955277,4,1631,2018-01-01 08:24:15,2018-01-01 08:24:15,2018-01-01 08:23:26,1.0,16_1,16_20,January,Monday,8,1632,2018-01-01 08:24:21,2018-01-01 08:23:41,0 days
4,2018-01-01,5955277,5,1632,2018-01-01 08:24:21,2018-01-01 08:24:21,2018-01-01 08:23:41,1.0,16_1,16_20,January,Monday,8,5053,2018-01-01 08:24:34,2018-01-01 08:24:06,0 days


In [14]:
leavetimes.drop(leavetimes[leavetimes['TRIPID'] != leavetimes['TRIPID'].shift(-1)].index, inplace = True)

In [15]:
leavetimes.dtypes

DAYOFSERVICE            datetime64[ns]
TRIPID                          object
PROGRNUMBER                      int64
STARTSTOP                       object
ACTUALTIME_ARR          datetime64[ns]
ACTUALTIME_DEP          datetime64[ns]
PLANNEDTIME             datetime64[ns]
SKIP                           float64
LINEID                          object
ROUTEID                         object
MONTH                           object
DAY                             object
HOUR                             int64
ENDSTOP                         object
JOURNEYTIME                     object
PLANNED_JOURNEYTIME             object
STOP_TIME              timedelta64[ns]
dtype: object

In [17]:
leavetimes['JOURNEYTIME'] = (pd.to_datetime(leavetimes.JOURNEYTIME) - leavetimes['ACTUALTIME_ARR']).dt.total_seconds()
leavetimes['PLANNED_JOURNEYTIME'] = (pd.to_datetime(leavetimes.PLANNED_JOURNEYTIME) - leavetimes['PLANNEDTIME']).dt.total_seconds()
leavetimes['STOP_TIME'] = leavetimes['STOP_TIME'].dt.total_seconds()

leavetimes

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STARTSTOP,ACTUALTIME_ARR,ACTUALTIME_DEP,PLANNEDTIME,SKIP,LINEID,ROUTEID,MONTH,DAY,HOUR,ENDSTOP,JOURNEYTIME,PLANNED_JOURNEYTIME,STOP_TIME
0,2018-01-01,5955277,1,7347,2018-01-01 08:20:29,2018-01-01 08:20:29,2018-01-01 08:20:00,1.0,16_1,16_20,January,Monday,8,3669,99.0,80.0,0.0
1,2018-01-01,5955277,2,3669,2018-01-01 08:22:08,2018-01-01 08:22:08,2018-01-01 08:21:20,1.0,16_1,16_20,January,Monday,8,7349,54.0,58.0,0.0
2,2018-01-01,5955277,3,7349,2018-01-01 08:23:02,2018-01-01 08:23:02,2018-01-01 08:22:18,1.0,16_1,16_20,January,Monday,8,1631,73.0,68.0,0.0
3,2018-01-01,5955277,4,1631,2018-01-01 08:24:15,2018-01-01 08:24:15,2018-01-01 08:23:26,1.0,16_1,16_20,January,Monday,8,1632,6.0,15.0,0.0
4,2018-01-01,5955277,5,1632,2018-01-01 08:24:21,2018-01-01 08:24:21,2018-01-01 08:23:41,1.0,16_1,16_20,January,Monday,8,5053,13.0,25.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9323124,2018-01-31,6245816,28,786,2018-01-31 07:36:36,2018-01-31 07:37:41,2018-01-31 07:40:00,1.0,46A_2,46A_62,January,Wednesday,8,792,212.0,240.0,65.0
9323125,2018-01-31,6245816,29,792,2018-01-31 07:40:08,2018-01-31 07:40:40,2018-01-31 07:44:00,2.0,46A_2,46A_62,January,Wednesday,8,320,306.0,245.0,32.0
9323126,2018-01-31,6245816,31,320,2018-01-31 07:45:14,2018-01-31 07:46:03,2018-01-31 07:48:05,1.0,46A_2,46A_62,January,Wednesday,8,7491,160.0,115.0,49.0
9323127,2018-01-31,6245816,32,7491,2018-01-31 07:47:54,2018-01-31 07:47:54,2018-01-31 07:50:00,1.0,46A_2,46A_62,January,Wednesday,8,278,69.0,170.0,0.0


In [18]:
leavetimes.reset_index(drop=True, inplace=True)

## Logic Check
### 1. Check non-consecutive PROGRNUMBER

In [19]:
leavetimes[leavetimes['SKIP'] != 1]

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STARTSTOP,ACTUALTIME_ARR,ACTUALTIME_DEP,PLANNEDTIME,SKIP,LINEID,ROUTEID,MONTH,DAY,HOUR,ENDSTOP,JOURNEYTIME,PLANNED_JOURNEYTIME,STOP_TIME
75,2018-01-01,5955278,1,5171,2018-01-01 09:57:45,2018-01-01 09:57:45,2018-01-01 10:00:00,3.0,16_2,16_24,January,Monday,10,2976,161.0,60.0,0.0
188,2018-01-01,5955279,42,1285,2018-01-01 12:27:29,2018-01-01 12:27:59,2018-01-01 12:34:03,2.0,16_1,16_20,January,Monday,13,1288,180.0,216.0,30.0
221,2018-01-01,5955280,1,5171,2018-01-01 13:20:08,2018-01-01 13:20:08,2018-01-01 13:20:00,3.0,16_2,16_24,January,Monday,13,2976,70.0,68.0,0.0
368,2018-01-01,5955282,1,5171,2018-01-01 16:40:40,2018-01-01 16:40:40,2018-01-01 16:40:00,3.0,16_2,16_24,January,Monday,17,2976,64.0,69.0,0.0
515,2018-01-01,5955284,1,5171,2018-01-01 20:01:08,2018-01-01 20:01:08,2018-01-01 20:00:00,3.0,16_2,16_24,January,Monday,20,2976,51.0,55.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9152914,2018-01-31,6245760,42,315,2018-01-31 22:14:20,2018-01-31 22:14:54,2018-01-31 22:07:54,2.0,66A_2,66A_36,January,Wednesday,22,406,358.0,217.0,34.0
9153030,2018-01-31,6245762,56,6179,2018-01-31 09:05:29,2018-01-31 09:05:29,2018-01-31 08:51:40,2.0,17A_2,17A_17,January,Wednesday,9,7297,30.0,27.0,0.0
9153148,2018-01-31,6245764,56,6179,2018-01-31 11:33:37,2018-01-31 11:33:37,2018-01-31 11:37:56,2.0,17A_2,17A_17,January,Wednesday,12,7297,31.0,25.0,0.0
9153190,2018-01-31,6245769,40,7576,2018-01-31 07:15:47,2018-01-31 07:15:47,2018-01-31 07:27:04,2.0,11_2,11_42,January,Wednesday,7,320,154.0,208.0,0.0


In [20]:
leavetimes.drop(leavetimes[leavetimes['SKIP'] != 1].index, inplace = True)

leavetimes[leavetimes['SKIP'] != 1]

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STARTSTOP,ACTUALTIME_ARR,ACTUALTIME_DEP,PLANNEDTIME,SKIP,LINEID,ROUTEID,MONTH,DAY,HOUR,ENDSTOP,JOURNEYTIME,PLANNED_JOURNEYTIME,STOP_TIME


In [21]:
leavetimes.drop('SKIP', axis = 1, inplace = True)

In [22]:
leavetimes.reset_index(drop=True, inplace=True)

leavetimes.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STARTSTOP,ACTUALTIME_ARR,ACTUALTIME_DEP,PLANNEDTIME,LINEID,ROUTEID,MONTH,DAY,HOUR,ENDSTOP,JOURNEYTIME,PLANNED_JOURNEYTIME,STOP_TIME
0,2018-01-01,5955277,1,7347,2018-01-01 08:20:29,2018-01-01 08:20:29,2018-01-01 08:20:00,16_1,16_20,January,Monday,8,3669,99.0,80.0,0.0
1,2018-01-01,5955277,2,3669,2018-01-01 08:22:08,2018-01-01 08:22:08,2018-01-01 08:21:20,16_1,16_20,January,Monday,8,7349,54.0,58.0,0.0
2,2018-01-01,5955277,3,7349,2018-01-01 08:23:02,2018-01-01 08:23:02,2018-01-01 08:22:18,16_1,16_20,January,Monday,8,1631,73.0,68.0,0.0
3,2018-01-01,5955277,4,1631,2018-01-01 08:24:15,2018-01-01 08:24:15,2018-01-01 08:23:26,16_1,16_20,January,Monday,8,1632,6.0,15.0,0.0
4,2018-01-01,5955277,5,1632,2018-01-01 08:24:21,2018-01-01 08:24:21,2018-01-01 08:23:41,16_1,16_20,January,Monday,8,5053,13.0,25.0,0.0


### 2. Check 'JOURNEYTIME' <= 0 

In [23]:
leavetimes[leavetimes['JOURNEYTIME'] <= 0]

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STARTSTOP,ACTUALTIME_ARR,ACTUALTIME_DEP,PLANNEDTIME,LINEID,ROUTEID,MONTH,DAY,HOUR,ENDSTOP,JOURNEYTIME,PLANNED_JOURNEYTIME,STOP_TIME
2517,2018-01-01,5955447,1,7662,2018-01-01 19:38:20,2018-01-01 19:38:20,2018-01-01 19:30:00,47_1,47_139,January,Monday,20,7564,0.0,98.0,0.0
2566,2018-01-01,5955449,1,7662,2018-01-01 21:21:36,2018-01-01 21:21:36,2018-01-01 21:30:00,47_1,47_139,January,Monday,22,7564,0.0,74.0,0.0
9296,2018-01-01,5955984,1,7662,2018-01-01 14:29:38,2018-01-01 14:29:38,2018-01-01 14:30:00,47_1,47_139,January,Monday,14,7564,0.0,93.0,0.0
9444,2018-01-01,5955988,1,7662,2018-01-01 18:30:05,2018-01-01 18:30:05,2018-01-01 18:30:00,47_1,47_139,January,Monday,18,7564,0.0,156.0,0.0
9493,2018-01-01,5955990,1,7662,2018-01-01 20:29:16,2018-01-01 20:29:16,2018-01-01 20:30:00,47_1,47_139,January,Monday,20,7564,0.0,98.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8990172,2018-01-31,6240243,66,2020,2018-01-31 12:20:21,2018-01-31 12:20:21,2018-01-31 11:57:02,75_1,75_17,January,Wednesday,12,2021,-62.0,27.0,0.0
9057907,2018-01-31,6243511,25,1403,2018-01-31 13:53:11,2018-01-31 13:53:25,2018-01-31 13:40:53,150_2,150_9,January,Wednesday,14,1404,-1173.0,47.0,14.0
9058715,2018-01-31,6243601,34,4495,2018-01-31 11:10:49,2018-01-31 11:10:49,2018-01-31 11:07:02,15_1,15_16,January,Wednesday,11,5190,0.0,7.0,0.0
9074419,2018-01-31,6244566,1,7073,2018-01-31 11:47:53,2018-01-31 11:47:53,2018-01-31 11:45:00,238_1,238_11,January,Wednesday,12,7097,0.0,61.0,0.0


In [24]:
df1 = leavetimes[leavetimes['JOURNEYTIME'] <= 0][['DAYOFSERVICE', 'TRIPID']]

df1.head()

Unnamed: 0,DAYOFSERVICE,TRIPID
2517,2018-01-01,5955447
2566,2018-01-01,5955449
9296,2018-01-01,5955984
9444,2018-01-01,5955988
9493,2018-01-01,5955990


In [25]:
df2 = leavetimes.merge(df1, on = ['DAYOFSERVICE', 'TRIPID'])

df2.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STARTSTOP,ACTUALTIME_ARR,ACTUALTIME_DEP,PLANNEDTIME,LINEID,ROUTEID,MONTH,DAY,HOUR,ENDSTOP,JOURNEYTIME,PLANNED_JOURNEYTIME,STOP_TIME
0,2018-01-01,5955447,1,7662,2018-01-01 19:38:20,2018-01-01 19:38:20,2018-01-01 19:30:00,47_1,47_139,January,Monday,20,7564,0.0,98.0,0.0
1,2018-01-01,5955447,2,7564,2018-01-01 19:38:20,2018-01-01 19:38:46,2018-01-01 19:31:38,47_1,47_139,January,Monday,20,340,124.0,98.0,26.0
2,2018-01-01,5955447,3,340,2018-01-01 19:40:24,2018-01-01 19:40:24,2018-01-01 19:33:16,47_1,47_139,January,Monday,20,350,47.0,97.0,0.0
3,2018-01-01,5955447,4,350,2018-01-01 19:41:11,2018-01-01 19:41:11,2018-01-01 19:34:53,47_1,47_139,January,Monday,20,351,39.0,96.0,0.0
4,2018-01-01,5955447,5,351,2018-01-01 19:41:50,2018-01-01 19:42:03,2018-01-01 19:36:29,47_1,47_139,January,Monday,20,352,34.0,45.0,13.0


In [26]:
leavetimes = pd.concat([leavetimes, df2]).drop_duplicates(keep=False)

leavetimes[leavetimes['JOURNEYTIME'] <= 0]

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STARTSTOP,ACTUALTIME_ARR,ACTUALTIME_DEP,PLANNEDTIME,LINEID,ROUTEID,MONTH,DAY,HOUR,ENDSTOP,JOURNEYTIME,PLANNED_JOURNEYTIME,STOP_TIME


In [27]:
leavetimes.reset_index(drop=True, inplace=True)

In [28]:
leavetimes.drop(['PROGRNUMBER', 'ACTUALTIME_ARR', 'ACTUALTIME_DEP', 'PLANNEDTIME', 'ROUTEID'], axis = 1, inplace = True)
leavetimes.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,STARTSTOP,LINEID,MONTH,DAY,HOUR,ENDSTOP,JOURNEYTIME,PLANNED_JOURNEYTIME,STOP_TIME
0,2018-01-01,5955277,7347,16_1,January,Monday,8,3669,99.0,80.0,0.0
1,2018-01-01,5955277,3669,16_1,January,Monday,8,7349,54.0,58.0,0.0
2,2018-01-01,5955277,7349,16_1,January,Monday,8,1631,73.0,68.0,0.0
3,2018-01-01,5955277,1631,16_1,January,Monday,8,1632,6.0,15.0,0.0
4,2018-01-01,5955277,1632,16_1,January,Monday,8,5053,13.0,25.0,0.0


In [29]:
leavetimes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9049608 entries, 0 to 9049607
Data columns (total 11 columns):
 #   Column               Dtype         
---  ------               -----         
 0   DAYOFSERVICE         datetime64[ns]
 1   TRIPID               object        
 2   STARTSTOP            object        
 3   LINEID               object        
 4   MONTH                object        
 5   DAY                  object        
 6   HOUR                 int64         
 7   ENDSTOP              object        
 8   JOURNEYTIME          float64       
 9   PLANNED_JOURNEYTIME  float64       
 10  STOP_TIME            float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(6)
memory usage: 759.5+ MB


In [30]:
leavetimes.to_csv(f'leavetimes_premodeling_month_{month_num}.csv', index=False)