In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
from sklearn.pipeline import make_pipeline
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler 
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

# Using sklearn to split data into training and testing sets,train classifier and regressor models 
from sklearn.model_selection import train_test_split

#pickle package saves and loads sklearn models
import pickle

from warnings import filterwarnings
filterwarnings('ignore')

## Source of Truth gathered in prior EDA
* **TRIPID**
    * **TRIPID are shared across days for perticular LINEID on particular ROUTEID**
    * For a single TRIPID, only one corresponding LINEID and ROUTEID are present
    * TRIPID for inbound and outbound directions are mutually exclusive
    * For certein TRIPIDs; PROGNUMBER do not start at 1; which should. It means, some entries are missing.
* **LINEID**
    * **For a LINEID; ROUTEIDs and TRIPIDs used for DIFFERENT DIRECTION are mutually exclusive**
* **ROUTEID**
    * A LINEID has multiple ROUTEID 
    * A LINEID may / may not have trips with both INBOUND-going(1)/OUTBOUND-returning(2) direction 
    * **THOUGH 2 ROUTES FOR A LINEID ARE HAVING SAME DIRECTION; STOPPOINTIDs VISITED ARE DIFFERENT**
    * **FOR A SINGLE ROUTE; STOPS VISITED DURING DIFFERENT TRIPS ARE DIFEENRENT**
    * **AS ROUTEIDs ARE NOT KNOWN TO USER _AND_ ROUTEIDs ARE NOT TIMEBOUND; IT IS NOT POSSIBLE TO TRAIN ML MODEL USING ALL ROUTEIDs**

## Read Merge df_Jan with df_trips on 'TRIPID'

In [2]:
df_Jan_trips = pd.read_csv("../DB/ML/Jan_trips_MERGED.csv")

In [3]:
df_Jan_trips['DAYOFSERVICE'] = pd.to_datetime(df_Jan_trips['DAYOFSERVICE'])
df_Jan_trips['TRIPID'] = pd.to_numeric(df_Jan_trips['TRIPID'], downcast='integer', errors='coerce')
df_Jan_trips['PROGRNUMBER'] = pd.to_numeric(df_Jan_trips['PROGRNUMBER'], downcast='integer', errors='coerce')
df_Jan_trips['STOPPOINTID'] = pd.to_numeric(df_Jan_trips['STOPPOINTID'], downcast='integer', errors='coerce')
df_Jan_trips['PLANNEDTIME_ARR'] = pd.to_numeric(df_Jan_trips['PLANNEDTIME_ARR'], downcast='integer', errors='coerce')
df_Jan_trips['PLANNEDTIME_DEP'] = pd.to_numeric(df_Jan_trips['PLANNEDTIME_DEP'], downcast='integer', errors='coerce')
df_Jan_trips['ACTUALTIME_ARR'] = pd.to_numeric(df_Jan_trips['ACTUALTIME_ARR'], downcast='integer', errors='coerce')
df_Jan_trips['ACTUALTIME_DEP'] = pd.to_numeric(df_Jan_trips['ACTUALTIME_DEP'], downcast='integer', errors='coerce')
df_Jan_trips['LINEID'] = df_Jan_trips['LINEID'].astype('str')
df_Jan_trips['ROUTEID'] = df_Jan_trips['ROUTEID'].astype('str')
df_Jan_trips['DIRECTION'] = pd.to_numeric(df_Jan_trips['DIRECTION'], downcast='integer', errors='coerce')

### Object types and valid entries

In [4]:
print(df_Jan_trips.info(null_counts = True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10525004 entries, 0 to 10525003
Data columns (total 11 columns):
 #   Column           Non-Null Count     Dtype         
---  ------           --------------     -----         
 0   DAYOFSERVICE     10525004 non-null  datetime64[ns]
 1   TRIPID           10525004 non-null  int32         
 2   PROGRNUMBER      10525004 non-null  int8          
 3   STOPPOINTID      10525004 non-null  int16         
 4   PLANNEDTIME_ARR  10525004 non-null  int32         
 5   PLANNEDTIME_DEP  10525004 non-null  int32         
 6   ACTUALTIME_ARR   10525004 non-null  int32         
 7   ACTUALTIME_DEP   10525004 non-null  int32         
 8   LINEID           10525004 non-null  object        
 9   DIRECTION        10525004 non-null  int8          
 10  ROUTEID          10525004 non-null  object        
dtypes: datetime64[ns](1), int16(1), int32(5), int8(2), object(2)
memory usage: 481.8+ MB
None


### Nature of data

In [5]:
df_Jan_trips.head(100)

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
0,2018-01-01,5958355,3,1172,23799,23799,23863,23863,41,1,41_3
1,2018-01-01,5958355,4,1173,23860,23860,23934,24072,41,1,41_3
2,2018-01-01,5958355,5,1174,23937,23937,24114,24140,41,1,41_3
3,2018-01-01,5958355,6,1175,24048,24048,24180,24192,41,1,41_3
4,2018-01-01,5958355,7,15,24130,24130,24227,24257,41,1,41_3
...,...,...,...,...,...,...,...,...,...,...,...
95,2018-01-01,5958088,47,3958,27943,27943,28473,28473,66,1,66_11
96,2018-01-01,5958088,48,3959,27990,27990,28508,28508,66,1,66_11
97,2018-01-01,5958088,49,3960,28024,28024,28534,28534,66,1,66_11
98,2018-01-01,5958088,50,3961,28058,28058,28551,28562,66,1,66_11


### Unique entries

In [6]:
df_Jan_trips.nunique()

DAYOFSERVICE          31
TRIPID             76453
PROGRNUMBER          103
STOPPOINTID         4714
PLANNEDTIME_ARR    68859
PLANNEDTIME_DEP    68859
ACTUALTIME_ARR     71982
ACTUALTIME_DEP     71965
LINEID               126
DIRECTION              2
ROUTEID              498
dtype: int64

### Resolve datetime parameter

In [10]:
df_Jan_trips['year'] = df_Jan_trips.DAYOFSERVICE.dt.year 
df_Jan_trips['Day'] = df_Jan_trips.DAYOFSERVICE.dt.day
df_Jan_trips['month'] = df_Jan_trips.DAYOFSERVICE.dt.month
df_Jan_trips['dayofweek_num']=df_Jan_trips.DAYOFSERVICE.dt.dayofweek 
df_Jan_trips['quarter']=df_Jan_trips.DAYOFSERVICE.dt.quarter 

In [11]:
df_Jan_trips.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID,year,Day,month,dayofweek_num,quarter
0,2018-01-01,5958355,3,1172,23799,23799,23863,23863,41,1,41_3,2018,1,1,0,1
1,2018-01-01,5958355,4,1173,23860,23860,23934,24072,41,1,41_3,2018,1,1,0,1
2,2018-01-01,5958355,5,1174,23937,23937,24114,24140,41,1,41_3,2018,1,1,0,1
3,2018-01-01,5958355,6,1175,24048,24048,24180,24192,41,1,41_3,2018,1,1,0,1
4,2018-01-01,5958355,7,15,24130,24130,24227,24257,41,1,41_3,2018,1,1,0,1


### Check if arrival time for buses are early/ late than planned time

In [14]:
df_Jan_trips.loc[df_Jan_trips['PLANNEDTIME_ARR'] > df_Jan_trips['ACTUALTIME_ARR']]

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID,year,Day,month,dayofweek_num,quarter
24,2018-01-01,5958355,27,1630,25279,25279,25246,25257,41,1,41_3,2018,1,1,0,1
25,2018-01-01,5958355,28,7348,25508,25508,25487,25537,41,1,41_3,2018,1,1,0,1
26,2018-01-01,5958355,29,3669,25615,25615,25611,25622,41,1,41_3,2018,1,1,0,1
27,2018-01-01,5958355,30,3671,25677,25677,25664,25679,41,1,41_3,2018,1,1,0,1
28,2018-01-01,5958355,31,3672,25708,25708,25707,25707,41,1,41_3,2018,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10524870,2018-01-31,6235356,1,7289,87300,87300,87250,87250,65,2,65_75,2018,31,1,2,1
10524871,2018-01-31,6235356,2,7280,87328,87328,87296,87296,65,2,65_75,2018,31,1,2,1
10524872,2018-01-31,6235356,3,7281,87390,87390,87371,87371,65,2,65_75,2018,31,1,2,1
10524873,2018-01-31,6235356,4,7284,87443,87443,87438,87438,65,2,65_75,2018,31,1,2,1
