In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [112]:
from sklearn.pipeline import make_pipeline
from lightgbm import LGBMClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn import metrics
import math

# Using sklearn to split data into training and testing sets,train classifier and regressor models 
from sklearn.model_selection import train_test_split

#pickle package saves and loads sklearn models
import pickle

from warnings import filterwarnings
filterwarnings('ignore')

## Source of Truth gathered in prior EDA
* **TRIPID**
    * **TRIPID are shared across days for perticular LINEID on particular ROUTEID**
    * For a single TRIPID, only one corresponding LINEID and ROUTEID are present
    * TRIPID for inbound and outbound directions are mutually exclusive
    * For certein TRIPIDs; PROGNUMBER do not start at 1; which should. It means, some entries are missing.
* **LINEID**
    * **For a LINEID; ROUTEIDs and TRIPIDs used for DIFFERENT DIRECTION are mutually exclusive**
* **ROUTEID**
    * A LINEID has multiple ROUTEID 
    * A LINEID may / may not have trips with both INBOUND-going(1)/OUTBOUND-returning(2) direction 
    * **THOUGH 2 ROUTES FOR A LINEID ARE HAVING SAME DIRECTION; STOPPOINTIDs VISITED ARE DIFFERENT**
    * **FOR A SINGLE ROUTE; STOPS VISITED DURING DIFFERENT TRIPS ARE DIFEENRENT**
    * **AS ROUTEIDs ARE NOT KNOWN TO USER _AND_ ROUTEIDs ARE NOT TIMEBOUND; IT IS NOT POSSIBLE TO TRAIN ML MODEL USING ALL ROUTEIDs**

## Read Merge df_Jan with df_trips on 'TRIPID'

In [3]:
df_Jan_trips = pd.read_csv("../DB/ML/imp_ROUTEID_Jan_trips_MERGED.csv")

In [4]:
df_Jan_trips['DAYOFSERVICE'] = pd.to_datetime(df_Jan_trips['DAYOFSERVICE'])
df_Jan_trips['TRIPID'] = pd.to_numeric(df_Jan_trips['TRIPID'], downcast='integer', errors='coerce')
df_Jan_trips['PROGRNUMBER'] = pd.to_numeric(df_Jan_trips['PROGRNUMBER'], downcast='integer', errors='coerce')
df_Jan_trips['STOPPOINTID'] = pd.to_numeric(df_Jan_trips['STOPPOINTID'], downcast='integer', errors='coerce')
df_Jan_trips['PLANNEDTIME_ARR'] = pd.to_numeric(df_Jan_trips['PLANNEDTIME_ARR'], downcast='integer', errors='coerce')
df_Jan_trips['ACTUALTIME_ARR'] = pd.to_numeric(df_Jan_trips['ACTUALTIME_ARR'], downcast='integer', errors='coerce')
df_Jan_trips['ACTUALTIME_DEP'] = pd.to_numeric(df_Jan_trips['ACTUALTIME_DEP'], downcast='integer', errors='coerce')
df_Jan_trips['LINEID'] = df_Jan_trips['LINEID'].astype('str')
df_Jan_trips['ROUTEID'] = df_Jan_trips['ROUTEID'].astype('str')
df_Jan_trips['DIRECTION'] = pd.to_numeric(df_Jan_trips['DIRECTION'], downcast='integer', errors='coerce')

### Object types and valid entries

In [5]:
print(df_Jan_trips.info(null_counts = True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9600545 entries, 0 to 9600544
Data columns (total 13 columns):
 #   Column                    Non-Null Count    Dtype         
---  ------                    --------------    -----         
 0   DAYOFSERVICE              9600545 non-null  datetime64[ns]
 1   TRIPID                    9600545 non-null  int32         
 2   PROGRNUMBER               9600545 non-null  int8          
 3   STOPPOINTID               9600545 non-null  int16         
 4   PLANNEDTIME_ARR           9600545 non-null  int32         
 5   ACTUALTIME_ARR            9600545 non-null  int32         
 6   ACTUALTIME_DEP            9600545 non-null  int32         
 7   LINEID                    9600545 non-null  object        
 8   DIRECTION                 9600545 non-null  int8          
 9   ROUTEID                   9600545 non-null  object        
 10  Arrival_lateEarly         9600545 non-null  bool          
 11  departure_lateEarly       9600545 non-null  bool  

### Nature of data

In [6]:
df_Jan_trips.head(100)

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID,Arrival_lateEarly,departure_lateEarly,planned_arr_dep_equality
0,2018-01-01,5958355,3,1172,23799,23863,23863,41,1,41_3,False,False,True
1,2018-01-01,5958355,4,1173,23860,23934,24072,41,1,41_3,False,False,True
2,2018-01-01,5958355,5,1174,23937,24114,24140,41,1,41_3,False,False,True
3,2018-01-01,5958355,6,1175,24048,24180,24192,41,1,41_3,False,False,True
4,2018-01-01,5958355,7,15,24130,24227,24257,41,1,41_3,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2018-01-01,5958088,47,3958,27943,28473,28473,66,1,66_11,False,False,True
96,2018-01-01,5958088,48,3959,27990,28508,28508,66,1,66_11,False,False,True
97,2018-01-01,5958088,49,3960,28024,28534,28534,66,1,66_11,False,False,True
98,2018-01-01,5958088,50,3961,28058,28551,28562,66,1,66_11,False,False,True


### Unique entries

In [7]:
df_Jan_trips.nunique()

DAYOFSERVICE                   31
TRIPID                      69359
PROGRNUMBER                   102
STOPPOINTID                  4602
PLANNEDTIME_ARR             67749
ACTUALTIME_ARR              70065
ACTUALTIME_DEP              70063
LINEID                        126
DIRECTION                       2
ROUTEID                       246
Arrival_lateEarly               2
departure_lateEarly             2
planned_arr_dep_equality        1
dtype: int64

### Resolve datetime parameter

In [8]:
df_Jan_trips['year'] = df_Jan_trips.DAYOFSERVICE.dt.year 
df_Jan_trips['day'] = df_Jan_trips.DAYOFSERVICE.dt.day
df_Jan_trips['month'] = df_Jan_trips.DAYOFSERVICE.dt.month
df_Jan_trips['dayofweek_num']=df_Jan_trips.DAYOFSERVICE.dt.dayofweek 
df_Jan_trips['quarter']=df_Jan_trips.DAYOFSERVICE.dt.quarter 

In [9]:
df_Jan_trips.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID,Arrival_lateEarly,departure_lateEarly,planned_arr_dep_equality,year,day,month,dayofweek_num,quarter
0,2018-01-01,5958355,3,1172,23799,23863,23863,41,1,41_3,False,False,True,2018,1,1,0,1
1,2018-01-01,5958355,4,1173,23860,23934,24072,41,1,41_3,False,False,True,2018,1,1,0,1
2,2018-01-01,5958355,5,1174,23937,24114,24140,41,1,41_3,False,False,True,2018,1,1,0,1
3,2018-01-01,5958355,6,1175,24048,24180,24192,41,1,41_3,False,False,True,2018,1,1,0,1
4,2018-01-01,5958355,7,15,24130,24227,24257,41,1,41_3,False,False,True,2018,1,1,0,1


## Dataframe for model training
**dataframe necessary To train a LightGBM Model for Target variable "ACTUALTIME_ARR" is ontained bellow**

In [78]:
df_Jan_trips_arr = df_Jan_trips[["year","month","dayofweek_num","quarter","LINEID","DIRECTION","STOPPOINTID","PROGRNUMBER","PLANNEDTIME_ARR","ACTUALTIME_ARR"]]

### Nature of data

In [79]:
df_Jan_trips_arr.head()

Unnamed: 0,year,month,dayofweek_num,quarter,LINEID,DIRECTION,STOPPOINTID,PROGRNUMBER,PLANNEDTIME_ARR,ACTUALTIME_ARR
0,2018,1,0,1,41,1,1172,3,23799,23863
1,2018,1,0,1,41,1,1173,4,23860,23934
2,2018,1,0,1,41,1,1174,5,23937,24114
3,2018,1,0,1,41,1,1175,6,24048,24180
4,2018,1,0,1,41,1,15,7,24130,24227


In [80]:
df_Jan_trips_arr.dtypes

year                int64
month               int64
dayofweek_num       int64
quarter             int64
LINEID             object
DIRECTION            int8
STOPPOINTID         int16
PROGRNUMBER          int8
PLANNEDTIME_ARR     int32
ACTUALTIME_ARR      int32
dtype: object

In [81]:
df_Jan_trips_arr.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9600545 entries, 0 to 9600544
Data columns (total 10 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   year             9600545 non-null  int64 
 1   month            9600545 non-null  int64 
 2   dayofweek_num    9600545 non-null  int64 
 3   quarter          9600545 non-null  int64 
 4   LINEID           9600545 non-null  object
 5   DIRECTION        9600545 non-null  int8  
 6   STOPPOINTID      9600545 non-null  int16 
 7   PROGRNUMBER      9600545 non-null  int8  
 8   PLANNEDTIME_ARR  9600545 non-null  int32 
 9   ACTUALTIME_ARR   9600545 non-null  int32 
dtypes: int16(1), int32(2), int64(4), int8(2), object(1)
memory usage: 476.1+ MB


In [82]:
df_Jan_trips_arr.nunique()

year                   1
month                  1
dayofweek_num          7
quarter                1
LINEID               126
DIRECTION              2
STOPPOINTID         4602
PROGRNUMBER          102
PLANNEDTIME_ARR    67749
ACTUALTIME_ARR     70065
dtype: int64

## Train model for each LINEID for each direction

In [83]:
LINEID_list = df_Jan_trips_arr["LINEID"].unique()
# LINEID_list = array(['41', '66', '77A', '130', '151',...'161', '68X', '33D'],dtype=object)

## Model

###  Dataframe with data for LINEID 41

In [84]:
df_Jan_trips_arr_41 = df_Jan_trips_arr.loc[(df_Jan_trips_arr["LINEID"] == "41")].copy()

In [142]:
df_Jan_trips_arr_41.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147405 entries, 0 to 9599673
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   year             147405 non-null  int64 
 1   month            147405 non-null  int64 
 2   dayofweek_num    147405 non-null  int64 
 3   quarter          147405 non-null  int64 
 4   LINEID           147405 non-null  object
 5   DIRECTION        147405 non-null  int8  
 6   STOPPOINTID      147405 non-null  int16 
 7   PROGRNUMBER      147405 non-null  int8  
 8   PLANNEDTIME_ARR  147405 non-null  int32 
 9   ACTUALTIME_ARR   147405 non-null  int32 
dtypes: int16(1), int32(2), int64(4), int8(2), object(1)
memory usage: 8.4+ MB


In [85]:
df_Jan_trips_arr_41.nunique()

year                   1
month                  1
dayofweek_num          7
quarter                1
LINEID                 1
DIRECTION              2
STOPPOINTID          105
PROGRNUMBER           56
PLANNEDTIME_ARR    26412
ACTUALTIME_ARR     58511
dtype: int64

### Input and Output features

In [98]:
X = df_Jan_trips_arr_41[["year","month","dayofweek_num","quarter","LINEID","PROGRNUMBER",\
                         "STOPPOINTID","DIRECTION","PLANNEDTIME_ARR"]]
Y = df_Jan_trips_arr_41[["ACTUALTIME_ARR"]]

#### Typecast input features

In [99]:
X['year'] = X['year'].astype('category')
X['month'] = X['month'].astype('category')
X['dayofweek_num'] = X['dayofweek_num'].astype('category')
X['quarter'] = X['quarter'].astype('category')
X['PROGRNUMBER'] = X['PROGRNUMBER'].astype('category')
X['STOPPOINTID'] = X['STOPPOINTID'].astype('category')
X['PLANNEDTIME_ARR'] = pd.to_numeric(X['PLANNEDTIME_ARR'], downcast='integer', errors='coerce')
X['LINEID'] = X['LINEID'].astype('category')
X['DIRECTION'] = X['DIRECTION'].astype('category')

#### Typecast target features

In [100]:
Y['ACTUALTIME_ARR'] = pd.to_numeric(Y['ACTUALTIME_ARR'], downcast='integer', errors='coerce')

#### list categorical and numeric columns of input features for transformation

In [101]:
cat_col = list(X.select_dtypes(include=['category']).columns)
num_col = list(X.select_dtypes(exclude=['category','object','datetime']).columns)

### Linear Regression pipeline

In [124]:
'''
Pipeline with transformed target regressor is defined
'''
# X = pd.get_dummies(X)
# X.head()

std_scalar = StandardScaler()
oh_encoder = OneHotEncoder()

pipe_lin_reg = make_pipeline(
    ColumnTransformer([
        ('num', std_scalar, num_col),
        ('cat', oh_encoder, cat_col),    
    ]),
    LinearRegression()
)

lin_regressor = TransformedTargetRegressor(regressor=pipe_lin_reg, transformer=std_scalar)

In [125]:
def cross_val_LinRegCalc(X, y, scoring='accuracy', cv=3):
    """Functions to carry out cross validation on the linear regression model
    Default number of validations is 3. The randon state will be updated 
    at each iteration to allow our results to be repeated"""
    
    # store results
    results = []
    # evaluate cv times and append to results
    for i in range(cv):
        # set up train test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=i , test_size=0.3)
        # generate model
        multi_LinReg_model = lin_regressor.fit(X_train, y_train)
        # threshold
        multi_LinReg_predict = multi_LinReg_model.predict(X_test)
        # calc score
        if scoring=='R2':
            score = metrics.r2_score(y_test, multi_LinReg_predict)
        elif scoring=='RMSE':
            score = math.sqrt(metrics.mean_squared_error(y_test, multi_LinReg_predict))
        # append to results
        results.append(score)
    return results, multi_LinReg_model

In [126]:
def cross_val_linReg(X,y,cvVal=3):
    """Function to perform cross validation and store results 
    in dataframe. Cross validation looks at R2 score, RMSE
    and returns a dataframe with results"""

    linRegResults = {}
    # metrics to test against
    test_metrics = ['R2','RMSE']
    
    for metric in test_metrics:
        # generate test results
        result,multi_LinReg_model = cross_val_LinRegCalc(X, y, cv=cvVal, scoring=metric)
        length = len(result)
        # store result in dict
        linRegResults[metric] = sum(result)/length

    # create dataframe with results
    LinRegDF = pd.DataFrame.from_dict(linRegResults, orient='index', columns=['Linear_Regression'])
    
    return LinRegDF

### Model evaluation

#### Model metrics

In [127]:
print(cross_val_linReg(X,Y))

      Linear_Regression
R2             0.999679
RMSE         313.304048


#### Model test IO pair

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=1 , test_size=0.3)

In [138]:
X_test[:2]

Unnamed: 0,year,month,dayofweek_num,quarter,LINEID,PROGRNUMBER,STOPPOINTID,DIRECTION,PLANNEDTIME_ARR
2009826,2018,1,0,1,41,11,5078,2,23475
3587777,2018,1,4,1,41,33,3675,1,57264


In [140]:
y_test[:2]

Unnamed: 0,ACTUALTIME_ARR
2009826,23454
3587777,57066


#### Model test run

In [139]:
multi_LinReg_model = lin_regressor.fit(X,Y)
multi_LinReg_model.predict(X_test[:2])

array([[23439.44989556],
       [57388.43974369]])