# Regerssors : Model for each route on each direction
* **Notebook genrates Linear regression models for each bus route on either direction** 
* **TIME parameters are considered in minutes resolution**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.pipeline import make_pipeline
from lightgbm import LGBMRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn import metrics
import math

# Using sklearn to split data into training and testing sets,train classifier and regressor models 
from sklearn.model_selection import train_test_split

#pickle package saves and loads sklearn models
import pickle

from warnings import filterwarnings
filterwarnings('ignore')

## Source of Truth gathered in prior EDA
* **TRIPID**
    * **TRIPID are shared across days for perticular LINEID on particular ROUTEID**
    * For a single TRIPID, only one corresponding LINEID and ROUTEID are present
    * TRIPID for inbound and outbound directions are mutually exclusive
    * For certein TRIPIDs; PROGNUMBER do not start at 1; which should. It means, some entries are missing.
* **LINEID**
    * **For a LINEID; ROUTEIDs and TRIPIDs used for DIFFERENT DIRECTION are mutually exclusive**
* **ROUTEID**
    * A LINEID has multiple ROUTEID 
    * A LINEID may / may not have trips with both INBOUND-going(1)/OUTBOUND-returning(2) direction 
    * **THOUGH 2 ROUTES FOR A LINEID ARE HAVING SAME DIRECTION; STOPPOINTIDs VISITED ARE DIFFERENT**
    * **FOR A SINGLE ROUTE; STOPS VISITED DURING DIFFERENT TRIPS ARE DIFEENRENT**
    * **ROUTEIDs ARE NOT KNOWN TO USER & ROUTEIDs ARE NOT TIMEBOUND i.e. timetable DOES NOT exists to follow a ROUTEID**
    * **majority of data for a LINEID is tied with a particular ROUTEID**
* **PLANNED ARRIVAL AND DEPARTURE TIMES ARE SAME FOR ALL ENTRIES; HENCE "PLANNEDTIME_DEP" CAN BE DROPPED**    

## Read Merge df_Jan with df_trips on 'TRIPID'

In [3]:
df_Jan_trips_arr = pd.read_csv("../DB/ML/Jan_trips_arr_MINUTES.csv",)

In [4]:
df_Jan_trips_arr.head()

Unnamed: 0,year,month,dayofweek_num,quarter,LINEID,DIRECTION,STOPPOINTID,PROGRNUMBER,PLANNEDTIME_ARR,ACTUALTIME_ARR
0,2018,1,0,1,41,1,1172,3,23760,23820
1,2018,1,0,1,41,1,1173,4,23820,23880
2,2018,1,0,1,41,1,1174,5,23880,24060
3,2018,1,0,1,41,1,1175,6,24000,24180
4,2018,1,0,1,41,1,15,7,24120,24180


In [5]:
df_Jan_trips_arr['year'] = df_Jan_trips_arr['year'].astype('category')
df_Jan_trips_arr['month'] = df_Jan_trips_arr['month'].astype('category')
df_Jan_trips_arr['dayofweek_num'] = df_Jan_trips_arr['dayofweek_num'].astype('category')
df_Jan_trips_arr['quarter'] = df_Jan_trips_arr['quarter'].astype('category')
df_Jan_trips_arr['PROGRNUMBER'] = df_Jan_trips_arr['PROGRNUMBER'].astype('category')
df_Jan_trips_arr['STOPPOINTID'] = df_Jan_trips_arr['STOPPOINTID'].astype('category')
df_Jan_trips_arr['PLANNEDTIME_ARR'] = pd.to_numeric(df_Jan_trips_arr['PLANNEDTIME_ARR'], downcast='integer', errors='coerce')
df_Jan_trips_arr['ACTUALTIME_ARR'] = pd.to_numeric(df_Jan_trips_arr['ACTUALTIME_ARR'], downcast='integer', errors='coerce')
df_Jan_trips_arr['LINEID'] = df_Jan_trips_arr['LINEID'].astype('category')
df_Jan_trips_arr['DIRECTION'] = df_Jan_trips_arr['DIRECTION'].astype('category')

### Object types and valid entries

In [6]:
print(df_Jan_trips_arr.info(null_counts = True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9600545 entries, 0 to 9600544
Data columns (total 10 columns):
 #   Column           Non-Null Count    Dtype   
---  ------           --------------    -----   
 0   year             9600545 non-null  category
 1   month            9600545 non-null  category
 2   dayofweek_num    9600545 non-null  category
 3   quarter          9600545 non-null  category
 4   LINEID           9600545 non-null  category
 5   DIRECTION        9600545 non-null  category
 6   STOPPOINTID      9600545 non-null  category
 7   PROGRNUMBER      9600545 non-null  category
 8   PLANNEDTIME_ARR  9600545 non-null  int32   
 9   ACTUALTIME_ARR   9600545 non-null  int32   
dtypes: category(8), int32(2)
memory usage: 155.9 MB
None


### Nature of data

In [7]:
df_Jan_trips_arr.head(100)

Unnamed: 0,year,month,dayofweek_num,quarter,LINEID,DIRECTION,STOPPOINTID,PROGRNUMBER,PLANNEDTIME_ARR,ACTUALTIME_ARR
0,2018,1,0,1,41,1,1172,3,23760,23820
1,2018,1,0,1,41,1,1173,4,23820,23880
2,2018,1,0,1,41,1,1174,5,23880,24060
3,2018,1,0,1,41,1,1175,6,24000,24180
4,2018,1,0,1,41,1,15,7,24120,24180
...,...,...,...,...,...,...,...,...,...,...
95,2018,1,0,1,66,1,3958,47,27900,28440
96,2018,1,0,1,66,1,3959,48,27960,28500
97,2018,1,0,1,66,1,3960,49,28020,28500
98,2018,1,0,1,66,1,3961,50,28020,28500


### Unique entries

In [8]:
df_Jan_trips_arr.nunique()

year                  1
month                 1
dayofweek_num         7
quarter               1
LINEID              126
DIRECTION             2
STOPPOINTID        4602
PROGRNUMBER         102
PLANNEDTIME_ARR    1215
ACTUALTIME_ARR     1222
dtype: int64

### Entries for LINEID 41 DIRECTION 1 which faces exception during handling

In [9]:
df_Jan_trips_arr_41_1 = df_Jan_trips_arr.loc[(df_Jan_trips_arr["LINEID"] == "41") & 
                                                 (df_Jan_trips_arr["DIRECTION"] == 1)].copy()

In [10]:
df_Jan_trips_arr_41_1

Unnamed: 0,year,month,dayofweek_num,quarter,LINEID,DIRECTION,STOPPOINTID,PROGRNUMBER,PLANNEDTIME_ARR,ACTUALTIME_ARR
0,2018,1,0,1,41,1,1172,3,23760,23820
1,2018,1,0,1,41,1,1173,4,23820,23880
2,2018,1,0,1,41,1,1174,5,23880,24060
3,2018,1,0,1,41,1,1175,6,24000,24180
4,2018,1,0,1,41,1,15,7,24120,24180
...,...,...,...,...,...,...,...,...,...,...
9592293,2018,1,2,1,41,1,4912,47,84180,83940
9592294,2018,1,2,1,41,1,4913,48,84180,84000
9592295,2018,1,2,1,41,1,4914,49,84240,84000
9592296,2018,1,2,1,41,1,4915,50,84240,84060


In [11]:
df_Jan_trips_arr_41_1.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71010 entries, 0 to 9592297
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   year             71010 non-null  category
 1   month            71010 non-null  category
 2   dayofweek_num    71010 non-null  category
 3   quarter          71010 non-null  category
 4   LINEID           71010 non-null  category
 5   DIRECTION        71010 non-null  category
 6   STOPPOINTID      71010 non-null  category
 7   PROGRNUMBER      71010 non-null  category
 8   PLANNEDTIME_ARR  71010 non-null  int32   
 9   ACTUALTIME_ARR   71010 non-null  int32   
dtypes: category(8), int32(2)
memory usage: 1.9 MB


In [12]:
df_Jan_trips_arr_41_1.nunique()

year                  1
month                 1
dayofweek_num         7
quarter               1
LINEID                1
DIRECTION             1
STOPPOINTID          51
PROGRNUMBER          51
PLANNEDTIME_ARR    1117
ACTUALTIME_ARR     1131
dtype: int64

## Train model for each LINEID for each direction

### Obtain pairs of LINEID for either direction
* Some LINEIDs have valid Entries for a single direction only. 
Particularly for January data, following combinations produce error while data handling
* line 41A direction 1
* line 77X direction 1
* line 51X direction 1
* line 46E direction 1
* line 118 direction 1
* line 68X direction 

Hence; follwoing step avoids exceptions during model creation.

In [13]:
df_line_dir_pair = pd.read_csv('../DB/dominant_route.csv',index_col=False)[['LINEID','DIRECTION']]

In [14]:
df_line_dir_pair

Unnamed: 0,LINEID,DIRECTION
0,41,1
1,41,2
2,66,1
3,66,2
4,77A,1
...,...,...
241,15D,1
242,15D,2
243,68X,2
244,33D,1


## Model

### Regression pipeline

In [15]:
'''
Pipeline with transformed target regressor is defined
'''
def regressorModel(num_col,cat_col,regression_algo = "Linear"):
    
    
    algorithms = {"Linear": LinearRegression(),"LGBM": LGBMRegressor()}
    std_scalar = StandardScaler()
    oh_encoder = OneHotEncoder()

    pipe_lin_reg = make_pipeline(
        ColumnTransformer([
            ('num', std_scalar, num_col),
            ('cat', oh_encoder, cat_col)    
        ]),
        algorithms[regression_algo]
    )

    regressor = TransformedTargetRegressor(regressor=pipe_lin_reg, transformer=std_scalar)
    return regressor

### list of feature based on datatypes

In [16]:
'''
get_column_lists(df) returns lists of catagorical and 
numerical columns in the input dataframe
'''
def get_column_lists(df):
    cat_col = list(df.select_dtypes(include=['category']).columns)
    num_col = list(df.select_dtypes(exclude=['category','object','datetime']).columns)
    
    return num_col,cat_col

###  Train model with data for LINEID x  and  DIRECTION y

In [17]:
for _, row in df_line_dir_pair.iterrows():
    
    # pair of LINEID and DIRECTION is taken in variable. Seperate model is trained for each pair
    line = row['LINEID']
    direction = row['DIRECTION']
    
    # Dataframe slice for pair of LINEID and DIRECTION is taken for operation
    df_Jan_trips_arr_LINE_DIR = df_Jan_trips_arr.loc[(df_Jan_trips_arr["LINEID"] == line) & 
                                                     (df_Jan_trips_arr["DIRECTION"] == direction)].copy().reset_index(drop=True)
    
    # INPUT and TARGET features
    X = df_Jan_trips_arr_LINE_DIR[["year","month","dayofweek_num","quarter","PROGRNUMBER",\
                         "STOPPOINTID","PLANNEDTIME_ARR"]].copy().reset_index(drop=True)
    Y = df_Jan_trips_arr_LINE_DIR[["ACTUALTIME_ARR"]].copy().reset_index(drop=True)
    
    # Train model
    num_col,cat_col = get_column_lists(X)
    regressor = regressorModel(num_col,cat_col,regression_algo="Linear")
    try:
        model = regressor.fit(X, Y)
    except Exception as e:
        print(line, direction,"\nInput features\n" ,X,"\nTarget features\n" ,Y, e)
        continue
    
    # Save model
    filename = str(line)+"_"+str(direction)+'.pkl'
    pickle.dump(model, open("models/06/"+filename, 'wb'))  

## Validation and Evaluation

### Model validation

In [18]:
def calc_Regression(X, y, model, scoring='R2', cv=3):
    """Functions to carry out validation on the regression model.
    Default number of validations cycles is 3. The randon state will be updated 
    at each iteration"""
    
    # store results
    results = []
    # evaluate cv times and append to results
    for i in range(cv):
        # set up train test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=i , test_size=0.8)
        # Prediction
        y_predict = model.predict(X_test)
        # calc score
        if scoring=='R2':
            score = metrics.r2_score(y_test, y_predict)
        elif scoring=='RMSE':
            score = math.sqrt(metrics.mean_squared_error(y_test, y_predict))
        # append to results
        results.append(score)
    return results

In [19]:
def validation(X,y,model, cvVal=3):
    """Function performs cross validation for input model and store results 
    in dataframe. Cross validation looks at R2 score, RMSE
    and returns a dataframe with results"""

    regression_results = {}
    # metrics to test against
    test_metrics = ['R2','RMSE']
    
    for metric in test_metrics:
        # generate test results
        result = calc_Regression(X, y, model,cv=cvVal, scoring=metric)
        length = len(result)
        # store result in dict
        regression_results[metric] = sum(result)/length

    # create dataframe with results
    LinRegDF = pd.DataFrame.from_dict(regression_results, orient='index', columns=['Regression'])
    
    return LinRegDF

### Model evaluation

#### Load data

In [20]:
# load dataframe
df_Jan_trips_arr_LINE_DIR = df_Jan_trips_arr.loc[(df_Jan_trips_arr["LINEID"] == "1") &
                                                 (df_Jan_trips_arr["DIRECTION"] == 2)].copy()
# Prepare INPUT and TARGET features
X = df_Jan_trips_arr_LINE_DIR[["year","month","dayofweek_num","quarter","PROGRNUMBER",
                               "STOPPOINTID","PLANNEDTIME_ARR"]]
Y = df_Jan_trips_arr_LINE_DIR[["ACTUALTIME_ARR"]]

In [21]:
df_Jan_trips_arr_LINE_DIR.head()

Unnamed: 0,year,month,dayofweek_num,quarter,LINEID,DIRECTION,STOPPOINTID,PROGRNUMBER,PLANNEDTIME_ARR,ACTUALTIME_ARR
13291,2018,1,0,1,1,2,381,1,36000,36120
13292,2018,1,0,1,1,2,382,2,36000,36120
13293,2018,1,0,1,1,2,4451,3,36060,36180
13294,2018,1,0,1,1,2,383,4,36120,36180
13295,2018,1,0,1,1,2,384,5,36180,36180


In [22]:
X

Unnamed: 0,year,month,dayofweek_num,quarter,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR
13291,2018,1,0,1,1,381,36000
13292,2018,1,0,1,2,382,36000
13293,2018,1,0,1,3,4451,36060
13294,2018,1,0,1,4,383,36120
13295,2018,1,0,1,5,384,36180
...,...,...,...,...,...,...,...
9594487,2018,1,2,1,39,222,85020
9594488,2018,1,2,1,40,223,85080
9594489,2018,1,2,1,41,224,85140
9594490,2018,1,2,1,42,225,85140


#### Load model

In [23]:
# load the model 
model = pickle.load(open("models/06/1_2.pkl", 'rb'))

#### Model validity

In [24]:
# Display model validity
print(validation(X,Y,model))

      Regression
R2      0.999660
RMSE  281.981575


#### Test run

In [25]:
print("Y test\n",Y[:10].reset_index(drop=True),"\n")
print("Y predict\n",model.predict(X[:10]))

Y test
    ACTUALTIME_ARR
0           36120
1           36120
2           36180
3           36180
4           36180
5           36240
6           36300
7           36360
8           36360
9           36420 

Y predict
 [[35991.52133728]
 [36008.21470644]
 [36079.38863729]
 [36122.16883348]
 [36168.09179889]
 [36166.83881834]
 [36237.26163252]
 [36338.16149561]
 [36344.75909849]
 [36403.6269209 ]]


## Generate PLANNEDTIME_ARR array and PREDICT
* **Trained models are used for prediction of bus arrivals at time passed from webapp frontend**
* **Planned time arrival timetable for each BUSSTOP is not available**
* **Hence, we are using all unique values available for PLANNEDTIME_ARR for a particular STOPPOINTID on a LINEID for training and prediction**
<br>

* **Algorithm**
    * **INPUT : YEAR, MONTH, DAY_OF_WEEK, QUARTER, DIRECTION, list[STOPOINTID], LINEID, PROGRNUMBER, TIME_INPUT**
    * **generate dataframe with schema [YEAR, MONTH, DAY_OF_WEEK, QUARTER, DIRECTION, LINEID, PROGRNUMBER, PLANNEDTIME_ARR,LIST_STOPOINTID]**
    * **set t = TIME_INPUT**    
    * **For each LINEID-DIRECTION-STOPPOINTID, DO :**
        * **timetable = fetch entries from Jan_trips_arr_TIMETABLE.csv for the tuple**
        * **assign (first entry in timetable > t) to PLANNEDTIME_ARR_present_tuple**
        * **assign t = PLANNEDTIME_ARR_present_tuple**
        * **append to dataframe**   
    * **predict results**
       

### Generate STOPPOINTID data for LINE 1 DIRECTION 1 : ordered list of PROGRNUMBER

In [26]:
# load dataframe; and select STOPPOINTID data ordered by program number
df_Jan_trips_arr_1_1= df_Jan_trips_arr.loc[(df_Jan_trips_arr["LINEID"] == "41") &
                                           (df_Jan_trips_arr["DIRECTION"] == 1)][['PROGRNUMBER','STOPPOINTID']]\
                                            .sort_values(by=['PROGRNUMBER'])\
                                            .drop_duplicates().copy()

list_STOPPOINTID = df_Jan_trips_arr_1_1['STOPPOINTID'].to_list()
list_PROGRNUMBER = df_Jan_trips_arr_1_1['PROGRNUMBER'].to_list()
print("Stops\n",list_STOPPOINTID,"\n\nProgram Numbers\n",list_PROGRNUMBER)

Stops
 [288, 1171, 1172, 1173, 1174, 1175, 15, 17, 18, 19, 21, 7602, 85, 203, 204, 205, 1620, 220, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629, 1630, 7348, 3669, 3671, 3672, 3674, 3675, 3676, 5073, 5074, 6054, 4330, 3679, 5075, 5076, 3682, 3864, 3865, 4910, 4911, 4912, 4913, 4914, 4915, 4957] 

Program Numbers
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51]


### prediction model

In [27]:
def predict(YEAR, MONTH, DAY_OF_WEEK, QUARTER, LINEID, DIRECTION, PROGRNUMBER, TIME_INPUT,LIST_STOPOINTID):
    
    # define dataframe
    df_X = pd.DataFrame(columns=['year','month','dayofweek_num','quarter','PROGRNUMBER','STOPPOINTID','PLANNEDTIME_ARR'])
    # assign start time : ferry coming to the stop after start time is to be serached 
    t = TIME_INPUT
    # fetch TIMETABLE
    df_TIMETABLE = pd.read_csv("../DB/ML/Jan_trips_arr_TIMETABLE.csv").query('LINEID == @LINEID and DIRECTION == @DIRECTION')
    '''
    1. query df_TIMETABLE for list of LINEID - DIRECTION ferries
    arriving at stop
    2. from the list of arrival_times; fetch smallest entry > TIME_INPUT and add to tuple as PLANNEDTIME_ARR to tuple
    3. append tuple to df_X
    4. set this PLANNEDTIME_ARR as new t
    '''
    for stop, progrNum in zip(LIST_STOPOINTID,PROGRNUMBER):
        list_timeArr = sorted(df_TIMETABLE.query('STOPPOINTID == @stop')['PLANNEDTIME_ARR'].to_list()) 
        TIME_ARR = next(TIME_ARR for TIME_ARR in list_timeArr if TIME_ARR > t)
        # Append tuple to dataframe
        df_X.loc[len(df_X)] = [YEAR, MONTH, DAY_OF_WEEK, QUARTER, progrNum, stop,  TIME_ARR]
        # Assign TIME_ARR of last STOP to t
        t = TIME_ARR
        
    print("input dataframe \n",df_X)
    
    # Fetch prediction model
    filename = "models/06/"+LINEID+"_"+DIRECTION+".pkl"
    model = pickle.load(open(filename, 'rb'))
    print("Model prediction \n",model.predict(df_X))

### Predict arrival times for next ferry of LINEID 41 DIRECTION 1 after 28800 (8 O' clock)

In [28]:
predict(2018, 1, 1, 1, "41", "1",  list_PROGRNUMBER,28800 ,list_STOPPOINTID)

input dataframe 
     year month dayofweek_num quarter PROGRNUMBER STOPPOINTID PLANNEDTIME_ARR
0   2018     1             1       1           1         288           29400
1   2018     1             1       1           2        1171           29460
2   2018     1             1       1           3        1172           29520
3   2018     1             1       1           4        1173           29580
4   2018     1             1       1           5        1174           29640
5   2018     1             1       1           6        1175           29760
6   2018     1             1       1           7          15           29820
7   2018     1             1       1           8          17           29940
8   2018     1             1       1           9          18           30000
9   2018     1             1       1          10          19           30060
10  2018     1             1       1          11          21           30120
11  2018     1             1       1          12        76

**OUTCOMES AND OBSERVATIONS**
* **ERROR BETWEEN PREDICTION AND PLANNED ARRIVAL TIME SEEM SATISFACTORY AT GLANCE**
* **THOUGH, IT SHOULD BE NOTED THAT, TOTAL TIMETABLE FETCHED PLAYS MAJOR ROLE IN SELECTING PLANNEDTIME_ARR PARAMETER**
    * **IN ABOVE EXAMPLE, TOTAL JOURNEY TIME IS FOR PLANNEDTIME_ARR INPUT IS $(33060-29400)\ /60 = 61$ MINUTES; WHEREAS [IDEAL JOURNEY TIME FOR LINE 41](https://www.dublinbus.ie/Your-Journey1/Timetables/All-Timetables/412/)  DEFINED BY DUBLIN BUS OPERATORS IS ITSELF NEARLY $68$ MINUTES**
    * THIS HAPPENED DUE TO OF ROUND-OFF TO MINUTES OPERATION; WHICH FLOORED PLANNEDTIME_ARR COLUMN TO NEAREST MINUTE VALUE