# Regerssors : Model for each route on each direction
* **Notebook genrates Linear regression models for each bus route on either direction** 
* **TIME parameters are considered in minutes resolution**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.pipeline import make_pipeline
from lightgbm import LGBMRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder 
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn import metrics
import math
from sklearn.preprocessing import KBinsDiscretizer

# Using sklearn to split data into training and testing sets,train classifier and regressor models 
from sklearn.model_selection import train_test_split

#pickle package saves and loads sklearn models
import pickle

from warnings import filterwarnings
filterwarnings('ignore')

## Source of Truth gathered in prior EDA
* **TRIPID**
    * **TRIPID are shared across days for perticular LINEID on particular ROUTEID**
    * For a single TRIPID, only one corresponding LINEID and ROUTEID are present
    * TRIPID for inbound and outbound directions are mutually exclusive
    * For certein TRIPIDs; PROGNUMBER do not start at 1; which should. It means, some entries are missing.
* **LINEID**
    * **For a LINEID; ROUTEIDs and TRIPIDs used for DIFFERENT DIRECTION are mutually exclusive**
* **ROUTEID**
    * A LINEID has multiple ROUTEID 
    * A LINEID may / may not have trips with both INBOUND-going(1)/OUTBOUND-returning(2) direction 
    * **THOUGH 2 ROUTES FOR A LINEID ARE HAVING SAME DIRECTION; STOPPOINTIDs VISITED ARE DIFFERENT**
    * **FOR A SINGLE ROUTE; STOPS VISITED DURING DIFFERENT TRIPS ARE DIFEENRENT**
    * **ROUTEIDs ARE NOT KNOWN TO USER & ROUTEIDs ARE NOT TIMEBOUND i.e. timetable DOES NOT exists to follow a ROUTEID**
    * **majority of data for a LINEID is tied with a particular ROUTEID**
* **PLANNED ARRIVAL AND DEPARTURE TIMES ARE SAME FOR ALL ENTRIES; HENCE "PLANNEDTIME_DEP" CAN BE DROPPED**    

## Read Mergef file : df_weather with df_leavetimes with df_trips on 'DAYOFSERVICE','TRIPID'

In [3]:
df_weather_leavetimes_trips_arr_MINUTES = pd.read_csv("../DB/ML/weather_leavetimes_trips_arr_MINUTES.csv",skip_blank_lines=True,index_col=False)

In [4]:
print("Shape of dataframe",df_weather_leavetimes_trips_arr_MINUTES.shape)
df_weather_leavetimes_trips_arr_MINUTES.head()

Shape of dataframe (101536033, 14)


Unnamed: 0,DAYOFSERVICE,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,ACTUALTIME_ARR,LINEID,DIRECTION,feels_like,wind_speed,weather_id,year,month,dayofweek_num,quarter
0,2018-04-01,1,288,23700,23998,41,1,-1.6,3.6,803,2018,4,6,2
1,2018-04-01,2,1171,23700,24029,41,1,-1.6,3.6,803,2018,4,6,2
2,2018-04-01,3,1172,23760,24081,41,1,-1.6,3.6,803,2018,4,6,2
3,2018-04-01,4,1173,23820,24160,41,1,-1.6,3.6,803,2018,4,6,2
4,2018-04-01,5,1174,23880,24242,41,1,-1.6,3.6,803,2018,4,6,2


In [5]:
df_weather_leavetimes_trips_arr_MINUTES['year'] = df_weather_leavetimes_trips_arr_MINUTES['year'].astype('category')
df_weather_leavetimes_trips_arr_MINUTES['month'] = df_weather_leavetimes_trips_arr_MINUTES['month'].astype('category')
df_weather_leavetimes_trips_arr_MINUTES['dayofweek_num'] = df_weather_leavetimes_trips_arr_MINUTES['dayofweek_num'].astype('category')
df_weather_leavetimes_trips_arr_MINUTES['quarter'] = df_weather_leavetimes_trips_arr_MINUTES['quarter'].astype('category')
df_weather_leavetimes_trips_arr_MINUTES['PROGRNUMBER'] = df_weather_leavetimes_trips_arr_MINUTES['PROGRNUMBER'].astype('category')
df_weather_leavetimes_trips_arr_MINUTES['STOPPOINTID'] = df_weather_leavetimes_trips_arr_MINUTES['STOPPOINTID'].astype('category')
df_weather_leavetimes_trips_arr_MINUTES['PLANNEDTIME_ARR'] = pd.to_numeric(df_weather_leavetimes_trips_arr_MINUTES['PLANNEDTIME_ARR'], downcast='integer', errors='coerce')
df_weather_leavetimes_trips_arr_MINUTES['ACTUALTIME_ARR'] = pd.to_numeric(df_weather_leavetimes_trips_arr_MINUTES['ACTUALTIME_ARR'], downcast='integer', errors='coerce')
df_weather_leavetimes_trips_arr_MINUTES['LINEID'] = df_weather_leavetimes_trips_arr_MINUTES['LINEID'].astype('category')
df_weather_leavetimes_trips_arr_MINUTES['DIRECTION'] = df_weather_leavetimes_trips_arr_MINUTES['DIRECTION'].astype('category')
df_weather_leavetimes_trips_arr_MINUTES['feels_like'] = pd.to_numeric(df_weather_leavetimes_trips_arr_MINUTES['feels_like'], errors='coerce')
df_weather_leavetimes_trips_arr_MINUTES['wind_speed'] = pd.to_numeric(df_weather_leavetimes_trips_arr_MINUTES['wind_speed'], errors='coerce')
df_weather_leavetimes_trips_arr_MINUTES['weather_id'] = df_weather_leavetimes_trips_arr_MINUTES['weather_id'].astype('category')

### Object types and valid entries

In [6]:
print(df_weather_leavetimes_trips_arr_MINUTES.info(null_counts = True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101536033 entries, 0 to 101536032
Data columns (total 14 columns):
 #   Column           Non-Null Count      Dtype   
---  ------           --------------      -----   
 0   DAYOFSERVICE     101536033 non-null  object  
 1   PROGRNUMBER      101536033 non-null  category
 2   STOPPOINTID      101536033 non-null  category
 3   PLANNEDTIME_ARR  101536033 non-null  int32   
 4   ACTUALTIME_ARR   101536033 non-null  int32   
 5   LINEID           101536033 non-null  category
 6   DIRECTION        101536033 non-null  category
 7   feels_like       101536033 non-null  float64 
 8   wind_speed       101536033 non-null  float64 
 9   weather_id       101536033 non-null  category
 10  year             101536033 non-null  category
 11  month            101536033 non-null  category
 12  dayofweek_num    101536033 non-null  category
 13  quarter          101536033 non-null  category
dtypes: category(9), float64(2), int32(2), object(1)
memory usage: 

### Nature of data

In [7]:
df_weather_leavetimes_trips_arr_MINUTES.head()

Unnamed: 0,DAYOFSERVICE,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,ACTUALTIME_ARR,LINEID,DIRECTION,feels_like,wind_speed,weather_id,year,month,dayofweek_num,quarter
0,2018-04-01,1,288,23700,23998,41,1,-1.6,3.6,803,2018,4,6,2
1,2018-04-01,2,1171,23700,24029,41,1,-1.6,3.6,803,2018,4,6,2
2,2018-04-01,3,1172,23760,24081,41,1,-1.6,3.6,803,2018,4,6,2
3,2018-04-01,4,1173,23820,24160,41,1,-1.6,3.6,803,2018,4,6,2
4,2018-04-01,5,1174,23880,24242,41,1,-1.6,3.6,803,2018,4,6,2


### Unique entries

In [8]:
# df_weather_leavetimes_trips_arr_MINUTES.nunique()

In [34]:
df_weather_leavetimes_trips_arr_MINUTES.nunique()

DAYOFSERVICE         360
PROGRNUMBER          109
STOPPOINTID         4523
PLANNEDTIME_ARR     1226
ACTUALTIME_ARR     72804
LINEID               130
DIRECTION              2
feels_like           355
wind_speed            57
weather_id            20
year                   1
month                 12
dayofweek_num          7
quarter                4
dtype: int64

### Entries for LINEID 41 DIRECTION 1 which faces exception during handling

In [9]:
# df_weather_leavetimes_trips_arr_MINUTES_41_1 = df_weather_leavetimes_trips_arr_MINUTES.loc[(df_Jan_trips_arr["LINEID"] == "41") & 
#                                                  (df_Jan_trips_arr["DIRECTION"] == 1)].copy()

In [10]:
# df_weather_leavetimes_trips_arr_MINUTES_41_1

In [11]:
# df_weather_leavetimes_trips_arr_MINUTES_41_1.info(null_counts=True)

In [12]:
# df_weather_leavetimes_trips_arr_MINUTES_41_1.nunique()

## Train model for each LINEID for each direction

### Obtain pairs of LINEID for either direction
* Some LINEIDs have valid Entries for a single direction only. 
Particularly for January data, following combinations produce error while data handling
* line 41A direction 1
* line 77X direction 1
* line 51X direction 1
* line 46E direction 1
* line 118 direction 1
* line 68X direction 

Hence; follwoing step avoids exceptions during model creation.

In [13]:
df_line_dir_pair = pd.read_csv('../DB/dominant_route_2018.csv',index_col=False)[['LINEID','DIRECTION']]

In [14]:
df_line_dir_pair

Unnamed: 0,LINEID,DIRECTION
0,41,1
1,41,2
2,66,1
3,66,2
4,77A,1
...,...,...
247,40E,2
248,33E,1
249,16D,1
250,41D,1


## Model

### Regression pipeline

In [15]:
'''
Pipeline with transformed target regressor is defined
n_bins = (27[extended hours]*60[minutes]/2[minute interval])
'''
def regressorModel(num_col,cat_col,regression_algo = "Linear"):
    
    
    algorithms = {"Linear": LinearRegression(),"LGBM": LGBMRegressor()}
    std_scalar = StandardScaler()
    oh_encoder = OneHotEncoder(drop="first")
    ordinal_encoder = OrdinalEncoder()


    pipe_lin_reg = make_pipeline(
        ColumnTransformer([
            ('num', std_scalar, num_col),
            ('cat', ordinal_encoder, cat_col)
        ]),
        algorithms[regression_algo]
    )

    regressor = TransformedTargetRegressor(regressor=pipe_lin_reg, transformer=std_scalar)
    return regressor

### list of feature based on datatypes

In [16]:
'''
get_column_lists(df) returns lists of catagorical and 
numerical columns in the input dataframe
'''
def get_column_lists(df):
    cat_col = list(df.select_dtypes(include=['category']).columns)
    num_col = list(df.select_dtypes(exclude=['category','object','datetime']).columns)
    
    return num_col,cat_col

###  Train model with data for LINEID x  and  DIRECTION y

In [17]:
for _, row in df_line_dir_pair.iterrows():
    
    # pair of LINEID and DIRECTION is taken in variable. Seperate model is trained for each pair
    line = row['LINEID']
    direction = row['DIRECTION']
    
    # Dataframe slice for pair of LINEID and DIRECTION is taken for operation
    df_weather_leavetimes_trips_arr_MINUTES_LINE_DIR = df_weather_leavetimes_trips_arr_MINUTES.loc[(df_weather_leavetimes_trips_arr_MINUTES["LINEID"] == line) & 
                                                     (df_weather_leavetimes_trips_arr_MINUTES["DIRECTION"] == direction)].reset_index(drop=True)
    
    # INPUT and TARGET features
    X = df_weather_leavetimes_trips_arr_MINUTES_LINE_DIR[["year","month","dayofweek_num","quarter","PROGRNUMBER",\
                         "STOPPOINTID","PLANNEDTIME_ARR","feels_like","wind_speed","weather_id"]]
    X = X.reset_index(drop=True)
    Y = df_weather_leavetimes_trips_arr_MINUTES_LINE_DIR[["ACTUALTIME_ARR"]]
    Y = Y.reset_index(drop=True)
    
    # Train model
    num_col,cat_col = get_column_lists(X)
    
    regressor = regressorModel(num_col,cat_col,regression_algo="Linear")
    try:
        model = regressor.fit(X, Y)
    except Exception as e:
        print(line, direction,"\nInput features\n" ,X.head(),"\nTarget features\n" ,Y.head(), e)
        continue
    
    # Save model
    filename = str(line)+"_"+str(direction)+'.pkl'
    pickle.dump(model, open("models/09/"+filename, 'wb'))  
    
    del X
    del Y
    del df_weather_leavetimes_trips_arr_MINUTES_LINE_DIR

## Validation and Evaluation

### Model validation

In [18]:
def calc_Regression(X, y, model, scoring='R2', cv=3):
    """Functions to carry out validation on the regression model.
    Default number of validations cycles is 3. The randon state will be updated 
    at each iteration"""
    
    # store results
    results = []
    # evaluate cv times and append to results
    for i in range(cv):
        # set up train test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=i , test_size=0.8)
        # Prediction
        y_predict = model.predict(X_test)
        # calc score
        if scoring=='R2':
            score = metrics.r2_score(y_test, y_predict)
        elif scoring=='RMSE':
            score = math.sqrt(metrics.mean_squared_error(y_test, y_predict))
        elif scoring=='MAE':
            score = metrics.mean_absolute_error(y_test, y_predict)
        # append to results
        results.append(score)
    return results

In [19]:
def validation(X,y,model, cvVal=3):
    """Function performs cross validation for input model and store results 
    in dataframe. Cross validation looks at R2 score, RMSE
    and returns a dataframe with results"""

    regression_results = {}
    # metrics to test against
    test_metrics = ['R2','RMSE','MAE']
    
    for metric in test_metrics:
        # generate test results
        result = calc_Regression(X, y, model,cv=cvVal, scoring=metric)
        length = len(result)
        # store result in dict
        regression_results[metric] = sum(result)/length

    # create dataframe with results
    LinRegDF = pd.DataFrame.from_dict(regression_results, orient='index', columns=['Regression'])
    
    return LinRegDF

### Model evaluation

#### Load data

In [20]:
# load dataframe
df_weather_trips_arr_LINE_DIR = df_weather_leavetimes_trips_arr_MINUTES.loc[(df_weather_leavetimes_trips_arr_MINUTES["LINEID"] == "1") & 
                                                                        (df_weather_leavetimes_trips_arr_MINUTES["DIRECTION"] == 2)]
# Prepare INPUT and TARGET features
X = df_weather_trips_arr_LINE_DIR[["year","month","dayofweek_num","quarter","PROGRNUMBER",
                               "STOPPOINTID","PLANNEDTIME_ARR","feels_like","wind_speed","weather_id"]]
Y = df_weather_trips_arr_LINE_DIR[["ACTUALTIME_ARR"]]

In [21]:
df_weather_trips_arr_LINE_DIR.head()

Unnamed: 0,DAYOFSERVICE,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,ACTUALTIME_ARR,LINEID,DIRECTION,feels_like,wind_speed,weather_id,year,month,dayofweek_num,quarter
15072,2018-04-01,4,383,36120,36213,1,2,-1.2,6.2,803,2018,4,6,2
15073,2018-04-01,5,384,36180,36250,1,2,-1.2,6.2,803,2018,4,6,2
15074,2018-04-01,6,385,36180,36272,1,2,-1.2,6.2,803,2018,4,6,2
15075,2018-04-01,7,387,36240,36366,1,2,-1.2,6.2,803,2018,4,6,2
15076,2018-04-01,8,388,36300,36436,1,2,-1.2,6.2,803,2018,4,6,2


In [22]:
X

Unnamed: 0,year,month,dayofweek_num,quarter,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,feels_like,wind_speed,weather_id
15072,2018,4,6,2,4,383,36120,-1.2,6.2,803
15073,2018,4,6,2,5,384,36180,-1.2,6.2,803
15074,2018,4,6,2,6,385,36180,-1.2,6.2,803
15075,2018,4,6,2,7,387,36240,-1.2,6.2,803
15076,2018,4,6,2,8,388,36300,-1.2,6.2,803
...,...,...,...,...,...,...,...,...,...,...
101530904,2018,9,6,3,39,222,84960,1.6,3.6,801
101530905,2018,9,6,3,40,223,85020,1.6,3.6,801
101530906,2018,9,6,3,41,224,85020,1.6,3.6,801
101530907,2018,9,6,3,42,225,85080,1.6,3.6,801


In [23]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 624638 entries, 15072 to 101530908
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   year             624638 non-null  category
 1   month            624638 non-null  category
 2   dayofweek_num    624638 non-null  category
 3   quarter          624638 non-null  category
 4   PROGRNUMBER      624638 non-null  category
 5   STOPPOINTID      624638 non-null  category
 6   PLANNEDTIME_ARR  624638 non-null  int32   
 7   feels_like       624638 non-null  float64 
 8   wind_speed       624638 non-null  float64 
 9   weather_id       624638 non-null  category
dtypes: category(7), float64(2), int32(1)
memory usage: 21.6 MB


#### Load model

In [24]:
# load the model 
model = pickle.load(open("models/09/1_2.pkl", 'rb'))

#### Model validity

In [25]:
# Display model validity
print(validation(X,Y,model))

      Regression
R2      0.999481
RMSE  363.401052
MAE   211.292178


#### Test run

In [26]:
print("Y test\n",Y[:10].reset_index(drop=True),"\n")
print("Y predict\n",model.predict(X[:10]))

Y test
    ACTUALTIME_ARR
0           36213
1           36250
2           36272
3           36366
4           36436
5           36452
6           36505
7           36559
8           36612
9           36647 

Y predict
 [[36175.83327294]
 [36241.74578869]
 [36247.63954925]
 [36313.552065  ]
 [36379.46458075]
 [36385.35834131]
 [36450.35817573]
 [36521.74677943]
 [36584.00856988]
 [36649.92108563]]


## Generate PLANNEDTIME_ARR array and PREDICT
* **Trained models are used for prediction of bus arrivals at time passed from webapp frontend**
* **Planned time arrival timetable for each BUSSTOP is not available**
* **Hence, we are using all unique values available for PLANNEDTIME_ARR for a particular STOPPOINTID on a LINEID for training and prediction**
<br>

* **Algorithm**
    * **INPUT : YEAR, MONTH, DAY_OF_WEEK, QUARTER, DIRECTION, list[STOPOINTID], LINEID, PROGRNUMBER, TIME_INPUT**
    * **generate dataframe with schema [YEAR, MONTH, DAY_OF_WEEK, QUARTER, DIRECTION, LINEID, PROGRNUMBER, PLANNEDTIME_ARR,LIST_STOPOINTID]**
    * **set t = TIME_INPUT**    
    * **For each LINEID-DIRECTION-STOPPOINTID, DO :**
        * **timetable = fetch entries from Jan_trips_arr_TIMETABLE.csv for the tuple**
        * **assign (first entry in timetable > t) to PLANNEDTIME_ARR_present_tuple**
        * **assign t = PLANNEDTIME_ARR_present_tuple**
        * **append to dataframe**   
    * **predict results**
       

### Generate STOPPOINTID data for LINE 1 DIRECTION 1 : ordered list of PROGRNUMBER

In [31]:
# load dataframe; and select STOPPOINTID data ordered by program number
df_weather_trips_arr_1_1= df_weather_leavetimes_trips_arr_MINUTES.loc[(df_weather_leavetimes_trips_arr_MINUTES["LINEID"] == "41") &
                                           (df_weather_leavetimes_trips_arr_MINUTES["DIRECTION"] == 1)][['PROGRNUMBER','STOPPOINTID']]\
                                            .sort_values(by=['PROGRNUMBER'])\
                                            .drop_duplicates().copy()

list_STOPPOINTID = df_weather_trips_arr_1_1['STOPPOINTID'].to_list()
list_PROGRNUMBER = df_weather_trips_arr_1_1['PROGRNUMBER'].to_list()
print("Stops\n",list_STOPPOINTID,"\n\nProgram Numbers\n",list_PROGRNUMBER)

Stops
 [288, 1171, 1172, 1173, 1174, 1175, 15, 17, 18, 19, 21, 7602, 85, 203, 204, 205, 1620, 220, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629, 1630, 7348, 3669, 3671, 3672, 3674, 3675, 3676, 5073, 5074, 6054, 4330, 3679, 5075, 5076, 3682, 3864, 3865, 4910, 4911, 4912, 4913, 4914, 4915, 4957] 

Program Numbers
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51]


### prediction model

In [32]:
def predict(YEAR, MONTH, DAY_OF_WEEK, QUARTER, LINEID, DIRECTION, PROGRNUMBER, TIME_INPUT,LIST_STOPOINTID,dt_in):
    
    # define dataframe
    df_X = pd.DataFrame(columns=['year','month','dayofweek_num','quarter','PROGRNUMBER','STOPPOINTID','PLANNEDTIME_ARR',\
                                'feels_like','wind_speed','weather_id'])
    convert_dict = {'year':'int','month':'int','dayofweek_num':'int','quarter':'int',
                    'PROGRNUMBER':'int','STOPPOINTID':'int',
                    'PLANNEDTIME_ARR': 'int','feels_like': 'float','wind_speed': 'float',
                    'weather_id':'int'
                   }

    # assign start time : ferry coming to the stop after start time is to be serached 
    t = TIME_INPUT
    # fetch TIMETABLE
    df_TIMETABLE = pd.read_csv("../DB/ML/Jan_trips_arr_TIMETABLE.csv").query('LINEID == @LINEID and DIRECTION == @DIRECTION')
    
    # resolve datetime input to find weather data for the given date
    dt_in = pd.to_datetime(dt_in)
    dt = str(dt_in.date())
    hr = dt_in.hour
    
    # make dictionary of [feels_like,wind_speed, weather_id] entries on given day and hour
    # No dublin bus ride exceeds 3 hour planned journey, hence 3 entries from given input hour are selected
    dates = [dt]
    df_weather = pd.read_csv("../DB/ML/weather_extendedTime_2018.csv",skip_blank_lines=True,index_col=False)
    df_weather = df_weather[df_weather['DAYOFSERVICE'].astype('str') == dt ].drop_duplicates('hour')
    
    # Generate dictionary containing weather parameters for 3 hours including given hour value
    dict_weather = {}
    for h in range(hr,hr+3):
        dict_weather[h] = df_weather.query('hour == @h')[['feels_like', 'wind_speed', 'weather_id']].values.tolist()[0]
        
    '''
    1. query df_TIMETABLE for list of LINEID - DIRECTION ferries
    arriving at stop
    2. from the list of arrival_times; fetch smallest entry > TIME_INPUT and add to tuple as PLANNEDTIME_ARR to tuple
    3. append tuple to df_X
    4. set this PLANNEDTIME_ARR as new t
    '''

    for stop, progrNum in zip(LIST_STOPOINTID,PROGRNUMBER):
        list_timeArr = sorted(df_TIMETABLE.query('STOPPOINTID == @stop')['PLANNEDTIME_ARR'].to_list()) 
        TIME_ARR = next(TIME_ARR for TIME_ARR in list_timeArr if TIME_ARR > t+10)

        # Append tuple to dataframe
        df_X.loc[len(df_X)] = [YEAR, MONTH, DAY_OF_WEEK, QUARTER, progrNum, stop,
                               TIME_ARR, *dict_weather[int(TIME_ARR/3600)]]
        # Assign TIME_ARR of last STOP to t
        t = TIME_ARR
    
    # Typecast dataframe
    df_X = df_X.astype(convert_dict)   

    print("input dataframe \n",df_X,df_X.info())
    
    # Fetch prediction model
    filename = "models/09/"+LINEID+"_"+DIRECTION+".pkl"
    model = pickle.load(open(filename, 'rb'))
    print("Model prediction \n",model.predict(df_X))

### Predict arrival times for next ferry of LINEID 41 DIRECTION 1 after 28800 (8 O' clock)

In [33]:
predict(2018, 1, 1, 1, "41", "1",  list_PROGRNUMBER,28800, list_STOPPOINTID, "2018-01-02T08:00:00")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 0 to 50
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   year             51 non-null     int32  
 1   month            51 non-null     int32  
 2   dayofweek_num    51 non-null     int32  
 3   quarter          51 non-null     int32  
 4   PROGRNUMBER      51 non-null     int32  
 5   STOPPOINTID      51 non-null     int32  
 6   PLANNEDTIME_ARR  51 non-null     int32  
 7   feels_like       51 non-null     float64
 8   wind_speed       51 non-null     float64
 9   weather_id       51 non-null     int32  
dtypes: float64(2), int32(8)
memory usage: 2.8 KB
input dataframe 
     year  month  dayofweek_num  quarter  PROGRNUMBER  STOPPOINTID  \
0   2018      1              1        1            1          288   
1   2018      1              1        1            2         1171   
2   2018      1              1        1            3         1172   
3

**OUTCOMES AND OBSERVATIONS**
* **ERROR BETWEEN PREDICTION AND PLANNED ARRIVAL TIME SEEM SATISFACTORY AT GLANCE**
* **THOUGH, IT SHOULD BE NOTED THAT, TOTAL TIMETABLE FETCHED PLAYS MAJOR ROLE IN SELECTING PLANNEDTIME_ARR PARAMETER**
    * **IN ABOVE EXAMPLE, TOTAL JOURNEY TIME IS FOR PLANNEDTIME_ARR INPUT IS $(33060-29400)\ /60 = 61$ MINUTES; WHEREAS [IDEAL JOURNEY TIME FOR LINE 41](https://www.dublinbus.ie/Your-Journey1/Timetables/All-Timetables/412/)  DEFINED BY DUBLIN BUS OPERATORS IS ITSELF NEARLY $68$ MINUTES**
    * THIS HAPPENED DUE TO OF ROUND-OFF TO MINUTES OPERATION; WHICH FLOORED PLANNEDTIME_ARR COLUMN TO NEAREST MINUTE VALUE