# Regerssors : Model for each route on each direction
**Notebook genrates Linear regression models for each bus route on either direction**

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [31]:
from sklearn.pipeline import make_pipeline
from lightgbm import LGBMRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn import metrics
import math

# Using sklearn to split data into training and testing sets,train classifier and regressor models 
from sklearn.model_selection import train_test_split

#pickle package saves and loads sklearn models
import pickle

from warnings import filterwarnings
filterwarnings('ignore')

## Source of Truth gathered in prior EDA
* **TRIPID**
    * **TRIPID are shared across days for perticular LINEID on particular ROUTEID**
    * For a single TRIPID, only one corresponding LINEID and ROUTEID are present
    * TRIPID for inbound and outbound directions are mutually exclusive
    * For certein TRIPIDs; PROGNUMBER do not start at 1; which should. It means, some entries are missing.
* **LINEID**
    * **For a LINEID; ROUTEIDs and TRIPIDs used for DIFFERENT DIRECTION are mutually exclusive**
* **ROUTEID**
    * A LINEID has multiple ROUTEID 
    * A LINEID may / may not have trips with both INBOUND-going(1)/OUTBOUND-returning(2) direction 
    * **THOUGH 2 ROUTES FOR A LINEID ARE HAVING SAME DIRECTION; STOPPOINTIDs VISITED ARE DIFFERENT**
    * **FOR A SINGLE ROUTE; STOPS VISITED DURING DIFFERENT TRIPS ARE DIFEENRENT**
    * **ROUTEIDs ARE NOT KNOWN TO USER & ROUTEIDs ARE NOT TIMEBOUND i.e. timetable DOES NOT exists to follow a ROUTEID**
    * **majority of data for a LINEID is tied with a particular ROUTEID**
* **PLANNED ARRIVAL AND DEPARTURE TIMES ARE SAME FOR ALL ENTRIES; HENCE "PLANNEDTIME_DEP" CAN BE DROPPED**    

## Read Merge df_Jan with df_trips on 'TRIPID'

In [32]:
df_Jan_trips_arr = pd.read_csv("../DB/ML/Jan_Trips_arr.csv")

In [33]:
df_Jan_trips_arr.head()

Unnamed: 0,year,month,dayofweek_num,quarter,LINEID,DIRECTION,STOPPOINTID,PROGRNUMBER,PLANNEDTIME_ARR,ACTUALTIME_ARR
0,2018,1,0,1,41,1,1172,3,23799,23863
1,2018,1,0,1,41,1,1173,4,23860,23934
2,2018,1,0,1,41,1,1174,5,23937,24114
3,2018,1,0,1,41,1,1175,6,24048,24180
4,2018,1,0,1,41,1,15,7,24130,24227


In [34]:
df_Jan_trips_arr['year'] = df_Jan_trips_arr['year'].astype('category')
df_Jan_trips_arr['month'] = df_Jan_trips_arr['month'].astype('category')
df_Jan_trips_arr['dayofweek_num'] = df_Jan_trips_arr['dayofweek_num'].astype('category')
df_Jan_trips_arr['quarter'] = df_Jan_trips_arr['quarter'].astype('category')
df_Jan_trips_arr['PROGRNUMBER'] = df_Jan_trips_arr['PROGRNUMBER'].astype('category')
df_Jan_trips_arr['STOPPOINTID'] = df_Jan_trips_arr['STOPPOINTID'].astype('category')
df_Jan_trips_arr['PLANNEDTIME_ARR'] = pd.to_numeric(df_Jan_trips_arr['PLANNEDTIME_ARR'], downcast='integer', errors='coerce')
df_Jan_trips_arr['ACTUALTIME_ARR'] = pd.to_numeric(df_Jan_trips_arr['ACTUALTIME_ARR'], downcast='integer', errors='coerce')
df_Jan_trips_arr['LINEID'] = df_Jan_trips_arr['LINEID'].astype('category')
df_Jan_trips_arr['DIRECTION'] = df_Jan_trips_arr['DIRECTION'].astype('category')

### Object types and valid entries

In [35]:
print(df_Jan_trips_arr.info(null_counts = True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9600545 entries, 0 to 9600544
Data columns (total 10 columns):
 #   Column           Non-Null Count    Dtype   
---  ------           --------------    -----   
 0   year             9600545 non-null  category
 1   month            9600545 non-null  category
 2   dayofweek_num    9600545 non-null  category
 3   quarter          9600545 non-null  category
 4   LINEID           9600545 non-null  category
 5   DIRECTION        9600545 non-null  category
 6   STOPPOINTID      9600545 non-null  category
 7   PROGRNUMBER      9600545 non-null  category
 8   PLANNEDTIME_ARR  9600545 non-null  int32   
 9   ACTUALTIME_ARR   9600545 non-null  int32   
dtypes: category(8), int32(2)
memory usage: 155.9 MB
None


### Nature of data

In [36]:
df_Jan_trips_arr.head(100)

Unnamed: 0,year,month,dayofweek_num,quarter,LINEID,DIRECTION,STOPPOINTID,PROGRNUMBER,PLANNEDTIME_ARR,ACTUALTIME_ARR
0,2018,1,0,1,41,1,1172,3,23799,23863
1,2018,1,0,1,41,1,1173,4,23860,23934
2,2018,1,0,1,41,1,1174,5,23937,24114
3,2018,1,0,1,41,1,1175,6,24048,24180
4,2018,1,0,1,41,1,15,7,24130,24227
...,...,...,...,...,...,...,...,...,...,...
95,2018,1,0,1,66,1,3958,47,27943,28473
96,2018,1,0,1,66,1,3959,48,27990,28508
97,2018,1,0,1,66,1,3960,49,28024,28534
98,2018,1,0,1,66,1,3961,50,28058,28551


### Unique entries

In [37]:
df_Jan_trips_arr.nunique()

year                   1
month                  1
dayofweek_num          7
quarter                1
LINEID               126
DIRECTION              2
STOPPOINTID         4602
PROGRNUMBER          102
PLANNEDTIME_ARR    67749
ACTUALTIME_ARR     70065
dtype: int64

## Train model for each LINEID for each direction

### Obtain pairs of LINEID for either direction
* Some LINEIDs have Entries for a single direction only. Hence; this step avoids exceptions during model creation.

In [38]:
df_line_dir_pair = df_Jan_trips_arr.groupby(['LINEID','DIRECTION']).size().reset_index()[['LINEID','DIRECTION']]

In [39]:
df_line_dir_pair

Unnamed: 0,LINEID,DIRECTION
0,1,1
1,1,2
2,102,1
3,102,2
4,104,1
...,...,...
247,84A,2
248,84X,1
249,84X,2
250,9,1


## Model

### Regression pipeline

In [40]:
'''
Pipeline with transformed target regressor is defined
'''
def regressorModel(num_col,cat_col,regression_algo = "Linear"):
    
    
    algorithms = {"Linear": LinearRegression(),"LGBM": LGBMRegressor()}
    std_scalar = StandardScaler()
    oh_encoder = OneHotEncoder()

    pipe_lin_reg = make_pipeline(
        ColumnTransformer([
            ('num', std_scalar, num_col),
            ('cat', oh_encoder, cat_col),    
        ]),
        algorithms[regression_algo]
    )

    regressor = TransformedTargetRegressor(regressor=pipe_lin_reg, transformer=std_scalar)
    return regressor

### list of feature based on datatypes

In [41]:
'''
get_column_lists(df) returns lists of catagorical and 
numerical columns in the input dataframe
'''
def get_column_lists(df):
    cat_col = list(df.select_dtypes(include=['category']).columns)
    num_col = list(df.select_dtypes(exclude=['category','object','datetime']).columns)
    
    return num_col,cat_col

###  Train model with data for LINEID x  and  DIRECTION y

In [51]:
for _, row in df_line_dir_pair.iterrows():
    
    # pair of LINEID and DIRECTION is taken in variable. Seperate model is trained for each pair
    line = row['LINEID']
    direction = row['DIRECTION']
    
    # Dataframe slice for pair of LINEID and DIRECTION is taken for operation
    df_Jan_trips_arr_LINE_DIR = df_Jan_trips_arr.loc[(df_Jan_trips_arr["LINEID"] == line) & 
                                                     (df_Jan_trips_arr["DIRECTION"] == direction)].copy()
    
    # INPUT and TARGET features
    X = df_Jan_trips_arr_LINE_DIR[["year","month","dayofweek_num","quarter","LINEID","PROGRNUMBER",\
                         "STOPPOINTID","DIRECTION","PLANNEDTIME_ARR"]]
    Y = df_Jan_trips_arr_LINE_DIR[["ACTUALTIME_ARR"]]
    
    #generate and fit model
    num_col,cat_col = get_column_lists(X)
    regressor = regressorModel(num_col,cat_col,regression_algo="Linear")
    model = regressor.fit(X, Y)
    
    # Save model
    filename = str(line)+"_"+str(direction)+'.pkl'
    pickle.dump(model, open("models/03/"+filename, 'wb'))    

## Validation and Evaluation

### Model validation

In [58]:
def calc_Regression(X, y, model, scoring='R2', cv=3):
    """Functions to carry out validation on the regression model.
    Default number of validations cycles is 3. The randon state will be updated 
    at each iteration"""
    
    # store results
    results = []
    # evaluate cv times and append to results
    for i in range(cv):
        # set up train test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=i , test_size=0.8)
        # Prediction
        y_predict = model.predict(X_test)
        # calc score
        if scoring=='R2':
            score = metrics.r2_score(y_test, y_predict)
        elif scoring=='RMSE':
            score = math.sqrt(metrics.mean_squared_error(y_test, y_predict))
        # append to results
        results.append(score)
    return results

In [68]:
def validation(X,y,model, cvVal=3):
    """Function performs cross validation for input model and store results 
    in dataframe. Cross validation looks at R2 score, RMSE
    and returns a dataframe with results"""

    regression_results = {}
    # metrics to test against
    test_metrics = ['R2','RMSE']
    
    for metric in test_metrics:
        # generate test results
        result = calc_Regression(X, y, model,cv=cvVal, scoring=metric)
        length = len(result)
        # store result in dict
        regression_results[metric] = sum(result)/length

    # create dataframe with results
    LinRegDF = pd.DataFrame.from_dict(regression_results, orient='index', columns=['Regression'])
    
    return LinRegDF

### Model evaluation

#### Load data

In [69]:
# load dataframe
df_Jan_trips_arr_LINE_DIR = df_Jan_trips_arr.loc[(df_Jan_trips_arr["LINEID"] == "1") &
                                                 (df_Jan_trips_arr["DIRECTION"] == 1)].copy()
# Prepare INPUT and TARGET features
X = df_Jan_trips_arr_LINE_DIR[["year","month","dayofweek_num","quarter","LINEID","PROGRNUMBER",
                               "STOPPOINTID","DIRECTION","PLANNEDTIME_ARR"]]
Y = df_Jan_trips_arr_LINE_DIR[["ACTUALTIME_ARR"]]

#### Load model

In [70]:
# load the model 
model_1_1 = pickle.load(open("models/03/1_1.pkl", 'rb'))

#### Model validity

In [71]:
# Display model validity
print(validation(X,Y,model_1_1))

      Regression
R2      0.999623
RMSE  275.964557


#### Test run

In [78]:
print("Y test\n",Y[:10].reset_index(drop=True),"\n")
print("Y predict\n",model.predict(X[:10]))

Y test
    ACTUALTIME_ARR
0           36048
1           36075
2           36091
3           36125
4           36183
5           36254
6           36315
7           36361
8           36438
9           36480 

Y predict
 [[36014.82063939]
 [36053.64892332]
 [36083.07828805]
 [36138.06680613]
 [36180.45048497]
 [36274.83938967]
 [36363.89323105]
 [36404.42869289]
 [36477.93487258]
 [36524.15871362]]
