# Model Training 

## Feature Enginneering

In [None]:
# Importing all required library
import pandas as pd
import numpy as np
import seaborn as sns
from math import radians, cos, sin, asin, acos, sqrt, pi
from geopy import distance
from geopy.geocoders import Nominatim
import osmnx as ox
import networkx as nx
from statistics import  mode
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv('/config/workspace/Delivery_price_prediction/notebook/data/finalTrain.csv')

In [None]:
df_clened=pd.read_csv('/config/workspace/Delivery_price_prediction/notebook/data/Final_Raw.csv')

In [None]:
df

In [None]:
df_clened

In [None]:
# Splitting independant and dependant features 
df.drop([                'ID', 
                'Delivery_person_ID'],inplace=True,axis=1)
X=df.drop('Time_taken (min)',axis=1)
y=df['Time_taken (min)']

In [None]:
#Categories for ordinal encoding
Weather_conditions_ODE=['Sunny','Cloudy','Windy','Fog', 'Stormy', 'Sandstorms' ]
Road_traffic_density_ODE=['Low', 'Medium', 'High', 'Jam']
Type_of_vehicle_ODE=['bicycle', 'electric_scooter', 'scooter', 'motorcycle']
Festival_ODE=['No','Yes']

#Categories for One Hot encoding
OHE_Cat_City=['Metropolitian', 'Urban', 'Semi-Urban']
OHE_Cat_type_orders=['Snack', 'Meal', 'Drinks', 'Buffet']

#Column Transformation
num_CT=['Delivery_person_Age', 'Delivery_person_Ratings','Vehicle_condition']
ordinal_CT=['Weather_conditions', 'Road_traffic_density','Type_of_vehicle', 'Festival']
OHE_CT=['City','Type_of_order']
time_CT=['Time_Orderd', 'Time_Order_picked']


drop_list_pipe=['Restaurant_latitude',
                'Restaurant_longitude', 
                'Delivery_location_latitude',
                'Delivery_location_longitude',
                'Time_Orderd', 
                'Time_Order_picked', 
                'Order_Date',

               ]


In [None]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
from sklearn.preprocessing import OneHotEncoder # One Hot Encoding
## pipelines
from sklearn.pipeline import Pipeline,FunctionTransformer
from sklearn.compose import ColumnTransformer

In [None]:
# Distance calculator

def calculate_spherical_distance(lat1, lon1, lat2, lon2, r=6371):
    
    # Convert degrees to radians
    coordinates = lat1, lon1, lat2, lon2
    # radians(c) is same as c*pi/180
    phi1, lambda1, phi2, lambda2 = [
        radians(c) for c in coordinates
    ]  
    
    # Apply the haversine formula
    a = (np.square(sin((phi2-phi1)/2)) + cos(phi1) * cos(phi2) * 
         np.square(sin((lambda2-lambda1)/2)))
    d = 2*r*asin(np.sqrt(a))
    return d

In [None]:
def time_to_minutes(x):
    if isinstance(x, str) and ":" in x:
        return float(x.split(":")[0]) * 60 + (float(x.split(":")[1]))
    else:
        return np.nan

def get_pickup_time(df):
    df['Time_Order_picked'].apply(time_to_minutes)
    df['Time_Orderd'].apply(time_to_minutes)
    df['pickup_time']=df['Time_Order_picked']-df['Time_Orderd']
    df.drop(['Time_Order_picked','Time_Orderd'],axis=1,inplace=True)
    return df

# Pipeline for time columns
time_pipeline = Pipeline([
    ('time_conversion', FunctionTransformer(get_pickup_time))])


In [None]:
#Distance Transformation
distance_pipeline = Pipeline([
    ('distance', FunctionTransformer(lambda x: x.assign(Distance=[round(calculate_spherical_distance(*row), 2) 
    for row in x[['Restaurant_latitude', 'Restaurant_longitude', 'Delivery_location_latitude', 'Delivery_location_longitude']].values])))
])




# Pipeline for date column
date_pipeline = Pipeline([
    ('extract_month', FunctionTransformer(lambda x: x.assign(Order_Month=x['Order_Date'].apply(lambda y: int(y.split("-")[1]))))),
    ('extract_day', FunctionTransformer(lambda x: x.assign(Order_Day=x['Order_Date'].apply(lambda y: int(y.split("-")[0])))))
])

#Pipeline for frequent values handling 
frequncy_of_delivery=Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('scaler',StandardScaler())
])
# Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
   
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
        
        ] 
                    )
# Categorical Pipeline
cat_pipeline_ODE =Pipeline(
    steps=[
    
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[Weather_conditions_ODE,Road_traffic_density_ODE,Type_of_vehicle_ODE,Festival_ODE])),
    ('scaler',StandardScaler())
    
    ]
                    )
cat_pipeline_OHE=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehotencoder',OneHotEncoder(categories=[OHE_Cat_City,OHE_Cat_type_orders])),
        ('scaler',StandardScaler())
    ]
)


preprocessor=ColumnTransformer([
# ('time_pipeline', time_pipeline,),
# ('date_pipeline', date_pipeline,),
('num_pipeline',num_pipeline,num_CT),
('cat_pipeline_ODE',cat_pipeline_ODE,ordinal_CT),
('cat_pipeline_OHE',cat_pipeline_OHE,OHE_CT),
('Frequency_match',frequncy_of_delivery,[ 'multiple_deliveries'])
],remainder='passthrough')

drop_non_essential=Pipeline([
    ('drop_cols', FunctionTransformer(lambda x: x.drop(drop_list_pipe, axis=1)))
])
# Combine pipelines
full_pipeline = Pipeline([
    ('distance_preprocessing', distance_pipeline),
    ('preprocessor', preprocessor),
    ('Drop_non_essential', drop_non_essential)

])

In [None]:
time_pipeline.fit_transform(X_train)


In [None]:
full_pipeline.fit_transform(X_train)

In [None]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)



In [None]:
X_train.columns

In [None]:
#Feature Engineering
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())


In [None]:
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [None]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [None]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [None]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [None]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')