# Model Training 

## Feature Enginneering

In [1]:
# Importing all required library
import pandas as pd
import numpy as np
import seaborn as sns
from math import radians, cos, sin, asin, acos, sqrt, pi
from geopy import distance
from geopy.geocoders import Nominatim
import osmnx as ox
import networkx as nx
from statistics import  mode
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
from sklearn.preprocessing import OneHotEncoder # One Hot Encoding
## pipelines
from sklearn.pipeline import Pipeline,FunctionTransformer
from sklearn.compose import ColumnTransformer

In [2]:
df=pd.read_csv('/config/workspace/Delivery_Time_prediction/notebook/data/finalTrain.csv')

In [3]:
df_clened=pd.read_csv('/config/workspace/Delivery_Time_prediction/notebook/data/Final_Raw.csv')

In [4]:
# Splitting independant and dependant features 
X=df.drop('Time_taken (min)',axis=1)
y=df['Time_taken (min)']

In [5]:
#Categories for ordinal encoding
Weather_conditions_ODE=['Sunny','Cloudy','Windy','Fog', 'Stormy', 'Sandstorms' ]
Road_traffic_density_ODE=['Low', 'Medium', 'High', 'Jam']
Type_of_vehicle_ODE=['bicycle', 'electric_scooter', 'scooter', 'motorcycle']
Festival_ODE=['No','Yes']

#Categories for One Hot encoding
OHE_Cat_City=['Metropolitian', 'Urban', 'Semi-Urban']
OHE_Cat_type_orders=['Snack', 'Meal', 'Drinks', 'Buffet']

#Column Transformation
num_CT=['Delivery_person_Age', 'Delivery_person_Ratings','Vehicle_condition','pickup_time','Distance']
ordinal_CT=['Weather_conditions', 'Road_traffic_density','Type_of_vehicle', 'Festival']
OHE_CT=['City','Type_of_order']


drop_list_pipe=['Restaurant_latitude',
                'Restaurant_longitude', 
                'Delivery_location_latitude',
                'Delivery_location_longitude',
                'Time_Orderd', 
                'Time_Order_picked', 
                'Order_Date',
                'ID',
                'Delivery_person_ID'

               ]


In [6]:
# Distance calculator

def calculate_spherical_distance(lat1, lon1, lat2, lon2, r=6371):
    
    # Convert degrees to radians
    coordinates = abs(lat1), abs(lon1), abs(lat2), abs(lon2)
    # radians(c) is same as c*pi/180
    phi1, lambda1, phi2, lambda2 = [
        radians(c) for c in coordinates
    ]  
    
    # Apply the haversine formula
    a = (np.square(sin((phi2-phi1)/2)) + cos(phi1) * cos(phi2) * 
         np.square(sin((lambda2-lambda1)/2)))
    d = 2*r*asin(np.sqrt(a))
    return d


def distance_con_pipe(df):
    
        df['Distance']=[
            round(calculate_spherical_distance(*row), 2) 
            for row in df[['Restaurant_latitude', 'Restaurant_longitude', 
                        'Delivery_location_latitude', 
                        'Delivery_location_longitude']].values
                    ]
        return df



In [7]:
df.select_dtypes(exclude="O").columns

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Vehicle_condition',
       'multiple_deliveries', 'Time_taken (min)'],
      dtype='object')

In [8]:
#time_conversion pipeline


def get_pickup_time(df):
    def time_to_minutes(x):
        if ((isinstance(x,str))and(":" in x)):
            return float(x.split(":")[0]) * 60 + (float(x.split(":")[1]))
        else:
            return np.nan


    df['Time_Order_picked']=df['Time_Order_picked'].apply(time_to_minutes)
    df['Time_Orderd']=df['Time_Orderd'].apply(time_to_minutes)
    df['pickup_time']=df['Time_Order_picked']-df['Time_Orderd']
    return df




In [9]:
# #Extring month column 
# def month_spliter(df):
#     df['Order_Month']=df['Order_Date'].apply(lambda x: int(x.split("-")[1]))
#     return df
# # Pipeline for date column
# date_pipeline = Pipeline([('extract_month', FunctionTransformer(month_spliter))])

In [10]:
# Column dropper
def dropper(df):
    df.drop(drop_list_pipe,inplace=True, axis=1)
    return df

In [11]:
#Distance Transformation pipeline
distance_pipeline = Pipeline([
    ('Distance_converter',FunctionTransformer(distance_con_pipe))
    ])

# # Pipeline for date column
# date_pipeline = Pipeline([
#     ('extract_month', FunctionTransformer(month_spliter))
#                         ])

# Pipeline for time columns
time_pipeline = Pipeline([
    ('time_conversion', FunctionTransformer(get_pickup_time))
                        ])

#Pipeline for frequent values handling 
frequncy_of_delivery=Pipeline(
    steps = [
    ('imputer',SimpleImputer(strategy='most_frequent'))
    
            ])

# Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
   
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler(with_mean=False))
        
        ] 
                    )
# Categorical Pipeline
cat_pipeline_ODE =Pipeline(
    steps=[
    
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[Weather_conditions_ODE,Road_traffic_density_ODE,Type_of_vehicle_ODE,Festival_ODE])),
    ('scaler',StandardScaler(with_mean=False))
    
    ]
                    )
cat_pipeline_OHE=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehotencoder',OneHotEncoder(categories=[OHE_Cat_City,OHE_Cat_type_orders])),
        ('scaler',StandardScaler(with_mean=False))
    ]
)


preprocessor=ColumnTransformer([

('num_pipeline',num_pipeline,num_CT),
('cat_pipeline_ODE',cat_pipeline_ODE,ordinal_CT),
('cat_pipeline_OHE',cat_pipeline_OHE,OHE_CT),
('Frequency_match',frequncy_of_delivery,[ 'multiple_deliveries']),
# ("scaler",scale)
])

drop_non_essential=Pipeline([
    ('drop_cols', FunctionTransformer(dropper)),
   
])
scale=Pipeline([ ('scaler',StandardScaler())])
# # Combine pipelines
full_pipeline = Pipeline([
    ('distance_preprocessing', distance_pipeline),
    ('time_pipeline', time_pipeline),
    # ('date_pipeline', date_pipeline),    
    ('Drop_non_essential', drop_non_essential)   

])


In [12]:
full_pipeline

In [13]:
preprocessor

In [14]:
## Train test split

from sklearn.model_selection import train_test_split
X = full_pipeline.fit_transform(X)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)



In [15]:
#Feature Engineering
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())


In [16]:
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [17]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor

In [18]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [19]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    'Xgboost': XGBRegressor(),
    'SVR': SVR(),
    'RandomForestRegressor':RandomForestRegressor(),
    'LGBMRegressor':LGBMRegressor()
    
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 6.45422159863973
MAE: 5.112705593610536
R2 score 51.83881504847756


Lasso
Model Training Performance
RMSE: 6.970692336703406
MAE: 5.569105623625355
R2 score 43.82265001441314


Ridge
Model Training Performance
RMSE: 6.454219774067393
MAE: 5.112705661099724
R2 score 51.83884227826881


Elasticnet
Model Training Performance
RMSE: 7.023802822797504
MAE: 5.637814398144025
R2 score 42.96334582017337


Xgboost
Model Training Performance
RMSE: 3.985148118347519
MAE: 3.1801452294327635
R2 score 81.6389144370662


SVR
Model Training Performance
RMSE: 6.278025296557284
MAE: 4.940028439286502
R2 score 54.43246439248179


RandomForestRegressor
Model Training Performance
RMSE: 4.064813356810226
MAE: 3.208691550369782
R2 score 80.8974811476175


LGBMRegressor
Model Training Performance
RMSE: 3.948321315049131
MAE: 3.158691986548729
R2 score 81.97669650794114




In [20]:

svm_parameters={
    'C':[1,2,3,4],
    'espilon':[0.01,0.05,0.075,0.1],
    'degree':[1,2,3,4,5]
}
xgb_parameter={
    'max_depth':[10,20,30,40],
    'learning_rate':[0.001,0.01,0.1],
    'n_estimators':[10,20,30,40],
    'reg_lambda':[0.1,1,5,10,100],
    
}
