# Model Training 

## Feature Enginneering

In [300]:
# Importing all required library
import pandas as pd
import numpy as np
import seaborn as sns
from math import radians, cos, sin, asin, acos, sqrt, pi
from geopy import distance
from geopy.geocoders import Nominatim
import osmnx as ox
import networkx as nx
from statistics import  mode
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
from sklearn.preprocessing import OneHotEncoder # One Hot Encoding
## pipelines
from sklearn.pipeline import Pipeline,FunctionTransformer
from sklearn.compose import ColumnTransformer

In [346]:
df=pd.read_csv('/config/workspace/Delivery_price_prediction/notebook/data/finalTrain.csv')

In [302]:
df_clened=pd.read_csv('/config/workspace/Delivery_price_prediction/notebook/data/Final_Raw.csv')

In [303]:
# Splitting independant and dependant features 
X=df.drop('Time_taken (min)',axis=1)
y=df['Time_taken (min)']

In [304]:
#Categories for ordinal encoding
Weather_conditions_ODE=['Sunny','Cloudy','Windy','Fog', 'Stormy', 'Sandstorms' ]
Road_traffic_density_ODE=['Low', 'Medium', 'High', 'Jam']
Type_of_vehicle_ODE=['bicycle', 'electric_scooter', 'scooter', 'motorcycle']
Festival_ODE=['No','Yes']

#Categories for One Hot encoding
OHE_Cat_City=['Metropolitian', 'Urban', 'Semi-Urban']
OHE_Cat_type_orders=['Snack', 'Meal', 'Drinks', 'Buffet']

#Column Transformation
num_CT=['Delivery_person_Age', 'Delivery_person_Ratings','Vehicle_condition','pickup_time','Distance','Order_Month']
ordinal_CT=['Weather_conditions', 'Road_traffic_density','Type_of_vehicle', 'Festival']
OHE_CT=['City','Type_of_order']


drop_list_pipe=['Restaurant_latitude',
                'Restaurant_longitude', 
                'Delivery_location_latitude',
                'Delivery_location_longitude',
                'Time_Orderd', 
                'Time_Order_picked', 
                'Order_Date',
                'ID',
                'Delivery_person_ID'

               ]


In [305]:
# Distance calculator

def calculate_spherical_distance(lat1, lon1, lat2, lon2, r=6371):
    
    # Convert degrees to radians
    coordinates = lat1, lon1, lat2, lon2
    # radians(c) is same as c*pi/180
    phi1, lambda1, phi2, lambda2 = [
        radians(c) for c in coordinates
    ]  
    
    # Apply the haversine formula
    a = (np.square(sin((phi2-phi1)/2)) + cos(phi1) * cos(phi2) * 
         np.square(sin((lambda2-lambda1)/2)))
    d = 2*r*asin(np.sqrt(a))
    return d


def distance_con_pipe(df):
    
        df['Distance']=[
            round(calculate_spherical_distance(*row), 2) 
            for row in df[['Restaurant_latitude', 'Restaurant_longitude', 
                        'Delivery_location_latitude', 
                        'Delivery_location_longitude']].values
                    ]
        return df



In [306]:
df.select_dtypes(exclude="O").columns

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Vehicle_condition',
       'multiple_deliveries', 'Time_taken (min)'],
      dtype='object')

In [345]:
#time_conversion pipeline


def get_pickup_time(df):
    def time_to_minutes(x):
        if ((isinstance(x,str))and(":" in x)):
            return float(x.split(":")[0]) * 60 + (float(x.split(":")[1]))
        else:
            return np.nan


    df['Time_Order_picked']=df['Time_Order_picked'].apply(time_to_minutes)
    df['Time_Orderd']=df['Time_Orderd'].apply(time_to_minutes)
    df['pickup_time']=df['Time_Order_picked']-df['Time_Orderd']
    return df




In [308]:
#Extring month column 
def month_spliter(df):
    df['Order_Month']=df['Order_Date'].apply(lambda x: int(x.split("-")[1]))
    return df
# Pipeline for date column
date_pipeline = Pipeline([('extract_month', FunctionTransformer(month_spliter))])

In [309]:
# Column dropper
def dropper(df):
    df.drop(drop_list_pipe,inplace=True, axis=1)
    return df

In [331]:
#Distance Transformation pipeline
distance_pipeline = Pipeline([
    ('Distance_converter',FunctionTransformer(distance_con_pipe))
    ])

# Pipeline for date column
date_pipeline = Pipeline([
    ('extract_month', FunctionTransformer(month_spliter))
                        ])

# Pipeline for time columns
time_pipeline = Pipeline([
    ('time_conversion', FunctionTransformer(get_pickup_time))
                        ])

#Pipeline for frequent values handling 
frequncy_of_delivery=Pipeline(
    steps = [
    ('imputer',SimpleImputer(strategy='most_frequent'))
    
            ])

# Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
   
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler(with_mean=False))
        
        ] 
                    )
# Categorical Pipeline
cat_pipeline_ODE =Pipeline(
    steps=[
    
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[Weather_conditions_ODE,Road_traffic_density_ODE,Type_of_vehicle_ODE,Festival_ODE])),
    ('scaler',StandardScaler(with_mean=False))
    
    ]
                    )
cat_pipeline_OHE=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehotencoder',OneHotEncoder(categories=[OHE_Cat_City,OHE_Cat_type_orders])),
        ('scaler',StandardScaler(with_mean=False))
    ]
)


preprocessor=ColumnTransformer([

('num_pipeline',num_pipeline,num_CT),
('cat_pipeline_ODE',cat_pipeline_ODE,ordinal_CT),
('cat_pipeline_OHE',cat_pipeline_OHE,OHE_CT),
('Frequency_match',frequncy_of_delivery,[ 'multiple_deliveries']),
# ("scaler",scale)
])

drop_non_essential=Pipeline([
    ('drop_cols', FunctionTransformer(dropper)),
   
])
scale=Pipeline([ ('scaler',StandardScaler())])
# # Combine pipelines
full_pipeline = Pipeline([
    ('distance_preprocessing', distance_pipeline),
    ('time_pipeline', time_pipeline),
    ('date_pipeline', date_pipeline),    
    ('Drop_non_essential', drop_non_essential)   

])


In [325]:
df


Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Distance,pickup_time,Order_Month
0,36.0,4.2,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,10.28,15.0,2
1,21.0,4.7,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,6.24,10.0,2
2,23.0,4.7,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,13.79,10.0,3
3,34.0,4.3,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,2.93,10.0,2
4,24.0,4.7,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,19.40,15.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45579,30.0,4.8,Windy,High,1,Meal,motorcycle,0.0,No,Metropolitian,1.49,10.0,3
45580,21.0,4.6,Windy,Jam,0,Buffet,motorcycle,1.0,No,Metropolitian,11.01,15.0,2
45581,30.0,4.9,Cloudy,Low,1,Drinks,scooter,0.0,No,Metropolitian,4.66,15.0,3
45582,20.0,4.7,Cloudy,High,0,Snack,motorcycle,1.0,No,Metropolitian,6.23,5.0,3


In [328]:
full_pipeline

In [313]:
full_pipeline.fit_transform(df)

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),Distance,pickup_time,Order_Month
0,36.0,4.2,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,10.28,15.0,2
1,21.0,4.7,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23,6.24,10.0,2
2,23.0,4.7,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21,13.79,10.0,3
3,34.0,4.3,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20,2.93,10.0,2
4,24.0,4.7,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41,19.40,15.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45579,30.0,4.8,Windy,High,1,Meal,motorcycle,0.0,No,Metropolitian,32,1.49,10.0,3
45580,21.0,4.6,Windy,Jam,0,Buffet,motorcycle,1.0,No,Metropolitian,36,11.01,15.0,2
45581,30.0,4.9,Cloudy,Low,1,Drinks,scooter,0.0,No,Metropolitian,16,4.66,15.0,3
45582,20.0,4.7,Cloudy,High,0,Snack,motorcycle,1.0,No,Metropolitian,26,6.23,5.0,3


In [332]:
preprocessor

In [333]:
preprocessor.fit_transform(df)

array([[ 6.32076226, 12.81818724,  2.38365888, ...,  0.        ,
         0.        ,  3.        ],
       [ 3.68711132, 14.34416191,  1.19182944, ...,  0.        ,
         0.        ,  1.        ],
       [ 4.03826478, 14.34416191,  1.19182944, ...,  2.31450084,
         0.        ,  1.        ],
       ...,
       [ 5.26730188, 14.95455178,  1.19182944, ...,  2.31450084,
         0.        ,  0.        ],
       [ 3.51153459, 14.34416191,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 4.03826478, 14.95455178,  2.38365888, ...,  0.        ,
         0.        ,  1.        ]])

In [337]:
## Train test split

from sklearn.model_selection import train_test_split
X = full_pipeline.fit_transform(X)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)



In [338]:
X_train.columns

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Weather_conditions',
       'Road_traffic_density', 'Vehicle_condition', 'Type_of_order',
       'Type_of_vehicle', 'multiple_deliveries', 'Festival', 'City',
       'Distance', 'pickup_time', 'Order_Month'],
      dtype='object')

In [339]:
#Feature Engineering
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())


In [340]:
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [341]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [342]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [343]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [344]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 6.69272061453577
MAE: 5.282226022767106
R2 score 48.21370859517261


Lasso
Model Training Performance
RMSE: 7.167369332525899
MAE: 5.7083616626804305
R2 score 40.60785817917479


Ridge
Model Training Performance
RMSE: 6.692718556467999
MAE: 5.282224869642293
R2 score 48.213740444606366


Elasticnet
Model Training Performance
RMSE: 7.243424516645106
MAE: 5.797640024989153
R2 score 39.34071366846609


