In [1]:
import pandas as pd
import numpy as np

## Model Training

In [2]:
df = pd.read_csv('eda_cleaned.csv')
df.head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,...,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Distance,Time_taken_min
0,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,2022-12-02 21:55:00,...,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,10.29,46
1,0xd987,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,2022-02-13 14:55:00,...,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,6.25,23
2,0x2784,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,04-03-2022,2022-04-03 17:30:00,...,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,13.8,21
3,0xc8b6,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,13-02-2022,2022-02-13 09:20:00,...,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,2.93,20
4,0xdb64,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,14-02-2022,2022-02-14 19:50:00,...,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,19.42,41


In [3]:
df=df.drop(labels=['ID','Delivery_person_ID','Restaurant_latitude','Restaurant_longitude','Delivery_location_latitude','Delivery_location_longitude','Order_Date','Time_Orderd','Time_Order_picked'],axis=1)

In [4]:
## Independent and dependent features
X = df.drop(labels=['Time_taken_min'],axis=1)
Y = df[['Time_taken_min']]

In [5]:
Y

Unnamed: 0,Time_taken_min
0,46
1,23
2,21
3,20
4,41
...,...
33778,32
33779,36
33780,16
33781,26


In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33783 entries, 0 to 33782
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Delivery_person_Age      33783 non-null  float64
 1   Delivery_person_Ratings  33783 non-null  float64
 2   Prep_time                33783 non-null  float64
 3   Weather_conditions       33783 non-null  object 
 4   Road_traffic_density     33783 non-null  object 
 5   Vehicle_condition        33783 non-null  int64  
 6   Type_of_order            33783 non-null  object 
 7   Type_of_vehicle          33783 non-null  object 
 8   multiple_deliveries      33783 non-null  float64
 9   Festival                 33783 non-null  object 
 10  City                     33783 non-null  object 
 11  Distance                 33783 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 3.1+ MB


In [7]:
# Define which columns should be ordinal-encoded and which should be scaled
Ordinal_cols = ['Road_traffic_density']
Nominal_cols = ['Weather_conditions','Type_of_order','Type_of_vehicle','Festival','City']
numerical_cols = X.select_dtypes(exclude='object').columns

In [8]:
df.multiple_deliveries.unique()

array([3., 1., 0., 2.])

In [9]:
# Define the custom ranking for each ordinal variable
Traffic_categories = ['Low','Medium','High','Jam']
#Delivery_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
#clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [10]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
from sklearn.preprocessing import OneHotEncoder #Onehot encoder
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [11]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
ord_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[Traffic_categories])),
    ('scaler',StandardScaler())
    ]

)

# Categorigal Pipeline
nom_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehotencoder',OneHotEncoder(sparse=False)),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
#'num_pipeline',num_pipeline,['Delivery_person_Age','Delivery_person_Ratings']),

('ord_pipeline',ord_pipeline,Ordinal_cols),
('nom_pipeline',nom_pipeline,Nominal_cols)
])


In [22]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

<h1>###################################################################</h1>

In [24]:
def get_column_names(transformers):
    
    col_names = []
    
    for t in transformers.named_transformers_:
        print(t)
        if(t == 'num_pipeline'):
            #print(type(list(transformers.named_transformers_[t].feature_names_in_)))
            col_names = col_names + list(transformers.named_transformers_[t].feature_names_in_)
            #print(col_names)
        elif (t == 'ord_pipeline'):
            col_names = col_names + list(preprocessor.named_transformers_['ord_pipeline'][0].feature_names_in_)
        else:
            #print('imputer' in transformers.named_transformers_[t])
            #print(transformers.named_transformers_[t][1].get_feature_names_out())
            cols = list(transformers.named_transformers_[t][1].get_feature_names_out())
            #print(cols)
            col_names = col_names + cols

    return col_names

In [25]:
# for t in preprocessor.named_transformers_:
#     print(t)
#     if(t == 'num_pipeline'):
#         print(preprocessor.named_transformers_[t].feature_names_in_)
#     else:
#         print('imputer' in preprocessor.named_transformers_[t])
#         print(preprocessor.named_transformers_[t][1].get_feature_names_out())
#         cols = preprocessor.named_transformers_[t][1].get_feature_names_out()

In [26]:
#get_column_names_from_ColumnTransformer(preprocessor)

<h1>###################################################################</h1>

In [27]:
X_train.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Prep_time,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Distance
11324,33.0,4.7,15.0,Sunny,High,2,Drinks,scooter,0.0,No,Urban,3.04
3106,27.0,4.9,15.0,Cloudy,Low,2,Meal,scooter,1.0,No,Metropolitian,19.91
14753,30.0,4.9,5.0,Sandstorms,Jam,0,Drinks,motorcycle,1.0,Yes,Metropolitian,13.99
3594,39.0,4.2,15.0,Cloudy,High,2,Snack,scooter,1.0,No,Metropolitian,6.3
1057,31.0,5.0,15.0,Sunny,Jam,2,Snack,scooter,1.0,No,Metropolitian,20.47


In [28]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=get_column_names(preprocessor))
X_test=pd.DataFrame(preprocessor.transform(X_test), columns=get_column_names(preprocessor))

#X_train=pd.DataFrame(preprocessor.fit_transform(X_train))
#X_test=pd.DataFrame(preprocessor.transform(X_test))


num_pipeline
ord_pipeline
nom_pipeline
num_pipeline
ord_pipeline
nom_pipeline


In [29]:
X_train

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Prep_time,Vehicle_condition,multiple_deliveries,Distance,Road_traffic_density,x0_Cloudy,x0_Fog,x0_Sandstorms,...,x1_Meal,x1_Snack,x2_electric_scooter,x2_motorcycle,x2_scooter,x3_No,x3_Yes,x4_Metropolitian,x4_Semi-Urban,x4_Urban
0,0.588912,0.215251,1.230278,1.224845,-1.296163,-0.080171,0.479559,-0.445420,-0.456216,-0.444397,...,-0.580540,-0.58002,-0.297431,-1.194020,1.421737,0.145461,-0.145461,-1.827751,-0.061809,1.847590
1,-0.451913,0.842401,1.230278,1.224845,0.444222,-0.024586,-1.124375,2.245074,-0.456216,-0.444397,...,1.722533,-0.58002,-0.297431,-1.194020,1.421737,0.145461,-0.145461,0.547120,-0.061809,-0.541246
2,0.068499,0.842401,-1.220124,-1.215352,0.444222,-0.044092,1.281526,-0.445420,-0.456216,2.250242,...,-0.580540,-0.58002,-0.297431,0.837507,-0.703365,-6.874680,6.874680,0.547120,-0.061809,-0.541246
3,1.629737,-1.352624,1.230278,1.224845,0.444222,-0.069430,0.479559,2.245074,-0.456216,-0.444397,...,-0.580540,1.72408,-0.297431,-1.194020,1.421737,0.145461,-0.145461,0.547120,-0.061809,-0.541246
4,0.241970,1.155976,1.230278,1.224845,0.444222,-0.022741,1.281526,-0.445420,-0.456216,-0.444397,...,-0.580540,1.72408,-0.297431,-1.194020,1.421737,0.145461,-0.145461,0.547120,-0.061809,-0.541246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23643,1.629737,0.842401,1.230278,0.004747,0.444222,-0.085278,-1.124375,2.245074,-0.456216,-0.444397,...,-0.580540,-0.58002,-0.297431,-1.194020,1.421737,0.145461,-0.145461,-1.827751,-0.061809,1.847590
23644,1.109325,1.155976,-1.220124,-1.215352,2.184607,-0.054339,-0.322408,-0.445420,-0.456216,2.250242,...,-0.580540,1.72408,-0.297431,0.837507,-0.703365,0.145461,-0.145461,0.547120,-0.061809,-0.541246
23645,-1.666210,0.528826,-1.220124,1.224845,0.444222,-0.069693,-0.322408,-0.445420,-0.456216,2.250242,...,-0.580540,-0.58002,3.362120,-1.194020,-0.703365,0.145461,-0.145461,0.547120,-0.061809,-0.541246
23646,-0.278443,1.155976,-1.220124,0.004747,0.444222,-0.054339,-1.124375,-0.445420,-0.456216,2.250242,...,1.722533,-0.58002,-0.297431,0.837507,-0.703365,0.145461,-0.145461,0.547120,-0.061809,-0.541246


In [30]:
X_train.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Prep_time,Vehicle_condition,multiple_deliveries,Distance,Road_traffic_density,x0_Cloudy,x0_Fog,x0_Sandstorms,...,x1_Meal,x1_Snack,x2_electric_scooter,x2_motorcycle,x2_scooter,x3_No,x3_Yes,x4_Metropolitian,x4_Semi-Urban,x4_Urban
0,0.588912,0.215251,1.230278,1.224845,-1.296163,-0.080171,0.479559,-0.44542,-0.456216,-0.444397,...,-0.58054,-0.58002,-0.297431,-1.19402,1.421737,0.145461,-0.145461,-1.827751,-0.061809,1.84759
1,-0.451913,0.842401,1.230278,1.224845,0.444222,-0.024586,-1.124375,2.245074,-0.456216,-0.444397,...,1.722533,-0.58002,-0.297431,-1.19402,1.421737,0.145461,-0.145461,0.54712,-0.061809,-0.541246
2,0.068499,0.842401,-1.220124,-1.215352,0.444222,-0.044092,1.281526,-0.44542,-0.456216,2.250242,...,-0.58054,-0.58002,-0.297431,0.837507,-0.703365,-6.87468,6.87468,0.54712,-0.061809,-0.541246
3,1.629737,-1.352624,1.230278,1.224845,0.444222,-0.06943,0.479559,2.245074,-0.456216,-0.444397,...,-0.58054,1.72408,-0.297431,-1.19402,1.421737,0.145461,-0.145461,0.54712,-0.061809,-0.541246
4,0.24197,1.155976,1.230278,1.224845,0.444222,-0.022741,1.281526,-0.44542,-0.456216,-0.444397,...,-0.58054,1.72408,-0.297431,-1.19402,1.421737,0.145461,-0.145461,0.54712,-0.061809,-0.541246


In [32]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [33]:
regression=LinearRegression()
regression.fit(X_train,y_train)

LinearRegression()

In [34]:
regression.coef_

array([[ 2.13543982e+00, -2.46303276e+00, -7.23818181e-03,
        -1.76294751e+00,  1.86846973e+00, -2.65520231e-03,
         3.17564601e+00,  5.16632636e+12,  5.24891695e+12,
         5.15837914e+12,  5.21551393e+12,  5.10740613e+12,
         5.18213925e+12,  9.49967040e+12,  9.57264279e+12,
         9.58708131e+12,  9.58281241e+12,  1.27777002e+13,
         2.30174882e+13,  2.20039550e+13,  3.74694430e+13,
         3.74694430e+13,  8.94094539e+12,  1.30743366e+12,
         8.88868275e+12]])

In [35]:
regression.intercept_

array([26.52767717])

In [36]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [37]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE: 6.233890089558152
MAE: 4.978271327833397
R2 score 55.61012483978541


Lasso
Model Training Performance
RMSE: 6.844391042770046
MAE: 5.450762827428761
R2 score 46.48996065031435


Ridge
Model Training Performance
RMSE: 6.233910321795842
MAE: 4.978604214868384
R2 score 55.609836702538914


Elasticnet
Model Training Performance
RMSE: 6.819402459519195
MAE: 5.455413350596076
R2 score 46.87997318574022




In [38]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet']