### **Model Training**

In [1]:
# importing libraries
import pandas as pd
import numpy as np

# for model training
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")


pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv("data/delivery_data(preprocessed).csv")

In [3]:
df.head()

Unnamed: 0,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),Year,Month,Day,TimeOrder_hours,TimeOrder_min,Time_Order_picked_Hour,Time_Order_picked_min,city_code,City_name,distance
0,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,21:55:00,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Semi-Urban,46,2022,2,12,21,55,22,10,DEH,Dehradun,10.280582
1,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,14:55:00,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitan,23,2022,2,13,14,55,15,5,KOC,Kochi,6.242319
2,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,17:30:00,17:40,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitan,21,2022,3,4,17,30,17,40,PUNE,Pune,13.78786
3,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,09:20:00,09:30,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Semi-Urban,20,2022,2,13,9,20,9,30,LUDH,Ludhiana,2.930258
4,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,19:50:00,20:05,Fog,Jam,1,Snack,scooter,1.0,No,Semi-Urban,41,2022,2,14,19,50,20,5,KNP,Kanpur,19.396618


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40344 entries, 0 to 40343
Data columns (total 28 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Delivery_person_ID           40344 non-null  object 
 1   Delivery_person_Age          40144 non-null  float64
 2   Delivery_person_Ratings      40344 non-null  float64
 3   Restaurant_latitude          40344 non-null  float64
 4   Restaurant_longitude         40344 non-null  float64
 5   Delivery_location_latitude   40344 non-null  float64
 6   Delivery_location_longitude  40344 non-null  float64
 7   Time_Orderd                  40344 non-null  object 
 8   Time_Order_picked            40344 non-null  object 
 9   Weather_conditions           40344 non-null  object 
 10  Road_traffic_density         40344 non-null  object 
 11  Vehicle_condition            40344 non-null  int64  
 12  Type_of_order                40344 non-null  object 
 13  Type_of_vehicle 

In [5]:
df.isnull().sum()

Delivery_person_ID               0
Delivery_person_Age            200
Delivery_person_Ratings          0
Restaurant_latitude              0
Restaurant_longitude             0
Delivery_location_latitude       0
Delivery_location_longitude      0
Time_Orderd                      0
Time_Order_picked                0
Weather_conditions               0
Road_traffic_density             0
Vehicle_condition                0
Type_of_order                    0
Type_of_vehicle                  0
multiple_deliveries            855
Festival                       206
City                             0
Time_taken (min)                 0
Year                             0
Month                            0
Day                              0
TimeOrder_hours                  0
TimeOrder_min                    0
Time_Order_picked_Hour           0
Time_Order_picked_min            0
city_code                        0
City_name                        0
distance                         0
dtype: int64

**Differentiating between the numerical and categotical columns**

In [6]:
categorical_columns = df.select_dtypes(include='object').columns

numerical_columns = df.select_dtypes(exclude='object').columns

print("Categorical columns: - ", categorical_columns)
print("\nNumerical Columns: - ", numerical_columns)

Categorical columns: -  Index(['Delivery_person_ID', 'Time_Orderd', 'Time_Order_picked',
       'Weather_conditions', 'Road_traffic_density', 'Type_of_order',
       'Type_of_vehicle', 'Festival', 'City', 'city_code', 'City_name'],
      dtype='object')

Numerical Columns: -  Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Vehicle_condition',
       'multiple_deliveries', 'Time_taken (min)', 'Year', 'Month', 'Day',
       'TimeOrder_hours', 'TimeOrder_min', 'Time_Order_picked_Hour',
       'Time_Order_picked_min', 'distance'],
      dtype='object')


**Removing the unrequired columns**

In [7]:
# droping the Delivery_person_ID, Time_Orderd, Time_Order_picked

df.drop(columns=['Delivery_person_ID', 'Time_Orderd', 'Time_Order_picked'], axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),Year,Month,Day,TimeOrder_hours,TimeOrder_min,Time_Order_picked_Hour,Time_Order_picked_min,city_code,City_name,distance
0,36.0,4.2,30.327968,78.046106,30.397968,78.116106,Fog,Jam,2,Snack,motorcycle,3.0,No,Semi-Urban,46,2022,2,12,21,55,22,10,DEH,Dehradun,10.280582
1,21.0,4.7,10.003064,76.307589,10.043064,76.347589,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitan,23,2022,2,13,14,55,15,5,KOC,Kochi,6.242319
2,23.0,4.7,18.56245,73.916619,18.65245,74.006619,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitan,21,2022,3,4,17,30,17,40,PUNE,Pune,13.78786
3,34.0,4.3,30.899584,75.809346,30.919584,75.829346,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Semi-Urban,20,2022,2,13,9,20,9,30,LUDH,Ludhiana,2.930258
4,24.0,4.7,26.463504,80.372929,26.593504,80.502929,Fog,Jam,1,Snack,scooter,1.0,No,Semi-Urban,41,2022,2,14,19,50,20,5,KNP,Kanpur,19.396618


Droping extra columns that are not required for the model training.

In [9]:
df.drop(columns=['Restaurant_latitude', 'Restaurant_longitude', 'Delivery_location_latitude', 'Delivery_location_longitude', 'Year', 
                 'Month', 'Day', 'TimeOrder_min', 'Time_Order_picked_Hour', 'Time_Order_picked_min', 'City_name'], axis=1, inplace=True)

In [10]:
df.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),TimeOrder_hours,city_code,distance
0,36.0,4.2,Fog,Jam,2,Snack,motorcycle,3.0,No,Semi-Urban,46,21,DEH,10.280582
1,21.0,4.7,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitan,23,14,KOC,6.242319
2,23.0,4.7,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitan,21,17,PUNE,13.78786
3,34.0,4.3,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Semi-Urban,20,9,LUDH,2.930258
4,24.0,4.7,Fog,Jam,1,Snack,scooter,1.0,No,Semi-Urban,41,19,KNP,19.396618


In [11]:
categorical_columns = df.select_dtypes(include='object')
categorical_columns

Unnamed: 0,Weather_conditions,Road_traffic_density,Type_of_order,Type_of_vehicle,Festival,City,city_code
0,Fog,Jam,Snack,motorcycle,No,Semi-Urban,DEH
1,Stormy,High,Meal,motorcycle,No,Metropolitan,KOC
2,Sandstorms,Medium,Drinks,scooter,No,Metropolitan,PUNE
3,Sandstorms,Low,Buffet,motorcycle,No,Semi-Urban,LUDH
4,Fog,Jam,Snack,scooter,No,Semi-Urban,KNP
...,...,...,...,...,...,...,...
40339,Windy,Jam,Drinks,motorcycle,No,Semi-Urban,RANCHI
40340,Windy,High,Meal,motorcycle,No,Metropolitan,JAP
40341,Cloudy,Low,Drinks,scooter,No,Metropolitan,CHEN
40342,Cloudy,High,Snack,motorcycle,No,Metropolitan,COIMB


In [12]:
numerical_columns = df.select_dtypes(exclude='object')
numerical_columns


Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Vehicle_condition,multiple_deliveries,Time_taken (min),TimeOrder_hours,distance
0,36.0,4.2,2,3.0,46,21,10.280582
1,21.0,4.7,1,1.0,23,14,6.242319
2,23.0,4.7,1,1.0,21,17,13.787860
3,34.0,4.3,0,0.0,20,9,2.930258
4,24.0,4.7,1,1.0,41,19,19.396618
...,...,...,...,...,...,...,...
40339,35.0,4.2,2,1.0,33,21,16.600272
40340,30.0,4.8,1,0.0,32,11,1.489846
40341,30.0,4.9,1,0.0,16,23,4.657195
40342,20.0,4.7,0,1.0,26,13,6.232393


**Splitting the dataset into Dependent (y) and Independent Features (X)**

In [13]:
# splitting the data
X = df.drop(labels=['Time_taken (min)'], axis=1)
y = df['Time_taken (min)']

In [14]:
X

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,TimeOrder_hours,city_code,distance
0,36.0,4.2,Fog,Jam,2,Snack,motorcycle,3.0,No,Semi-Urban,21,DEH,10.280582
1,21.0,4.7,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitan,14,KOC,6.242319
2,23.0,4.7,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitan,17,PUNE,13.787860
3,34.0,4.3,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Semi-Urban,9,LUDH,2.930258
4,24.0,4.7,Fog,Jam,1,Snack,scooter,1.0,No,Semi-Urban,19,KNP,19.396618
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40339,35.0,4.2,Windy,Jam,2,Drinks,motorcycle,1.0,No,Semi-Urban,21,RANCHI,16.600272
40340,30.0,4.8,Windy,High,1,Meal,motorcycle,0.0,No,Metropolitan,11,JAP,1.489846
40341,30.0,4.9,Cloudy,Low,1,Drinks,scooter,0.0,No,Metropolitan,23,CHEN,4.657195
40342,20.0,4.7,Cloudy,High,0,Snack,motorcycle,1.0,No,Metropolitan,13,COIMB,6.232393


In [15]:
y

0        46
1        23
2        21
3        20
4        41
         ..
40339    33
40340    32
40341    16
40342    26
40343    36
Name: Time_taken (min), Length: 40344, dtype: int64

**Preparaing the pipeline for Ordinal, Onehot Encoding and imputation**

In [16]:
df.columns

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Weather_conditions',
       'Road_traffic_density', 'Vehicle_condition', 'Type_of_order',
       'Type_of_vehicle', 'multiple_deliveries', 'Festival', 'City',
       'Time_taken (min)', 'TimeOrder_hours', 'city_code', 'distance'],
      dtype='object')

In [17]:
Road_traffic_density=['Low','Medium','High','Jam']
Weather_conditions=['Sunny','Cloudy','Windy','Fog','Sandstorms','Stormy']

In [18]:
# differentiating columns into categorical, ordinal, numerical

categorical_column = ['Type_of_order','Type_of_vehicle','Festival','City','city_code']
ordinal_column = ['Road_traffic_density','Weather_conditions']
numerical_column = ['Delivery_person_Age','Delivery_person_Ratings','Vehicle_condition','multiple_deliveries',
                  'TimeOrder_hours','distance']

In [19]:
# Pipeline

# numerical pipeline
numerical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='constant',fill_value=0)),
    ('scaler',StandardScaler(with_mean=False))
])

# categorical pipeline
categorical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore')),
    ('scaler',StandardScaler(with_mean=False))
])


# ordinal pipeline
ordianl_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('ordinal',OrdinalEncoder(categories=[Road_traffic_density,Weather_conditions])),
    ('scaler',StandardScaler(with_mean=False))   
])


In [20]:
preprocessor =ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline,numerical_column),
    ('categorical_pipeline',categorical_pipeline,categorical_column),
    ('ordianl_pipeline',ordianl_pipeline,ordinal_column)
])

**Splitting the data into train and test set**

In [22]:
# splitting into train and test set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [24]:
x_train.shape

(32275, 13)

In [25]:
x_test.shape

(8069, 13)

**Transforming the independent Features**

In [26]:
x_train = preprocessor.fit_transform(x_train)

In [27]:
x_test = preprocessor.transform(x_test)

**Model Training**

In [28]:
# importing required libraries
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [37]:
# Creating a function for calculating r2_score, mae and mse
def evaluate_model(true_val, predicted_val):
    ''' 
    This function will return the calculated r2 score, mean absolute error and mean squared error
    input: actual value (y) and the predicted value (y_pred)
    output: 1. r2 score value
            2. mean absolute error value
            3. mean squared error value
            4. root mean square error value
    '''
    r2 = r2_score(true_val, predicted_val)
    mae = mean_absolute_error(true_val, predicted_val)
    mse = mean_squared_error(true_val, predicted_val)
    rmse = np.sqrt(mean_squared_error(true_val, predicted_val))

    return r2, mae, mse, rmse

In [38]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
}

r2_list = []
mse_list = []
models_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train)

    # Making predictions
    train_y_pred = model.predict(x_train)
    test_y_pred = model.predict(x_test)

    # Evaluatoing the predictions
    train_r2, train_mae, train_mse, train_rmse = evaluate_model(y_train, train_y_pred)
    test_r2, test_mae, test_mse, test_rmse = evaluate_model(y_test, test_y_pred)

    models_list.append(list(models.keys())[i])

    # Printing the results
    print(list(models.keys())[i])

    print("Model performance for Training set")
    print("- Root Mean Squared Error: {:.4f}".format(train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(train_mae))
    print("- R2 Score: {:.4f}".format(train_r2))

    print("-"*35)
    
    print("Model performance for Test set")
    print("- Root Mean Squared Error: {:.4f}".format(test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(test_mae))
    print("- R2 Score: {:.4f}".format(test_r2))
    r2_list.append(test_r2)
    mse_list.append(test_mse)

    print("="*35)
    print("\n")

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 6.4339
- Mean Absolute Error: 5.1071
- R2 Score: 0.5268
-----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6.5386
- Mean Absolute Error: 5.2098
- R2 Score: 0.5210


Lasso
Model performance for Training set
- Root Mean Squared Error: 6.8327
- Mean Absolute Error: 5.4624
- R2 Score: 0.4663
-----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6.9326
- Mean Absolute Error: 5.5619
- R2 Score: 0.4615


Ridge
Model performance for Training set
- Root Mean Squared Error: 6.4339
- Mean Absolute Error: 5.1071
- R2 Score: 0.5268
-----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6.5386
- Mean Absolute Error: 5.2098
- R2 Score: 0.5210


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0118
- Mean Absolute Error: 0.0001
- R2 Score: 1.0000
----------------------------

In [39]:
result = pd.DataFrame(list(zip(models_list, r2_list)), columns=['Model','R2 Score']).sort_values(by='R2 Score', ascending=False)
result

Unnamed: 0,Model,R2 Score
5,XGBRegressor,0.825142
4,Random Forest Regressor,0.824416
3,Decision Tree,0.674706
2,Ridge,0.521005
0,Linear Regression,0.521005
1,Lasso,0.461545


In [40]:

Results = pd.DataFrame(list(zip(models_list, mse_list)), columns=['Model Name', 'Mse']).sort_values(by=["Mse"],ascending=True)
Results

Unnamed: 0,Model Name,Mse
5,XGBRegressor,15.60725
4,Random Forest Regressor,15.672049
3,Decision Tree,29.034608
2,Ridge,42.753453
0,Linear Regression,42.753455
1,Lasso,48.060636


**According to the results obtained after training various models, `XGBRegressor` issleected as a baseline model which shows the highest accuracy of `0.825142`. i.e., 82% accurate on the test set.**