In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [7]:
df =pd.read_csv("../Data/data_after_feature_engineering.csv",index_col = 0)
df.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,Multiple_deliveries,Festival,City_type,Time_taken(min),city_code,is_weekend,month_interval,order_prepare_time,distance
0,37.0,4.9,22.745049,75.892471,22.765049,75.912471,Sunny,High,2,Snack,motorcycle,0.0,No,Urban,24,INDO,True,middle,15.0,2
1,34.0,4.5,12.913041,77.683237,13.043041,77.813237,Stormy,Jam,2,Snack,scooter,1.0,No,Metropolitian,33,BANG,False,end,5.0,20
2,23.0,4.4,12.914264,77.6784,12.924264,77.6884,Sandstorms,Low,0,Drinks,motorcycle,1.0,No,Urban,26,BANG,True,middle,15.0,1
3,38.0,4.7,11.003669,76.976494,11.053669,77.026494,Sunny,Medium,0,Buffet,motorcycle,1.0,No,Metropolitian,21,COIMB,False,start,10.0,7
4,32.0,4.6,12.972793,80.249982,13.012793,80.289982,Cloudy,High,1,Snack,scooter,1.0,No,Metropolitian,30,CHEN,True,end,15.0,6


In [10]:
from sklearn.preprocessing import LabelEncoder
def label_encoding(df):
    
    categoric_columns = df.select_dtypes(include='object').columns
    label = LabelEncoder()
    for col in categoric_columns:
        df[col] = label.fit_transform(df[col])

    

label_encoding(df)    

In [11]:
df.columns

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Weather_conditions',
       'Road_traffic_density', 'Vehicle_condition', 'Type_of_order',
       'Type_of_vehicle', 'Multiple_deliveries', 'Festival', 'City_type',
       'Time_taken(min)', 'city_code', 'is_weekend', 'month_interval',
       'order_prepare_time', 'distance'],
      dtype='object')

In [12]:
column_mapping = {column_name: idx for idx, column_name in enumerate(df.columns)}
column_mapping

{'Delivery_person_Age': 0,
 'Delivery_person_Ratings': 1,
 'Restaurant_latitude': 2,
 'Restaurant_longitude': 3,
 'Delivery_location_latitude': 4,
 'Delivery_location_longitude': 5,
 'Weather_conditions': 6,
 'Road_traffic_density': 7,
 'Vehicle_condition': 8,
 'Type_of_order': 9,
 'Type_of_vehicle': 10,
 'Multiple_deliveries': 11,
 'Festival': 12,
 'City_type': 13,
 'Time_taken(min)': 14,
 'city_code': 15,
 'is_weekend': 16,
 'month_interval': 17,
 'order_prepare_time': 18,
 'distance': 19}

In [14]:
from sklearn.model_selection import train_test_split
X = df.drop('Time_taken(min)', axis=1)  # Features
y = df['Time_taken(min)']               # Target variable

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape,y_train.shape , X_test.shape ,y_test.shape)


(36474, 19) (36474,) (9119, 19) (9119,)


In [16]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Perform standardization on the training data
X_train = scaler.transform(X_train)

# Perform standardization on the testing data
X_test = scaler.transform(X_test)

In [32]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Find the best model
models = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    XGBRegressor(),        

]

param_grid = [
    {},
    {'max_depth': [3, 5, 7]},
    {'n_estimators': [3, 5, 45], 'max_features': ['auto', 'sqrt', 'log2']},                                                                                                                     
    {'n_estimators': [20, 25, 30], 'max_depth': [5, 7, 9]},       

]

for i, model in enumerate(models):
    grid_search = GridSearchCV(model, param_grid[i], cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)

    print(f"{model.__class__.__name__}:")
    print("Best parameters:", grid_search.best_params_)
    print("Best R2 score:", grid_search.best_score_)
    print()

LinearRegression:
Best parameters: {}
Best R2 score: 0.4185875073875159

DecisionTreeRegressor:
Best parameters: {'max_depth': 7}
Best R2 score: 0.733882737105098

RandomForestRegressor:
Best parameters: {'max_features': 'log2', 'n_estimators': 45}
Best R2 score: 0.7994235869468068

XGBRegressor:
Best parameters: {'max_depth': 7, 'n_estimators': 20}
Best R2 score: 0.8213681912317743



In [30]:

# Create a XGB regressor model
xgb_model = XGBRegressor(n_estimators=20,max_depth=9)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

In [31]:
from sklearn.metrics import mean_squared_error ,mean_absolute_error,r2_score
y_pred = xgb_model.predict(X_test)

def adjusted_r_squared(r2, n, k):
    return 1 - ((1 - r2) * (n - 1) / (n - k - 1))

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
adjusted_r2 = adjusted_r_squared(r2, len(y_test), X_test.shape[1])

print("Mean Absolute Error (MAE):", round(mae,2))
print("Mean Squared Error (MSE):", round(mse,2))
print("Root Mean Squared Error (RMSE):", round(rmse,2))
print("R-squared (R2) Score:", round(r2,4))
print("Adjusted R-squared Score:", round(adjusted_r2, 4))

Mean Absolute Error (MAE): 3.16
Mean Squared Error (MSE): 15.78
Root Mean Squared Error (RMSE): 3.97
R-squared (R2) Score: 0.82
Adjusted R-squared Score: 0.8196
