## Amazon Delivery Time Prediction - Regression Model Development

In [31]:
# Import required libraries
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [33]:
# Load the dataset
data = pd.read_csv("featured_data.csv")

In [35]:
data.dtypes

Order_ID                    object
Agent_Age                  float64
Agent_Rating               float64
Store_Latitude             float64
Store_Longitude            float64
Drop_Latitude              float64
Drop_Longitude             float64
Order_Date                  object
Order_Time                  object
Pickup_Time                 object
Weather                     object
Traffic                     object
Vehicle                     object
Area                        object
Delivery_Time                int64
Category                    object
Distance                   float64
Order_Hour                   int64
Order_Minute                 int64
Pickup_Hour                  int64
Pickup_Minute                int64
Order_Day                   object
Order_Month                  int64
Weather_Encoded              int64
Traffic_Encoded              int64
Vehicle_Encoded              int64
Area_Encoded                 int64
Category_Encoded             int64
Order_Day_Encoded   

In [37]:
data.head()

Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Date,Order_Time,Pickup_Time,...,Weather_Encoded,Traffic_Encoded,Vehicle_Encoded,Area_Encoded,Category_Encoded,Order_Day_Encoded,Order_to_Pickup_Minutes,Is_Weekend,Is_Peak_Hour,Distance_Category
0,ialx566343618,1.278203,0.795124,22.745049,75.892471,22.765049,75.912471,2022-03-19,11:30:00,11:45:00,...,4,0,1,3,2,2,0.166617,1,0,Short
1,akqg208421122,0.762304,-0.400487,12.913041,77.683237,13.043041,77.813237,2022-03-25,19:45:00,19:50:00,...,3,1,2,0,4,0,0.11582,0,1,Very Long
2,njpu434582536,-1.129327,-0.69939,12.914264,77.6784,12.924264,77.6884,2022-03-19,08:30:00,08:45:00,...,2,2,1,3,14,2,0.166617,1,1,Short
3,rjto796129700,1.450169,0.197319,11.003669,76.976494,11.053669,77.026494,2022-04-05,18:00:00,18:10:00,...,4,3,1,0,3,5,0.141219,0,1,Medium
4,zguw716275638,0.418371,-0.101584,12.972793,80.249982,13.012793,80.289982,2022-03-26,13:30:00,13:45:00,...,0,0,2,0,15,2,0.166617,1,1,Medium


In [39]:
# Select only numeric features and convert integer columns to float
def ensure_float(data):
    for col in data.select_dtypes(include=['int64']).columns:
        data[col] = data[col].astype(np.float64)
    return data

In [19]:
# Select only numeric features
X = data.select_dtypes(include=[np.number]).drop(columns=['Delivery_Time'], errors='ignore')
X = ensure_float(X)
y = data['Delivery_Time']

In [41]:
# Select features
features = [
    'Agent_Age', 'Agent_Rating', 'Distance', 'Weather_Encoded',
    'Traffic_Encoded', 'Vehicle_Encoded', 'Area_Encoded',
    'Category_Encoded', 'Order_to_Pickup_Minutes'
]

X = data[features]
y = data['Delivery_Time']

In [43]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
# Define models with optimized hyperparameters
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=300, max_depth=15, min_samples_split=5, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=7, random_state=42),
    "SVR": SVR(C=10.0, epsilon=0.05, kernel='rbf'),
    "XGBoost": XGBRegressor(objective='reg:squarederror', n_estimators=250, learning_rate=0.05, max_depth=8, subsample=0.8, colsample_bytree=0.8, random_state=42)
}

In [47]:
# Track and compare models using MLflow
trained_models = {}
mlflow.set_experiment("Amazon Delivery Time Prediction")

best_model = None
best_rmse = float("inf")

In [49]:
# Train, save, and track models
for name, model in models.items():
    print(f"Training model: {name}")
    model.fit(X_train, y_train)
    trained_models[name] = model
    
    with open(f"{name}.pkl", "wb") as f:
        pickle.dump(model, f)
    
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"{name} - RMSE: {rmse:.2f}, MAE: {mae:.2f}, R-squared: {r2:.2f}")
    
    with mlflow.start_run(run_name=name):
        # Log only important hyperparameters
        if hasattr(model, 'get_params'):
            params = model.get_params()
            important_params = ['n_estimators', 'max_depth', 'learning_rate', 'C', 'epsilon', 'subsample', 'min_samples_split', 'colsample_bytree']
            for param_name, param_value in params.items():
                if param_name in important_params:
                    mlflow.log_param(param_name, param_value)
        
        mlflow.log_param("model", name)
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("R-squared", r2)
        
        mlflow.sklearn.log_model(model, name)
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = name

print(f"Best Model: {best_model} with RMSE: {best_rmse:.2f}")

Training model: Linear Regression
Linear Regression - RMSE: 45.58, MAE: 35.26, R-squared: 0.22




Training model: Random Forest
Random Forest - RMSE: 22.52, MAE: 17.50, R-squared: 0.81




Training model: Gradient Boosting
Gradient Boosting - RMSE: 21.84, MAE: 17.05, R-squared: 0.82




Training model: SVR
SVR - RMSE: 43.76, MAE: 33.67, R-squared: 0.28




Training model: XGBoost
XGBoost - RMSE: 21.86, MAE: 17.03, R-squared: 0.82




Best Model: Gradient Boosting with RMSE: 21.84


In [51]:
# Load the model
with open("Gradient Boosting.pkl", "rb") as file:  # Replace 'your_model.pkl' with your actual filename
    model = pickle.load(file)

In [53]:
# Define the input features
input_data = np.array([[37, 4.9, 3, 4, 0, 1, 3, 2, 0.16]])

# Ensure it has the correct shape
print("Input shape:", input_data.shape)

Input shape: (1, 9)


In [55]:
prediction = model.predict(input_data)

# Print the predicted delivery time
print(f"Predicted Delivery Time: {prediction[0]:.2f} minutes")

Predicted Delivery Time: 119.39 minutes


