In [1]:
import sklearn
print(sklearn.__version__)

1.6.1


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from geopy.distance import geodesic
import joblib

## 1. Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Define the file path
file_path = '/content/drive/My Drive/Colab Notebooks/cleaned_pickup_data.csv'

# Read the CSV file
df = pd.read_csv(file_path)
df = df.dropna()
df = df.head(100000)

## 2. Convert Data Types

In [None]:
date_cols = ["accept_time", "accept_gps_time", "pi_gps_time"]
df[date_cols] = df[date_cols].apply(pd.to_datetime)

## 3. Sort Data Chronologically

In [None]:
df = df.sort_values(by="accept_time")

## 4. Feature Engineering

In [None]:
#ETA Calculation
df['pickup_time'] = pd.to_datetime(df['pickup_time']) # Convert 'pickup_time' to datetime
df['ETA'] = (df['pickup_time'] - df['accept_time']).dt.total_seconds()/60

In [None]:
## Distance Calculation (Geodesic Distance)
df["distance_km"] = df.apply(lambda row: geodesic((row["accept_gps_lat"], row["accept_gps_lng"]),
                                                    (row["pickup_gps_lat"], row["pickup_gps_lng"]) ).km, axis=1)

In [None]:
## Time-Based Features
df["hour"] = df["accept_time"].dt.hour
df["day_of_week"] = df["accept_time"].dt.dayofweek
df["is_weekend"] = (df["day_of_week"] >= 5).astype(int)

## 5. Handle Missing Values

In [None]:
# Select only numeric columns for median calculation
numeric_df = df.select_dtypes(include=np.number)

# Calculate the median for numeric columns
median_values = numeric_df.median()

# Fill missing values in the original DataFrame using the calculated medians
df.fillna(median_values, inplace=True)

In [None]:
# Encoding Categorical Variables
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder

categorical_cols = ['city', 'aoi_type']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

## 6. Feature Selection

In [None]:
features = ["distance_km", "hour", "day_of_week", "is_weekend", "time_window_duration", "task_duration"]
target = "ETA"

## 7.Scaling Numerical Features

In [None]:
scaler = StandardScaler()
numeric_cols = ['distance', 'hour', 'day_of_week', 'ETA']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

## 8.Chronological Train-Test Split

In [None]:
# Define columns to drop
drop_cols = [
    'order_id', 'courier_id', 'accept_time', 'pickup_time',
    'accept_gps_time', 'pi_gps_time', 'pickup_gps_time',
    'time_window_start', 'time_window_end', 'ds'
]

# Drop only existing columns
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)


In [None]:
train_size = int(0.6 * len(df))
val_size = int(0.2 * len(df))
test_size = len(df) - (train_size + val_size)
train_data = df.iloc[:train_size]
val_data = df.iloc[train_size:train_size + val_size]
test_data = df.iloc[train_size + val_size:]


X_train, y_train = train_data.drop(columns=['ETA']), train_data['ETA']
X_val, y_val = val_data.drop(columns=['ETA']), val_data['ETA']
X_test, y_test = test_data.drop(columns=['ETA']), test_data['ETA']

# 9.Model Training
# Gradient Boosting Regressor

In [None]:
gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
gbr.fit(X_train, y_train)

## Support Vector Regression (SVR)

In [None]:
svr = SVR(kernel='rbf', C=10, gamma='scale')
svr.fit(X_train, y_train)

## 10.Model Evaluation

In [None]:
def evaluate_model(model, X, y):
    predictions = model.predict(X)
    mae = mean_absolute_error(y, predictions)
    mse = mean_squared_error(y, predictions)
    r2 = r2_score(y, predictions)
    return mae, mse, r2

In [None]:
print("Gradient Boosting Regressor:")
print(evaluate_model(gbr, X_test, y_test))

Gradient Boosting Regressor:
(3.351124942222854e-05, 5.314828141272911e-09, 0.9999999949594169)


In [None]:
print("Support Vector Regression:")
print(evaluate_model(svr, X_test, y_test))

Support Vector Regression:
(0.03839136303159514, 0.002775033025874692, 0.9973681586503085)


Model Comparison
✅ GBR outperforms SVR with a near-perfect R² score (~1), minimal MAE (0.0000335), and extremely low MSE (5.31e-09).
✅ SVR performs well but has slightly higher errors (MAE: 0.0384, MSE: 0.00278) and a lower R² (0.9974).

## 11. Check for Overfitting: Evaluate GBR on Test Data

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predict on training and test sets
y_train_pred = gbr.predict(X_train)
y_test_pred = gbr.predict(X_test)

# Evaluate performance
def evaluate_model(y_true, y_pred, dataset_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{dataset_name} Performance:")
    print(f"MAE: {mae:.6f}, MSE: {mse:.6f}, R²: {r2:.6f}")
    print("-" * 50)

# Print results
evaluate_model(y_train, y_train_pred, "Training Data")
evaluate_model(y_test, y_test_pred, "Test Data")


Training Data Performance:
MAE: 0.000030, MSE: 0.000000, R²: 1.000000
--------------------------------------------------
Test Data Performance:
MAE: 0.000034, MSE: 0.000000, R²: 1.000000
--------------------------------------------------


## 12.K-Fold Cross-Validation (Using 5-Folds)

In [None]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
cv_scores = cross_val_score(gbr, X_train, y_train, cv=5, scoring='r2')

# Print results
print("Cross-Validation R² Scores:", cv_scores)
print(f"Mean R² Score: {cv_scores.mean():.6f}")
print(f"Standard Deviation: {cv_scores.std():.6f}")


Cross-Validation R² Scores: [0.99999999 1.         0.99999999 1.         0.99999999]
Mean R² Score: 1.000000
Standard Deviation: 0.000000


## 13.Hyperparameter Tuning using Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Perform Grid Search
grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print(f"Best Cross-Validation R² Score: {grid_search.best_score_:.6f}")

# Train GBR with best parameters
best_gbr = GradientBoostingRegressor(**grid_search.best_params_, random_state=42)
best_gbr.fit(X_train, y_train)

# Evaluate optimized model
evaluate_model(y_test, best_gbr.predict(X_test), "Test Data (Optimized GBR)")


Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Best Cross-Validation R² Score: 1.000000
Test Data (Optimized GBR) Performance:
MAE: 0.000000, MSE: 0.000000, R²: 1.000000
--------------------------------------------------


## After hyperparameter tuning, the model's performance is optimized with the best parameters: learning_rate = 0.1, max_depth = 7, and n_estimators = 200. The cross-validation R² score of 1.0 indicates that the model explains all the variance in the training data, suggesting excellent generalization. On the test data, the model achieves MAE = 0.0, MSE = 0.0, and R² = 1.0, confirming that it makes perfect predictions without any error. This indicates a high-performing, well-tuned model.

## The best performing model is the Gradient Boosting Regressor (GBR).

In [None]:
import joblib  # For model persistence
# Save the trained model
model_filename = 'best_gbr_model.pkl'  # Choose a filename
joblib.dump(best_gbr, model_filename)

['best_gbr_model.pkl']

In [None]:
# Download the model (for Google Colab)
from google.colab import files
files.download(model_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>