# AI-Driven Delivery Delay Prediction

This notebook covers:
1. **Data Preparation** - Load LaDe dataset, calculate & simulate features
2. **Exploratory Data Analysis** - Visualize patterns and correlations
3. **Model Training** - Random Forest, XGBoost, Gradient Boosting
4. **Model Evaluation & Selection** - Compare and save best model

In [None]:
# Install required packages
!pip install pandas numpy scikit-learn xgboost matplotlib seaborn -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-whitegrid')

print("Libraries imported successfully!")

## Part 1: Data Preparation

### 1.1 Load Datasets

In [None]:
# Load LaDe dataset from local sample file
# (Downloaded from Hugging Face: Cainiao-AI/LaDe-D - Shanghai delivery data)
print("Loading LaDe Shanghai dataset from local file...")
df_lade = pd.read_csv('Datasets/lade_shanghai_sample.csv')

print(f"Loaded {len(df_lade)} records from LaDe dataset")
print(f"Columns: {list(df_lade.columns)}")

In [None]:
# Display sample data
print("Sample data from LaDe dataset:")
df_lade.head()

In [None]:
# Load existing Delivery_Logistics dataset for feature distributions
df_logistics = pd.read_csv('Datasets/Delivery_Logistics.csv')
print(f"Loaded Delivery_Logistics.csv: {len(df_logistics)} records")
print(f"Columns: {list(df_logistics.columns)}")

### 1.2 Calculate Distance from GPS Coordinates

In [None]:
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """Calculate the great circle distance in km between two points on earth."""
    # Convert to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of earth in km
    return c * r

# Calculate distance
df_lade['distance_km'] = df_lade.apply(
    lambda row: haversine(
        row['accept_gps_lng'], row['accept_gps_lat'],
        row['lng'], row['lat']
    ), axis=1
)

print("Distance statistics (km):")
print(df_lade['distance_km'].describe())

### 1.3 Parse and Process Timestamps

In [None]:
# Parse timestamps (format: MM-DD HH:MM:SS)
# We'll assume year 2024 for all dates

def parse_lade_time(time_str):
    """Parse LaDe timestamp format (MM-DD HH:MM:SS) to datetime."""
    try:
        # Format: '06-04 11:05:00'
        return datetime.strptime(f"2024-{time_str}", "%Y-%m-%d %H:%M:%S")
    except:
        return None

# Parse order time (accept_time) and actual delivery time
df_lade['order_time'] = df_lade['accept_time'].apply(parse_lade_time)
df_lade['actual_delivery_time'] = df_lade['delivery_time'].apply(parse_lade_time)

# Remove rows with invalid timestamps
df_lade = df_lade.dropna(subset=['order_time', 'actual_delivery_time'])

# Calculate actual delivery duration in hours
df_lade['delivery_duration_hours'] = (df_lade['actual_delivery_time'] - df_lade['order_time']).dt.total_seconds() / 3600

# Filter out negative or unrealistic durations (> 24 hours)
df_lade = df_lade[(df_lade['delivery_duration_hours'] > 0) & (df_lade['delivery_duration_hours'] <= 24)]

print(f"Records after timestamp filtering: {len(df_lade)}")
print("\nDelivery duration statistics (hours):")
print(df_lade['delivery_duration_hours'].describe())

In [None]:
# Calculate scheduled delivery time based on distance
# Assumption: Average speed of 15 km/h in urban areas + 30 min buffer

def estimate_delivery_time(distance_km):
    """Estimate delivery time in hours based on distance."""
    avg_speed = 15  # km/h (considering traffic, stops, etc.)
    buffer_hours = 0.5  # 30 minutes buffer for pickup/dropoff
    return (distance_km / avg_speed) + buffer_hours

df_lade['expected_duration_hours'] = df_lade['distance_km'].apply(estimate_delivery_time)
df_lade['scheduled_delivery_time'] = df_lade['order_time'] + pd.to_timedelta(df_lade['expected_duration_hours'], unit='h')

# Create target variable: delayed (1 if actual > scheduled)
df_lade['delayed'] = (df_lade['actual_delivery_time'] > df_lade['scheduled_delivery_time']).astype(int)

print(f"\nDelay distribution:")
print(df_lade['delayed'].value_counts(normalize=True))

### 1.4 Simulate Missing Features

In [None]:
# Get distributions from Delivery_Logistics.csv
vehicle_types = df_logistics['vehicle_type'].value_counts(normalize=True).to_dict()
weather_conditions = df_logistics['weather_condition'].value_counts(normalize=True).to_dict()

print("Vehicle type distribution:")
print(vehicle_types)
print("\nWeather condition distribution:")
print(weather_conditions)

In [None]:
np.random.seed(42)
n = len(df_lade)

# 1. Vehicle Type - based on distribution from Delivery_Logistics
vehicle_list = list(vehicle_types.keys())
vehicle_probs = list(vehicle_types.values())
df_lade['vehicle_type'] = np.random.choice(vehicle_list, size=n, p=vehicle_probs)

# 2. Package Weight - simulate using distribution from Delivery_Logistics
weight_mean = df_logistics['package_weight_kg'].mean()
weight_std = df_logistics['package_weight_kg'].std()
df_lade['package_weight_kg'] = np.abs(np.random.normal(weight_mean, weight_std, n))

# 3. Weather Conditions - based on distribution
weather_list = list(weather_conditions.keys())
weather_probs = list(weather_conditions.values())
df_lade['weather_condition'] = np.random.choice(weather_list, size=n, p=weather_probs)

# 4. Traffic Level - based on hour of order
def get_traffic_level(hour):
    if hour in [7, 8, 17, 18]:
        return 'Very High'
    elif hour in [9, 10, 15, 16]:
        return 'High'
    elif hour in [11, 12, 13, 14]:
        return 'Medium'
    else:
        return 'Low'

df_lade['order_hour'] = df_lade['order_time'].dt.hour
df_lade['traffic_level'] = df_lade['order_hour'].apply(get_traffic_level)

# 5. Road Type - based on distance
def get_road_type(distance):
    if distance < 3:
        return 'City'
    elif distance < 15:
        return 'Highway'
    else:
        return 'Rural'

df_lade['road_type'] = df_lade['distance_km'].apply(get_road_type)

print("Simulated features added!")
print(f"\nVehicle types: {df_lade['vehicle_type'].value_counts().to_dict()}")
print(f"\nTraffic levels: {df_lade['traffic_level'].value_counts().to_dict()}")
print(f"\nRoad types: {df_lade['road_type'].value_counts().to_dict()}")

### 1.5 Create Final Dataset with All Required Columns

In [None]:
# Create the final prepared dataset with all required columns
df_final = df_lade[[
    'order_id',              # Delivery ID
    'accept_gps_lat',        # Source latitude
    'accept_gps_lng',        # Source longitude
    'lat',                   # Destination latitude
    'lng',                   # Destination longitude
    'distance_km',           # Distance
    'vehicle_type',          # Vehicle type
    'courier_id',            # Driver ID
    'package_weight_kg',     # Package weight
    'order_time',            # Order time
    'scheduled_delivery_time', # Scheduled delivery time
    'actual_delivery_time',  # Actual delivery time
    'traffic_level',         # Traffic level
    'weather_condition',     # Weather conditions
    'road_type',             # Road type
    'delayed'                # Target variable
]].copy()

# Rename columns for clarity
df_final.columns = [
    'delivery_id', 'source_lat', 'source_lng', 'dest_lat', 'dest_lng',
    'distance_km', 'vehicle_type', 'driver_id', 'package_weight_kg',
    'order_time', 'scheduled_delivery_time', 'actual_delivery_time',
    'traffic_level', 'weather_condition', 'road_type', 'delayed'
]

print(f"Final dataset shape: {df_final.shape}")
print(f"\nColumns: {list(df_final.columns)}")
df_final.head()

In [None]:
# Check for missing values
print("Missing values:")
print(df_final.isnull().sum())

# Data types
print("\nData types:")
print(df_final.dtypes)

In [None]:
# Save the prepared dataset
df_final.to_csv('prepared_logistics_dataset.csv', index=False)
print("Dataset saved to 'prepared_logistics_dataset.csv'")

---
## Part 2: Exploratory Data Analysis

In [None]:
# Dataset overview
print("=" * 50)
print("DATASET OVERVIEW")
print("=" * 50)
print(f"\nTotal records: {len(df_final):,}")
print(f"Total features: {len(df_final.columns)}")
print(f"\nTarget variable distribution:")
print(df_final['delayed'].value_counts())
print(f"\nDelay rate: {df_final['delayed'].mean()*100:.2f}%")

In [None]:
# Statistical summary
df_final.describe()

In [None]:
# Visualizations
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# 1. Delay distribution
ax1 = axes[0, 0]
df_final['delayed'].value_counts().plot(kind='bar', ax=ax1, color=['green', 'red'])
ax1.set_title('Delay Distribution')
ax1.set_xlabel('Delayed')
ax1.set_ylabel('Count')
ax1.set_xticklabels(['On Time', 'Delayed'], rotation=0)

# 2. Distance distribution
ax2 = axes[0, 1]
df_final['distance_km'].hist(bins=50, ax=ax2, color='steelblue', edgecolor='black')
ax2.set_title('Distance Distribution')
ax2.set_xlabel('Distance (km)')
ax2.set_ylabel('Frequency')

# 3. Vehicle type vs Delay
ax3 = axes[0, 2]
delay_by_vehicle = df_final.groupby('vehicle_type')['delayed'].mean()
delay_by_vehicle.plot(kind='bar', ax=ax3, color='coral')
ax3.set_title('Delay Rate by Vehicle Type')
ax3.set_xlabel('Vehicle Type')
ax3.set_ylabel('Delay Rate')
ax3.set_xticklabels(ax3.get_xticklabels(), rotation=45)

# 4. Traffic level vs Delay
ax4 = axes[1, 0]
delay_by_traffic = df_final.groupby('traffic_level')['delayed'].mean()
delay_by_traffic.reindex(['Low', 'Medium', 'High', 'Very High']).plot(kind='bar', ax=ax4, color='purple')
ax4.set_title('Delay Rate by Traffic Level')
ax4.set_xlabel('Traffic Level')
ax4.set_ylabel('Delay Rate')
ax4.set_xticklabels(ax4.get_xticklabels(), rotation=45)

# 5. Weather vs Delay
ax5 = axes[1, 1]
delay_by_weather = df_final.groupby('weather_condition')['delayed'].mean()
delay_by_weather.plot(kind='bar', ax=ax5, color='teal')
ax5.set_title('Delay Rate by Weather')
ax5.set_xlabel('Weather Condition')
ax5.set_ylabel('Delay Rate')
ax5.set_xticklabels(ax5.get_xticklabels(), rotation=45)

# 6. Road type vs Delay
ax6 = axes[1, 2]
delay_by_road = df_final.groupby('road_type')['delayed'].mean()
delay_by_road.reindex(['City', 'Highway', 'Rural']).plot(kind='bar', ax=ax6, color='orange')
ax6.set_title('Delay Rate by Road Type')
ax6.set_xlabel('Road Type')
ax6.set_ylabel('Delay Rate')
ax6.set_xticklabels(ax6.get_xticklabels(), rotation=45)

plt.tight_layout()
plt.savefig('eda_visualizations.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Correlation heatmap for numerical features
numerical_cols = ['source_lat', 'source_lng', 'dest_lat', 'dest_lng', 
                  'distance_km', 'package_weight_kg', 'delayed']

plt.figure(figsize=(10, 8))
correlation_matrix = df_final[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, fmt='.2f')
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

---
## Part 3: Model Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, classification_report,
    roc_curve
)
import joblib
import os

print("Model training libraries imported!")

In [None]:
# Prepare features for modeling
feature_cols = ['distance_km', 'package_weight_kg', 'vehicle_type', 
                'traffic_level', 'weather_condition', 'road_type']

# Create a copy for modeling
df_model = df_final[feature_cols + ['delayed']].copy()

# Encode categorical variables
label_encoders = {}
categorical_cols = ['vehicle_type', 'traffic_level', 'weather_condition', 'road_type']

for col in categorical_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    label_encoders[col] = le

print("Features encoded!")
df_model.head()

In [None]:
# Split data
X = df_model.drop('delayed', axis=1)
y = df_model['delayed']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))

In [None]:
# Define models
models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=100, 
        max_depth=10, 
        random_state=42,
        n_jobs=-1
    ),
    'XGBoost': XGBClassifier(
        n_estimators=100, 
        max_depth=6, 
        learning_rate=0.1,
        random_state=42,
        eval_metric='logloss'
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=100, 
        max_depth=5, 
        learning_rate=0.1,
        random_state=42
    )
}

print("Models defined:")
for name in models:
    print(f"  - {name}")

In [None]:
# Train and evaluate all models
results = []
trained_models = {}

print("Training models...\n")
print("=" * 70)

for name, model in models.items():
    print(f"\n{'='*20} {name} {'='*20}")
    
    # Train
    model.fit(X_train, y_train)
    trained_models[name] = model
    
    # Predict
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Store results
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC-AUC': roc_auc
    })
    
    # Print metrics
    print(f"\nAccuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"ROC-AUC:   {roc_auc:.4f}")
    
    # Confusion Matrix
    print(f"\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

print("\n" + "=" * 70)

In [None]:
# Results comparison
results_df = pd.DataFrame(results)
results_df = results_df.set_index('Model')
print("\nMODEL COMPARISON:")
print("=" * 70)
print(results_df.round(4))

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart of metrics
ax1 = axes[0]
results_df[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC']].plot(
    kind='bar', ax=ax1, width=0.8
)
ax1.set_title('Model Performance Comparison')
ax1.set_ylabel('Score')
ax1.set_ylim(0, 1)
ax1.legend(loc='lower right')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45)

# ROC Curves
ax2 = axes[1]
for name, model in trained_models.items():
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    ax2.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})')

ax2.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
ax2.set_xlabel('False Positive Rate')
ax2.set_ylabel('True Positive Rate')
ax2.set_title('ROC Curves')
ax2.legend(loc='lower right')

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Feature importance for best model
best_model_name = results_df['ROC-AUC'].idxmax()
best_model = trained_models[best_model_name]

print(f"\nBest Model: {best_model_name}")
print(f"ROC-AUC: {results_df.loc[best_model_name, 'ROC-AUC']:.4f}")

# Get feature importance
if hasattr(best_model, 'feature_importances_'):
    importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=True)
    
    plt.figure(figsize=(10, 6))
    plt.barh(importance['Feature'], importance['Importance'], color='steelblue')
    plt.xlabel('Importance')
    plt.title(f'Feature Importance - {best_model_name}')
    plt.tight_layout()
    plt.savefig('feature_importance.png', dpi=150, bbox_inches='tight')
    plt.show()

---
## Part 4: Save Best Model

In [None]:
# Create models directory
os.makedirs('models', exist_ok=True)

# Save the best model
model_path = 'models/best_delay_prediction_model.pkl'
joblib.dump(best_model, model_path)
print(f"Best model ({best_model_name}) saved to: {model_path}")

# Save label encoders for inference
encoders_path = 'models/label_encoders.pkl'
joblib.dump(label_encoders, encoders_path)
print(f"Label encoders saved to: {encoders_path}")

# Save model comparison results
results_df.to_csv('models/model_comparison_results.csv')
print(f"Model comparison results saved to: models/model_comparison_results.csv")

In [None]:
# Test loading and prediction
print("\n" + "=" * 50)
print("TESTING SAVED MODEL")
print("=" * 50)

# Load model
loaded_model = joblib.load(model_path)
loaded_encoders = joblib.load(encoders_path)

# Sample prediction
sample = X_test.iloc[0:1]
prediction = loaded_model.predict(sample)
probability = loaded_model.predict_proba(sample)[0]

print(f"\nSample features: {sample.values[0]}")
print(f"Prediction: {'Delayed' if prediction[0] == 1 else 'On Time'}")
print(f"Probability: On Time={probability[0]:.2%}, Delayed={probability[1]:.2%}")
print(f"Actual: {'Delayed' if y_test.iloc[0] == 1 else 'On Time'}")

---
## Summary

### Dataset
- **Source**: LaDe-D (Hugging Face) + Delivery_Logistics.csv
- **Records**: 5,000 deliveries from Shanghai
- **Features**: 16 columns including GPS, timestamps, vehicle, weather, traffic

### Models Trained
1. Random Forest Classifier
2. XGBoost Classifier
3. Gradient Boosting Classifier

### Outputs
- `prepared_logistics_dataset.csv` - Final dataset
- `models/best_delay_prediction_model.pkl` - Best trained model
- `models/label_encoders.pkl` - Encoders for inference
- `models/model_comparison_results.csv` - Performance metrics