In [1]:
# --- Cell 1: Setup and Requirements ---
!pip install pandas numpy scikit-learn xgboost requests joblib

import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib
import time

# --- Configuration (UPDATE THIS!) ---
# You need a key from OpenWeatherMap or a similar service to fetch real-time data.
WEATHER_API_KEY = "5197fc88f5f846ee7566eb28d403c91f"
THRESHOLD_MIN = 10 # Delivery is "Late" if actual time > estimated time + THRESHOLD_MIN

def setup_environment():
    """Confirms environment setup."""
    print("Environment setup check completed. Required libraries are installed.")

# Run setup
setup_environment()

Environment setup check completed. Required libraries are installed.


In [2]:
# --- Cell 2: Helper Functions ---

def haversine(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance in km between two points on the earth."""
    R = 6371  # Earth radius in kilometers
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

def fetch_realtime_weather(latitude, longitude, api_key):
    """Fetches real-time weather data for a given location using OpenWeatherMap API."""
    if api_key == "YOUR_OPENWEATHERMAP_API_KEY":
        print("Warning: API Key is a placeholder. Skipping real-time fetch.")
        return 25.0, 'Clear', 5.0 # Return fallback values for simulation

    try:
        url = f"http://api.openweathermap.org/data/2.5/weather?lat={latitude}&lon={longitude}&appid={api_key}&units=metric"
        response = requests.get(url, timeout=5)
        response.raise_for_status()
        data = response.json()

        temp = data['main']['temp']
        weather_main = data['weather'][0]['main']
        wind_speed = data['wind']['speed']

        print(f"Weather fetched successfully: {weather_main}, {temp}째C")
        return temp, weather_main, wind_speed
    except requests.exceptions.RequestException as e:
        print(f"Error fetching weather data: {e}")
        return np.nan, 'Unknown', np.nan

In [3]:
# --- Cell 3: Dataset Creation and Feature Engineering ---

def get_or_create_dataset():
    """Simulates loading and initial feature engineering on a historical dataset."""
    print("... Loading or creating simulated historical dataset ...")

    # 1. Simulate Historical Data
    np.random.seed(42) # For reproducibility
    data = {
        'Order_ID': range(500), # Increased sample size
        'Restaurant_lat': np.random.uniform(28.5, 28.7, 500),
        'Restaurant_lon': np.random.uniform(77.1, 77.3, 500),
        'Delivery_lat': np.random.uniform(28.5, 28.7, 500),
        'Delivery_lon': np.random.uniform(77.1, 77.3, 500),
        'Order_Placed_Time': pd.to_datetime(pd.date_range('2025-01-01', periods=500, freq='4H')),
        'Initial_Estimate_Min': np.random.randint(25, 45, 500),
        'Actual_Delivery_Time_Min': np.random.randint(20, 70, 500),
        'preparation_time_min': np.random.randint(10, 30, 500),
        'restaurant_rating': np.random.uniform(3.0, 5.0, 500).round(1),
        'delivery_person_rating': np.random.uniform(4.0, 5.0, 500).round(1),
        'Road_Traffic_Density': np.random.choice(['Low', 'Medium', 'High', 'Jam'], 500, p=[0.4, 0.3, 0.2, 0.1]),
        'Weather_Condition': np.random.choice(['Clear', 'Rainy', 'Foggy', 'Stormy'], 500, p=[0.7, 0.2, 0.05, 0.05])
    }
    df = pd.DataFrame(data)

    # 2. Feature Engineering
    df['delivery_distance_km'] = haversine(
        df['Restaurant_lat'], df['Restaurant_lon'],
        df['Delivery_lat'], df['Delivery_lon']
    )

    # Time-based features
    df['order_hour'] = df['Order_Placed_Time'].dt.hour
    df['day_of_week'] = df['Order_Placed_Time'].dt.day_name()

    # Cyclic features for hour
    df['sin_hour'] = np.sin(2 * np.pi * df['order_hour'] / 24)
    df['cos_hour'] = np.cos(2 * np.pi * df['order_hour'] / 24)

    # Target Variable
    df['is_late'] = np.where(
        df['Actual_Delivery_Time_Min'] > (df['Initial_Estimate_Min'] + THRESHOLD_MIN),
        1,
        0
    )

    # Add simulated weather for training data (as historical data won't use the live API)
    df['current_temp_c'] = np.random.uniform(15, 35, 500)

    final_features = [
        'delivery_distance_km', 'preparation_time_min', 'restaurant_rating',
        'delivery_person_rating', 'Road_Traffic_Density', 'Weather_Condition',
        'sin_hour', 'cos_hour'
    ]

    df_train = df[final_features + ['current_temp_c', 'is_late']].copy()

    print(f"Dataset created with {len(df_train)} samples. Target variable 'is_late' distribution:")
    print(df_train['is_late'].value_counts(normalize=True))
    return df_train, final_features

In [4]:
# --- Cell 4: Preprocessing Functions ---

def create_preprocessing_pipeline(feature_list):
    """Creates a scikit-learn ColumnTransformer for preprocessing."""

    numeric_features = [
        'delivery_distance_km', 'preparation_time_min', 'restaurant_rating',
        'delivery_person_rating', 'current_temp_c', 'sin_hour', 'cos_hour'
    ]
    categorical_features = [
        'Road_Traffic_Density', 'Weather_Condition'
    ]

    # Create preprocessing steps
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, [f for f in numeric_features if f in feature_list or f == 'current_temp_c']),
            ('cat', categorical_transformer, [f for f in categorical_features if f in feature_list])
        ],
        remainder='passthrough'
    )

    print("Preprocessing pipeline (Scaling + One-Hot Encoding) created.")
    return preprocessor

def perform_preprocessing(df_train, feature_list, preprocessor):
    """Splits data and prepares for the full pipeline."""

    # Include 'current_temp_c' in X for training as it's a numeric feature
    X = df_train[feature_list + ['current_temp_c']].copy()
    y = df_train['is_late']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print(f"Data split: Train size: {len(X_train)}, Test size: {len(X_test)}")
    return X_train, X_test, y_train, y_test

In [5]:
# --- Cell 5: Model Training, Evaluation, and Saving ---

def train_and_evaluate_model(X_train, X_test, y_train, y_test, preprocessor):
    """Defines, trains, and evaluates the final ML pipeline."""

    # Model Choice: XGBoost Hyperparameters
    XGB_PARAMS = {
        'objective': 'binary:logistic',
        'n_estimators': 300,
        'learning_rate': 0.05,
        'max_depth': 5,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    model = XGBClassifier(**XGB_PARAMS)

    # Create the full ML pipeline
    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    print("... Training XGBoost model ...")
    start_time = time.time()
    full_pipeline.fit(X_train, y_train)
    end_time = time.time()
    print(f"Training completed in {end_time - start_time:.2f} seconds.")

    # Evaluation
    y_pred = full_pipeline.predict(X_test)
    y_proba = full_pipeline.predict_proba(X_test)[:, 1]

    print("\n--- Model Evaluation (Test Set) ---")
    print(classification_report(y_test, y_pred))
    print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba):.4f}")

    # Save the model to Google Colab instance storage
    joblib.dump(full_pipeline, 'late_delivery_predictor_model.pkl')
    print("Model saved as 'late_delivery_predictor_model.pkl'.")

    return full_pipeline

In [6]:
# --- Cell 6: Real-Time Dashboard (Conceptual & Demonstration) ---

def create_real_time_dashboard(model_pipeline, features):
    """
    Provides the conceptual code for the Streamlit dashboard
    and demonstrates a real-time prediction using the saved model.
    """

    print("\n--- Conceptual Streamlit Dashboard Code ---")
    print("To run the full dashboard, save the following code as 'app.py' and execute 'streamlit run app.py' locally.")

    # (The actual Streamlit code structure is omitted here but provided in the original output documentation)
    # The output is focused on demonstrating the prediction function itself.

    # --- Real-Time Prediction Demonstration ---
    print("\n--- Real-Time Prediction Demonstration ---")

    # 1. Define a New Order (Input Data)
    NEW_ORDER_DATA = {
        'Restaurant_lat': 28.60,
        'Restaurant_lon': 77.15,
        'Delivery_lat': 28.70,
        'Delivery_lon': 77.25,
        'preparation_time_min': 25,
        'restaurant_rating': 4.2,
        'delivery_person_rating': 4.9,
    }

    # 2. Feature Engineering
    current_time = pd.Timestamp.now(tz='Asia/Kolkata')
    order_hour = current_time.hour

    delivery_distance_km = haversine(
        NEW_ORDER_DATA['Restaurant_lat'], NEW_ORDER_DATA['Restaurant_lon'],
        NEW_ORDER_DATA['Delivery_lat'], NEW_ORDER_DATA['Delivery_lon']
    )

    # Simulate Real-time Dynamic Features
    temp, weather_main, wind_speed = fetch_realtime_weather(
        NEW_ORDER_DATA['Delivery_lat'],
        NEW_ORDER_DATA['Delivery_lon'],
        WEATHER_API_KEY
    )

    # Traffic Density Simulation based on time (for demonstration)
    traffic = 'Jam' if 17 <= order_hour <= 21 else 'High' if 12 <= order_hour <= 14 else 'Medium'

    sin_hour = np.sin(2 * np.pi * order_hour / 24)
    cos_hour = np.cos(2 * np.pi * order_hour / 24)

    # 3. Create DataFrame (must match the features and order used in the pipeline)
    input_data = pd.DataFrame({
        'delivery_distance_km': [delivery_distance_km],
        'preparation_time_min': [NEW_ORDER_DATA['preparation_time_min']],
        'restaurant_rating': [NEW_ORDER_DATA['restaurant_rating']],
        'delivery_person_rating': [NEW_ORDER_DATA['delivery_person_rating']],
        'Road_Traffic_Density': [traffic],
        'Weather_Condition': [weather_main],
        'sin_hour': [sin_hour],
        'cos_hour': [cos_hour],
        'current_temp_c': [temp if not np.isnan(temp) else 25.0]
    })

    # 4. Predict
    prediction_proba = model_pipeline.predict_proba(input_data)[:, 1][0] * 100

    # 5. Output
    print(f"\n--- Prediction Result ---")
    print(f"Distance: {delivery_distance_km:.2f} km")
    print(f"Current Traffic: {traffic}")
    print(f"Weather: {weather_main}, {temp}째C")
    print(f"**Probability of Being Late:** {prediction_proba:.2f}%")
    if prediction_proba > 50:
        print("Conclusion: High risk of late delivery (Predicted Late).")
    else:
        print("Conclusion: Low risk of late delivery (Predicted On-Time).")

In [7]:
# --- Cell 7: Execute the Full Pipeline ---

# Step 1 & 2: Get Data and Features
df_train, feature_list = get_or_create_dataset()

print("\n" + "="*50 + "\n")

# Step 3: Preprocessing Setup
preprocessor = create_preprocessing_pipeline(feature_list)
X_train, X_test, y_train, y_test = perform_preprocessing(df_train, feature_list, preprocessor)

print("\n" + "="*50 + "\n")

# Step 4: Train, Evaluate, and Save Model
trained_pipeline = train_and_evaluate_model(X_train, X_test, y_train, y_test, preprocessor)

print("\n" + "="*50 + "\n")

# Step 5: Real-time Prediction Demonstration (using the trained model)
create_real_time_dashboard(trained_pipeline, feature_list)

... Loading or creating simulated historical dataset ...
Dataset created with 500 samples. Target variable 'is_late' distribution:
is_late
1    0.5
0    0.5
Name: proportion, dtype: float64


Preprocessing pipeline (Scaling + One-Hot Encoding) created.
Data split: Train size: 400, Test size: 100


... Training XGBoost model ...


  'Order_Placed_Time': pd.to_datetime(pd.date_range('2025-01-01', periods=500, freq='4H')),
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training completed in 0.65 seconds.

--- Model Evaluation (Test Set) ---
              precision    recall  f1-score   support

           0       0.57      0.58      0.57        50
           1       0.57      0.56      0.57        50

    accuracy                           0.57       100
   macro avg       0.57      0.57      0.57       100
weighted avg       0.57      0.57      0.57       100

ROC AUC Score: 0.5356
Model saved as 'late_delivery_predictor_model.pkl'.



--- Conceptual Streamlit Dashboard Code ---
To run the full dashboard, save the following code as 'app.py' and execute 'streamlit run app.py' locally.

--- Real-Time Prediction Demonstration ---
Weather fetched successfully: Clouds, 26.15째C

--- Prediction Result ---
Distance: 14.79 km
Current Traffic: Medium
Weather: Clouds, 26.15째C
**Probability of Being Late:** 81.42%
Conclusion: High risk of late delivery (Predicted Late).


In [8]:
from google.colab import files

# This command initiates the download of the specified file to your local computer.
files.download('late_delivery_predictor_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>