In [3]:
!pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/29/22/e3ff2dfafe862a91733dfa0aecdb4794aa1d9a18e09a14e118bde0cbc2db/xgboost-3.0.2-py3-none-win_amd64.whl.metadata
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB 6.5 MB/s eta 0:00:23
   ---------------------------------------- 0.8/150.0 MB 9.8 MB/s eta 0:00:16
   ---------------------------------------- 1.3/150.0 MB 10.4 MB/s eta 0:00:15
   ---------------------------------------- 1.8/150.0 MB 10.7 MB/s eta 0:00:14
    --------------------------------------- 2.2/150.0 MB 10.0 MB/s eta 0:00:15
    --------------------------------------- 2.7/150.0 MB 10.0 MB/s eta 0:00:15
    --------------------------------------- 3.1/150.0 MB 10.0 MB/s eta 0:00:15
    ---------------------

In [6]:
!pip install lightgbm

Collecting lightgbm
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/5e/23/f8b28ca248bb629b9e08f877dd2965d1994e1674a03d67cd10c5246da248/lightgbm-4.6.0-py3-none-win_amd64.whl.metadata
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------------ --------------------------- 0.4/1.5 MB 9.2 MB/s eta 0:00:01
   ------------------------ --------------- 0.9/1.5 MB 9.4 MB/s eta 0:00:01
   -------------------------------------- - 1.4/1.5 MB 9.9 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 8.4 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [41]:
# Travel Time Prediction Model Training for Udupi Delivery Routes
# This notebook trains an ML model to predict travel times for delivery optimization

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json
import requests
import datetime
from typing import List, Dict, Any, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

In [42]:
# ML libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

In [43]:
# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [44]:
print("All libraries imported successfully!")
print(f"XGBoost version: {xgb.__version__}")

All libraries imported successfully!
XGBoost version: 3.0.2


In [47]:
# =============================================================================
# SECTION 1: CONFIGURATION AND SETUP
# =============================================================================

# Configuration
CONFIG = {
    'GOOGLE_API_KEY':'AIzaSyCAWRHOBP5MGK1kXDc3vEGPJi-SC1zNkuc',  # Add your Google Maps API key here
    'WEATHER_API_KEY': '1128702f089368cf9384837ca336f183', # Add your OpenWeatherMap API key here
    'USE_SIMULATED_DATA': False,  # Set to False when you have real API keys
    'TRAINING_SAMPLES_PER_ROUTE': 50,
    'RANDOM_SEED': 42
}



In [48]:
# Set random seeds for reproducibility
np.random.seed(CONFIG['RANDOM_SEED'])

print("Configuration loaded:")
for key, value in CONFIG.items():
    if 'API_KEY' in key:
        print(f"   {key}: {'Set' if value else 'Not Set'}")
    else:
        print(f"   {key}: {value}")



Configuration loaded:
   GOOGLE_API_KEY: Set
   WEATHER_API_KEY: Set
   USE_SIMULATED_DATA: False
   TRAINING_SAMPLES_PER_ROUTE: 50
   RANDOM_SEED: 42


In [49]:
# =============================================================================
# SECTION 2: DATA COLLECTION FUNCTIONS
# =============================================================================

def get_google_maps_travel_time(origin_lat: float, origin_lng: float,
                               dest_lat: float, dest_lng: float,
                               departure_time: datetime.datetime,
                               api_key: Optional[str] = None) -> Optional[float]:
    """Get travel time from Google Maps Distance Matrix API."""
    
    if not api_key or CONFIG['USE_SIMULATED_DATA']:
        # Simulate realistic travel times for Udupi
        distance_deg = np.sqrt((origin_lat - dest_lat)**2 + (origin_lng - dest_lng)**2)
        distance_km = distance_deg * 111  # Rough conversion to km
        
        # Base travel time (assuming 25 km/h average speed in city)
        base_time = (distance_km / 25) * 60  # minutes
        
        # Add time-based factors
        hour = departure_time.hour
        day_of_week = departure_time.weekday()
        
        # Rush hour multiplier
        if (7 <= hour <= 9) or (17 <= hour <= 19):
            time_multiplier = 1.5
        elif (22 <= hour <= 6):  # Night time
            time_multiplier = 0.8
        else:
            time_multiplier = 1.0
        
        # Weekend factor
        if day_of_week >= 5:  # Weekend
            time_multiplier *= 0.9
        
        # Weather factor (random)
        weather_factor = np.random.uniform(0.9, 1.3)
        
        # Calculate final time with some randomness
        travel_time = base_time * time_multiplier * weather_factor
        travel_time += np.random.normal(0, travel_time * 0.1)  # 10% noise
        
        return max(1.0, travel_time)
    
    try:
        url = "https://maps.googleapis.com/maps/api/distancematrix/json"
        params = {
            'origins': f"{origin_lat},{origin_lng}",
            'destinations': f"{dest_lat},{dest_lng}",
            'departure_time': int(departure_time.timestamp()),
            'traffic_model': 'best_guess',
            'key': api_key
        }
        
        response = requests.get(url, params=params)
        data = response.json()
        
        if data['status'] == 'OK' and data['rows'][0]['elements'][0]['status'] == 'OK':
            duration_in_traffic = data['rows'][0]['elements'][0].get('duration_in_traffic', 
                                data['rows'][0]['elements'][0]['duration'])
            return duration_in_traffic['value'] / 60  # Convert to minutes
            
    except Exception as e:
        print(f"Google Maps API error: {e}")
    
    return None



In [50]:
def get_weather_data(lat: float, lng: float, timestamp: datetime.datetime,
                    api_key: Optional[str] = None) -> Dict[str, Any]:
    """Get weather data for the given location and time."""
    
    if not api_key or CONFIG['USE_SIMULATED_DATA']:
        # Generate realistic simulated weather for Udupi
        # Udupi climate: tropical, warm, humid
        
        month = timestamp.month
        hour = timestamp.hour
        
        # Temperature varies by month and time
        if month in [12, 1, 2]:  # Winter
            base_temp = np.random.normal(24, 3)
        elif month in [3, 4, 5]:  # Summer
            base_temp = np.random.normal(30, 4)
        elif month in [6, 7, 8, 9]:  # Monsoon
            base_temp = np.random.normal(26, 2)
        else:  # Post-monsoon
            base_temp = np.random.normal(28, 3)
        
        # Daily temperature variation
        if 6 <= hour <= 8:  # Morning
            temp_adjustment = -2
        elif 12 <= hour <= 15:  # Afternoon
            temp_adjustment = 3
        elif 18 <= hour <= 20:  # Evening
            temp_adjustment = 0
        else:  # Night
            temp_adjustment = -1
        
        temperature = base_temp + temp_adjustment
        
        # Humidity (high in coastal areas)
        humidity = np.random.randint(65, 95)
        
        # Weather conditions
        if month in [6, 7, 8, 9]:  # Monsoon season
            weather_condition = np.random.choice(['rain', 'clouds', 'clear'], p=[0.4, 0.4, 0.2])
        else:
            weather_condition = np.random.choice(['clear', 'clouds', 'rain'], p=[0.6, 0.3, 0.1])
        
        return {
            'temperature': round(temperature, 1),
            'humidity': humidity,
            'weather_condition': weather_condition,
            'wind_speed': np.random.exponential(3),
            'visibility': np.random.normal(8, 2) if weather_condition == 'rain' else np.random.normal(12, 2)
        }
    
    try:
        url = f"http://api.openweathermap.org/data/2.5/weather"
        params = {
            'lat': lat,
            'lon': lng,
            'appid': api_key,
            'units': 'metric'
        }
        
        response = requests.get(url, params=params)
        data = response.json()
        
        if response.status_code == 200:
            return {
                'temperature': data['main']['temp'],
                'humidity': data['main']['humidity'],
                'weather_condition': data['weather'][0]['main'].lower(),
                'wind_speed': data['wind']['speed'],
                'visibility': data.get('visibility', 10000) / 1000
            }
    
    except Exception as e:
        print(f" Weather API error: {e}")
    
    # Fallback
    return {
        'temperature': 25,
        'humidity': 70,
        'weather_condition': 'clear',
        'wind_speed': 3,
        'visibility': 10
    }



In [51]:
def create_features(origin_lat: float, origin_lng: float,
                   dest_lat: float, dest_lng: float,
                   timestamp: datetime.datetime,
                   weather_data: Dict[str, Any]) -> Dict[str, Any]:
    """Create feature vector from raw data."""
    
    # Distance features
    distance_km = np.sqrt((origin_lat - dest_lat)**2 + (origin_lng - dest_lng)**2) * 111
    
    # Time features
    hour = timestamp.hour
    day_of_week = timestamp.weekday()
    month = timestamp.month
    is_weekend = 1 if day_of_week >= 5 else 0
    is_rush_hour = 1 if (7 <= hour <= 9) or (17 <= hour <= 19) else 0
    is_night = 1 if (22 <= hour <= 6) else 0
    
    # Location features
    center_lat = (origin_lat + dest_lat) / 2
    center_lng = (origin_lng + dest_lng) / 2
    
    # Direction features
    lat_diff = dest_lat - origin_lat
    lng_diff = dest_lng - origin_lng
    
    return {
        'origin_lat': origin_lat,
        'origin_lng': origin_lng,
        'dest_lat': dest_lat,
        'dest_lng': dest_lng,
        'distance_km': distance_km,
        'center_lat': center_lat,
        'center_lng': center_lng,
        'lat_diff': lat_diff,
        'lng_diff': lng_diff,
        'hour': hour,
        'day_of_week': day_of_week,
        'month': month,
        'is_weekend': is_weekend,
        'is_rush_hour': is_rush_hour,
        'is_night': is_night,
        'temperature': weather_data['temperature'],
        'humidity': weather_data['humidity'],
        'weather_condition': weather_data['weather_condition'],
        'wind_speed': weather_data['wind_speed'],
        'visibility': weather_data['visibility']
    }



In [52]:
# =============================================================================
# SECTION 3: DEFINE TRAINING ROUTES
# =============================================================================

# Define sample delivery routes in Udupi
# You should replace these with your actual delivery routes
UDUPI_ROUTES = [
    # Format: (origin_lat, origin_lng, dest_lat, dest_lng)
    
    # Main Udupi area routes
    (13.3409, 74.7421, 13.3500, 74.7500),  # Central to North
    (13.3409, 74.7421, 13.3300, 74.7400),  # Central to South
    (13.3409, 74.7421, 13.3450, 74.7350),  # Central to East
    (13.3409, 74.7421, 13.3350, 74.7480),  # Central to West
    
    # Cross-town routes
    (13.3500, 74.7500, 13.3300, 74.7400),  # North to South
    (13.3450, 74.7350, 13.3350, 74.7480),  # East to West
    (13.3600, 74.7550, 13.3250, 74.7350),  # Northeast to Southwest
    (13.3250, 74.7450, 13.3550, 74.7380),  # Northwest to Southeast
    
    # Longer routes (to suburbs/nearby areas)
    (13.3409, 74.7421, 13.3700, 74.7600),  # To Manipal area
    (13.3409, 74.7421, 13.3100, 74.7300),  # To Malpe area
    (13.3409, 74.7421, 13.3200, 74.7600),  # To Karkala road
    (13.3409, 74.7421, 13.3600, 74.7200),  # To Kundapur road
    
    # Market and commercial area routes
    (13.3380, 74.7430, 13.3420, 74.7400),  # Market area
    (13.3400, 74.7450, 13.3440, 74.7420),  # Commercial district
    (13.3360, 74.7410, 13.3480, 74.7440),  # Shopping areas
    
    # Residential area routes
    (13.3320, 74.7380, 13.3520, 74.7520),  # Residential zones
    (13.3280, 74.7360, 13.3480, 74.7480),  # Housing societies
    (13.3540, 74.7540, 13.3340, 74.7340),  # Apartment complexes
]

print(f" Defined {len(UDUPI_ROUTES)} training routes in Udupi")



 Defined 18 training routes in Udupi


In [53]:
# =============================================================================
# SECTION 4: DATA COLLECTION
# =============================================================================

def collect_training_data(routes: List[Tuple[float, float, float, float]], 
                         samples_per_route: int = 50) -> pd.DataFrame:
    """Collect training data for all routes."""
    
    print(f"Starting data collection for {len(routes)} routes...")
    print(f"Collecting {samples_per_route} samples per route = {len(routes) * samples_per_route} total samples")
    
    training_data = []
    
    # Progress tracking
    total_routes = len(routes)
    
    for route_idx, (origin_lat, origin_lng, dest_lat, dest_lng) in enumerate(routes):
        print(f"   Route {route_idx + 1}/{total_routes}: ({origin_lat:.4f}, {origin_lng:.4f}) → ({dest_lat:.4f}, {dest_lng:.4f})")
        
        # Generate samples for this route
        base_date = datetime.datetime.now() - datetime.timedelta(days=60)
        
        for sample_idx in range(samples_per_route):
            # Sample random time within last 60 days
            sample_time = base_date + datetime.timedelta(
                days=np.random.randint(0, 60),
                hours=np.random.randint(6, 22),  # Business hours
                minutes=np.random.randint(0, 60)
            )
            
            # Get travel time
            travel_time = get_google_maps_travel_time(
                origin_lat, origin_lng, dest_lat, dest_lng, 
                sample_time, CONFIG['GOOGLE_API_KEY']
            )
            
            if travel_time is None:
                continue
            
            # Get weather data
            weather_data = get_weather_data(
                origin_lat, origin_lng, sample_time, CONFIG['WEATHER_API_KEY']
            )
            
            # Create features
            features = create_features(
                origin_lat, origin_lng, dest_lat, dest_lng, sample_time, weather_data
            )
            features['travel_time_minutes'] = travel_time
            features['route_id'] = route_idx
            
            training_data.append(features)
        
        # Show progress every 5 routes
        if (route_idx + 1) % 5 == 0:
            print(f"    Completed {route_idx + 1} routes ({len(training_data)} samples collected)")
    
    df = pd.DataFrame(training_data)
    print(f"Data collection completed! Collected {len(df)} samples")
    
    return df

# Collect the training data
print("=" * 60)
print("STARTING DATA COLLECTION")
print("=" * 60)

training_df = collect_training_data(UDUPI_ROUTES, CONFIG['TRAINING_SAMPLES_PER_ROUTE'])

# Save raw data
training_df.to_csv('udupi_travel_time_data.csv', index=False)
print(f"Training data saved to 'udupi_travel_time_data.csv'")



STARTING DATA COLLECTION
Starting data collection for 18 routes...
Collecting 50 samples per route = 900 total samples
   Route 1/18: (13.3409, 74.7421) → (13.3500, 74.7500)
   Route 2/18: (13.3409, 74.7421) → (13.3300, 74.7400)
   Route 3/18: (13.3409, 74.7421) → (13.3450, 74.7350)
   Route 4/18: (13.3409, 74.7421) → (13.3350, 74.7480)
   Route 5/18: (13.3500, 74.7500) → (13.3300, 74.7400)
    Completed 5 routes (0 samples collected)
   Route 6/18: (13.3450, 74.7350) → (13.3350, 74.7480)
   Route 7/18: (13.3600, 74.7550) → (13.3250, 74.7350)
   Route 8/18: (13.3250, 74.7450) → (13.3550, 74.7380)
   Route 9/18: (13.3409, 74.7421) → (13.3700, 74.7600)
   Route 10/18: (13.3409, 74.7421) → (13.3100, 74.7300)
    Completed 10 routes (0 samples collected)
   Route 11/18: (13.3409, 74.7421) → (13.3200, 74.7600)
   Route 12/18: (13.3409, 74.7421) → (13.3600, 74.7200)
   Route 13/18: (13.3380, 74.7430) → (13.3420, 74.7400)
   Route 14/18: (13.3400, 74.7450) → (13.3440, 74.7420)
   Route 15/18:

In [56]:
training_df.head()

In [55]:
# =============================================================================
# SECTION 5: EXPLORATORY DATA ANALYSIS
# =============================================================================

print("\n" + "=" * 60)
print("EXPLORATORY DATA ANALYSIS")
print("=" * 60)

# Basic statistics
print("Dataset Overview:")
print(f"   • Total samples: {len(training_df)}")
print(f"   • Features: {len(training_df.columns) - 1}")
print(f"   • Date range: {training_df.shape[0]} samples")
print(f"   • Memory usage: {training_df.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")

print("\n Travel Time Statistics:")
print(training_df['travel_time_minutes'].describe())

# Create visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Udupi Delivery Route Travel Time Analysis', fontsize=16)

# 1. Travel time distribution
axes[0, 0].hist(training_df['travel_time_minutes'], bins=50, alpha=0.7, color='skyblue')
axes[0, 0].axvline(training_df['travel_time_minutes'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {training_df["travel_time_minutes"].mean():.1f} min')
axes[0, 0].set_xlabel('Travel Time (minutes)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Travel Time Distribution')
axes[0, 0].legend()

# 2. Travel time by hour
hourly_avg = training_df.groupby('hour')['travel_time_minutes'].mean()
axes[0, 1].plot(hourly_avg.index, hourly_avg.values, marker='o', linewidth=2)
axes[0, 1].set_xlabel('Hour of Day')
axes[0, 1].set_ylabel('Average Travel Time (min)')
axes[0, 1].set_title('Travel Time by Hour')
axes[0, 1].grid(True, alpha=0.3)

# 3. Travel time by day of week
dow_avg = training_df.groupby('day_of_week')['travel_time_minutes'].mean()
dow_labels = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
axes[0, 2].bar(range(7), dow_avg.values, color='lightcoral')
axes[0, 2].set_xlabel('Day of Week')
axes[0, 2].set_ylabel('Average Travel Time (min)')
axes[0, 2].set_title('Travel Time by Day of Week')
axes[0, 2].set_xticks(range(7))
axes[0, 2].set_xticklabels(dow_labels)

# 4. Distance vs travel time
axes[1, 0].scatter(training_df['distance_km'], training_df['travel_time_minutes'], alpha=0.5)
axes[1, 0].set_xlabel('Distance (km)')
axes[1, 0].set_ylabel('Travel Time (minutes)')
axes[1, 0].set_title('Distance vs Travel Time')

# 5. Weather impact
weather_avg = training_df.groupby('weather_condition')['travel_time_minutes'].mean()
axes[1, 1].bar(weather_avg.index, weather_avg.values, color='lightgreen')
axes[1, 1].set_xlabel('Weather Condition')
axes[1, 1].set_ylabel('Average Travel Time (min)')
axes[1, 1].set_title('Travel Time by Weather')
axes[1, 1].tick_params(axis='x', rotation=45)

# 6. Temperature vs travel time
axes[1, 2].scatter(training_df['temperature'], training_df['travel_time_minutes'], alpha=0.5, color='orange')
axes[1, 2].set_xlabel('Temperature (°C)')
axes[1, 2].set_ylabel('Travel Time (minutes)')
axes[1, 2].set_title('Temperature vs Travel Time')

plt.tight_layout()
plt.show()

# Correlation analysis
print("\n🔗 Feature Correlations with Travel Time:")
correlations = training_df.corr()['travel_time_minutes'].sort_values(ascending=False)
for feature, corr in correlations.items():
    if feature != 'travel_time_minutes':
        print(f"   {feature:20s}: {corr:6.3f}")




EXPLORATORY DATA ANALYSIS
Dataset Overview:
   • Total samples: 0
   • Features: -1
   • Date range: 0 samples
   • Memory usage: 0.0 MB

 Travel Time Statistics:


KeyError: 'travel_time_minutes'

In [None]:
# =============================================================================
# SECTION 6: DATA PREPROCESSING
# =============================================================================

print("\n" + "=" * 60)
print("DATA PREPROCESSING")
print("=" * 60)

def preprocess_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, StandardScaler, Dict]:
    """Preprocess the training data."""
    
    # Separate features and target
    X = df.drop(['travel_time_minutes', 'route_id'], axis=1)
    y = df['travel_time_minutes']
    
    print(f" Original features: {X.shape[1]}")
    
    # Handle categorical variables
    label_encoders = {}
    categorical_columns = ['weather_condition']
    
    for col in categorical_columns:
        if col in X.columns:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col].astype(str))
            label_encoders[col] = le
            print(f"    Encoded {col}: {len(le.classes_)} categories")
    
    # Scale numerical features
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(
        scaler.fit_transform(X), 
        columns=X.columns, 
        index=X.index
    )
    
    print(f"  Scaled {X.shape[1]} numerical features")
    print(f"Target variable: {y.name} (shape: {y.shape})")
    
    return X_scaled, y, scaler, label_encoders

# Preprocess the data
X_processed, y_processed, feature_scaler, encoders = preprocess_data(training_df)

print(f"\n Processed Dataset Shape: {X_processed.shape}")
print(f" Feature Names: {list(X_processed.columns)}")



In [None]:
# =============================================================================
# SECTION 7: MODEL TRAINING AND COMPARISON
# =============================================================================

print("\n" + "=" * 60)
print("MODEL TRAINING AND COMPARISON")
print("=" * 60)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_processed, test_size=0.2, random_state=CONFIG['RANDOM_SEED']
)

print(f" Training set: {X_train.shape}")
print(f" Test set: {X_test.shape}")

# Define models to compare
models = {
    'XGBoost': xgb.XGBRegressor(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=CONFIG['RANDOM_SEED'],
        n_jobs=-1
    ),
    'Random Forest': RandomForestRegressor(
        n_estimators=100,
        max_depth=8,
        random_state=CONFIG['RANDOM_SEED'],
        n_jobs=-1
    ),
    'LightGBM': lgb.LGBMRegressor(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=CONFIG['RANDOM_SEED'],
        n_jobs=-1,
        verbose=-1
    )
}



In [None]:
# Train and evaluate models
results = {}

for name, model in models.items():
    print(f"\n Training {name}...")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    train_mae = mean_absolute_error(y_train, y_pred_train)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    test_r2 = r2_score(y_test, y_pred_test)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, 
                               scoring='neg_mean_absolute_error', n_jobs=-1)
    cv_mae = -cv_scores.mean()
    
    results[name] = {
        'model': model,
        'train_mae': train_mae,
        'test_mae': test_mae,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'test_r2': test_r2,
        'cv_mae': cv_mae,
        'predictions': y_pred_test
    }
    
    print(f"    {name} Results:")
    print(f"      • Train MAE: {train_mae:.2f} min")
    print(f"      • Test MAE:  {test_mae:.2f} min")
    print(f"      • Test RMSE: {test_rmse:.2f} min")
    print(f"      • Test R²:   {test_r2:.3f}")
    print(f"      • CV MAE:    {cv_mae:.2f} min")



In [None]:
# =============================================================================
# SECTION 8: MODEL COMPARISON AND SELECTION
# =============================================================================

print("\n" + "=" * 60)
print("MODEL COMPARISON AND SELECTION")
print("=" * 60)

# Create comparison table
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Test MAE': [results[model]['test_mae'] for model in results.keys()],
    'Test RMSE': [results[model]['test_rmse'] for model in results.keys()],
    'Test R²': [results[model]['test_r2'] for model in results.keys()],
    'CV MAE': [results[model]['cv_mae'] for model in results.keys()]
})

print("Model Comparison:")
print(comparison_df.round(3))

# Select best model based on test MAE
best_model_name = comparison_df.loc[comparison_df['Test MAE'].idxmin(), 'Model']
best_model = results[best_model_name]['model']

print(f"\n Best Model: {best_model_name}")
print(f"   • Test MAE: {results[best_model_name]['test_mae']:.2f} minutes")
print(f"   • Test R²: {results[best_model_name]['test_r2']:.3f}")



In [None]:
# =============================================================================
# SECTION 9: MODEL ANALYSIS AND VISUALIZATION
# =============================================================================

print("\n" + "=" * 60)
print("MODEL ANALYSIS AND VISUALIZATION")
print("=" * 60)

# Feature importance (for tree-based models)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X_processed.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(" Top 10 Most Important Features:")
    for idx, row in feature_importance.head(10).iterrows():
        print(f"   {row['feature']:20s}: {row['importance']:.4f}")
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    plt.subplot(2, 2, 1)
    top_features = feature_importance.head(10)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title('Top 10 Feature Importances')
    plt.gca().invert_yaxis()




In [None]:
# Prediction vs Actual plot
plt.subplot(2, 2, 2)
y_pred_best = results[best_model_name]['predictions']
plt.scatter(y_test, y_pred_best, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Travel Time (min)')
plt.ylabel('Predicted Travel Time (min)')
plt.title(f'{best_model_name}: Predictions vs Actual')

# Residuals plot
plt.subplot(2, 2, 3)
residuals = y_test - y_pred_best
plt.scatter(y_pred_best, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Travel Time (min)')
plt.ylabel('Residuals (min)')
plt.title('Residuals Plot')

# Model comparison
plt.subplot(2, 2, 4)
model_names = list(results.keys())
test_maes = [results[model]['test_mae'] for model in model_names]
plt.bar(model_names, test_maes, color=['gold' if model == best_model_name else 'lightblue' for model in model_names])
plt.ylabel('Test MAE (minutes)')
plt.title('Model Comparison (Test MAE)')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


In [31]:
# =============================================================================
# SECTION 10: SAVE THE BEST MODEL
# =============================================================================

print("\n" + "=" * 60)
print("SAVING THE BEST MODEL")
print("=" * 60)

# Create model package
model_package = {
    'model': best_model,
    'scaler': feature_scaler,
    'label_encoders': encoders,
    'feature_columns': list(X_processed.columns),
    'model_name': best_model_name,
    'training_metrics': {
        'test_mae': results[best_model_name]['test_mae'],
        'test_rmse': results[best_model_name]['test_rmse'],
        'test_r2': results[best_model_name]['test_r2'],
        'cv_mae': results[best_model_name]['cv_mae']
    },
    'training_date': datetime.datetime.now().isoformat(),
    'training_samples': len(training_df),
    'feature_importance': feature_importance.to_dict('records') if hasattr(best_model, 'feature_importances_') else None
}

# Save model
model_filename = f'udupi_travel_time_'


SAVING THE BEST MODEL
