In [1]:
import pandas as pd
import numpy as np
import json
import joblib
from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score, ParameterSampler
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support, 
                             confusion_matrix, classification_report)
from lightgbm import LGBMClassifier
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
print("📊 Loading dataset...")
df = pd.read_csv('metro_manila_weather_sus_data.csv', parse_dates=['date'])
print(f"✅ Loaded {len(df):,} records from {df['date'].min()} to {df['date'].max()}")

# Handle missing values
print("🔧 Handling missing values...")
df = df.fillna({
    'relativehumidity_2m': df['relativehumidity_2m'].median(),
    'temperature_2m': df['temperature_2m'].median(),
    'precipitation': 0,
    'apparent_temperature': df['apparent_temperature'].median(),
    'windspeed_10m': df['windspeed_10m'].median()
})

print(f"✅ Dataset shape: {df.shape}")
print(f"📈 Suspension distribution:\n{df['suspension'].value_counts().sort_index()}")

📊 Loading dataset...
✅ Loaded 756,840 records from 2020-09-20 00:00:00 to 2025-10-18 23:00:00
🔧 Handling missing values...
✅ Dataset shape: (756840, 8)
📈 Suspension distribution:
suspension
0    740730
1       168
2       216
3      1722
4     11964
5      2040
Name: count, dtype: int64


In [3]:
print("\n🛠️ Engineering features...")

# Temporal features
df['hour'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
df['is_rush_hour'] = df['hour'].isin([7, 8, 9, 17, 18, 19]).astype(int)

# Sort by city and date for rolling features
df = df.sort_values(['city', 'date'])

# Rolling averages by city
print("  📊 Computing rolling averages...")
for window in [3, 6, 12]:
    df[f'precip_roll_{window}h'] = df.groupby('city')['precipitation'].transform(
        lambda x: x.rolling(window, min_periods=1).mean()
    )
    df[f'wind_roll_{window}h'] = df.groupby('city')['windspeed_10m'].transform(
        lambda x: x.rolling(window, min_periods=1).mean()
    )

# Lag features
print("  🔄 Creating lag features...")
for lag in [1, 2, 3]:
    df[f'precip_lag_{lag}h'] = df.groupby('city')['precipitation'].shift(lag).fillna(0)
    df[f'temp_lag_{lag}h'] = df.groupby('city')['temperature_2m'].shift(lag).fillna(df['temperature_2m'].median())

# Peak indicators
df['is_precip_peak'] = (df['precipitation'] > df['precip_roll_6h'] * 1.5).astype(int)
df['is_wind_peak'] = (df['windspeed_10m'] > df['wind_roll_6h'] * 1.3).astype(int)

# Temperature delta
df['apparent_temp_delta'] = df['apparent_temperature'] - df['temperature_2m']

# Categorical features
df['precip_intensity'] = pd.cut(df['precipitation'], 
                                bins=[-1, 0, 5, 15, 50, 200],
                                labels=[0, 1, 2, 3, 4]).astype(int)

df['wind_category'] = pd.cut(df['windspeed_10m'],
                             bins=[-1, 20, 40, 60, 100],
                             labels=[0, 1, 2, 3]).astype(int)

# Encode city
print("  🏙️ Encoding cities...")
city_encoder = LabelEncoder()
df['city_encoded'] = city_encoder.fit_transform(df['city'])

print(f"✅ Feature engineering complete. Total features: {len(df.columns)}")


🛠️ Engineering features...
  📊 Computing rolling averages...
  🔄 Creating lag features...
  🏙️ Encoding cities...
✅ Feature engineering complete. Total features: 31


In [4]:
print("\n✂️ Splitting data (80-20 time-based)...")

# Sort by date
df = df.sort_values('date')

# Select features
feature_cols = [
    'relativehumidity_2m', 'temperature_2m', 'precipitation', 
    'apparent_temperature', 'windspeed_10m',
    'hour', 'day_of_week', 'month', 'is_weekend', 'is_rush_hour',
    'precip_roll_3h', 'precip_roll_6h', 'precip_roll_12h',
    'wind_roll_3h', 'wind_roll_6h', 'wind_roll_12h',
    'precip_lag_1h', 'precip_lag_2h', 'precip_lag_3h',
    'temp_lag_1h', 'temp_lag_2h', 'temp_lag_3h',
    'is_precip_peak', 'is_wind_peak', 'apparent_temp_delta',
    'precip_intensity', 'wind_category', 'city_encoded'
]

X = df[feature_cols].copy()
y = df['suspension'].copy()

# Ensure all columns are numeric
for col in X.columns:
    if X[col].dtype == 'category':
        X[col] = X[col].astype(int)

# Encode target variable
print("  🎯 Encoding target variable...")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"  Original classes: {sorted(y.unique())}")
print(f"  Encoded classes: {sorted(np.unique(y_encoded))}")

# Time-based split
split_idx = int(len(X) * 0.8)
X_train = X.iloc[:split_idx].copy()
X_test = X.iloc[split_idx:].copy()
y_train = y_encoded[:split_idx]
y_test = y_encoded[split_idx:]

# Scale numerical features
print("  📏 Scaling features...")
preprocessor = StandardScaler()
numerical_cols = ['relativehumidity_2m', 'temperature_2m', 'precipitation', 
                 'apparent_temperature', 'windspeed_10m', 'apparent_temp_delta']

X_train[numerical_cols] = preprocessor.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = preprocessor.transform(X_test[numerical_cols])

print(f"✅ Train set: {len(X_train):,} | Test set: {len(X_test):,}")
print(f"📊 Train suspension distribution (encoded):\n{pd.Series(y_train).value_counts().sort_index()}")


✂️ Splitting data (80-20 time-based)...
  🎯 Encoding target variable...
  Original classes: [0, 1, 2, 3, 4, 5]
  Encoded classes: [0, 1, 2, 3, 4, 5]
  📏 Scaling features...
✅ Train set: 605,472 | Test set: 151,368
📊 Train suspension distribution (encoded):
0    595860
3      1224
4      7572
5       816
Name: count, dtype: int64


In [5]:
print("\n🤖 Training model with class imbalance handling...\n")

# Convert to float for LightGBM
X_train_lgb = X_train.astype(float)
X_test_lgb = X_test.astype(float)

# Analyze class distribution
print("📊 Class Distribution Analysis:")
class_counts = pd.Series(y_train).value_counts().sort_index()
total_samples = len(y_train)
for class_idx, count in class_counts.items():
    percentage = (count / total_samples) * 100
    original_class = label_encoder.inverse_transform([class_idx])[0]
    print(f"   Class {original_class} (encoded {class_idx}): {count:,} samples ({percentage:.2f}%)")

# Calculate class weights (inverse frequency)
print("\n⚖️ Calculating class weights...")
class_weights = {}
for class_idx in np.unique(y_train):
    class_weights[class_idx] = total_samples / (len(np.unique(y_train)) * class_counts[class_idx])

print("Class weights (higher = more weight for minority classes):")
for class_idx, weight in sorted(class_weights.items()):
    original_class = label_encoder.inverse_transform([class_idx])[0]
    print(f"   Class {original_class}: {weight:.2f}")

# Apply SMOTE for oversampling minority classes
print("\n🔄 Applying SMOTE to balance training data...")
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42, k_neighbors=3)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_lgb, y_train)

print(f"   Original training size: {len(X_train_lgb):,}")
print(f"   Balanced training size: {len(X_train_balanced):,}")
print("\nBalanced class distribution:")
balanced_counts = pd.Series(y_train_balanced).value_counts().sort_index()
for class_idx, count in balanced_counts.items():
    original_class = label_encoder.inverse_transform([class_idx])[0]
    print(f"   Class {original_class}: {count:,} samples")

# Define best hyperparameters
best_params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.05,
    'max_depth': 8,
    'min_child_samples': 20,
    'n_estimators': 600,
    'num_leaves': 50,
    'reg_alpha': 0.5,
    'reg_lambda': 3.0,
    'subsample': 0.9
}

print("\n🔧 Training with optimized parameters...")
print("="*60)
for param, value in sorted(best_params.items()):
    print(f"   {param}: {value}")
print("="*60)

# Train model with SMOTE-balanced data and class weights
best_model = LGBMClassifier(
    **best_params,
    num_class=len(np.unique(y_train)),
    class_weight=class_weights,
    random_state=42,
    verbose=-1,
    n_jobs=-1,
    force_row_wise=True
)

print("\n🎯 Training on balanced dataset...")
best_model.fit(X_train_balanced, y_train_balanced)
print("✅ Training complete!\n")


🤖 Training model with class imbalance handling...

📊 Class Distribution Analysis:
   Class 0 (encoded 0): 595,860 samples (98.41%)
   Class 3 (encoded 3): 1,224 samples (0.20%)
   Class 4 (encoded 4): 7,572 samples (1.25%)
   Class 5 (encoded 5): 816 samples (0.13%)

⚖️ Calculating class weights...
Class weights (higher = more weight for minority classes):
   Class 0: 0.25
   Class 3: 123.67
   Class 4: 19.99
   Class 5: 185.50

🔄 Applying SMOTE to balance training data...


AttributeError: 'NoneType' object has no attribute 'split'

In [None]:
print("📊 Evaluating model on imbalanced test set...\n")

y_pred = best_model.predict(X_test_lgb)
y_pred_proba = best_model.predict_proba(X_test_lgb)

# Overall metrics
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='weighted', zero_division=0
)

# Per-class metrics (IMPORTANT for imbalanced data)
precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(
    y_test, y_pred, average=None, zero_division=0
)

print("="*60)
print("📈 PER-CLASS PERFORMANCE (Key for Imbalanced Data)")
print("="*60)
for class_idx in range(len(precision_per_class)):
    original_class = label_encoder.inverse_transform([class_idx])[0]
    print(f"\nClass {original_class} (Level {original_class}):")
    print(f"   Samples in test: {support_per_class[class_idx]}")
    print(f"   Precision: {precision_per_class[class_idx]:.4f}")
    print(f"   Recall:    {recall_per_class[class_idx]:.4f}")
    print(f"   F1 Score:  {f1_per_class[class_idx]:.4f}")

# Confusion matrix with detailed breakdown
cm = confusion_matrix(y_test, y_pred)
print("\n" + "="*60)
print("🔍 CONFUSION MATRIX ANALYSIS")
print("="*60)
print("\nConfusion Matrix:")
print(cm)

print("\nPer-class accuracy breakdown:")
for class_idx in range(len(cm)):
    original_class = label_encoder.inverse_transform([class_idx])[0]
    correct = cm[class_idx, class_idx]
    total = cm[class_idx, :].sum()
    if total > 0:
        class_accuracy = correct / total
        print(f"   Class {original_class}: {correct}/{total} correct ({class_accuracy:.2%})")
    else:
        print(f"   Class {original_class}: No samples in test set")

# Macro vs Weighted metrics (important distinction for imbalanced data)
print("\n" + "="*60)
print("📊 AGGREGATE METRICS")
print("="*60)
precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
    y_test, y_pred, average='macro', zero_division=0
)

print(f"\nWeighted Metrics (accounts for class frequency):")
print(f"   Accuracy:  {accuracy:.4f}")
print(f"   Precision: {precision:.4f}")
print(f"   Recall:    {recall:.4f}")
print(f"   F1 Score:  {f1:.4f}")

print(f"\nMacro Metrics (treats all classes equally):")
print(f"   Precision: {precision_macro:.4f}")
print(f"   Recall:    {recall_macro:.4f}")
print(f"   F1 Score:  {f1_macro:.4f}")

# Cross-validation on original training data
print("\n🔄 Running 5-fold cross-validation on balanced data...")
cv_scores = cross_val_score(best_model, X_train_balanced, y_train_balanced, cv=5, scoring='accuracy')
print(f"   CV Score: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")

# Calculate balanced accuracy (better metric for imbalanced data)
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test, y_pred)
print(f"\n⚖️ Balanced Accuracy: {balanced_acc:.4f} (accounts for class imbalance)")
print("="*60 + "\n")

In [None]:
print("📈 Extracting feature importance...")

importances = best_model.feature_importances_
feature_importance = dict(zip(feature_cols, importances.tolist()))
feature_importance = dict(sorted(feature_importance.items(), key=lambda x: x[1], reverse=True))

print("Top 10 features:")
for i, (feat, imp) in enumerate(list(feature_importance.items())[:10], 1):
    print(f"  {i}. {feat}: {imp:.4f}")

In [None]:
print("\n🎯 Calculating suspension thresholds...")

thresholds = {}
for level in range(6):
    level_data = df[df['suspension'] == level]
    if len(level_data) > 0:
        thresholds[f'level_{level}'] = {
            'precipitation_mean': float(level_data['precipitation'].mean()),
            'precipitation_max': float(level_data['precipitation'].max()),
            'windspeed_mean': float(level_data['windspeed_10m'].mean()),
            'windspeed_max': float(level_data['windspeed_10m'].max()),
            'humidity_mean': float(level_data['relativehumidity_2m'].mean()),
            'count': int(len(level_data))
        }

In [None]:
print("\n💾 Saving model artifacts...\n")

# Save model and preprocessors
joblib.dump(best_model, 'model.pkl')
print("✅ Saved: model.pkl")

joblib.dump(preprocessor, 'preprocessor.pkl')
print("✅ Saved: preprocessor.pkl")

joblib.dump(city_encoder, 'city_encoder.pkl')
print("✅ Saved: city_encoder.pkl")

joblib.dump(label_encoder, 'label_encoder.pkl')
print("✅ Saved: label_encoder.pkl")

# Save metrics
metrics = {
    'best_model': 'lightgbm_tuned',
    'models': {
        'lightgbm_tuned': {
            'accuracy': float(accuracy),
            'precision': float(precision),
            'recall': float(recall),
            'f1': float(f1),
            'precision_per_class': precision_per_class.tolist(),
            'recall_per_class': recall_per_class.tolist(),
            'f1_per_class': f1_per_class.tolist(),
            'confusion_matrix': cm.tolist(),
            'cv_mean': float(cv_scores.mean()),
            'cv_std': float(cv_scores.std()),
            'best_params': best_params
        }
    },
    'hyperparameters': best_params,
    'feature_names': feature_cols,
    'trained_date': datetime.now().isoformat(),
    'label_mapping': {int(k): int(v) for k, v in enumerate(label_encoder.classes_)},
    'dataset_info': {
        'total_records': len(df),
        'date_range': {
            'start': df['date'].min().isoformat(),
            'end': df['date'].max().isoformat()
        },
        'cities': list(city_encoder.classes_)
    }
}

with open('metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)
print("✅ Saved: metrics.json")

with open('feature_importance.json', 'w') as f:
    json.dump(feature_importance, f, indent=2)
print("✅ Saved: feature_importance.json")

np.save('confusion_matrix.npy', cm)
print("✅ Saved: confusion_matrix.npy")

with open('thresholds.json', 'w') as f:
    json.dump(thresholds, f, indent=2)
print("✅ Saved: thresholds.json")

training_stats = {
    'model_version': '2.0',
    'training_date': datetime.now().isoformat(),
    'best_model': 'lightgbm_tuned',
    'accuracy': float(accuracy),
    'f1_score': float(f1),
    'train_size': len(X_train),
    'test_size': len(X_test),
    'total_features': len(feature_cols),
    'tuned': True,
    'best_hyperparameters': best_params
}

with open('training_stats.json', 'w') as f:
    json.dump(training_stats, f, indent=2)
print("✅ Saved: training_stats.json")

with open('model_ready.flag', 'w') as f:
    f.write(f"Model trained successfully at {datetime.now()}")
print("✅ Saved: model_ready.flag")

In [None]:
print("\n" + "="*60)
print("📋 TRAINING REPORT")
print("="*60)
print(f"\n🎯 Model: LightGBM Classifier (Hyperparameter Tuned)")
print(f"📊 Dataset: {len(df):,} records")
print(f"📅 Date Range: {df['date'].min()} to {df['date'].max()}")
print(f"🏙️ Cities: {len(city_encoder.classes_)}")

print(f"\n🔧 Best Hyperparameters:")
for param, value in sorted(best_params.items()):
    print(f"   {param}: {value}")

print(f"\n📈 Performance Metrics:")
print(f"   Accuracy:  {accuracy:.4f}")
print(f"   Precision: {precision:.4f}")
print(f"   Recall:    {recall:.4f}")
print(f"   F1 Score:  {f1:.4f}")
print(f"\n🔄 Cross-Validation:")
print(f"   Mean: {cv_scores.mean():.4f}")
print(f"   Std:  {cv_scores.std():.4f}")
print("\n" + "="*60)
print("✅ Model training complete and artifacts saved!")
print("="*60)
print("\n🚀 Ready to run: streamlit run main.py\n")

In [None]:
"""
Metro Manila Suspension Prediction - Model Training (Notebook Version)
Simplified workflow for Jupyter notebooks
"""

import pandas as pd
import numpy as np
import json
import joblib
from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score, ParameterSampler
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support, 
                             confusion_matrix, classification_report)
from lightgbm import LGBMClassifier
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# 1. LOAD AND PREPROCESS DATA
# =============================================================================

print("📊 Loading dataset...")
df = pd.read_csv('metro_manila_weather_sus_data.csv', parse_dates=['date'])
print(f"✅ Loaded {len(df):,} records from {df['date'].min()} to {df['date'].max()}")

# Handle missing values
print("🔧 Handling missing values...")
df = df.fillna({
    'relativehumidity_2m': df['relativehumidity_2m'].median(),
    'temperature_2m': df['temperature_2m'].median(),
    'precipitation': 0,
    'apparent_temperature': df['apparent_temperature'].median(),
    'windspeed_10m': df['windspeed_10m'].median()
})

print(f"✅ Dataset shape: {df.shape}")
print(f"📈 Suspension distribution:\n{df['suspension'].value_counts().sort_index()}")

# =============================================================================
# 2. FEATURE ENGINEERING
# =============================================================================

print("\n🛠️ Engineering features...")

# Temporal features
df['hour'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
df['is_rush_hour'] = df['hour'].isin([7, 8, 9, 17, 18, 19]).astype(int)

# Sort by city and date for rolling features
df = df.sort_values(['city', 'date'])

# Rolling averages by city
print("  📊 Computing rolling averages...")
for window in [3, 6, 12]:
    df[f'precip_roll_{window}h'] = df.groupby('city')['precipitation'].transform(
        lambda x: x.rolling(window, min_periods=1).mean()
    )
    df[f'wind_roll_{window}h'] = df.groupby('city')['windspeed_10m'].transform(
        lambda x: x.rolling(window, min_periods=1).mean()
    )

# Lag features
print("  🔄 Creating lag features...")
for lag in [1, 2, 3]:
    df[f'precip_lag_{lag}h'] = df.groupby('city')['precipitation'].shift(lag).fillna(0)
    df[f'temp_lag_{lag}h'] = df.groupby('city')['temperature_2m'].shift(lag).fillna(df['temperature_2m'].median())

# Peak indicators
df['is_precip_peak'] = (df['precipitation'] > df['precip_roll_6h'] * 1.5).astype(int)
df['is_wind_peak'] = (df['windspeed_10m'] > df['wind_roll_6h'] * 1.3).astype(int)

# Temperature delta
df['apparent_temp_delta'] = df['apparent_temperature'] - df['temperature_2m']

# Categorical features
df['precip_intensity'] = pd.cut(df['precipitation'], 
                                bins=[-1, 0, 5, 15, 50, 200],
                                labels=[0, 1, 2, 3, 4]).astype(int)

df['wind_category'] = pd.cut(df['windspeed_10m'],
                             bins=[-1, 20, 40, 60, 100],
                             labels=[0, 1, 2, 3]).astype(int)

# Encode city
print("  🏙️ Encoding cities...")
city_encoder = LabelEncoder()
df['city_encoded'] = city_encoder.fit_transform(df['city'])

print(f"✅ Feature engineering complete. Total features: {len(df.columns)}")

# =============================================================================
# 3. PREPARE TRAIN-TEST SPLIT
# =============================================================================

print("\n✂️ Splitting data (80-20 time-based)...")

# Sort by date
df = df.sort_values('date')

# Select features
feature_cols = [
    'relativehumidity_2m', 'temperature_2m', 'precipitation', 
    'apparent_temperature', 'windspeed_10m',
    'hour', 'day_of_week', 'month', 'is_weekend', 'is_rush_hour',
    'precip_roll_3h', 'precip_roll_6h', 'precip_roll_12h',
    'wind_roll_3h', 'wind_roll_6h', 'wind_roll_12h',
    'precip_lag_1h', 'precip_lag_2h', 'precip_lag_3h',
    'temp_lag_1h', 'temp_lag_2h', 'temp_lag_3h',
    'is_precip_peak', 'is_wind_peak', 'apparent_temp_delta',
    'precip_intensity', 'wind_category', 'city_encoded'
]

X = df[feature_cols].copy()
y = df['suspension'].copy()

# Ensure all columns are numeric
for col in X.columns:
    if X[col].dtype == 'category':
        X[col] = X[col].astype(int)

# Encode target variable
print("  🎯 Encoding target variable...")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"  Original classes: {sorted(y.unique())}")
print(f"  Encoded classes: {sorted(np.unique(y_encoded))}")

# Time-based split
split_idx = int(len(X) * 0.8)
X_train = X.iloc[:split_idx].copy()
X_test = X.iloc[split_idx:].copy()
y_train = y_encoded[:split_idx]
y_test = y_encoded[split_idx:]

# Scale numerical features
print("  📏 Scaling features...")
preprocessor = StandardScaler()
numerical_cols = ['relativehumidity_2m', 'temperature_2m', 'precipitation', 
                 'apparent_temperature', 'windspeed_10m', 'apparent_temp_delta']

X_train[numerical_cols] = preprocessor.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = preprocessor.transform(X_test[numerical_cols])

print(f"✅ Train set: {len(X_train):,} | Test set: {len(X_test):,}")
print(f"📊 Train suspension distribution (encoded):\n{pd.Series(y_train).value_counts().sort_index()}")

# =============================================================================
# 4. HANDLE CLASS IMBALANCE & TRAIN MODEL
# =============================================================================

print("\n🤖 Training model with class imbalance handling...\n")

# Convert to float for LightGBM
X_train_lgb = X_train.astype(float)
X_test_lgb = X_test.astype(float)

# Analyze class distribution
print("📊 Class Distribution Analysis:")
class_counts = pd.Series(y_train).value_counts().sort_index()
total_samples = len(y_train)
for class_idx, count in class_counts.items():
    percentage = (count / total_samples) * 100
    original_class = label_encoder.inverse_transform([class_idx])[0]
    print(f"   Class {original_class} (encoded {class_idx}): {count:,} samples ({percentage:.2f}%)")

# Calculate class weights (inverse frequency)
print("\n⚖️ Calculating class weights...")
class_weights = {}
for class_idx in np.unique(y_train):
    class_weights[class_idx] = total_samples / (len(np.unique(y_train)) * class_counts[class_idx])

print("Class weights (higher = more weight for minority classes):")
for class_idx, weight in sorted(class_weights.items()):
    original_class = label_encoder.inverse_transform([class_idx])[0]
    print(f"   Class {original_class}: {weight:.2f}")

# Apply SMOTE for oversampling minority classes
print("\n🔄 Applying SMOTE to balance training data...")
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42, k_neighbors=3)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_lgb, y_train)

print(f"   Original training size: {len(X_train_lgb):,}")
print(f"   Balanced training size: {len(X_train_balanced):,}")
print("\nBalanced class distribution:")
balanced_counts = pd.Series(y_train_balanced).value_counts().sort_index()
for class_idx, count in balanced_counts.items():
    original_class = label_encoder.inverse_transform([class_idx])[0]
    print(f"   Class {original_class}: {count:,} samples")

# Define best hyperparameters
best_params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.05,
    'max_depth': 8,
    'min_child_samples': 20,
    'n_estimators': 600,
    'num_leaves': 50,
    'reg_alpha': 0.5,
    'reg_lambda': 3.0,
    'subsample': 0.9
}

print("\n🔧 Training with optimized parameters...")
print("="*60)
for param, value in sorted(best_params.items()):
    print(f"   {param}: {value}")
print("="*60)

# Train model with SMOTE-balanced data and class weights
best_model = LGBMClassifier(
    **best_params,
    num_class=len(np.unique(y_train)),
    class_weight=class_weights,
    random_state=42,
    verbose=-1,
    n_jobs=-1,
    force_row_wise=True
)

print("\n🎯 Training on balanced dataset...")
best_model.fit(X_train_balanced, y_train_balanced)
print("✅ Training complete!\n")

# =============================================================================
# 5. EVALUATE MODEL (WITH CLASS-SPECIFIC METRICS)
# =============================================================================

print("📊 Evaluating model on imbalanced test set...\n")

y_pred = best_model.predict(X_test_lgb)
y_pred_proba = best_model.predict_proba(X_test_lgb)

# Overall metrics
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='weighted', zero_division=0
)

# Per-class metrics (IMPORTANT for imbalanced data)
precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(
    y_test, y_pred, average=None, zero_division=0
)

print("="*60)
print("📈 PER-CLASS PERFORMANCE (Key for Imbalanced Data)")
print("="*60)
for class_idx in range(len(precision_per_class)):
    original_class = label_encoder.inverse_transform([class_idx])[0]
    print(f"\nClass {original_class} (Level {original_class}):")
    print(f"   Samples in test: {support_per_class[class_idx]}")
    print(f"   Precision: {precision_per_class[class_idx]:.4f}")
    print(f"   Recall:    {recall_per_class[class_idx]:.4f}")
    print(f"   F1 Score:  {f1_per_class[class_idx]:.4f}")

# Confusion matrix with detailed breakdown
cm = confusion_matrix(y_test, y_pred)
print("\n" + "="*60)
print("🔍 CONFUSION MATRIX ANALYSIS")
print("="*60)
print("\nConfusion Matrix:")
print(cm)

print("\nPer-class accuracy breakdown:")
for class_idx in range(len(cm)):
    original_class = label_encoder.inverse_transform([class_idx])[0]
    correct = cm[class_idx, class_idx]
    total = cm[class_idx, :].sum()
    if total > 0:
        class_accuracy = correct / total
        print(f"   Class {original_class}: {correct}/{total} correct ({class_accuracy:.2%})")
    else:
        print(f"   Class {original_class}: No samples in test set")

# Macro vs Weighted metrics (important distinction for imbalanced data)
print("\n" + "="*60)
print("📊 AGGREGATE METRICS")
print("="*60)
precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
    y_test, y_pred, average='macro', zero_division=0
)

print(f"\nWeighted Metrics (accounts for class frequency):")
print(f"   Accuracy:  {accuracy:.4f}")
print(f"   Precision: {precision:.4f}")
print(f"   Recall:    {recall:.4f}")
print(f"   F1 Score:  {f1:.4f}")

print(f"\nMacro Metrics (treats all classes equally):")
print(f"   Precision: {precision_macro:.4f}")
print(f"   Recall:    {recall_macro:.4f}")
print(f"   F1 Score:  {f1_macro:.4f}")

# Cross-validation on original training data
print("\n🔄 Running 5-fold cross-validation on balanced data...")
cv_scores = cross_val_score(best_model, X_train_balanced, y_train_balanced, cv=5, scoring='accuracy')
print(f"   CV Score: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")

# Calculate balanced accuracy (better metric for imbalanced data)
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test, y_pred)
print(f"\n⚖️ Balanced Accuracy: {balanced_acc:.4f} (accounts for class imbalance)")
print("="*60 + "\n")

# =============================================================================
# 6. FEATURE IMPORTANCE
# =============================================================================

print("📈 Extracting feature importance...")

importances = best_model.feature_importances_
feature_importance = dict(zip(feature_cols, importances.tolist()))
feature_importance = dict(sorted(feature_importance.items(), key=lambda x: x[1], reverse=True))

print("Top 10 features:")
for i, (feat, imp) in enumerate(list(feature_importance.items())[:10], 1):
    print(f"  {i}. {feat}: {imp:.4f}")

# =============================================================================
# 7. CALCULATE THRESHOLDS
# =============================================================================

print("\n🎯 Calculating suspension thresholds...")

thresholds = {}
for level in range(6):
    level_data = df[df['suspension'] == level]
    if len(level_data) > 0:
        thresholds[f'level_{level}'] = {
            'precipitation_mean': float(level_data['precipitation'].mean()),
            'precipitation_max': float(level_data['precipitation'].max()),
            'windspeed_mean': float(level_data['windspeed_10m'].mean()),
            'windspeed_max': float(level_data['windspeed_10m'].max()),
            'humidity_mean': float(level_data['relativehumidity_2m'].mean()),
            'count': int(len(level_data))
        }

# =============================================================================
# 8. SAVE ARTIFACTS
# =============================================================================

print("\n💾 Saving model artifacts...\n")

# Save model and preprocessors
joblib.dump(best_model, 'model.pkl')
print("✅ Saved: model.pkl")

joblib.dump(preprocessor, 'preprocessor.pkl')
print("✅ Saved: preprocessor.pkl")

joblib.dump(city_encoder, 'city_encoder.pkl')
print("✅ Saved: city_encoder.pkl")

joblib.dump(label_encoder, 'label_encoder.pkl')
print("✅ Saved: label_encoder.pkl")

# Save metrics
metrics = {
    'best_model': 'lightgbm_tuned',
    'models': {
        'lightgbm_tuned': {
            'accuracy': float(accuracy),
            'precision': float(precision),
            'recall': float(recall),
            'f1': float(f1),
            'precision_per_class': precision_per_class.tolist(),
            'recall_per_class': recall_per_class.tolist(),
            'f1_per_class': f1_per_class.tolist(),
            'confusion_matrix': cm.tolist(),
            'cv_mean': float(cv_scores.mean()),
            'cv_std': float(cv_scores.std()),
            'best_params': best_params
        }
    },
    'hyperparameters': best_params,
    'feature_names': feature_cols,
    'trained_date': datetime.now().isoformat(),
    'label_mapping': {int(k): int(v) for k, v in enumerate(label_encoder.classes_)},
    'dataset_info': {
        'total_records': len(df),
        'date_range': {
            'start': df['date'].min().isoformat(),
            'end': df['date'].max().isoformat()
        },
        'cities': list(city_encoder.classes_)
    }
}

with open('metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)
print("✅ Saved: metrics.json")

with open('feature_importance.json', 'w') as f:
    json.dump(feature_importance, f, indent=2)
print("✅ Saved: feature_importance.json")

np.save('confusion_matrix.npy', cm)
print("✅ Saved: confusion_matrix.npy")

with open('thresholds.json', 'w') as f:
    json.dump(thresholds, f, indent=2)
print("✅ Saved: thresholds.json")

training_stats = {
    'model_version': '2.0',
    'training_date': datetime.now().isoformat(),
    'best_model': 'lightgbm_tuned',
    'accuracy': float(accuracy),
    'f1_score': float(f1),
    'train_size': len(X_train),
    'test_size': len(X_test),
    'total_features': len(feature_cols),
    'tuned': True,
    'best_hyperparameters': best_params
}

with open('training_stats.json', 'w') as f:
    json.dump(training_stats, f, indent=2)
print("✅ Saved: training_stats.json")

with open('model_ready.flag', 'w') as f:
    f.write(f"Model trained successfully at {datetime.now()}")
print("✅ Saved: model_ready.flag")

# =============================================================================
# 9. TRAINING REPORT
# =============================================================================

print("\n" + "="*60)
print("📋 TRAINING REPORT")
print("="*60)
print(f"\n🎯 Model: LightGBM Classifier (Hyperparameter Tuned)")
print(f"📊 Dataset: {len(df):,} records")
print(f"📅 Date Range: {df['date'].min()} to {df['date'].max()}")
print(f"🏙️ Cities: {len(city_encoder.classes_)}")

print(f"\n🔧 Best Hyperparameters:")
for param, value in sorted(best_params.items()):
    print(f"   {param}: {value}")

print(f"\n📈 Performance Metrics:")
print(f"   Accuracy:  {accuracy:.4f}")
print(f"   Precision: {precision:.4f}")
print(f"   Recall:    {recall:.4f}")
print(f"   F1 Score:  {f1:.4f}")
print(f"\n🔄 Cross-Validation:")
print(f"   Mean: {cv_scores.mean():.4f}")
print(f"   Std:  {cv_scores.std():.4f}")
print("\n" + "="*60)
print("✅ Model training complete and artifacts saved!")
print("="*60)
print("\n🚀 Ready to run: streamlit run main.py\n")