In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import pickle
import json
import os
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
import xgboost as xgb
import lightgbm as lgb

print("="*70)
print("ADVANCED AQI PREDICTION - WITH HYPERPARAMETER TUNING (OPTIMIZED)")
print("="*70)

os.makedirs('models', exist_ok=True)

# ============================================================================
# 1. Load Data
# ============================================================================

def load_from_mongodb(uri, max_attempts=2):
    """Try MongoDB"""
    from pymongo import MongoClient
    from pymongo.server_api import ServerApi
    
    for attempt in range(max_attempts):
        try:
            print(f"\nAttempt {attempt + 1}/{max_attempts}: Connecting to MongoDB...")
            client = MongoClient(uri, server_api=ServerApi('1'),
                               serverSelectionTimeoutMS=5000, connectTimeoutMS=5000)
            client.admin.command('ping')
            print("‚úì Connected!")
            
            db = client['aqi_feature_store']
            collection = db['aqi_features']
            data = pd.DataFrame(list(collection.find({}, {"_id": 0})))
            client.close()
            
            print(f"‚úì Loaded {len(data)} records from MongoDB")
            return data, 'mongodb'
        except Exception as e:
            print(f"‚úó Failed: {str(e)[:80]}")
    return None, None

def load_from_csv(csv_path):
    """Load from CSV"""
    try:
        print(f"\nLoading from CSV: {csv_path}")
        data = pd.read_csv(csv_path)
        print(f"‚úì Loaded {len(data)} records")
        return data, 'csv'
    except:
        return None, None

print("\n1. Loading data...")

MONGO_URI = "mongodb+srv://nawababbas08_db_user:2Ja4OGlDdKfG6EvZ@cluster0.jnxn95g.mongodb.net/?retryWrites=true&w=majority&tlsAllowInvalidCertificates=true"
CSV_PATH = "data/cleaned_aqi_data_v2.csv"

data, source = load_from_mongodb(MONGO_URI, 2)
if data is None:
    print("\n‚ö†Ô∏è MongoDB failed, using CSV...")
    data, source = load_from_csv(CSV_PATH)

if data is None:
    print("\n‚úó ERROR: No data source available")
    exit(1)

print(f"\n‚úì Source: {source.upper()}")
print(f"‚úì Records: {len(data)}")

# ============================================================================
# 2. Enhanced Feature Engineering
# ============================================================================

print("\n2. Engineering features...")

if 'time' in data.columns:
    data['time'] = pd.to_datetime(data['time'])
    data = data.sort_values('time').reset_index(drop=True)
elif 'timestamp' in data.columns:
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data = data.sort_values('timestamp').reset_index(drop=True)

# More comprehensive lag features
print("   Creating lag features...")
for lag in [1, 2, 3, 6, 12, 24, 48]:
    if 'aqi' in data.columns:
        data[f'aqi_lag_{lag}h'] = data['aqi'].shift(lag)
    if 'pm2_5' in data.columns:
        data[f'pm25_lag_{lag}h'] = data['pm2_5'].shift(lag)

# Rolling statistics (mean, std, min, max)
print("   Creating rolling features...")
for window in [3, 6, 12, 24]:
    if 'aqi' in data.columns:
        data[f'aqi_ma_{window}h'] = data['aqi'].rolling(window=window, min_periods=1).mean()
        data[f'aqi_std_{window}h'] = data['aqi'].rolling(window=window, min_periods=1).std()
        data[f'aqi_min_{window}h'] = data['aqi'].rolling(window=window, min_periods=1).min()
        data[f'aqi_max_{window}h'] = data['aqi'].rolling(window=window, min_periods=1).max()

# Difference features (trend detection)
print("   Creating difference features...")
if 'aqi' in data.columns:
    data['aqi_diff_1h'] = data['aqi'].diff(1)
    data['aqi_diff_3h'] = data['aqi'].diff(3)
    data['aqi_diff_24h'] = data['aqi'].diff(24)

# Cyclical features (better encoding)
if 'hour' in data.columns:
    data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
    data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)

if 'day_of_week' in data.columns:
    data['dow_sin'] = np.sin(2 * np.pi * data['day_of_week'] / 7)
    data['dow_cos'] = np.cos(2 * np.pi * data['day_of_week'] / 7)

if 'month' in data.columns:
    data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
    data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)

# Targets
data['aqi_24h'] = data['aqi'].shift(-24)
data['aqi_48h'] = data['aqi'].shift(-48)
data['aqi_72h'] = data['aqi'].shift(-72)

# Remove rows with all NaN
data = data.dropna(axis=1, how='all')

print(f"‚úì After engineering: {data.shape[0]} records, {data.shape[1]} columns")

# ============================================================================
# 3. Prepare Data with Better Filtering
# ============================================================================

print("\n3. Preparing features...")

# Exclude target columns and categorical/string columns
exclude_cols = ['time', 'timestamp', 'aqi_24h', 'aqi_48h', 'aqi_72h',
                'dominant_pollutant', 'aqi_category', 'aqi_color', 'time_of_day',
                'season', 'weather_condition', 'day_of_week', 'day_of_month',
                'is_weekend']

# Get only numeric columns for features
feature_cols = [col for col in data.columns if col not in exclude_cols]
numeric_cols = data[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
feature_cols = numeric_cols

print(f"‚úì Initial features: {len(feature_cols)}")

# Remove features with too many missing values
missing_threshold = 0.3
for col in feature_cols[:]:
    missing_pct = data[col].isnull().sum() / len(data)
    if missing_pct > missing_threshold:
        feature_cols.remove(col)
        print(f"   Removed {col} (missing: {missing_pct*100:.1f}%)")

print(f"‚úì After removing high-missing features: {len(feature_cols)}")

# Handle remaining missing values
data[feature_cols] = data[feature_cols].fillna(data[feature_cols].mean())

# Remove rows where target is missing
data = data.dropna(subset=['aqi_24h', 'aqi_48h', 'aqi_72h'])

print(f"‚úì Final dataset: {len(data)} records")

X = data[feature_cols]
y_24h = data['aqi_24h']
y_48h = data['aqi_48h']
y_72h = data['aqi_72h']

# Remove any remaining NaN
X = X.fillna(X.mean())

print(f"‚úì Features: {len(feature_cols)}")
print(f"‚úì Samples: {len(X)}")

# ============================================================================
# 4. Feature Selection
# ============================================================================

print("\n4. Feature selection...")

def select_best_features(X, y, k=30):
    """Select top K most important features"""
    if len(X.columns) <= k:
        return X.columns.tolist()
    
    # Use mutual information for feature selection
    selector = SelectKBest(score_func=mutual_info_regression, k=k)
    selector.fit(X, y)
    
    # Get selected feature names
    selected_features = X.columns[selector.get_support()].tolist()
    
    # Get scores
    scores = selector.scores_
    feature_scores = list(zip(X.columns, scores))
    feature_scores.sort(key=lambda x: x[1], reverse=True)
    
    print(f"   Top 10 features:")
    for feat, score in feature_scores[:10]:
        print(f"      {feat:30s}: {score:.3f}")
    
    return selected_features

# Select features for 24h prediction
selected_features = select_best_features(X, y_24h, k=min(30, len(X.columns)))
X = X[selected_features]

print(f"\n‚úì Selected {len(selected_features)} best features")

# ============================================================================
# 5. Time Series Split (Better for Time Series!)
# ============================================================================

print("\n5. Splitting data (time-series aware)...")

# Use 80-20 split but maintain time order
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_24h_train, y_24h_test = y_24h[:split_idx], y_24h[split_idx:]
y_48h_train, y_48h_test = y_48h[:split_idx], y_48h[split_idx:]
y_72h_train, y_72h_test = y_72h[:split_idx], y_72h[split_idx:]

# Use RobustScaler (better for outliers)
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

with open('models/scaler_ml.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print(f"‚úì Train: {len(X_train)}, Test: {len(X_test)}")

# ============================================================================
# 6. Define Model Hyperparameter Grids (OPTIMIZED!)
# ============================================================================

print("\n6. Setting up hyperparameter grids (optimized)...")

# MUCH SMALLER GRIDS - will run in 5-10 minutes instead of hours!
param_grids = {
    'Ridge': {
        'model': Ridge(),
        'params': {
            'alpha': [0.1, 1.0, 10.0]  # Reduced from 4 to 3 values
        },
        'n_iter': 3  # GridSearch will try all 3
    },
    'Lasso': {
        'model': Lasso(max_iter=5000),
        'params': {
            'alpha': [0.01, 0.1, 1.0]  # Reduced from 4 to 3 values
        },
        'n_iter': 3
    },
    'Random Forest': {
        'model': RandomForestRegressor(random_state=42, n_jobs=-1),
        'params': {
            'n_estimators': [50, 100],  # Reduced from 3 to 2
            'max_depth': [10, None],  # Reduced from 4 to 2
            'min_samples_split': [5, 10]  # Reduced from 3 to 2
        },
        'n_iter': 8  # Will sample 8 random combinations instead of all 48
    },
    'Gradient Boosting': {
        'model': GradientBoostingRegressor(random_state=42),
        'params': {
            'n_estimators': [50, 100],  # Reduced
            'learning_rate': [0.05, 0.1],  # Reduced
            'max_depth': [3, 5]  # Reduced
        },
        'n_iter': 8
    },
    'XGBoost': {
        'model': xgb.XGBRegressor(random_state=42, n_jobs=-1),
        'params': {
            'n_estimators': [50, 100],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3, 5],
            'subsample': [0.8, 1.0]
        },
        'n_iter': 10
    },
    'LightGBM': {
        'model': lgb.LGBMRegressor(random_state=42, verbose=-1, n_jobs=-1),
        'params': {
            'n_estimators': [50, 100],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3, 5],
            'num_leaves': [31, 63]
        },
        'n_iter': 10
    }
}

print(f"‚úì {len(param_grids)} models configured for tuning")
print(f"‚úì Using RandomizedSearchCV for faster training")

# ============================================================================
# 7. Evaluation Function
# ============================================================================

def evaluate(y_true, y_pred):
    """Comprehensive evaluation metrics"""
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    # Accuracy within thresholds
    acc_20 = np.sum(np.abs(y_true - y_pred) <= 20) / len(y_true) * 100
    acc_10 = np.sum(np.abs(y_true - y_pred) <= 10) / len(y_true) * 100
    acc_5 = np.sum(np.abs(y_true - y_pred) <= 5) / len(y_true) * 100
    
    # MAPE (Mean Absolute Percentage Error)
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-10))) * 100
    
    return {
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'MAPE': mape,
        'Acc20': acc_20,
        'Acc10': acc_10,
        'Acc5': acc_5
    }

# ============================================================================
# 8. Training with RandomizedSearchCV (FASTER!)
# ============================================================================

print("\n" + "="*70)
print("TRAINING MODELS WITH HYPERPARAMETER TUNING (OPTIMIZED)")
print("="*70)
print("\n‚è±Ô∏è  This should take 5-10 minutes...")

results = {}

# Time series cross-validation - REDUCED to 2 splits for speed
tscv = TimeSeriesSplit(n_splits=2)  # Changed from 3 to 2

for horizon, y_train, y_test in [
    ('24h', y_24h_train, y_24h_test),
    ('48h', y_48h_train, y_48h_test),
    ('72h', y_72h_train, y_72h_test)
]:
    print(f"\n{'='*70}")
    print(f"TRAINING FOR {horizon} AHEAD PREDICTION")
    print('='*70)
    
    results[horizon] = {}
    
    for name, config in param_grids.items():
        print(f"\n{name}...")
        print(f"   Tuning hyperparameters...")
        
        # Use RandomizedSearchCV for most models (faster than GridSearchCV)
        if name in ['Ridge', 'Lasso']:
            # GridSearch for simple models (fast anyway)
            from sklearn.model_selection import GridSearchCV
            search = GridSearchCV(
                estimator=config['model'],
                param_grid=config['params'],
                cv=tscv,
                scoring='r2',
                n_jobs=-1,
                verbose=0
            )
        else:
            # RandomizedSearch for complex models (much faster)
            search = RandomizedSearchCV(
                estimator=config['model'],
                param_distributions=config['params'],
                n_iter=config['n_iter'],  # Only try N random combinations
                cv=tscv,
                scoring='r2',
                n_jobs=-1,
                verbose=0,
                random_state=42
            )
        
        # Fit search
        search.fit(X_train_scaled, y_train)
        
        # Best model
        best_model = search.best_estimator_
        
        print(f"   Best params: {search.best_params_}")
        print(f"   Best CV R¬≤: {search.best_score_:.3f}")
        
        # Predictions
        y_pred_train = best_model.predict(X_train_scaled)
        y_pred_test = best_model.predict(X_test_scaled)
        
        # Evaluate
        train_metrics = evaluate(y_train, y_pred_train)
        test_metrics = evaluate(y_test, y_pred_test)
        
        # Store results
        results[horizon][name] = {
            'test_R2': test_metrics['R2'],
            'test_RMSE': test_metrics['RMSE'],
            'test_MAE': test_metrics['MAE'],
            'test_MAPE': test_metrics['MAPE'],
            'test_Acc20': test_metrics['Acc20'],
            'test_Acc10': test_metrics['Acc10'],
            'test_Acc5': test_metrics['Acc5'],
            'train_R2': train_metrics['R2'],
            'cv_R2': search.best_score_,
            'best_params': search.best_params_
        }
        
        # Display metrics
        print(f"\n   üìä Results:")
        print(f"      Test R¬≤:    {test_metrics['R2']:6.3f}")
        print(f"      Train R¬≤:   {train_metrics['R2']:6.3f}")
        print(f"      CV R¬≤:      {search.best_score_:6.3f}")
        print(f"      RMSE:       {test_metrics['RMSE']:6.2f}")
        print(f"      MAE:        {test_metrics['MAE']:6.2f}")
        print(f"      MAPE:       {test_metrics['MAPE']:6.2f}%")
        print(f"      Acc ¬±20:    {test_metrics['Acc20']:6.1f}%")
        print(f"      Acc ¬±10:    {test_metrics['Acc10']:6.1f}%")
        
        # Check for overfitting
        overfit_gap = train_metrics['R2'] - test_metrics['R2']
        if overfit_gap > 0.2:
            print(f"      ‚ö†Ô∏è  OVERFITTING (gap: {overfit_gap:.3f})")
        elif test_metrics['R2'] < 0:
            print(f"      ‚ö†Ô∏è  NEGATIVE R¬≤ - Model performs worse than baseline!")
        elif test_metrics['R2'] < 0.1:
            print(f"      ‚ö†Ô∏è  VERY LOW R¬≤ - Check data quality")
        else:
            print(f"      ‚úì  Good performance!")
        
        # Save model
        model_path = f'models/{name.lower().replace(" ", "_")}_{horizon}.pkl'
        with open(model_path, 'wb') as f:
            pickle.dump(best_model, f)
        print(f"      ‚úì  Saved: {model_path}")

# ============================================================================
# 9. Feature Importance Analysis
# ============================================================================

print("\n" + "="*70)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*70)

# Analyze feature importance for best model
best_24h = max(results['24h'].items(), key=lambda x: x[1]['test_R2'])
print(f"\nBest 24h model: {best_24h[0]} (R¬≤ = {best_24h[1]['test_R2']:.3f})")

# Save feature importance if available
feature_importance_path = 'models/feature_importance.csv'
try:
    # Load the best model
    model_name = best_24h[0].lower().replace(" ", "_")
    with open(f'models/{model_name}_24h.pkl', 'rb') as f:
        best_model = pickle.load(f)
    
    # Get feature importance
    if hasattr(best_model, 'feature_importances_'):
        importance_df = pd.DataFrame({
            'feature': selected_features,
            'importance': best_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        importance_df.to_csv(feature_importance_path, index=False)
        
        print(f"\nTop 15 Most Important Features:")
        for idx, row in importance_df.head(15).iterrows():
            print(f"   {row['feature']:30s}: {row['importance']:.4f}")
        
        print(f"\n‚úì Feature importance saved to: {feature_importance_path}")
except Exception as e:
    print(f"\n‚ö†Ô∏è Could not extract feature importance: {str(e)}")

# ============================================================================
# 10. Summary
# ============================================================================

print("\n" + "="*70)
print("RESULTS SUMMARY")
print("="*70)

for horizon in results.keys():
    print(f"\n{horizon} Ahead:")
    print("-" * 70)
    
    best = max(results[horizon].items(), key=lambda x: x[1]['test_R2'])
    
    for name in results[horizon]:
        m = results[horizon][name]
        marker = " ‚òÖ BEST" if name == best[0] else ""
        print(f"{name:18s}: R¬≤={m['test_R2']:7.3f}  RMSE={m['test_RMSE']:6.2f}  "
              f"MAE={m['test_MAE']:6.2f}  Acc¬±20={m['test_Acc20']:5.1f}%{marker}")

# Save results
with open('models/ml_tuned_results.json', 'w') as f:
    # Convert to serializable format
    results_serializable = {}
    for horizon in results:
        results_serializable[horizon] = {}
        for model_name in results[horizon]:
            results_serializable[horizon][model_name] = {
                k: v for k, v in results[horizon][model_name].items() 
                if k != 'best_params'
            }
            results_serializable[horizon][model_name]['best_params_str'] = str(
                results[horizon][model_name]['best_params']
            )
    
    json.dump(results_serializable, f, indent=2)

print("\n" + "="*70)
print("‚úÖ TRAINING COMPLETE!")
print("="*70)
print(f"\nüìä Source: {source.upper()}")
print(f"üìà Models: {len(param_grids)} ML models √ó 3 horizons = {len(param_grids)*3} total")
print(f"üéØ All models hyperparameter-tuned with RandomizedSearchCV")
print("\nüìÅ Saved:")
print("  ‚úì models/*.pkl (tuned models)")
print("  ‚úì models/scaler_ml.pkl")
print("  ‚úì models/ml_tuned_results.json")
print("  ‚úì models/feature_importance.csv")

# ============================================================================
# 11. Diagnostic Information
# ============================================================================

print("\n" + "="*70)
print("DIAGNOSTIC INFORMATION")
print("="*70)

print(f"\nData Quality:")
print(f"   Total samples: {len(data)}")
print(f"   Training samples: {len(X_train)}")
print(f"   Test samples: {len(X_test)}")
print(f"   Features used: {len(selected_features)}")

print(f"\nTarget Statistics (24h ahead):")
print(f"   Mean: {y_24h.mean():.2f}")
print(f"   Std: {y_24h.std():.2f}")
print(f"   Min: {y_24h.min():.2f}")
print(f"   Max: {y_24h.max():.2f}")

print(f"\nBest Model Performance:")
best_overall = max(
    [(h, n, m['test_R2']) for h in results for n, m in results[h].items()],
    key=lambda x: x[2]
)
print(f"   {best_overall[1]} ({best_overall[0]}): R¬≤ = {best_overall[2]:.3f}")



ADVANCED AQI PREDICTION - WITH HYPERPARAMETER TUNING (OPTIMIZED)

1. Loading data...

Attempt 1/2: Connecting to MongoDB...
‚úì Connected!
‚úì Loaded 4340 records from MongoDB

‚úì Source: MONGODB
‚úì Records: 4340

2. Engineering features...
   Creating lag features...
   Creating rolling features...
   Creating difference features...
‚úì After engineering: 4340 records, 73 columns

3. Preparing features...
‚úì Initial features: 63
‚úì After removing high-missing features: 63
‚úì Final dataset: 4268 records
‚úì Features: 63
‚úì Samples: 4268

4. Feature selection...
   Top 10 features:
      day_of_year                   : 0.537
      aqi_min_24h                   : 0.392
      aqi_min_12h                   : 0.338
      aqi_pm25                      : 0.329
      pm2_5                         : 0.326
      aqi                           : 0.322
      aqi_min_6h                    : 0.319
      aqi_min_3h                    : 0.309
      aqi_ma_3h                     : 0.305
      aqi_

In [None]:
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5dd64233-85cd-44fc-b98e-e64e6d0dacff",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "======================================================================\n",
      "ADVANCED AQI PREDICTION - WITH HYPERPARAMETER TUNING\n",
      "======================================================================\n",
      "\n",
      "1. Loading data...\n",
      "\n",
      "Attempt 1/2: Connecting to MongoDB...\n",
      "‚úì Connected!\n",
      "‚úì Loaded 4340 records from MongoDB\n",
      "\n",
      "‚úì Source: MONGODB\n",
      "‚úì Records: 4340\n",
      "\n",
      "2. Engineering features...\n",
      "   Creating lag features...\n",
      "   Creating rolling features...\n",
      "   Creating difference features...\n",
      "‚úì After engineering: 4340 records, 73 columns\n",
      "\n",
      "3. Preparing features...\n",
      "‚úì Initial features: 63\n",
      "‚úì After removing high-missing features: 63\n",
      "‚úì Final dataset: 4268 records\n",
      "‚úì Features: 63\n",
      "‚úì Samples: 4268\n",
      "\n",
      "4. Feature selection...\n",
      "   Top 10 features:\n",
      "      day_of_year                   : 0.526\n",
      "      aqi_min_24h                   : 0.390\n",
      "      aqi_min_12h                   : 0.346\n",
      "      aqi_pm25                      : 0.325\n",
      "      pm2_5                         : 0.323\n",
      "      aqi_min_6h                    : 0.322\n",
      "      aqi                           : 0.314\n",
      "      aqi_min_3h                    : 0.314\n",
      "      aqi_ma_3h                     : 0.307\n",
      "      aqi_max_3h                    : 0.301\n",
      "\n",
      "‚úì Selected 30 best features\n",
      "\n",
      "5. Splitting data (time-series aware)...\n",
      "‚úì Train: 3414, Test: 854\n",
      "\n",
      "6. Setting up hyperparameter grids...\n",
      "‚úì 6 models configured for tuning\n",
      "\n",
      "======================================================================\n",
      "TRAINING MODELS WITH HYPERPARAMETER TUNING\n",
      "======================================================================\n",
      "\n",
      "‚è±Ô∏è  This may take 5-15 minutes depending on your hardware...\n",
      "\n",
      "======================================================================\n",
      "TRAINING FOR 24h AHEAD PREDICTION\n",
      "======================================================================\n",
      "\n",
      "Ridge...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'alpha': 100.0, 'solver': 'saga'}\n",
      "   Best CV R¬≤: 0.085\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:     0.013\n",
      "      Train R¬≤:    0.168\n",
      "      CV R¬≤:       0.085\n",
      "      RMSE:        55.83\n",
      "      MAE:         28.41\n",
      "      MAPE:        24.99%\n",
      "      Acc ¬±20:      51.5%\n",
      "      Acc ¬±10:      23.4%\n",
      "      ‚ö†Ô∏è  VERY LOW R¬≤ - Check data quality\n",
      "      ‚úì  Saved: models/ridge_24h.pkl\n",
      "\n",
      "Lasso...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'alpha': 0.1}\n",
      "   Best CV R¬≤: 0.076\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:     0.009\n",
      "      Train R¬≤:    0.169\n",
      "      CV R¬≤:       0.076\n",
      "      RMSE:        55.94\n",
      "      MAE:         28.71\n",
      "      MAPE:        25.45%\n",
      "      Acc ¬±20:      52.3%\n",
      "      Acc ¬±10:      22.8%\n",
      "      ‚ö†Ô∏è  VERY LOW R¬≤ - Check data quality\n",
      "      ‚úì  Saved: models/lasso_24h.pkl\n",
      "\n",
      "Random Forest...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'max_depth': 5, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 100}\n",
      "   Best CV R¬≤: 0.042\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:    -0.009\n",
      "      Train R¬≤:    0.280\n",
      "      CV R¬≤:       0.042\n",
      "      RMSE:        56.44\n",
      "      MAE:         29.41\n",
      "      MAPE:        26.11%\n",
      "      Acc ¬±20:      49.2%\n",
      "      Acc ¬±10:      27.3%\n",
      "      ‚ö†Ô∏è  OVERFITTING (gap: 0.289)\n",
      "      ‚úì  Saved: models/random_forest_24h.pkl\n",
      "\n",
      "Gradient Boosting...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}\n",
      "   Best CV R¬≤: 0.040\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:     0.035\n",
      "      Train R¬≤:    0.128\n",
      "      CV R¬≤:       0.040\n",
      "      RMSE:        55.20\n",
      "      MAE:         27.74\n",
      "      MAPE:        24.99%\n",
      "      Acc ¬±20:      50.5%\n",
      "      Acc ¬±10:      22.7%\n",
      "      ‚ö†Ô∏è  VERY LOW R¬≤ - Check data quality\n",
      "      ‚úì  Saved: models/gradient_boosting_24h.pkl\n",
      "\n",
      "XGBoost...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8}\n",
      "   Best CV R¬≤: 0.066\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:     0.036\n",
      "      Train R¬≤:    0.182\n",
      "      CV R¬≤:       0.066\n",
      "      RMSE:        55.16\n",
      "      MAE:         28.07\n",
      "      MAPE:        25.14%\n",
      "      Acc ¬±20:      49.9%\n",
      "      Acc ¬±10:      24.8%\n",
      "      ‚ö†Ô∏è  VERY LOW R¬≤ - Check data quality\n",
      "      ‚úì  Saved: models/xgboost_24h.pkl\n",
      "\n",
      "LightGBM...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'learning_rate': 0.01, 'max_depth': 7, 'min_child_samples': 30, 'n_estimators': 200, 'num_leaves': 31}\n",
      "   Best CV R¬≤: 0.052\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:    -0.013\n",
      "      Train R¬≤:    0.384\n",
      "      CV R¬≤:       0.052\n",
      "      RMSE:        56.54\n",
      "      MAE:         29.62\n",
      "      MAPE:        26.16%\n",
      "      Acc ¬±20:      49.1%\n",
      "      Acc ¬±10:      25.3%\n",
      "      ‚ö†Ô∏è  OVERFITTING (gap: 0.396)\n",
      "      ‚úì  Saved: models/lightgbm_24h.pkl\n",
      "\n",
      "======================================================================\n",
      "TRAINING FOR 48h AHEAD PREDICTION\n",
      "======================================================================\n",
      "\n",
      "Ridge...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'alpha': 100.0, 'solver': 'saga'}\n",
      "   Best CV R¬≤: -0.009\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:     0.022\n",
      "      Train R¬≤:    0.115\n",
      "      CV R¬≤:      -0.009\n",
      "      RMSE:        58.69\n",
      "      MAE:         27.92\n",
      "      MAPE:        24.16%\n",
      "      Acc ¬±20:      51.3%\n",
      "      Acc ¬±10:      27.5%\n",
      "      ‚ö†Ô∏è  VERY LOW R¬≤ - Check data quality\n",
      "      ‚úì  Saved: models/ridge_48h.pkl\n",
      "\n",
      "Lasso...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'alpha': 1.0}\n",
      "   Best CV R¬≤: 0.008\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:     0.032\n",
      "      Train R¬≤:    0.104\n",
      "      CV R¬≤:       0.008\n",
      "      RMSE:        58.40\n",
      "      MAE:         27.86\n",
      "      MAPE:        24.30%\n",
      "      Acc ¬±20:      49.4%\n",
      "      Acc ¬±10:      27.4%\n",
      "      ‚ö†Ô∏è  VERY LOW R¬≤ - Check data quality\n",
      "      ‚úì  Saved: models/lasso_48h.pkl\n",
      "\n",
      "Random Forest...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'max_depth': 5, 'min_samples_leaf': 8, 'min_samples_split': 10, 'n_estimators': 200}\n",
      "   Best CV R¬≤: -0.144\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:     0.033\n",
      "      Train R¬≤:    0.286\n",
      "      CV R¬≤:      -0.144\n",
      "      RMSE:        58.37\n",
      "      MAE:         27.82\n",
      "      MAPE:        23.99%\n",
      "      Acc ¬±20:      50.9%\n",
      "      Acc ¬±10:      26.5%\n",
      "      ‚ö†Ô∏è  OVERFITTING (gap: 0.254)\n",
      "      ‚úì  Saved: models/random_forest_48h.pkl\n",
      "\n",
      "Gradient Boosting...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}\n",
      "   Best CV R¬≤: -0.014\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:     0.018\n",
      "      Train R¬≤:    0.104\n",
      "      CV R¬≤:      -0.014\n",
      "      RMSE:        58.79\n",
      "      MAE:         28.06\n",
      "      MAPE:        24.40%\n",
      "      Acc ¬±20:      50.4%\n",
      "      Acc ¬±10:      27.9%\n",
      "      ‚ö†Ô∏è  VERY LOW R¬≤ - Check data quality\n",
      "      ‚úì  Saved: models/gradient_boosting_48h.pkl\n",
      "\n",
      "XGBoost...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 50, 'subsample': 0.8}\n",
      "   Best CV R¬≤: -0.005\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:     0.025\n",
      "      Train R¬≤:    0.178\n",
      "      CV R¬≤:      -0.005\n",
      "      RMSE:        58.60\n",
      "      MAE:         28.04\n",
      "      MAPE:        24.55%\n",
      "      Acc ¬±20:      50.9%\n",
      "      Acc ¬±10:      28.1%\n",
      "      ‚ö†Ô∏è  VERY LOW R¬≤ - Check data quality\n",
      "      ‚úì  Saved: models/xgboost_48h.pkl\n",
      "\n",
      "LightGBM...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'learning_rate': 0.01, 'max_depth': 3, 'min_child_samples': 10, 'n_estimators': 100, 'num_leaves': 15}\n",
      "   Best CV R¬≤: -0.025\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:     0.025\n",
      "      Train R¬≤:    0.149\n",
      "      CV R¬≤:      -0.025\n",
      "      RMSE:        58.59\n",
      "      MAE:         27.60\n",
      "      MAPE:        23.48%\n",
      "      Acc ¬±20:      51.3%\n",
      "      Acc ¬±10:      28.6%\n",
      "      ‚ö†Ô∏è  VERY LOW R¬≤ - Check data quality\n",
      "      ‚úì  Saved: models/lightgbm_48h.pkl\n",
      "\n",
      "======================================================================\n",
      "TRAINING FOR 72h AHEAD PREDICTION\n",
      "======================================================================\n",
      "\n",
      "Ridge...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'alpha': 100.0, 'solver': 'saga'}\n",
      "   Best CV R¬≤: -0.120\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:     0.025\n",
      "      Train R¬≤:    0.098\n",
      "      CV R¬≤:      -0.120\n",
      "      RMSE:        58.61\n",
      "      MAE:         27.49\n",
      "      MAPE:        23.61%\n",
      "      Acc ¬±20:      54.1%\n",
      "      Acc ¬±10:      30.0%\n",
      "      ‚ö†Ô∏è  VERY LOW R¬≤ - Check data quality\n",
      "      ‚úì  Saved: models/ridge_72h.pkl\n",
      "\n",
      "Lasso...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'alpha': 1.0}\n",
      "   Best CV R¬≤: -0.046\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:     0.033\n",
      "      Train R¬≤:    0.083\n",
      "      CV R¬≤:      -0.046\n",
      "      RMSE:        58.36\n",
      "      MAE:         27.41\n",
      "      MAPE:        23.87%\n",
      "      Acc ¬±20:      51.3%\n",
      "      Acc ¬±10:      27.5%\n",
      "      ‚ö†Ô∏è  VERY LOW R¬≤ - Check data quality\n",
      "      ‚úì  Saved: models/lasso_72h.pkl\n",
      "\n",
      "Random Forest...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'max_depth': 5, 'min_samples_leaf': 8, 'min_samples_split': 10, 'n_estimators': 200}\n",
      "   Best CV R¬≤: -0.291\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:     0.015\n",
      "      Train R¬≤:    0.291\n",
      "      CV R¬≤:      -0.291\n",
      "      RMSE:        58.92\n",
      "      MAE:         27.45\n",
      "      MAPE:        23.53%\n",
      "      Acc ¬±20:      55.4%\n",
      "      Acc ¬±10:      28.5%\n",
      "      ‚ö†Ô∏è  OVERFITTING (gap: 0.277)\n",
      "      ‚úì  Saved: models/random_forest_72h.pkl\n",
      "\n",
      "Gradient Boosting...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}\n",
      "   Best CV R¬≤: -0.258\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:    -0.002\n",
      "      Train R¬≤:    0.106\n",
      "      CV R¬≤:      -0.258\n",
      "      RMSE:        59.40\n",
      "      MAE:         28.39\n",
      "      MAPE:        24.68%\n",
      "      Acc ¬±20:      52.7%\n",
      "      Acc ¬±10:      27.8%\n",
      "      ‚ö†Ô∏è  NEGATIVE R¬≤ - Model performs worse than baseline!\n",
      "      ‚úì  Saved: models/gradient_boosting_72h.pkl\n",
      "\n",
      "XGBoost...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 5, 'n_estimators': 50, 'subsample': 0.8}\n",
      "   Best CV R¬≤: -0.017\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:     0.012\n",
      "      Train R¬≤:    0.239\n",
      "      CV R¬≤:      -0.017\n",
      "      RMSE:        59.01\n",
      "      MAE:         28.17\n",
      "      MAPE:        24.68%\n",
      "      Acc ¬±20:      52.1%\n",
      "      Acc ¬±10:      28.0%\n",
      "      ‚ö†Ô∏è  OVERFITTING (gap: 0.227)\n",
      "      ‚úì  Saved: models/xgboost_72h.pkl\n",
      "\n",
      "LightGBM...\n",
      "   Tuning hyperparameters...\n",
      "   Best params: {'learning_rate': 0.01, 'max_depth': 3, 'min_child_samples': 10, 'n_estimators': 50, 'num_leaves': 15}\n",
      "   Best CV R¬≤: -0.038\n",
      "\n",
      "   üìä Results:\n",
      "      Test R¬≤:    -0.009\n",
      "      Train R¬≤:    0.100\n",
      "      CV R¬≤:      -0.038\n",
      "      RMSE:        59.63\n",
      "      MAE:         28.37\n",
      "      MAPE:        24.40%\n",
      "      Acc ¬±20:      53.9%\n",
      "      Acc ¬±10:      28.7%\n",
      "      ‚ö†Ô∏è  NEGATIVE R¬≤ - Model performs worse than baseline!\n",
      "      ‚úì  Saved: models/lightgbm_72h.pkl\n",
      "\n",
      "======================================================================\n",
      "FEATURE IMPORTANCE ANALYSIS\n",
      "======================================================================\n",
      "\n",
      "Best 24h model: XGBoost (R¬≤ = 0.036)\n",
      "\n",
      "Top 15 Most Important Features:\n",
      "   aqi                           : 0.1379\n",
      "   pm2_5                         : 0.1253\n",
      "   aqi_pm25                      : 0.0728\n",
      "   aqi_lag_3h                    : 0.0537\n",
      "   aqi_min_6h                    : 0.0520\n",
      "   aqi_ma_3h                     : 0.0468\n",
      "   aqi_std_24h                   : 0.0446\n",
      "   pm25_lag_1h                   : 0.0367\n",
      "   aqi_lag_1h                    : 0.0362\n",
      "   aqi_min_3h                    : 0.0343\n",
      "   aqi_max_12h                   : 0.0310\n",
      "   pres                          : 0.0305\n",
      "   aqi_ma_12h                    : 0.0296\n",
      "   aqi_min_24h                   : 0.0296\n",
      "   day_of_year                   : 0.0290\n",
      "\n",
      "‚úì Feature importance saved to: models/feature_importance.csv\n",
      "\n",
      "======================================================================\n",
      "RESULTS SUMMARY\n",
      "======================================================================\n",
      "\n",
      "24h Ahead:\n",
      "----------------------------------------------------------------------\n",
      "Ridge             : R¬≤=  0.013  RMSE= 55.83  MAE= 28.41  Acc¬±20= 51.5%\n",
      "Lasso             : R¬≤=  0.009  RMSE= 55.94  MAE= 28.71  Acc¬±20= 52.3%\n",
      "Random Forest     : R¬≤= -0.009  RMSE= 56.44  MAE= 29.41  Acc¬±20= 49.2%\n",
      "Gradient Boosting : R¬≤=  0.035  RMSE= 55.20  MAE= 27.74  Acc¬±20= 50.5%\n",
      "XGBoost           : R¬≤=  0.036  RMSE= 55.16  MAE= 28.07  Acc¬±20= 49.9% ‚òÖ BEST\n",
      "LightGBM          : R¬≤= -0.013  RMSE= 56.54  MAE= 29.62  Acc¬±20= 49.1%\n",
      "\n",
      "48h Ahead:\n",
      "----------------------------------------------------------------------\n",
      "Ridge             : R¬≤=  0.022  RMSE= 58.69  MAE= 27.92  Acc¬±20= 51.3%\n",
      "Lasso             : R¬≤=  0.032  RMSE= 58.40  MAE= 27.86  Acc¬±20= 49.4%\n",
      "Random Forest     : R¬≤=  0.033  RMSE= 58.37  MAE= 27.82  Acc¬±20= 50.9% ‚òÖ BEST\n",
      "Gradient Boosting : R¬≤=  0.018  RMSE= 58.79  MAE= 28.06  Acc¬±20= 50.4%\n",
      "XGBoost           : R¬≤=  0.025  RMSE= 58.60  MAE= 28.04  Acc¬±20= 50.9%\n",
      "LightGBM          : R¬≤=  0.025  RMSE= 58.59  MAE= 27.60  Acc¬±20= 51.3%\n",
      "\n",
      "72h Ahead:\n",
      "----------------------------------------------------------------------\n",
      "Ridge             : R¬≤=  0.025  RMSE= 58.61  MAE= 27.49  Acc¬±20= 54.1%\n",
      "Lasso             : R¬≤=  0.033  RMSE= 58.36  MAE= 27.41  Acc¬±20= 51.3% ‚òÖ BEST\n",
      "Random Forest     : R¬≤=  0.015  RMSE= 58.92  MAE= 27.45  Acc¬±20= 55.4%\n",
      "Gradient Boosting : R¬≤= -0.002  RMSE= 59.40  MAE= 28.39  Acc¬±20= 52.7%\n",
      "XGBoost           : R¬≤=  0.012  RMSE= 59.01  MAE= 28.17  Acc¬±20= 52.1%\n",
      "LightGBM          : R¬≤= -0.009  RMSE= 59.63  MAE= 28.37  Acc¬±20= 53.9%\n",
      "\n",
      "======================================================================\n",
      "‚úÖ TRAINING COMPLETE!\n",
      "======================================================================\n",
      "\n",
      "üìä Source: MONGODB\n",
      "üìà Models: 6 ML models √ó 3 horizons = 18 total\n",
      "üéØ All models hyperparameter-tuned with GridSearchCV\n",
      "\n",
      "üìÅ Saved:\n",
      "  ‚úì models/*.pkl (tuned models)\n",
      "  ‚úì models/scaler_ml.pkl\n",
      "  ‚úì models/ml_tuned_results.json\n",
      "  ‚úì models/feature_importance.csv\n",
      "\n",
      "======================================================================\n",
      "DIAGNOSTIC INFORMATION\n",
      "======================================================================\n",
      "\n",
      "Data Quality:\n",
      "   Total samples: 4268\n",
      "   Training samples: 3414\n",
      "   Test samples: 854\n",
      "   Features used: 30\n",
      "\n",
      "Target Statistics (24h ahead):\n",
      "   Mean: 101.86\n",
      "   Std: 55.66\n",
      "   Min: 29.00\n",
      "   Max: 500.00\n",
      "\n",
      "Best Model Performance:\n",
      "   XGBoost (24h): R¬≤ = 0.036\n"
     ]
    }
   ],
   "source": [
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from datetime import datetime\n",
    "import pickle\n",
    "import json\n",
    "import os\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, cross_val_score\n",
    "from sklearn.preprocessing import StandardScaler, RobustScaler\n",
    "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
    "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor\n",
    "from sklearn.linear_model import Ridge, Lasso, ElasticNet\n",
    "from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression\n",
    "import xgboost as xgb\n",
    "import lightgbm as lgb\n",
    "\n",
    "print(\"=\"*70)\n",
    "print(\"ADVANCED AQI PREDICTION - WITH HYPERPARAMETER TUNING\")\n",
    "print(\"=\"*70)\n",
    "\n",
    "os.makedirs('models', exist_ok=True)\n",
    "\n",
    "# ============================================================================\n",
    "# 1. Load Data\n",
    "# ============================================================================\n",
    "\n",
    "def load_from_mongodb(uri, max_attempts=2):\n",
    "    \"\"\"Try MongoDB\"\"\"\n",
    "    from pymongo import MongoClient\n",
    "    from pymongo.server_api import ServerApi\n",
    "    \n",
    "    for attempt in range(max_attempts):\n",
    "        try:\n",
    "            print(f\"\\nAttempt {attempt + 1}/{max_attempts}: Connecting to MongoDB...\")\n",
    "            client = MongoClient(uri, server_api=ServerApi('1'),\n",
    "                               serverSelectionTimeoutMS=5000, connectTimeoutMS=5000)\n",
    "            client.admin.command('ping')\n",
    "            print(\"‚úì Connected!\")\n",
    "            \n",
    "            db = client['aqi_feature_store']\n",
    "            collection = db['aqi_features']\n",
    "            data = pd.DataFrame(list(collection.find({}, {\"_id\": 0})))\n",
    "            client.close()\n",
    "            \n",
    "            print(f\"‚úì Loaded {len(data)} records from MongoDB\")\n",
    "            return data, 'mongodb'\n",
    "        except Exception as e:\n",
    "            print(f\"‚úó Failed: {str(e)[:80]}\")\n",
    "    return None, None\n",
    "\n",
    "def load_from_csv(csv_path):\n",
    "    \"\"\"Load from CSV\"\"\"\n",
    "    try:\n",
    "        print(f\"\\nLoading from CSV: {csv_path}\")\n",
    "        data = pd.read_csv(csv_path)\n",
    "        print(f\"‚úì Loaded {len(data)} records\")\n",
    "        return data, 'csv'\n",
    "    except:\n",
    "        return None, None\n",
    "\n",
    "print(\"\\n1. Loading data...\")\n",
    "\n",
    "MONGO_URI = \"mongodb+srv://nawababbas08_db_user:2Ja4OGlDdKfG6EvZ@cluster0.jnxn95g.mongodb.net/?retryWrites=true&w=majority&tlsAllowInvalidCertificates=true\"\n",
    "CSV_PATH = \"data/cleaned_aqi_data_v2.csv\"\n",
    "\n",
    "data, source = load_from_mongodb(MONGO_URI, 2)\n",
    "if data is None:\n",
    "    print(\"\\n‚ö†Ô∏è MongoDB failed, using CSV...\")\n",
    "    data, source = load_from_csv(CSV_PATH)\n",
    "\n",
    "if data is None:\n",
    "    print(\"\\n‚úó ERROR: No data source available\")\n",
    "    exit(1)\n",
    "\n",
    "print(f\"\\n‚úì Source: {source.upper()}\")\n",
    "print(f\"‚úì Records: {len(data)}\")\n",
    "\n",
    "# ============================================================================\n",
    "# 2. Enhanced Feature Engineering\n",
    "# ============================================================================\n",
    "\n",
    "print(\"\\n2. Engineering features...\")\n",
    "\n",
    "if 'time' in data.columns:\n",
    "    data['time'] = pd.to_datetime(data['time'])\n",
    "    data = data.sort_values('time').reset_index(drop=True)\n",
    "elif 'timestamp' in data.columns:\n",
    "    data['timestamp'] = pd.to_datetime(data['timestamp'])\n",
    "    data = data.sort_values('timestamp').reset_index(drop=True)\n",
    "\n",
    "# More comprehensive lag features\n",
    "print(\"   Creating lag features...\")\n",
    "for lag in [1, 2, 3, 6, 12, 24, 48]:\n",
    "    if 'aqi' in data.columns:\n",
    "        data[f'aqi_lag_{lag}h'] = data['aqi'].shift(lag)\n",
    "    if 'pm2_5' in data.columns:\n",
    "        data[f'pm25_lag_{lag}h'] = data['pm2_5'].shift(lag)\n",
    "\n",
    "# Rolling statistics (mean, std, min, max)\n",
    "print(\"   Creating rolling features...\")\n",
    "for window in [3, 6, 12, 24]:\n",
    "    if 'aqi' in data.columns:\n",
    "        data[f'aqi_ma_{window}h'] = data['aqi'].rolling(window=window, min_periods=1).mean()\n",
    "        data[f'aqi_std_{window}h'] = data['aqi'].rolling(window=window, min_periods=1).std()\n",
    "        data[f'aqi_min_{window}h'] = data['aqi'].rolling(window=window, min_periods=1).min()\n",
    "        data[f'aqi_max_{window}h'] = data['aqi'].rolling(window=window, min_periods=1).max()\n",
    "\n",
    "# Difference features (trend detection)\n",
    "print(\"   Creating difference features...\")\n",
    "if 'aqi' in data.columns:\n",
    "    data['aqi_diff_1h'] = data['aqi'].diff(1)\n",
    "    data['aqi_diff_3h'] = data['aqi'].diff(3)\n",
    "    data['aqi_diff_24h'] = data['aqi'].diff(24)\n",
    "\n",
    "# Cyclical features (better encoding)\n",
    "if 'hour' in data.columns:\n",
    "    data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)\n",
    "    data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)\n",
    "\n",
    "if 'day_of_week' in data.columns:\n",
    "    data['dow_sin'] = np.sin(2 * np.pi * data['day_of_week'] / 7)\n",
    "    data['dow_cos'] = np.cos(2 * np.pi * data['day_of_week'] / 7)\n",
    "\n",
    "if 'month' in data.columns:\n",
    "    data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)\n",
    "    data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)\n",
    "\n",
    "# Targets\n",
    "data['aqi_24h'] = data['aqi'].shift(-24)\n",
    "data['aqi_48h'] = data['aqi'].shift(-48)\n",
    "data['aqi_72h'] = data['aqi'].shift(-72)\n",
    "\n",
    "# Remove rows with all NaN\n",
    "data = data.dropna(axis=1, how='all')\n",
    "\n",
    "print(f\"‚úì After engineering: {data.shape[0]} records, {data.shape[1]} columns\")\n",
    "\n",
    "# ============================================================================\n",
    "# 3. Prepare Data with Better Filtering\n",
    "# ============================================================================\n",
    "\n",
    "print(\"\\n3. Preparing features...\")\n",
    "\n",
    "# Exclude target columns and categorical/string columns\n",
    "exclude_cols = ['time', 'timestamp', 'aqi_24h', 'aqi_48h', 'aqi_72h',\n",
    "                'dominant_pollutant', 'aqi_category', 'aqi_color', 'time_of_day',\n",
    "                'season', 'weather_condition', 'day_of_week', 'day_of_month',\n",
    "                'is_weekend']\n",
    "\n",
    "# Get only numeric columns for features\n",
    "feature_cols = [col for col in data.columns if col not in exclude_cols]\n",
    "numeric_cols = data[feature_cols].select_dtypes(include=[np.number]).columns.tolist()\n",
    "feature_cols = numeric_cols\n",
    "\n",
    "print(f\"‚úì Initial features: {len(feature_cols)}\")\n",
    "\n",
    "# Remove features with too many missing values\n",
    "missing_threshold = 0.3\n",
    "for col in feature_cols[:]:\n",
    "    missing_pct = data[col].isnull().sum() / len(data)\n",
    "    if missing_pct > missing_threshold:\n",
    "        feature_cols.remove(col)\n",
    "        print(f\"   Removed {col} (missing: {missing_pct*100:.1f}%)\")\n",
    "\n",
    "print(f\"‚úì After removing high-missing features: {len(feature_cols)}\")\n",
    "\n",
    "# Handle remaining missing values\n",
    "data[feature_cols] = data[feature_cols].fillna(data[feature_cols].mean())\n",
    "\n",
    "# Remove rows where target is missing\n",
    "data = data.dropna(subset=['aqi_24h', 'aqi_48h', 'aqi_72h'])\n",
    "\n",
    "print(f\"‚úì Final dataset: {len(data)} records\")\n",
    "\n",
    "X = data[feature_cols]\n",
    "y_24h = data['aqi_24h']\n",
    "y_48h = data['aqi_48h']\n",
    "y_72h = data['aqi_72h']\n",
    "\n",
    "# Remove any remaining NaN\n",
    "X = X.fillna(X.mean())\n",
    "\n",
    "print(f\"‚úì Features: {len(feature_cols)}\")\n",
    "print(f\"‚úì Samples: {len(X)}\")\n",
    "\n",
    "# ============================================================================\n",
    "# 4. Feature Selection\n",
    "# ============================================================================\n",
    "\n",
    "print(\"\\n4. Feature selection...\")\n",
    "\n",
    "def select_best_features(X, y, k=30):\n",
    "    \"\"\"Select top K most important features\"\"\"\n",
    "    if len(X.columns) <= k:\n",
    "        return X.columns.tolist()\n",
    "    \n",
    "    # Use mutual information for feature selection\n",
    "    selector = SelectKBest(score_func=mutual_info_regression, k=k)\n",
    "    selector.fit(X, y)\n",
    "    \n",
    "    # Get selected feature names\n",
    "    selected_features = X.columns[selector.get_support()].tolist()\n",
    "    \n",
    "    # Get scores\n",
    "    scores = selector.scores_\n",
    "    feature_scores = list(zip(X.columns, scores))\n",
    "    feature_scores.sort(key=lambda x: x[1], reverse=True)\n",
    "    \n",
    "    print(f\"   Top 10 features:\")\n",
    "    for feat, score in feature_scores[:10]:\n",
    "        print(f\"      {feat:30s}: {score:.3f}\")\n",
    "    \n",
    "    return selected_features\n",
    "\n",
    "# Select features for 24h prediction\n",
    "selected_features = select_best_features(X, y_24h, k=min(30, len(X.columns)))\n",
    "X = X[selected_features]\n",
    "\n",
    "print(f\"\\n‚úì Selected {len(selected_features)} best features\")\n",
    "\n",
    "# ============================================================================\n",
    "# 5. Time Series Split (Better for Time Series!)\n",
    "# ============================================================================\n",
    "\n",
    "print(\"\\n5. Splitting data (time-series aware)...\")\n",
    "\n",
    "# Use 80-20 split but maintain time order\n",
    "split_idx = int(len(X) * 0.8)\n",
    "X_train, X_test = X[:split_idx], X[split_idx:]\n",
    "y_24h_train, y_24h_test = y_24h[:split_idx], y_24h[split_idx:]\n",
    "y_48h_train, y_48h_test = y_48h[:split_idx], y_48h[split_idx:]\n",
    "y_72h_train, y_72h_test = y_72h[:split_idx], y_72h[split_idx:]\n",
    "\n",
    "# Use RobustScaler (better for outliers)\n",
    "scaler = RobustScaler()\n",
    "X_train_scaled = scaler.fit_transform(X_train)\n",
    "X_test_scaled = scaler.transform(X_test)\n",
    "\n",
    "with open('models/scaler_ml.pkl', 'wb') as f:\n",
    "    pickle.dump(scaler, f)\n",
    "\n",
    "print(f\"‚úì Train: {len(X_train)}, Test: {len(X_test)}\")\n",
    "\n",
    "# ============================================================================\n",
    "# 6. Define Model Hyperparameter Grids\n",
    "# ============================================================================\n",
    "\n",
    "print(\"\\n6. Setting up hyperparameter grids...\")\n",
    "\n",
    "# Simplified grids for faster training\n",
    "param_grids = {\n",
    "    'Ridge': {\n",
    "        'model': Ridge(),\n",
    "        'params': {\n",
    "            'alpha': [0.1, 1.0, 10.0, 100.0],\n",
    "            'solver': ['auto', 'svd', 'saga']\n",
    "        }\n",
    "    },\n",
    "    'Lasso': {\n",
    "        'model': Lasso(max_iter=5000),\n",
    "        'params': {\n",
    "            'alpha': [0.01, 0.1, 1.0, 10.0]\n",
    "        }\n",
    "    },\n",
    "    'Random Forest': {\n",
    "        'model': RandomForestRegressor(random_state=42, n_jobs=-1),\n",
    "        'params': {\n",
    "            'n_estimators': [50, 100, 200],\n",
    "            'max_depth': [5, 10, 15, None],\n",
    "            'min_samples_split': [5, 10, 20],\n",
    "            'min_samples_leaf': [2, 4, 8]\n",
    "        }\n",
    "    },\n",
    "    'Gradient Boosting': {\n",
    "        'model': GradientBoostingRegressor(random_state=42),\n",
    "        'params': {\n",
    "            'n_estimators': [50, 100, 200],\n",
    "            'learning_rate': [0.01, 0.05, 0.1],\n",
    "            'max_depth': [3, 5, 7],\n",
    "            'subsample': [0.8, 1.0]\n",
    "        }\n",
    "    },\n",
    "    'XGBoost': {\n",
    "        'model': xgb.XGBRegressor(random_state=42, n_jobs=-1),\n",
    "        'params': {\n",
    "            'n_estimators': [50, 100, 200],\n",
    "            'learning_rate': [0.01, 0.05, 0.1],\n",
    "            'max_depth': [3, 5, 7],\n",
    "            'min_child_weight': [1, 3, 5],\n",
    "            'subsample': [0.8, 1.0],\n",
    "            'colsample_bytree': [0.8, 1.0]\n",
    "        }\n",
    "    },\n",
    "    'LightGBM': {\n",
    "        'model': lgb.LGBMRegressor(random_state=42, verbose=-1, n_jobs=-1),\n",
    "        'params': {\n",
    "            'n_estimators': [50, 100, 200],\n",
    "            'learning_rate': [0.01, 0.05, 0.1],\n",
    "            'max_depth': [3, 5, 7],\n",
    "            'num_leaves': [15, 31, 63],\n",
    "            'min_child_samples': [10, 20, 30]\n",
    "        }\n",
    "    }\n",
    "}\n",
    "\n",
    "print(f\"‚úì {len(param_grids)} models configured for tuning\")\n",
    "\n",
    "# ============================================================================\n",
    "# 7. Evaluation Function\n",
    "# ============================================================================\n",
    "\n",
    "def evaluate(y_true, y_pred):\n",
    "    \"\"\"Comprehensive evaluation metrics\"\"\"\n",
    "    rmse = np.sqrt(mean_squared_error(y_true, y_pred))\n",
    "    mae = mean_absolute_error(y_true, y_pred)\n",
    "    r2 = r2_score(y_true, y_pred)\n",
    "    \n",
    "    # Accuracy within thresholds\n",
    "    acc_20 = np.sum(np.abs(y_true - y_pred) <= 20) / len(y_true) * 100\n",
    "    acc_10 = np.sum(np.abs(y_true - y_pred) <= 10) / len(y_true) * 100\n",
    "    acc_5 = np.sum(np.abs(y_true - y_pred) <= 5) / len(y_true) * 100\n",
    "    \n",
    "    # MAPE (Mean Absolute Percentage Error)\n",
    "    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-10))) * 100\n",
    "    \n",
    "    return {\n",
    "        'RMSE': rmse,\n",
    "        'MAE': mae,\n",
    "        'R2': r2,\n",
    "        'MAPE': mape,\n",
    "        'Acc20': acc_20,\n",
    "        'Acc10': acc_10,\n",
    "        'Acc5': acc_5\n",
    "    }\n",
    "\n",
    "# ============================================================================\n",
    "# 8. Training with GridSearch\n",
    "# ============================================================================\n",
    "\n",
    "print(\"\\n\" + \"=\"*70)\n",
    "print(\"TRAINING MODELS WITH HYPERPARAMETER TUNING\")\n",
    "print(\"=\"*70)\n",
    "print(\"\\n‚è±Ô∏è  This may take 5-15 minutes depending on your hardware...\")\n",
    "\n",
    "results = {}\n",
    "\n",
    "# Time series cross-validation\n",
    "tscv = TimeSeriesSplit(n_splits=3)\n",
    "\n",
    "for horizon, y_train, y_test in [\n",
    "    ('24h', y_24h_train, y_24h_test),\n",
    "    ('48h', y_48h_train, y_48h_test),\n",
    "    ('72h', y_72h_train, y_72h_test)\n",
    "]:\n",
    "    print(f\"\\n{'='*70}\")\n",
    "    print(f\"TRAINING FOR {horizon} AHEAD PREDICTION\")\n",
    "    print('='*70)\n",
    "    \n",
    "    results[horizon] = {}\n",
    "    \n",
    "    for name, config in param_grids.items():\n",
    "        print(f\"\\n{name}...\")\n",
    "        print(f\"   Tuning hyperparameters...\")\n",
    "        \n",
    "        # GridSearch with time series CV\n",
    "        grid_search = GridSearchCV(\n",
    "            estimator=config['model'],\n",
    "            param_grid=config['params'],\n",
    "            cv=tscv,\n",
    "            scoring='r2',\n",
    "            n_jobs=-1,\n",
    "            verbose=0\n",
    "        )\n",
    "        \n",
    "        # Fit grid search\n",
    "        grid_search.fit(X_train_scaled, y_train)\n",
    "        \n",
    "        # Best model\n",
    "        best_model = grid_search.best_estimator_\n",
    "        \n",
    "        print(f\"   Best params: {grid_search.best_params_}\")\n",
    "        print(f\"   Best CV R¬≤: {grid_search.best_score_:.3f}\")\n",
    "        \n",
    "        # Predictions\n",
    "        y_pred_train = best_model.predict(X_train_scaled)\n",
    "        y_pred_test = best_model.predict(X_test_scaled)\n",
    "        \n",
    "        # Evaluate\n",
    "        train_metrics = evaluate(y_train, y_pred_train)\n",
    "        test_metrics = evaluate(y_test, y_pred_test)\n",
    "        \n",
    "        # Store results\n",
    "        results[horizon][name] = {\n",
    "            'test_R2': test_metrics['R2'],\n",
    "            'test_RMSE': test_metrics['RMSE'],\n",
    "            'test_MAE': test_metrics['MAE'],\n",
    "            'test_MAPE': test_metrics['MAPE'],\n",
    "            'test_Acc20': test_metrics['Acc20'],\n",
    "            'test_Acc10': test_metrics['Acc10'],\n",
    "            'test_Acc5': test_metrics['Acc5'],\n",
    "            'train_R2': train_metrics['R2'],\n",
    "            'cv_R2': grid_search.best_score_,\n",
    "            'best_params': grid_search.best_params_\n",
    "        }\n",
    "        \n",
    "        # Display metrics\n",
    "        print(f\"\\n   üìä Results:\")\n",
    "        print(f\"      Test R¬≤:    {test_metrics['R2']:6.3f}\")\n",
    "        print(f\"      Train R¬≤:   {train_metrics['R2']:6.3f}\")\n",
    "        print(f\"      CV R¬≤:      {grid_search.best_score_:6.3f}\")\n",
    "        print(f\"      RMSE:       {test_metrics['RMSE']:6.2f}\")\n",
    "        print(f\"      MAE:        {test_metrics['MAE']:6.2f}\")\n",
    "        print(f\"      MAPE:       {test_metrics['MAPE']:6.2f}%\")\n",
    "        print(f\"      Acc ¬±20:    {test_metrics['Acc20']:6.1f}%\")\n",
    "        print(f\"      Acc ¬±10:    {test_metrics['Acc10']:6.1f}%\")\n",
    "        \n",
    "        # Check for overfitting\n",
    "        overfit_gap = train_metrics['R2'] - test_metrics['R2']\n",
    "        if overfit_gap > 0.2:\n",
    "            print(f\"      ‚ö†Ô∏è  OVERFITTING (gap: {overfit_gap:.3f})\")\n",
    "        elif test_metrics['R2'] < 0:\n",
    "            print(f\"      ‚ö†Ô∏è  NEGATIVE R¬≤ - Model performs worse than baseline!\")\n",
    "        elif test_metrics['R2'] < 0.1:\n",
    "            print(f\"      ‚ö†Ô∏è  VERY LOW R¬≤ - Check data quality\")\n",
    "        else:\n",
    "            print(f\"      ‚úì  Good performance!\")\n",
    "        \n",
    "        # Save model\n",
    "        model_path = f'models/{name.lower().replace(\" \", \"_\")}_{horizon}.pkl'\n",
    "        with open(model_path, 'wb') as f:\n",
    "            pickle.dump(best_model, f)\n",
    "        print(f\"      ‚úì  Saved: {model_path}\")\n",
    "\n",
    "# ============================================================================\n",
    "# 9. Feature Importance Analysis\n",
    "# ============================================================================\n",
    "\n",
    "print(\"\\n\" + \"=\"*70)\n",
    "print(\"FEATURE IMPORTANCE ANALYSIS\")\n",
    "print(\"=\"*70)\n",
    "\n",
    "# Analyze feature importance for best model\n",
    "best_24h = max(results['24h'].items(), key=lambda x: x[1]['test_R2'])\n",
    "print(f\"\\nBest 24h model: {best_24h[0]} (R¬≤ = {best_24h[1]['test_R2']:.3f})\")\n",
    "\n",
    "# Save feature importance if available\n",
    "feature_importance_path = 'models/feature_importance.csv'\n",
    "try:\n",
    "    # Load the best model\n",
    "    model_name = best_24h[0].lower().replace(\" \", \"_\")\n",
    "    with open(f'models/{model_name}_24h.pkl', 'rb') as f:\n",
    "        best_model = pickle.load(f)\n",
    "    \n",
    "    # Get feature importance\n",
    "    if hasattr(best_model, 'feature_importances_'):\n",
    "        importance_df = pd.DataFrame({\n",
    "            'feature': selected_features,\n",
    "            'importance': best_model.feature_importances_\n",
    "        }).sort_values('importance', ascending=False)\n",
    "        \n",
    "        importance_df.to_csv(feature_importance_path, index=False)\n",
    "        \n",
    "        print(f\"\\nTop 15 Most Important Features:\")\n",
    "        for idx, row in importance_df.head(15).iterrows():\n",
    "            print(f\"   {row['feature']:30s}: {row['importance']:.4f}\")\n",
    "        \n",
    "        print(f\"\\n‚úì Feature importance saved to: {feature_importance_path}\")\n",
    "except Exception as e:\n",
    "    print(f\"\\n‚ö†Ô∏è Could not extract feature importance: {str(e)}\")\n",
    "\n",
    "# ============================================================================\n",
    "# 10. Summary\n",
    "# ============================================================================\n",
    "\n",
    "print(\"\\n\" + \"=\"*70)\n",
    "print(\"RESULTS SUMMARY\")\n",
    "print(\"=\"*70)\n",
    "\n",
    "for horizon in ['24h', '48h', '72h']:\n",
    "    print(f\"\\n{horizon} Ahead:\")\n",
    "    print(\"-\" * 70)\n",
    "    \n",
    "    best = max(results[horizon].items(), key=lambda x: x[1]['test_R2'])\n",
    "    \n",
    "    for name in results[horizon]:\n",
    "        m = results[horizon][name]\n",
    "        marker = \" ‚òÖ BEST\" if name == best[0] else \"\"\n",
    "        print(f\"{name:18s}: R¬≤={m['test_R2']:7.3f}  RMSE={m['test_RMSE']:6.2f}  \"\n",
    "              f\"MAE={m['test_MAE']:6.2f}  Acc¬±20={m['test_Acc20']:5.1f}%{marker}\")\n",
    "\n",
    "# Save results\n",
    "with open('models/ml_tuned_results.json', 'w') as f:\n",
    "    # Convert to serializable format\n",
    "    results_serializable = {}\n",
    "    for horizon in results:\n",
    "        results_serializable[horizon] = {}\n",
    "        for model_name in results[horizon]:\n",
    "            results_serializable[horizon][model_name] = {\n",
    "                k: v for k, v in results[horizon][model_name].items() \n",
    "                if k != 'best_params'\n",
    "            }\n",
    "            results_serializable[horizon][model_name]['best_params_str'] = str(\n",
    "                results[horizon][model_name]['best_params']\n",
    "            )\n",
    "    \n",
    "    json.dump(results_serializable, f, indent=2)\n",
    "\n",
    "print(\"\\n\" + \"=\"*70)\n",
    "print(\"‚úÖ TRAINING COMPLETE!\")\n",
    "print(\"=\"*70)\n",
    "print(f\"\\nüìä Source: {source.upper()}\")\n",
    "print(f\"üìà Models: {len(param_grids)} ML models √ó 3 horizons = {len(param_grids)*3} total\")\n",
    "print(f\"üéØ All models hyperparameter-tuned with GridSearchCV\")\n",
    "print(\"\\nüìÅ Saved:\")\n",
    "print(\"  ‚úì models/*.pkl (tuned models)\")\n",
    "print(\"  ‚úì models/scaler_ml.pkl\")\n",
    "print(\"  ‚úì models/ml_tuned_results.json\")\n",
    "print(\"  ‚úì models/feature_importance.csv\")\n",
    "\n",
    "# ============================================================================\n",
    "# 11. Diagnostic Information\n",
    "# ============================================================================\n",
    "\n",
    "print(\"\\n\" + \"=\"*70)\n",
    "print(\"DIAGNOSTIC INFORMATION\")\n",
    "print(\"=\"*70)\n",
    "\n",
    "print(f\"\\nData Quality:\")\n",
    "print(f\"   Total samples: {len(data)}\")\n",
    "print(f\"   Training samples: {len(X_train)}\")\n",
    "print(f\"   Test samples: {len(X_test)}\")\n",
    "print(f\"   Features used: {len(selected_features)}\")\n",
    "\n",
    "print(f\"\\nTarget Statistics (24h ahead):\")\n",
    "print(f\"   Mean: {y_24h.mean():.2f}\")\n",
    "print(f\"   Std: {y_24h.std():.2f}\")\n",
    "print(f\"   Min: {y_24h.min():.2f}\")\n",
    "print(f\"   Max: {y_24h.max():.2f}\")\n",
    "\n",
    "print(f\"\\nBest Model Performance:\")\n",
    "best_overall = max(\n",
    "    [(h, n, m['test_R2']) for h in results for n, m in results[h].items()],\n",
    "    key=lambda x: x[2]\n",
    ")\n",
    "print(f\"   {best_overall[1]} ({best_overall[0]}): R¬≤ = {best_overall[2]:.3f}\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc732e3e-2aa4-4713-a012-1ca6fa0722b0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.14.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
