In [3]:
"""
ML Models ONLY - No TensorFlow Needed
======================================
Trains 4 powerful ML models without DL
Perfect when TensorFlow won't install

Author: AQI Prediction Team
"""

import pandas as pd
import numpy as np
from datetime import datetime
import pickle
import json
import os
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
import xgboost as xgb
import lightgbm as lgb

print("="*70)
print("AQI PREDICTION - ML MODELS (NO TENSORFLOW)")
print("="*70)

os.makedirs('models', exist_ok=True)

# ============================================================================
# 1. Load Data
# ============================================================================

def load_from_mongodb(uri, max_attempts=2):
    """Try MongoDB"""
    from pymongo import MongoClient
    from pymongo.server_api import ServerApi
    
    for attempt in range(max_attempts):
        try:
            print(f"\nAttempt {attempt + 1}/{max_attempts}: Connecting to MongoDB...")
            client = MongoClient(uri, server_api=ServerApi('1'),
                               serverSelectionTimeoutMS=5000, connectTimeoutMS=5000)
            client.admin.command('ping')
            print("‚úì Connected!")
            
            db = client['aqi_feature_store']
            collection = db['aqi_features']
            data = pd.DataFrame(list(collection.find({}, {"_id": 0})))
            client.close()
            
            print(f"‚úì Loaded {len(data)} records from MongoDB")
            return data, 'mongodb'
        except Exception as e:
            print(f"‚úó Failed: {str(e)[:80]}")
    return None, None

def load_from_csv(csv_path):
    """Load from CSV"""
    try:
        print(f"\nLoading from CSV: {csv_path}")
        data = pd.read_csv(csv_path)
        print(f"‚úì Loaded {len(data)} records")
        return data, 'csv'
    except:
        return None, None

print("\n1. Loading data...")

MONGO_URI = "mongodb+srv://nawababbas08_db_user:2Ja4OGlDdKfG6EvZ@cluster0.jnxn95g.mongodb.net/?retryWrites=true&w=majority&tlsAllowInvalidCertificates=true"
CSV_PATH = "data/cleaned_aqi_data_v2.csv"

data, source = load_from_mongodb(MONGO_URI, 2)
if data is None:
    print("\n‚ö†Ô∏è MongoDB failed, using CSV...")
    data, source = load_from_csv(CSV_PATH)

if data is None:
    print("\n‚úó ERROR: No data source available")
    exit(1)

print(f"\n‚úì Source: {source.upper()}")
print(f"‚úì Records: {len(data)}")

# ============================================================================
# 2. Feature Engineering
# ============================================================================

print("\n2. Engineering features...")

if 'time' in data.columns:
    data['time'] = pd.to_datetime(data['time'])
    data = data.sort_values('time').reset_index(drop=True)
elif 'timestamp' in data.columns:
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data = data.sort_values('timestamp').reset_index(drop=True)

# Lag features
for lag in [1, 6, 24]:
    if 'aqi' in data.columns:
        data[f'aqi_lag_{lag}h'] = data['aqi'].shift(lag)
    if 'pm2_5' in data.columns:
        data[f'pm25_lag_{lag}h'] = data['pm2_5'].shift(lag)

# Rolling averages
for window in [6, 24]:
    if 'aqi' in data.columns:
        data[f'aqi_ma_{window}h'] = data['aqi'].rolling(window=window, min_periods=1).mean()

# Cyclical
if 'hour' in data.columns:
    data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
    data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)

# Targets
data['aqi_24h'] = data['aqi'].shift(-24)
data['aqi_48h'] = data['aqi'].shift(-48)
data['aqi_72h'] = data['aqi'].shift(-72)

data = data.dropna(axis=1, how='all')
data = data.dropna()

print(f"‚úì After engineering: {len(data)} records")

# ============================================================================
# 3. Prepare Data
# ============================================================================

print("\n3. Preparing features...")

feature_cols = [col for col in data.columns 
                if col not in ['time', 'timestamp', 'aqi_24h', 'aqi_48h', 'aqi_72h',
                              'dominant_pollutant', 'aqi_category', 'aqi_color', 'time_of_day']]

# Get only numeric columns
X = data[feature_cols].select_dtypes(include=[np.number])
y_24h = data['aqi_24h']
y_48h = data['aqi_48h']
y_72h = data['aqi_72h']

print(f"‚úì Features: {len(X.columns)}")
print(f"‚úì Samples: {len(X)}")

# ============================================================================
# 4. Split & Scale
# ============================================================================

print("\n4. Splitting and scaling...")

split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_24h_train, y_24h_test = y_24h[:split_idx], y_24h[split_idx:]
y_48h_train, y_48h_test = y_48h[:split_idx], y_48h[split_idx:]
y_72h_train, y_72h_test = y_72h[:split_idx], y_72h[split_idx:]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

with open('models/scaler_ml.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print(f"‚úì Train: {len(X_train)}, Test: {len(X_test)}")

# ============================================================================
# 5. Define Models
# ============================================================================

models = {
    'Ridge': Ridge(alpha=10.0),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=10, random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.05, min_child_weight=5, random_state=42),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, max_depth=5, learning_rate=0.05, min_child_samples=20, random_state=42, verbose=-1)
}

# ============================================================================
# 6. Evaluation
# ============================================================================

def evaluate(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    acc_20 = np.sum(np.abs(y_true - y_pred) <= 20) / len(y_true) * 100
    acc_10 = np.sum(np.abs(y_true - y_pred) <= 10) / len(y_true) * 100
    return {'RMSE': rmse, 'MAE': mae, 'R2': r2, 'Acc20': acc_20, 'Acc10': acc_10}

# ============================================================================
# 7. Train
# ============================================================================

print("\n" + "="*70)
print("TRAINING ML MODELS")
print("="*70)

results = {}

for horizon, y_train, y_test in [
    ('24h', y_24h_train, y_24h_test),
    ('48h', y_48h_train, y_48h_test),
    ('72h', y_72h_train, y_72h_test)
]:
    print(f"\n{horizon} Ahead:")
    print("-" * 70)
    
    results[horizon] = {}
    
    for name, model in models.items():
        print(f"\n{name}...")
        
        model.fit(X_train_scaled, y_train)
        
        y_pred_train = model.predict(X_train_scaled)
        y_pred_test = model.predict(X_test_scaled)
        
        train_metrics = evaluate(y_train, y_pred_train)
        test_metrics = evaluate(y_test, y_pred_test)
        
        kf = KFold(n_splits=3, shuffle=False)
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=kf, scoring='r2', n_jobs=-1)
        
        results[horizon][name] = {
            'test_R2': test_metrics['R2'],
            'test_RMSE': test_metrics['RMSE'],
            'test_MAE': test_metrics['MAE'],
            'test_Acc20': test_metrics['Acc20'],
            'test_Acc10': test_metrics['Acc10'],
            'train_R2': train_metrics['R2'],
            'cv_R2': cv_scores.mean()
        }
        
        print(f"  Test R¬≤:   {test_metrics['R2']:.3f}")
        print(f"  RMSE:      {test_metrics['RMSE']:.2f}")
        print(f"  Acc ¬±20:   {test_metrics['Acc20']:.1f}%")
        print(f"  Train R¬≤:  {train_metrics['R2']:.3f}")
        
        if train_metrics['R2'] - test_metrics['R2'] > 0.3:
            print(f"  ‚ö†Ô∏è OVERFITTING")
        
        model_path = f'models/{name.lower().replace(" ", "_")}_{horizon}.pkl'
        with open(model_path, 'wb') as f:
            pickle.dump(model, f)
        print(f"  ‚úì Saved: {model_path}")

# ============================================================================
# 8. Summary
# ============================================================================

print("\n" + "="*70)
print("RESULTS SUMMARY")
print("="*70)

for horizon in ['24h', '48h', '72h']:
    print(f"\n{horizon} Ahead:")
    print("-" * 70)
    
    best = max(results[horizon].items(), key=lambda x: x[1]['test_R2'])
    
    for name in results[horizon]:
        m = results[horizon][name]
        marker = " ‚òÖ" if name == best[0] else ""
        print(f"{name:18s}: R¬≤={m['test_R2']:6.3f}  RMSE={m['test_RMSE']:6.2f}  Acc¬±20={m['test_Acc20']:5.1f}%{marker}")

with open('models/ml_only_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("\n" + "="*70)
print("‚úÖ ML TRAINING COMPLETE!")
print("="*70)
print(f"\nüìä Source: {source.upper()}")
print(f"üìà Models: 5 ML models √ó 3 horizons = 15 total")
print("\nüìÅ Saved:")
print("  ‚úì models/*.pkl (15 ML models)")
print("  ‚úì models/scaler_ml.pkl")
print("  ‚úì models/ml_only_results.json")
print("\nüí° TensorFlow not needed - ML models work great!")
print("   XGBoost and LightGBM often outperform LSTM anyway!")

AQI PREDICTION - ML MODELS (NO TENSORFLOW)

1. Loading data...

Attempt 1/2: Connecting to MongoDB...
‚úì Connected!
‚úì Loaded 4340 records from MongoDB

‚úì Source: MONGODB
‚úì Records: 4340

2. Engineering features...
‚úì After engineering: 4244 records

3. Preparing features...
‚úì Features: 43
‚úì Samples: 4244

4. Splitting and scaling...
‚úì Train: 3395, Test: 849

TRAINING ML MODELS

24h Ahead:
----------------------------------------------------------------------

Ridge...
  Test R¬≤:   -0.032
  RMSE:      57.21
  Acc ¬±20:   44.8%
  Train R¬≤:  0.186
  ‚úì Saved: models/ridge_24h.pkl

Gradient Boosting...
  Test R¬≤:   -0.203
  RMSE:      61.78
  Acc ¬±20:   44.3%
  Train R¬≤:  0.883
  ‚ö†Ô∏è OVERFITTING
  ‚úì Saved: models/gradient_boosting_24h.pkl

Random Forest...
  Test R¬≤:   -0.090
  RMSE:      58.79
  Acc ¬±20:   46.4%
  Train R¬≤:  0.629
  ‚ö†Ô∏è OVERFITTING
  ‚úì Saved: models/random_forest_24h.pkl

XGBoost...
  Test R¬≤:   -0.156
  RMSE:      60.57
  Acc ¬±20:   46.