In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================
TARGETS = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']

TEMPORAL_FEATURES = ['hour', 'is_day', 'hour_sin', 'hour_cos', 'dow', 'dow_sin',
                     'dow_cos', 'is_holiday', 'is_weekend', 'lockdown_code']

DATE_COL = 'date'

# Cross-validation settings
N_SPLITS = 2
TEST_SIZE_RATIO = 0.15

# CatBoost settings
CATBOOST_PARAMS = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'l2_leaf_reg': 3,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'verbose': False,
    'early_stopping_rounds': 50,
    'task_type': 'CPU',
    'thread_count': -1
}

# =============================================================================
# LOAD AND PREPARE DATA
# =============================================================================
print("="*70)
print("CATBOOST PIPELINE - TEMPORAL + CROSS-POLLUTANT + LAG FEATURES")
print("="*70)

train_df = pd.read_csv("../data/train_features.csv")
test_df = pd.read_csv("../data/test_features_to_predict.csv")

print(f"\n✓ Loaded train: {train_df.shape}")
print(f"✓ Loaded test: {test_df.shape}")

# Convert and sort by date
if DATE_COL in train_df.columns:
    train_df[DATE_COL] = pd.to_datetime(train_df[DATE_COL])
    train_df = train_df.sort_values(DATE_COL).reset_index(drop=True)

if DATE_COL in test_df.columns:
    test_df[DATE_COL] = pd.to_datetime(test_df[DATE_COL])
    test_df = test_df.sort_values(DATE_COL).reset_index(drop=True)

# =============================================================================
# FEATURE SELECTION
# =============================================================================
def get_cross_pollutant_features(target_col):
    """Get rolling means of OTHER pollutants."""
    target_name = target_col.replace('valeur_', '')
    other_pollutants = [p for p in ['NO2', 'CO', 'O3', 'PM10', 'PM25']
                        if p != target_name]

    cross_features = []
    for pollutant in other_pollutants:
        cross_features.append(f'valeur_{pollutant}_roll_mean_24')

    return cross_features

def get_lag_features(target_col):
    """Get lag features for the target."""
    lags = [1, 2, 3, 6, 12, 24]
    return [f'{target_col}_lag_{lag}' for lag in lags]

def get_rolling_features(target_col):
    """Get rolling statistics for the target."""
    windows = [6, 12, 24]
    stats = ['mean', 'std', 'min', 'max']
    features = []
    for window in windows:
        for stat in stats:
            features.append(f'{target_col}_roll_{stat}_{window}')
    return features

def prepare_features_for_target(df, target_col):
    """Prepare feature set: temporal + cross-pollutant + lag + rolling."""
    features = TEMPORAL_FEATURES.copy()

    # Add cross-pollutant features
    cross_features = get_cross_pollutant_features(target_col)
    cross_available = [f for f in cross_features if f in df.columns]
    features.extend(cross_available)

    # Add lag features
    lag_features = get_lag_features(target_col)
    lag_available = [f for f in lag_features if f in df.columns]
    features.extend(lag_available)

    # Add rolling features
    rolling_features = get_rolling_features(target_col)
    rolling_available = [f for f in rolling_features if f in df.columns]
    features.extend(rolling_available)

    # Filter to available columns
    available_features = [f for f in features if f in df.columns]

    return available_features

# Display features
print(f"\n{'='*70}")
print(f"FEATURE CONFIGURATION")
print(f"{'='*70}")

sample_features = prepare_features_for_target(train_df, TARGETS[0])
print(f"\nFeatures for {TARGETS[0]}: {len(sample_features)} total")
print(f"  - Temporal: {len([f for f in sample_features if f in TEMPORAL_FEATURES])}")
print(f"  - Cross-pollutant: {len([f for f in sample_features if 'valeur_' in f and 'roll_mean' in f])}")
print(f"  - Lag: {len([f for f in sample_features if '_lag_' in f])}")
print(f"  - Rolling: {len([f for f in sample_features if '_roll_' in f and 'valeur_' not in f])}")

# Clean training data
all_needed_cols = set()
for target in TARGETS:
    all_needed_cols.update(prepare_features_for_target(train_df, target))
all_needed_cols.update(TARGETS)

train_clean = train_df[list(all_needed_cols)].dropna().reset_index(drop=True)
print(f"\n✓ Clean train samples: {len(train_clean)}")

# =============================================================================
# TIME SERIES CROSS-VALIDATION
# =============================================================================
def time_series_cv_split(df, n_splits=2, test_ratio=0.15):
    """Expanding window CV splits."""
    n = len(df)
    test_size = int(n * test_ratio)
    min_train_size = int(n * 0.6)

    splits = []
    for i in range(n_splits):
        val_end = n - (n_splits - i - 1) * (test_size // 2)
        val_start = val_end - test_size
        train_end = val_start

        if train_end < min_train_size:
            continue

        train_idx = df.index[:train_end]
        val_idx = df.index[val_start:val_end]
        splits.append((train_idx, val_idx))

    return splits

# =============================================================================
# TRAIN CATBOOST WITH CV
# =============================================================================
def train_catboost_with_cv(df, target_col, n_splits=2):
    """Train CatBoost with time series cross-validation."""
    print(f"\n{'='*70}")
    print(f"TRAINING: {target_col}")
    print(f"{'='*70}")

    # Get features for this target
    feature_cols = prepare_features_for_target(df, target_col)
    print(f"✓ Using {len(feature_cols)} features")

    splits = time_series_cv_split(df, n_splits=n_splits, test_ratio=TEST_SIZE_RATIO)
    print(f"✓ Created {len(splits)} CV splits")

    cv_scores = []

    for fold, (train_idx, val_idx) in enumerate(splits, 1):
        print(f"\nFold {fold}/{len(splits)} - Train: {len(train_idx)}, Val: {len(val_idx)}", end=" ")

        train_data = df.loc[train_idx]
        val_data = df.loc[val_idx]

        X_train = train_data[feature_cols]
        y_train = train_data[target_col]
        X_val = val_data[feature_cols]
        y_val = val_data[target_col]

        # Handle any remaining NaNs
        X_train = X_train.fillna(0)
        X_val = X_val.fillna(0)

        try:
            # Create CatBoost pools
            train_pool = Pool(X_train, y_train)
            val_pool = Pool(X_val, y_val)

            # Train model
            model = CatBoostRegressor(**CATBOOST_PARAMS)
            model.fit(
                train_pool,
                eval_set=val_pool,
                use_best_model=True,
                plot=False
            )

            # Predict
            predictions = model.predict(X_val)
            predictions = np.maximum(predictions, 0)  # Non-negative constraint

            # Evaluate
            rmse = np.sqrt(mean_squared_error(y_val, predictions))
            mae = mean_absolute_error(y_val, predictions)
            r2 = r2_score(y_val, predictions)

            cv_scores.append({
                'fold': fold,
                'rmse': rmse,
                'mae': mae,
                'r2': r2,
                'best_iteration': model.get_best_iteration()
            })
            print(f"→ RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.3f}, Best iter: {model.get_best_iteration()}")

        except Exception as e:
            print(f"→ ✗ Error: {str(e)[:50]}")
            continue

    # Summary
    if cv_scores:
        cv_df = pd.DataFrame(cv_scores)
        print(f"\n📊 CV Summary: RMSE={cv_df['rmse'].mean():.2f}±{cv_df['rmse'].std():.2f}, "
              f"MAE={cv_df['mae'].mean():.2f}±{cv_df['mae'].std():.2f}, "
              f"R²={cv_df['r2'].mean():.3f}±{cv_df['r2'].std():.3f}")
        print(f"   Avg best iteration: {cv_df['best_iteration'].mean():.0f}")

    # Train final model on full data
    print(f"🔧 Training final model on full data...", end=" ")
    X_full = df[feature_cols].fillna(0)
    y_full = df[target_col]

    full_pool = Pool(X_full, y_full)

    final_model = CatBoostRegressor(**CATBOOST_PARAMS)
    final_model.fit(full_pool, plot=False)

    print(f"✓ Done")

    return final_model, cv_scores, feature_cols

# =============================================================================
# TRAIN ALL MODELS
# =============================================================================
print(f"\n{'='*70}")
print("TRAINING ALL POLLUTANT MODELS")
print(f"{'='*70}")

models = {}
all_cv_scores = {}
all_feature_cols = {}

import time
start_time = time.time()

for i, target in enumerate(TARGETS, 1):
    print(f"\n[{i}/{len(TARGETS)}] {target}")

    try:
        model, cv_scores, feature_cols = train_catboost_with_cv(
            train_clean,
            target,
            n_splits=N_SPLITS
        )
        models[target] = model
        all_cv_scores[target] = cv_scores
        all_feature_cols[target] = feature_cols

    except Exception as e:
        print(f"✗ Failed: {str(e)[:100]}")
        models[target] = None
        all_cv_scores[target] = []
        all_feature_cols[target] = []

elapsed = time.time() - start_time
successful = sum(1 for m in models.values() if m is not None)
print(f"\n{'='*70}")
print(f"✓ Training Complete: {successful}/{len(TARGETS)} models successful")
print(f"⏱ Total time: {elapsed/60:.1f} minutes")
print(f"{'='*70}")

# =============================================================================
# FEATURE IMPORTANCE
# =============================================================================
print(f"\n{'='*70}")
print("FEATURE IMPORTANCE (Top 10 per model)")
print(f"{'='*70}")

for target in TARGETS:
    if models[target] is not None:
        feature_importance = models[target].get_feature_importance()
        feature_names = all_feature_cols[target]

        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': feature_importance
        }).sort_values('importance', ascending=False)

        print(f"\n{target}:")
        print(importance_df.head(10).to_string(index=False))

# =============================================================================
# PREPARE TEST FEATURES
# =============================================================================
print(f"\n{'='*70}")
print("PREPARING TEST FEATURES")
print(f"{'='*70}")

# For lag and rolling features, we need to use last known values from train
last_24_rows = train_df.iloc[-24:].copy()

test_features_dict = {}

for target in TARGETS:
    feature_list = all_feature_cols[target]
    X_test = test_df[feature_list].copy()

    # Fill missing lag/rolling features with averages from last 24 hours
    for col in feature_list:
        if col not in test_df.columns or test_df[col].isna().any():
            if col in last_24_rows.columns:
                fill_value = last_24_rows[col].mean()
            else:
                fill_value = 0.0
            X_test[col] = X_test[col].fillna(fill_value) if col in X_test.columns else fill_value

    X_test = X_test.fillna(0)
    test_features_dict[target] = X_test

print(f"✓ Test features prepared for all {len(TARGETS)} targets")

# =============================================================================
# MAKE PREDICTIONS
# =============================================================================
print(f"\n{'='*70}")
print("GENERATING PREDICTIONS")
print(f"{'='*70}")

if DATE_COL in test_df.columns:
    predictions_df = test_df[[DATE_COL]].copy()
else:
    predictions_df = pd.DataFrame(index=test_df.index)

for target in TARGETS:
    if models[target] is None:
        print(f"✗ {target}: No model, using zeros")
        predictions_df[target] = 0
        continue

    try:
        X_test = test_features_dict[target]
        preds = models[target].predict(X_test)
        preds = np.maximum(preds, 0)  # Non-negative constraint

        predictions_df[target] = preds
        print(f"✓ {target}: [{preds.min():.1f}, {preds.max():.1f}], mean={preds.mean():.1f}")

    except Exception as e:
        print(f"✗ {target}: Error - {str(e)[:50]}")
        predictions_df[target] = 0

# =============================================================================
# SAVE RESULTS
# =============================================================================
print(f"\n{'='*70}")
print("SAVING RESULTS")
print(f"{'='*70}")

predictions_df.to_csv('submission.csv', index=False)
print(f"✓ Submission saved: submission.csv ({predictions_df.shape})")

# Save CV scores
cv_summary = []
for target, scores in all_cv_scores.items():
    if scores:
        cv_df = pd.DataFrame(scores)
        cv_summary.append({
            'target': target,
            'mean_rmse': cv_df['rmse'].mean(),
            'std_rmse': cv_df['rmse'].std(),
            'mean_mae': cv_df['mae'].mean(),
            'std_mae': cv_df['mae'].std(),
            'mean_r2': cv_df['r2'].mean(),
            'std_r2': cv_df['r2'].std(),
            'avg_best_iter': cv_df['best_iteration'].mean()
        })

if cv_summary:
    cv_summary_df = pd.DataFrame(cv_summary)
    cv_summary_df.to_csv('cv_scores_summary.csv', index=False)
    print(f"✓ CV scores saved: cv_scores_summary.csv")
    print(f"\n📊 FINAL RESULTS:")
    print(cv_summary_df.to_string(index=False))

# Save models (optional - can be large files)
# import joblib
# for target, model in models.items():
#     if model is not None:
#         joblib.dump(model, f'catboost_{target}.pkl')
# print(f"✓ Models saved")

print(f"\n{'='*70}")
print("✅ PIPELINE COMPLETE!")
print(f"{'='*70}")
print(f"\n💡 Model Configuration:")
print(f"   CatBoost Iterations: {CATBOOST_PARAMS['iterations']}")
print(f"   Learning Rate: {CATBOOST_PARAMS['learning_rate']}")
print(f"   Depth: {CATBOOST_PARAMS['depth']}")
print(f"   Features: Temporal + Cross-Pollutant + Lag + Rolling")
print(f"   CV Folds: {N_SPLITS}")
print(f"   Total runtime: {elapsed/60:.1f} minutes")

CATBOOST PIPELINE - TEMPORAL + CROSS-POLLUTANT + LAG FEATURES

✓ Loaded train: (40991, 213)
✓ Loaded test: (504, 208)

FEATURE CONFIGURATION

Features for valeur_NO2: 21 total
  - Temporal: 10
  - Cross-pollutant: 6
  - Lag: 3
  - Rolling: 0

✓ Clean train samples: 40991

TRAINING ALL POLLUTANT MODELS

[1/5] valeur_NO2

TRAINING: valeur_NO2
✓ Using 21 features
✓ Created 2 CV splits

Fold 1/2 - Train: 31769, Val: 6148 → RMSE: 7.23, MAE: 4.68, R²: 0.787, Best iter: 588

Fold 2/2 - Train: 34843, Val: 6148 → RMSE: 5.82, MAE: 3.77, R²: 0.753, Best iter: 956

📊 CV Summary: RMSE=6.53±1.00, MAE=4.23±0.65, R²=0.770±0.025
   Avg best iteration: 772
🔧 Training final model on full data... ✓ Done

[2/5] valeur_CO

TRAINING: valeur_CO
✓ Using 21 features
✓ Created 2 CV splits

Fold 1/2 - Train: 31769, Val: 6148 → RMSE: 0.06, MAE: 0.03, R²: 0.713, Best iter: 109

Fold 2/2 - Train: 34843, Val: 6148 → RMSE: 0.06, MAE: 0.02, R²: 0.548, Best iter: 116

📊 CV Summary: RMSE=0.06±0.01, MAE=0.03±0.00, R²=0.63

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================
TARGETS = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']

# Features we WILL KNOW at prediction time (NO WEATHER FORECASTS)
TEMPORAL_FEATURES = ['hour', 'is_day', 'hour_sin', 'hour_cos', 'dow', 'dow_sin',
                     'dow_cos', 'is_holiday', 'is_weekend', 'lockdown_code']

# Weather base features for lag/rolling only
WEATHER_BASE = [
    'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'apparent_temperature',
    'pressure_msl', 'wind_speed_10m', 'wind_direction_10m', 'precipitation',
    'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high',
    'shortwave_radiation', 'direct_radiation', 'diffuse_radiation',
    'global_tilted_irradiance', 'wind_gusts_10m', 'vapour_pressure_deficit',
    'et0_fao_evapotranspiration', 'snowfall', 'rain', 'showers', 'weather_code'
]

DATE_COL = 'date'

# Cross-validation settings
N_SPLITS = 2
TEST_SIZE_RATIO = 0.15

# CatBoost settings
CATBOOST_PARAMS = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'l2_leaf_reg': 3,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'verbose': False,
    'early_stopping_rounds': 50,
    'task_type': 'CPU',
    'thread_count': -1
}

# =============================================================================
# LOAD AND PREPARE DATA
# =============================================================================
print("="*70)
print("CATBOOST PIPELINE - NO WEATHER FORECASTS")
print("="*70)

train_df = pd.read_csv("../data/train_features.csv")
test_df = pd.read_csv("../data/test_features_to_predict.csv")

print(f"\n✓ Loaded train: {train_df.shape}")
print(f"✓ Loaded test: {test_df.shape}")

# Convert and sort by date
if DATE_COL in train_df.columns:
    train_df[DATE_COL] = pd.to_datetime(train_df[DATE_COL])
    train_df = train_df.sort_values(DATE_COL).reset_index(drop=True)

if DATE_COL in test_df.columns:
    test_df[DATE_COL] = pd.to_datetime(test_df[DATE_COL])
    test_df = test_df.sort_values(DATE_COL).reset_index(drop=True)

# =============================================================================
# FEATURE SELECTION - ONLY HISTORICAL DATA (NO FORECASTS)
# =============================================================================
def get_weather_lag_features():
    """Get weather lag features (historical weather we know)."""
    features = []
    for weather in WEATHER_BASE:
        features.extend([
            f'{weather}_lag_6',
            f'{weather}_lag_12',
            f'{weather}_lag_24'
        ])
    return features

def get_weather_rolling_features():
    """Get weather rolling features (historical weather statistics)."""
    features = []
    for weather in WEATHER_BASE:
        features.extend([
            f'{weather}_roll_mean_6',
            f'{weather}_roll_std_6',
            f'{weather}_roll_mean_24',
            f'{weather}_roll_std_24'
        ])
    return features

def get_cross_pollutant_lag_features(target_col):
    """
    Get lag features of OTHER pollutants.
    We can use historical pollutant data (lags) but NOT future predictions.
    """
    target_name = target_col.replace('valeur_', '')
    other_pollutants = [p for p in ['NO2', 'CO', 'O3', 'PM10', 'PM25']
                        if p != target_name]

    cross_features = []
    for pollutant in other_pollutants:
        # Only use lags 6, 12, 24 (historical data we know)
        for lag in [6, 12, 24]:
            cross_features.append(f'valeur_{pollutant}_lag_{lag}')
        # Rolling means/stds of historical data
        cross_features.extend([
            f'valeur_{pollutant}_roll_mean_6',
            f'valeur_{pollutant}_roll_std_6',
            f'valeur_{pollutant}_roll_mean_24',
            f'valeur_{pollutant}_roll_std_24'
        ])

    return cross_features

def get_target_historical_features(target_col):
    """
    Get historical features for the target itself.
    We can use past values (lags) and rolling statistics.
    """
    features = []

    # Lag features (we know past values)
    for lag in [6, 12, 24]:
        features.append(f'{target_col}_lag_{lag}')

    # Rolling statistics (calculated from past values)
    for window in [6, 24]:
        features.extend([
            f'{target_col}_roll_mean_{window}',
            f'{target_col}_roll_std_{window}'
        ])

    return features

def get_special_interaction_features(target_col):
    """Get special interaction features like NO2_lag1_for_O3."""
    features = []

    # These are specific interactions that were pre-computed
    if target_col == 'valeur_O3':
        features.append('NO2_lag1_for_O3')
    elif target_col == 'valeur_PM25':
        features.append('PM10_lag1_for_PM25')

    return features

def prepare_features_for_target(df, target_col):
    """
    Prepare feature set: ONLY features we'll know WITHOUT weather forecasts.

    INCLUDED:
    - Temporal features (hour, day of week, etc.) - we know when we're predicting
    - Weather lag/rolling features - historical weather we know
    - Cross-pollutant historical features - past values of other pollutants
    - Target historical features - past values of the target itself

    EXCLUDED:
    - Current/future weather (no forecasts available!)
    - Future pollutant values (we're predicting those!)
    - Short lags (lag_1, lag_2, lag_3) - might not be available at prediction time
    """
    features = []

    # 1. Temporal features (always known)
    features.extend(TEMPORAL_FEATURES)

    # 2. Historical weather features (NO current weather - no forecasts!)
    weather_lag = get_weather_lag_features()
    features.extend(weather_lag)

    weather_rolling = get_weather_rolling_features()
    features.extend(weather_rolling)

    # 3. Historical pollutant data (from other pollutants)
    cross_features = get_cross_pollutant_lag_features(target_col)
    features.extend(cross_features)

    # 4. Historical target data (past values of what we're predicting)
    target_historical = get_target_historical_features(target_col)
    features.extend(target_historical)

    # 5. Special interaction features
    special_features = get_special_interaction_features(target_col)
    features.extend(special_features)

    # Filter to only available columns
    available_features = [f for f in features if f in df.columns]

    return available_features

# Display features
print(f"\n{'='*70}")
print(f"FEATURE CONFIGURATION - NO WEATHER FORECASTS")
print(f"{'='*70}")

sample_features = prepare_features_for_target(train_df, TARGETS[0])
print(f"\nFeatures for {TARGETS[0]}: {len(sample_features)} total")
print(f"  - Temporal: {len([f for f in sample_features if f in TEMPORAL_FEATURES])}")
print(f"  - Weather (historical only): {len([f for f in sample_features if any(w in f for w in WEATHER_BASE) and ('_lag_' in f or '_roll_' in f)])}")
print(f"  - Cross-pollutant (historical): {len([f for f in sample_features if 'valeur_' in f and ('_lag_' in f or '_roll_' in f)])}")
print(f"  - Target (historical): {len([f for f in sample_features if TARGETS[0] in f and ('_lag_' in f or '_roll_' in f)])}")

# Clean training data
all_needed_cols = set()
for target in TARGETS:
    all_needed_cols.update(prepare_features_for_target(train_df, target))
all_needed_cols.update(TARGETS)

train_clean = train_df[list(all_needed_cols)].dropna().reset_index(drop=True)
print(f"\n✓ Clean train samples: {len(train_clean)}")

# =============================================================================
# TIME SERIES CROSS-VALIDATION
# =============================================================================
def time_series_cv_split(df, n_splits=2, test_ratio=0.15):
    """Expanding window CV splits."""
    n = len(df)
    test_size = int(n * test_ratio)
    min_train_size = int(n * 0.6)

    splits = []
    for i in range(n_splits):
        val_end = n - (n_splits - i - 1) * (test_size // 2)
        val_start = val_end - test_size
        train_end = val_start

        if train_end < min_train_size:
            continue

        train_idx = df.index[:train_end]
        val_idx = df.index[val_start:val_end]
        splits.append((train_idx, val_idx))

    return splits

# =============================================================================
# TRAIN CATBOOST WITH CV
# =============================================================================
def train_catboost_with_cv(df, target_col, n_splits=2):
    """Train CatBoost with time series cross-validation."""
    print(f"\n{'='*70}")
    print(f"TRAINING: {target_col}")
    print(f"{'='*70}")

    # Get features for this target
    feature_cols = prepare_features_for_target(df, target_col)
    print(f"✓ Using {len(feature_cols)} features")

    splits = time_series_cv_split(df, n_splits=n_splits, test_ratio=TEST_SIZE_RATIO)
    print(f"✓ Created {len(splits)} CV splits")

    cv_scores = []

    for fold, (train_idx, val_idx) in enumerate(splits, 1):
        print(f"\nFold {fold}/{len(splits)} - Train: {len(train_idx)}, Val: {len(val_idx)}", end=" ")

        train_data = df.loc[train_idx]
        val_data = df.loc[val_idx]

        X_train = train_data[feature_cols]
        y_train = train_data[target_col]
        X_val = val_data[feature_cols]
        y_val = val_data[target_col]

        # Handle any remaining NaNs
        X_train = X_train.fillna(0)
        X_val = X_val.fillna(0)

        try:
            # Create CatBoost pools
            train_pool = Pool(X_train, y_train)
            val_pool = Pool(X_val, y_val)

            # Train model
            model = CatBoostRegressor(**CATBOOST_PARAMS)
            model.fit(
                train_pool,
                eval_set=val_pool,
                use_best_model=True,
                plot=False
            )

            # Predict
            predictions = model.predict(X_val)
            predictions = np.maximum(predictions, 0)  # Non-negative constraint

            # Evaluate
            rmse = np.sqrt(mean_squared_error(y_val, predictions))
            mae = mean_absolute_error(y_val, predictions)
            r2 = r2_score(y_val, predictions)

            cv_scores.append({
                'fold': fold,
                'rmse': rmse,
                'mae': mae,
                'r2': r2,
                'best_iteration': model.get_best_iteration()
            })
            print(f"→ RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.3f}, Best iter: {model.get_best_iteration()}")

        except Exception as e:
            print(f"→ ✗ Error: {str(e)[:50]}")
            continue

    # Summary
    if cv_scores:
        cv_df = pd.DataFrame(cv_scores)
        print(f"\n📊 CV Summary: RMSE={cv_df['rmse'].mean():.2f}±{cv_df['rmse'].std():.2f}, "
              f"MAE={cv_df['mae'].mean():.2f}±{cv_df['mae'].std():.2f}, "
              f"R²={cv_df['r2'].mean():.3f}±{cv_df['r2'].std():.3f}")
        print(f"   Avg best iteration: {cv_df['best_iteration'].mean():.0f}")

    # Train final model on full data
    print(f"🔧 Training final model on full data...", end=" ")
    X_full = df[feature_cols].fillna(0)
    y_full = df[target_col]

    full_pool = Pool(X_full, y_full)

    final_model = CatBoostRegressor(**CATBOOST_PARAMS)
    final_model.fit(full_pool, plot=False)

    print(f"✓ Done")

    return final_model, cv_scores, feature_cols

# =============================================================================
# TRAIN ALL MODELS
# =============================================================================
print(f"\n{'='*70}")
print("TRAINING ALL POLLUTANT MODELS")
print(f"{'='*70}")

models = {}
all_cv_scores = {}
all_feature_cols = {}

import time
start_time = time.time()

for i, target in enumerate(TARGETS, 1):
    print(f"\n[{i}/{len(TARGETS)}] {target}")

    try:
        model, cv_scores, feature_cols = train_catboost_with_cv(
            train_clean,
            target,
            n_splits=N_SPLITS
        )
        models[target] = model
        all_cv_scores[target] = cv_scores
        all_feature_cols[target] = feature_cols

    except Exception as e:
        print(f"✗ Failed: {str(e)[:100]}")
        models[target] = None
        all_cv_scores[target] = []
        all_feature_cols[target] = []

elapsed = time.time() - start_time
successful = sum(1 for m in models.values() if m is not None)
print(f"\n{'='*70}")
print(f"✓ Training Complete: {successful}/{len(TARGETS)} models successful")
print(f"⏱ Total time: {elapsed/60:.1f} minutes")
print(f"{'='*70}")

# =============================================================================
# FEATURE IMPORTANCE
# =============================================================================
print(f"\n{'='*70}")
print("FEATURE IMPORTANCE (Top 15 per model)")
print(f"{'='*70}")

for target in TARGETS:
    if models[target] is not None:
        feature_importance = models[target].get_feature_importance()
        feature_names = all_feature_cols[target]

        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': feature_importance
        }).sort_values('importance', ascending=False)

        print(f"\n{target}:")
        print(importance_df.head(15).to_string(index=False))

# =============================================================================
# MAKE PREDICTIONS
# =============================================================================
print(f"\n{'='*70}")
print("GENERATING PREDICTIONS")
print(f"{'='*70}")

if DATE_COL in test_df.columns:
    predictions_df = test_df[[DATE_COL]].copy()
else:
    predictions_df = pd.DataFrame(index=test_df.index)

for target in TARGETS:
    if models[target] is None:
        print(f"✗ {target}: No model, using zeros")
        predictions_df[target] = 0
        continue

    try:
        feature_cols = all_feature_cols[target]
        X_test = test_df[feature_cols].copy()

        # Fill any missing values with 0
        X_test = X_test.fillna(0)

        preds = models[target].predict(X_test)
        preds = np.maximum(preds, 0)  # Non-negative constraint

        predictions_df[target] = preds
        print(f"✓ {target}: [{preds.min():.1f}, {preds.max():.1f}], mean={preds.mean():.1f}")

    except Exception as e:
        print(f"✗ {target}: Error - {str(e)[:50]}")
        predictions_df[target] = 0

# =============================================================================
# SAVE RESULTS
# =============================================================================
print(f"\n{'='*70}")
print("SAVING RESULTS")
print(f"{'='*70}")

predictions_df.to_csv('submission.csv', index=False)
print(f"✓ Submission saved: submission.csv ({predictions_df.shape})")

# Save CV scores
cv_summary = []
for target, scores in all_cv_scores.items():
    if scores:
        cv_df = pd.DataFrame(scores)
        cv_summary.append({
            'target': target,
            'mean_rmse': cv_df['rmse'].mean(),
            'std_rmse': cv_df['rmse'].std(),
            'mean_mae': cv_df['mae'].mean(),
            'std_mae': cv_df['mae'].std(),
            'mean_r2': cv_df['r2'].mean(),
            'std_r2': cv_df['r2'].std(),
            'avg_best_iter': cv_df['best_iteration'].mean()
        })

if cv_summary:
    cv_summary_df = pd.DataFrame(cv_summary)
    cv_summary_df.to_csv('cv_scores_summary.csv', index=False)
    print(f"✓ CV scores saved: cv_scores_summary.csv")
    print(f"\n📊 FINAL RESULTS:")
    print(cv_summary_df.to_string(index=False))

print(f"\n{'='*70}")
print("✅ PIPELINE COMPLETE!")
print(f"{'='*70}")
print(f"\n💡 Model Configuration:")
print(f"   CatBoost Iterations: {CATBOOST_PARAMS['iterations']}")
print(f"   Learning Rate: {CATBOOST_PARAMS['learning_rate']}")
print(f"   Depth: {CATBOOST_PARAMS['depth']}")
print(f"   Features: Temporal + Historical Weather + Historical Pollutants")
print(f"   CV Folds: {N_SPLITS}")
print(f"   Total runtime: {elapsed/60:.1f} minutes")
print(f"\n⚠️  IMPORTANT: NO WEATHER FORECASTS - Using ONLY features we know:")
print(f"   ✓ Temporal features (hour, day, holidays)")
print(f"   ✓ Historical weather (lag 6, 12, 24 hours + rolling stats)")
print(f"   ✓ Historical pollutant data (lag 6, 12, 24 hours + rolling stats)")
print(f"   ✗ NO current/future weather (no forecasts available!)")
print(f"   ✗ NO short-term lags (lag 1, 2, 3)")
print(f"   ✗ NO future pollutant values")

CATBOOST PIPELINE - NO WEATHER FORECASTS

✓ Loaded train: (40991, 213)
✓ Loaded test: (504, 208)

FEATURE CONFIGURATION - NO WEATHER FORECASTS

Features for valeur_NO2: 204 total
  - Temporal: 10
  - Weather (historical only): 159
  - Cross-pollutant (historical): 35
  - Target (historical): 7

✓ Clean train samples: 40991

TRAINING ALL POLLUTANT MODELS

[1/5] valeur_NO2

TRAINING: valeur_NO2
✓ Using 204 features
✓ Created 2 CV splits

Fold 1/2 - Train: 31769, Val: 6148 → RMSE: 6.78, MAE: 4.47, R²: 0.813, Best iter: 611

Fold 2/2 - Train: 34843, Val: 6148 → RMSE: 5.54, MAE: 3.67, R²: 0.776, Best iter: 705

📊 CV Summary: RMSE=6.16±0.88, MAE=4.07±0.57, R²=0.795±0.026
   Avg best iteration: 658
🔧 Training final model on full data... ✓ Done

[2/5] valeur_CO

TRAINING: valeur_CO
✓ Using 204 features
✓ Created 2 CV splits

Fold 1/2 - Train: 31769, Val: 6148 → RMSE: 0.05, MAE: 0.03, R²: 0.765, Best iter: 413

Fold 2/2 - Train: 34843, Val: 6148 → RMSE: 0.06, MAE: 0.02, R²: 0.627, Best iter: 37

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import ParameterGrid
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================
TARGETS = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']

# Features known at prediction time (TEMPORAL ONLY for purely future dates)
TEMPORAL_FEATURES = ['hour', 'is_day', 'hour_sin', 'hour_cos', 'dow', 'dow_sin',
                     'dow_cos', 'is_holiday', 'is_weekend', 'lockdown_code']

DATE_COL = 'date'

# Cross-validation settings
N_SPLITS = 5

# Hyperparameter tuning grid (limited for speed)
CATBOOST_PARAM_GRID = {
    'iterations': [500, 800],
    'learning_rate': [0.03, 0.05],
    'depth': [5, 6],
    'l2_leaf_reg': [2, 3]
}

# Best params found during tuning (will be updated)
CATBOOST_BEST_PARAMS = {
    'iterations': 500,
    'learning_rate': 0.05,
    'depth': 6,
    'l2_leaf_reg': 3,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'verbose': False,
    'early_stopping_rounds': 50,
    'task_type': 'CPU',
    'thread_count': -1
}

# =============================================================================
# LOAD AND PREPARE DATA
# =============================================================================
print("="*70)
print("CATBOOST PIPELINE - PURELY FUTURE FORECASTING")
print("="*70)
print("\n⚠️  Approach: TEMPORAL FEATURES ONLY (no lags, purely future dates)")

train_df = pd.read_csv("../data/train_features.csv")
test_df = pd.read_csv("../data/test_features_to_predict.csv")

print(f"\n✓ Loaded train: {train_df.shape}")
print(f"✓ Loaded test: {test_df.shape}")

# Convert and sort by date
if DATE_COL in train_df.columns:
    train_df[DATE_COL] = pd.to_datetime(train_df[DATE_COL])
    train_df = train_df.sort_values(DATE_COL).reset_index(drop=True)

if DATE_COL in test_df.columns:
    test_df[DATE_COL] = pd.to_datetime(test_df[DATE_COL])
    test_df = test_df.sort_values(DATE_COL).reset_index(drop=True)

# =============================================================================
# FEATURE PREPARATION - TEMPORAL ONLY
# =============================================================================
def prepare_features_for_target():
    """
    For PURELY FUTURE predictions: Use ONLY temporal features.

    No lags available because:
    - No historical pollution data for future dates
    - No weather data (forecasts not included)

    What we CAN use:
    - Time-based patterns (hour, day of week, holidays, etc.)
    - Model learns from historical temporal patterns in training
    """
    return TEMPORAL_FEATURES

# Display features
print(f"\n{'='*70}")
print(f"FEATURE CONFIGURATION - PURELY FUTURE FORECASTING")
print(f"{'='*70}")

features = prepare_features_for_target()
print(f"\nFeatures for prediction: {len(features)}")
for i, f in enumerate(features, 1):
    print(f"  {i}. {f}")

# Clean training data - remove rows with missing temporal features
train_clean = train_df[TEMPORAL_FEATURES + TARGETS].dropna().reset_index(drop=True)
print(f"\n✓ Clean train samples: {len(train_clean)}")

# =============================================================================
# TIME SERIES CROSS-VALIDATION (5 SPLITS)
# =============================================================================
def time_series_cv_split(df, n_splits=5):
    """Time series expanding window CV with multiple splits."""
    n = len(df)
    min_train_size = int(n * 0.4)  # Minimum 40% for training

    splits = []

    # Create n_splits by expanding the validation window
    test_size = int(n * 0.1)  # 10% per fold

    for i in range(n_splits):
        val_end = n - (n_splits - i - 1) * (test_size // 2)
        val_start = max(min_train_size, val_end - test_size)
        train_end = val_start

        if train_end < min_train_size or val_start >= val_end:
            continue

        train_idx = df.index[:train_end]
        val_idx = df.index[val_start:val_end]
        splits.append((train_idx, val_idx))

    return splits

# =============================================================================
# HYPERPARAMETER TUNING
# =============================================================================
def tune_hyperparameters(df, target_col, param_grid, cv_splits):
    """
    Quick hyperparameter tuning using CV.
    Evaluates each param combination on CV folds.
    """
    print(f"\n{'='*70}")
    print(f"HYPERPARAMETER TUNING: {target_col}")
    print(f"{'='*70}")

    best_rmse = float('inf')
    best_params = None
    best_cv_scores = []

    param_combinations = list(ParameterGrid(param_grid))
    print(f"Testing {len(param_combinations)} parameter combinations with {len(cv_splits)} CV folds\n")

    for combo_idx, params in enumerate(param_combinations, 1):
        fold_rmses = []

        for fold, (train_idx, val_idx) in enumerate(cv_splits, 1):
            try:
                train_data = df.loc[train_idx]
                val_data = df.loc[val_idx]

                X_train = train_data[TEMPORAL_FEATURES]
                y_train = train_data[target_col]
                X_val = val_data[TEMPORAL_FEATURES]
                y_val = val_data[target_col]

                # Create model with current params
                model_params = {
                    'iterations': params['iterations'],
                    'learning_rate': params['learning_rate'],
                    'depth': params['depth'],
                    'l2_leaf_reg': params['l2_leaf_reg'],
                    'loss_function': 'RMSE',
                    'eval_metric': 'RMSE',
                    'random_seed': 42,
                    'verbose': False,
                    'early_stopping_rounds': 30,
                    'task_type': 'CPU',
                    'thread_count': -1
                }

                train_pool = Pool(X_train, y_train)
                val_pool = Pool(X_val, y_val)

                model = CatBoostRegressor(**model_params)
                model.fit(train_pool, eval_set=val_pool, use_best_model=True, plot=False)

                predictions = model.predict(X_val)
                predictions = np.maximum(predictions, 0)

                rmse = np.sqrt(mean_squared_error(y_val, predictions))
                fold_rmses.append(rmse)

            except Exception as e:
                fold_rmses.append(float('inf'))

        mean_rmse = np.mean(fold_rmses)

        print(f"  [{combo_idx}/{len(param_combinations)}] "
              f"iter={params['iterations']:>3} lr={params['learning_rate']:.3f} "
              f"depth={params['depth']} l2={params['l2_leaf_reg']} → "
              f"CV RMSE: {mean_rmse:.2f}")

        if mean_rmse < best_rmse:
            best_rmse = mean_rmse
            best_params = params
            best_cv_scores = fold_rmses

    print(f"\n✓ Best params found: {best_params}")
    print(f"  Best CV RMSE: {best_rmse:.2f} ± {np.std(best_cv_scores):.2f}")

    return best_params, best_rmse, best_cv_scores

# =============================================================================
# TRAIN FINAL MODELS WITH BEST PARAMS
# =============================================================================
def train_final_model(df, target_col, best_params, cv_splits):
    """Train final model using best hyperparameters with CV evaluation."""
    print(f"\n{'='*70}")
    print(f"TRAINING FINAL MODEL: {target_col}")
    print(f"{'='*70}")

    cv_scores = []

    for fold, (train_idx, val_idx) in enumerate(cv_splits, 1):
        print(f"\nFold {fold}/{len(cv_splits)}", end=" ")

        train_data = df.loc[train_idx]
        val_data = df.loc[val_idx]

        X_train = train_data[TEMPORAL_FEATURES]
        y_train = train_data[target_col]
        X_val = val_data[TEMPORAL_FEATURES]
        y_val = val_data[target_col]

        model_params = {
            'iterations': best_params['iterations'],
            'learning_rate': best_params['learning_rate'],
            'depth': best_params['depth'],
            'l2_leaf_reg': best_params['l2_leaf_reg'],
            'loss_function': 'RMSE',
            'eval_metric': 'RMSE',
            'random_seed': 42,
            'verbose': False,
            'early_stopping_rounds': 30,
            'task_type': 'CPU',
            'thread_count': -1
        }

        train_pool = Pool(X_train, y_train)
        val_pool = Pool(X_val, y_val)

        model = CatBoostRegressor(**model_params)
        model.fit(train_pool, eval_set=val_pool, use_best_model=True, plot=False)

        predictions = model.predict(X_val)
        predictions = np.maximum(predictions, 0)

        rmse = np.sqrt(mean_squared_error(y_val, predictions))
        mae = mean_absolute_error(y_val, predictions)
        r2 = r2_score(y_val, predictions)

        cv_scores.append({'fold': fold, 'rmse': rmse, 'mae': mae, 'r2': r2})
        print(f"→ RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.3f}")

    # Train final model on ALL data
    print(f"\n🔧 Training final model on full dataset...", end=" ")
    X_full = df[TEMPORAL_FEATURES]
    y_full = df[target_col]

    model_params = {
        'iterations': best_params['iterations'],
        'learning_rate': best_params['learning_rate'],
        'depth': best_params['depth'],
        'l2_leaf_reg': best_params['l2_leaf_reg'],
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'random_seed': 42,
        'verbose': False,
        'task_type': 'CPU',
        'thread_count': -1
    }

    full_pool = Pool(X_full, y_full)
    final_model = CatBoostRegressor(**model_params)
    final_model.fit(full_pool, plot=False)

    print(f"✓ Done")

    return final_model, cv_scores

# =============================================================================
# MAIN PIPELINE
# =============================================================================
print(f"\n{'='*70}")
print("SETTING UP CROSS-VALIDATION")
print(f"{'='*70}")

cv_splits = time_series_cv_split(train_clean, n_splits=N_SPLITS)
print(f"✓ Created {len(cv_splits)} time series CV splits")

# Store results
models = {}
all_best_params = {}
all_cv_scores = {}

import time
start_time = time.time()

for i, target in enumerate(TARGETS, 1):
    print(f"\n\n{'#'*70}")
    print(f"[{i}/{len(TARGETS)}] {target}")
    print(f"{'#'*70}")

    try:
        # Hyperparameter tuning
        best_params, _, _ = tune_hyperparameters(
            train_clean,
            target,
            CATBOOST_PARAM_GRID,
            cv_splits
        )
        all_best_params[target] = best_params

        # Train final model with best params
        model, cv_scores = train_final_model(
            train_clean,
            target,
            best_params,
            cv_splits
        )
        models[target] = model
        all_cv_scores[target] = cv_scores

        # Print summary
        cv_df = pd.DataFrame(cv_scores)
        print(f"\n📊 Final CV Summary:")
        print(f"   RMSE: {cv_df['rmse'].mean():.2f} ± {cv_df['rmse'].std():.2f}")
        print(f"   MAE:  {cv_df['mae'].mean():.2f} ± {cv_df['mae'].std():.2f}")
        print(f"   R²:   {cv_df['r2'].mean():.3f} ± {cv_df['r2'].std():.3f}")

    except Exception as e:
        print(f"\n✗ Failed: {str(e)[:100]}")
        models[target] = None
        all_best_params[target] = None
        all_cv_scores[target] = []

elapsed = time.time() - start_time
successful = sum(1 for m in models.values() if m is not None)

print(f"\n\n{'='*70}")
print(f"✓ Training Complete: {successful}/{len(TARGETS)} models successful")
print(f"⏱ Total time: {elapsed/60:.1f} minutes")
print(f"{'='*70}")

# =============================================================================
# FEATURE IMPORTANCE
# =============================================================================
print(f"\n{'='*70}")
print("FEATURE IMPORTANCE")
print(f"{'='*70}")

for target in TARGETS:
    if models[target] is not None:
        feature_importance = models[target].get_feature_importance()

        importance_df = pd.DataFrame({
            'feature': TEMPORAL_FEATURES,
            'importance': feature_importance
        }).sort_values('importance', ascending=False)

        print(f"\n{target}:")
        print(importance_df.to_string(index=False))

# =============================================================================
# PREDICTIONS ON TEST SET
# =============================================================================
print(f"\n{'='*70}")
print("GENERATING PREDICTIONS")
print(f"{'='*70}")

if DATE_COL in test_df.columns:
    predictions_df = test_df[[DATE_COL]].copy()
else:
    predictions_df = pd.DataFrame(index=test_df.index)

X_test = test_df[TEMPORAL_FEATURES].fillna(0)

for target in TARGETS:
    if models[target] is None:
        print(f"✗ {target}: No model")
        predictions_df[target] = 0
        continue

    try:
        preds = models[target].predict(X_test)
        preds = np.maximum(preds, 0)
        predictions_df[target] = preds
        print(f"✓ {target}: [{preds.min():.1f}, {preds.max():.1f}], "
              f"mean={preds.mean():.1f}, median={np.median(preds):.1f}")

    except Exception as e:
        print(f"✗ {target}: {str(e)[:50]}")
        predictions_df[target] = 0

# =============================================================================
# SAVE RESULTS
# =============================================================================
print(f"\n{'='*70}")
print("SAVING RESULTS")
print(f"{'='*70}")

predictions_df.to_csv('submission.csv', index=False)
print(f"✓ Submission saved: submission.csv ({predictions_df.shape})")

# Save CV scores summary
cv_summary = []
for target, scores in all_cv_scores.items():
    if scores:
        cv_df = pd.DataFrame(scores)
        cv_summary.append({
            'target': target,
            'mean_rmse': cv_df['rmse'].mean(),
            'std_rmse': cv_df['rmse'].std(),
            'mean_mae': cv_df['mae'].mean(),
            'std_mae': cv_df['mae'].std(),
            'mean_r2': cv_df['r2'].mean(),
            'std_r2': cv_df['r2'].std()
        })

if cv_summary:
    cv_summary_df = pd.DataFrame(cv_summary)
    cv_summary_df.to_csv('cv_scores_summary.csv', index=False)
    print(f"✓ CV scores saved: cv_scores_summary.csv\n")
    print(cv_summary_df.to_string(index=False))

# Save best hyperparameters
params_summary = []
for target, params in all_best_params.items():
    if params:
        params_summary.append({
            'target': target,
            'iterations': params['iterations'],
            'learning_rate': params['learning_rate'],
            'depth': params['depth'],
            'l2_leaf_reg': params['l2_leaf_reg']
        })

if params_summary:
    params_df = pd.DataFrame(params_summary)
    params_df.to_csv('best_hyperparameters.csv', index=False)
    print(f"✓ Best hyperparameters saved: best_hyperparameters.csv\n")
    print(params_df.to_string(index=False))

print(f"\n{'='*70}")
print("✅ PIPELINE COMPLETE!")
print(f"{'='*70}")
print(f"\n📋 Summary:")
print(f"   Models: {successful}/{len(TARGETS)} successful")
print(f"   CV Folds: {N_SPLITS}")
print(f"   Hyperparameter tuning: {len(list(ParameterGrid(CATBOOST_PARAM_GRID)))} combinations")
print(f"   Features: {len(TEMPORAL_FEATURES)} temporal features")
print(f"   Runtime: {elapsed/60:.1f} minutes")
print(f"\n⚠️  APPROACH:")
print(f"   ✓ Temporal features only (hour, dow, holidays, etc.)")
print(f"   ✓ {N_SPLITS}-fold time series CV")
print(f"   ✓ Hyperparameter tuning with limited grid")
print(f"   ✗ No lags (not available for future dates)")
print(f"   ✗ No weather data (not available for future dates)")
print(f"\n💡 Model learns temporal patterns from historical data")

CATBOOST PIPELINE - PURELY FUTURE FORECASTING

⚠️  Approach: TEMPORAL FEATURES ONLY (no lags, purely future dates)

✓ Loaded train: (40991, 213)
✓ Loaded test: (504, 208)

FEATURE CONFIGURATION - PURELY FUTURE FORECASTING

Features for prediction: 10
  1. hour
  2. is_day
  3. hour_sin
  4. hour_cos
  5. dow
  6. dow_sin
  7. dow_cos
  8. is_holiday
  9. is_weekend
  10. lockdown_code

✓ Clean train samples: 40991

SETTING UP CROSS-VALIDATION
✓ Created 5 time series CV splits


######################################################################
[1/5] valeur_NO2
######################################################################

HYPERPARAMETER TUNING: valeur_NO2
Testing 16 parameter combinations with 5 CV folds

  [1/16] iter=500 lr=0.030 depth=5 l2=2 → CV RMSE: 13.70
  [2/16] iter=500 lr=0.050 depth=5 l2=2 → CV RMSE: 13.69
  [3/16] iter=500 lr=0.030 depth=5 l2=3 → CV RMSE: 13.70
  [4/16] iter=500 lr=0.050 depth=5 l2=3 → CV RMSE: 13.69
  [5/16] iter=800 lr=0.030 depth=5 l2=2 → CV

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================
TARGETS = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']

TEMPORAL_FEATURES = ['hour', 'is_day', 'hour_sin', 'hour_cos', 'dow', 'dow_sin',
                     'dow_cos', 'is_holiday', 'is_weekend', 'lockdown_code']

WEATHER_BASE = [
    'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'apparent_temperature',
    'pressure_msl', 'wind_speed_10m', 'wind_direction_10m', 'precipitation',
    'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high',
    'shortwave_radiation', 'direct_radiation', 'diffuse_radiation',
    'global_tilted_irradiance', 'wind_gusts_10m', 'vapour_pressure_deficit',
    'et0_fao_evapotranspiration', 'snowfall', 'rain', 'showers', 'weather_code'
]

DATE_COL = 'date'

N_SPLITS = 2
TEST_SIZE_RATIO = 0.15

CATBOOST_PARAMS = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'l2_leaf_reg': 3,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'verbose': False,
    'early_stopping_rounds': 50,
    'task_type': 'CPU',
    'thread_count': -1
}

# Autoregressive settings
FORECAST_HORIZON = 24  # Hours ahead to forecast
LOOKBACK_WINDOW = 24   # Hours of history to use

# =============================================================================
# LOAD AND PREPARE DATA
# =============================================================================
print("="*70)
print("CATBOOST AUTOREGRESSIVE PIPELINE - FUTURE PREDICTIONS")
print("="*70)

train_df = pd.read_csv("../data/train_features.csv")
test_df = pd.read_csv("../data/test_features_to_predict.csv")

print(f"\n✓ Loaded train: {train_df.shape}")
print(f"✓ Loaded test: {test_df.shape}")

if DATE_COL in train_df.columns:
    train_df[DATE_COL] = pd.to_datetime(train_df[DATE_COL])
    train_df = train_df.sort_values(DATE_COL).reset_index(drop=True)

if DATE_COL in test_df.columns:
    test_df[DATE_COL] = pd.to_datetime(test_df[DATE_COL])
    test_df = test_df.sort_values(DATE_COL).reset_index(drop=True)

# =============================================================================
# AUTOREGRESSIVE FEATURE PREPARATION
# =============================================================================
def get_autoregressive_features():
    """
    Features for autoregressive prediction:
    - Temporal features (we know these at prediction time)
    - Current weather (assuming we have forecast or current data)
    - Recent history of all pollutants (lag 1, 2, 3)
    """
    features = []

    # Temporal features
    features.extend(TEMPORAL_FEATURES)

    # Current weather (not lagged - we know this)
    features.extend(WEATHER_BASE)

    # Recent pollutant history (short lags that work for step-ahead)
    for target in TARGETS:
        features.extend([
            f'{target}_lag_1',
            f'{target}_lag_2',
            f'{target}_lag_3'
        ])

    return features

def prepare_autoregressive_data(df):
    """Prepare data for autoregressive training."""
    features = get_autoregressive_features()
    available_features = [f for f in features if f in df.columns]

    return available_features

# Get feature list
feature_cols = prepare_autoregressive_data(train_df)
print(f"\n✓ Autoregressive features: {len(feature_cols)}")
print(f"  - Temporal: {len([f for f in feature_cols if f in TEMPORAL_FEATURES])}")
print(f"  - Weather (current): {len([f for f in feature_cols if any(w in f for w in WEATHER_BASE)])}")
print(f"  - Pollutant lags: {len([f for f in feature_cols if 'valeur_' in f and '_lag_' in f])}")

# Clean training data
train_clean = train_df[feature_cols + TARGETS].dropna().reset_index(drop=True)
print(f"\n✓ Clean train samples: {len(train_clean)}")

# =============================================================================
# TRAIN MODELS
# =============================================================================
def train_models(df, feature_cols, n_splits=2):
    """Train one model per target using autoregressive features."""
    print(f"\n{'='*70}")
    print("TRAINING AUTOREGRESSIVE MODELS")
    print(f"{'='*70}")

    splits = []
    n = len(df)
    test_size = int(n * TEST_SIZE_RATIO)
    min_train_size = int(n * 0.6)

    for i in range(n_splits):
        val_end = n - (n_splits - i - 1) * (test_size // 2)
        val_start = val_end - test_size
        train_end = val_start

        if train_end < min_train_size:
            continue

        train_idx = df.index[:train_end]
        val_idx = df.index[val_start:val_end]
        splits.append((train_idx, val_idx))

    models = {}
    cv_scores_all = {}

    for target in TARGETS:
        print(f"\n{'='*70}")
        print(f"Training: {target}")
        print(f"{'='*70}")

        cv_scores = []

        for fold, (train_idx, val_idx) in enumerate(splits, 1):
            print(f"Fold {fold}/{len(splits)} - Train: {len(train_idx)}, Val: {len(val_idx)}", end=" ")

            X_train = df.loc[train_idx, feature_cols].fillna(0)
            y_train = df.loc[train_idx, target]
            X_val = df.loc[val_idx, feature_cols].fillna(0)
            y_val = df.loc[val_idx, target]

            try:
                train_pool = Pool(X_train, y_train)
                val_pool = Pool(X_val, y_val)

                model = CatBoostRegressor(**CATBOOST_PARAMS)
                model.fit(train_pool, eval_set=val_pool, use_best_model=True, plot=False)

                preds = np.maximum(model.predict(X_val), 0)
                rmse = np.sqrt(mean_squared_error(y_val, preds))
                mae = mean_absolute_error(y_val, preds)
                r2 = r2_score(y_val, preds)

                cv_scores.append({
                    'fold': fold,
                    'rmse': rmse,
                    'mae': mae,
                    'r2': r2,
                    'best_iteration': model.get_best_iteration()
                })

                print(f"→ RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.3f}")

            except Exception as e:
                print(f"✗ Error: {str(e)[:50]}")
                continue

        # Train final model on all data
        print(f"Training final model...", end=" ")
        X_full = df[feature_cols].fillna(0)
        y_full = df[target]
        full_pool = Pool(X_full, y_full)

        final_model = CatBoostRegressor(**CATBOOST_PARAMS)
        final_model.fit(full_pool, plot=False)

        models[target] = final_model
        cv_scores_all[target] = cv_scores

        if cv_scores:
            cv_df = pd.DataFrame(cv_scores)
            print(f"✓ CV: RMSE={cv_df['rmse'].mean():.2f}±{cv_df['rmse'].std():.2f}")

    return models, cv_scores_all

models, cv_scores_all = train_models(train_clean, feature_cols)

print(f"\n{'='*70}")
print(f"✓ Training Complete")
print(f"{'='*70}")

# =============================================================================
# AUTOREGRESSIVE FORECASTING
# =============================================================================
def autoregressive_forecast(df_init, models, feature_cols, horizon=24):
    """
    Forecast multiple steps ahead using autoregressive approach.

    For each step:
    1. Use current features (temporal + weather)
    2. Use previous predictions as input for next step
    3. Predict all targets for this step
    4. Move to next timestep
    """
    print(f"\n{'='*70}")
    print(f"AUTOREGRESSIVE FORECASTING - {horizon} steps ahead")
    print(f"{'='*70}")

    # Start from the last known data
    current_state = df_init.iloc[-1:].copy()
    all_predictions = []

    for step in range(horizon):
        step_preds = {'step': step + 1}

        # Get features for this step
        X_current = current_state[feature_cols].fillna(0)

        # Predict all targets
        for target in TARGETS:
            if models[target] is not None:
                pred = np.maximum(models[target].predict(X_current)[0], 0)
                step_preds[target] = pred
            else:
                step_preds[target] = 0

        all_predictions.append(step_preds)

        # Update state for next iteration
        # Shift lags: lag_1 becomes lag_2, current becomes lag_1
        new_state = current_state.copy()

        for target in TARGETS:
            if f'{target}_lag_3' in new_state.columns:
                new_state[f'{target}_lag_3'] = new_state[f'{target}_lag_2'].values[0]
            if f'{target}_lag_2' in new_state.columns:
                new_state[f'{target}_lag_2'] = new_state[f'{target}_lag_1'].values[0]
            if f'{target}_lag_1' in new_state.columns:
                new_state[f'{target}_lag_1'] = step_preds[target]

        # Update temporal features for next hour
        if DATE_COL in new_state.columns:
            new_state[DATE_COL] = new_state[DATE_COL] + pd.Timedelta(hours=1)

        # Update hour-related features (this is simplified - adjust based on your feature engineering)
        if 'hour' in new_state.columns:
            new_state['hour'] = (new_state['hour'].values[0] + 1) % 24
            # Recalculate hour_sin, hour_cos if they exist
            if 'hour_sin' in new_state.columns:
                new_state['hour_sin'] = np.sin(2 * np.pi * new_state['hour'] / 24)
            if 'hour_cos' in new_state.columns:
                new_state['hour_cos'] = np.cos(2 * np.pi * new_state['hour'] / 24)

        current_state = new_state

        if (step + 1) % 6 == 0 or step == 0:
            print(f"Step {step + 1}/{horizon} - Predictions: "
                  f"NO2={step_preds['valeur_NO2']:.1f}, CO={step_preds['valeur_CO']:.1f}, "
                  f"O3={step_preds['valeur_O3']:.1f}, PM10={step_preds['valeur_PM10']:.1f}, "
                  f"PM25={step_preds['valeur_PM25']:.1f}")

    return pd.DataFrame(all_predictions)

# Generate forecasts for the test set (or last data point)
forecast_df = autoregressive_forecast(train_clean, models, feature_cols, horizon=FORECAST_HORIZON)

print(f"\n{'='*70}")
print("FORECAST SUMMARY")
print(f"{'='*70}")
print(forecast_df.describe())

# =============================================================================
# SAVE RESULTS
# =============================================================================
print(f"\n{'='*70}")
print("SAVING RESULTS")
print(f"{'='*70}")

forecast_df.to_csv('autoregressive_forecast.csv', index=False)
print(f"✓ Forecast saved: autoregressive_forecast.csv ({forecast_df.shape})")

# Save CV summary
cv_summary = []
for target, scores in cv_scores_all.items():
    if scores:
        cv_df = pd.DataFrame(scores)
        cv_summary.append({
            'target': target,
            'mean_rmse': cv_df['rmse'].mean(),
            'std_rmse': cv_df['rmse'].std(),
            'mean_mae': cv_df['mae'].mean(),
            'std_mae': cv_df['mae'].std(),
            'mean_r2': cv_df['r2'].mean(),
            'std_r2': cv_df['r2'].std()
        })

if cv_summary:
    cv_summary_df = pd.DataFrame(cv_summary)
    cv_summary_df.to_csv('cv_scores_autoregressive.csv', index=False)
    print(f"✓ CV scores saved: cv_scores_autoregressive.csv")
    print(f"\n📊 VALIDATION RESULTS:")
    print(cv_summary_df.to_string(index=False))

print(f"\n{'='*70}")
print("✅ AUTOREGRESSIVE FORECASTING COMPLETE!")
print(f"{'='*70}")
print(f"\n💡 Configuration:")
print(f"   Forecast Horizon: {FORECAST_HORIZON} hours")
print(f"   Features: Temporal + Current Weather + Recent Pollutant Lags (1-3)")
print(f"   Approach: Step-by-step predictions using previous outputs as inputs")
print(f"   Predictions: {forecast_df.shape[0]} steps × {len(TARGETS)} pollutants")
print(f"\n⚠️  NOTES:")
print(f"   ✓ Can forecast multiple hours ahead")
print(f"   ✓ Uses temporal continuity through lag features")
print(f"   ✗ Error accumulates over longer horizons (24+ hours may degrade)")
print(f"   ✗ Depends on accurate weather data for each step ahead")

CATBOOST AUTOREGRESSIVE PIPELINE - FUTURE PREDICTIONS

✓ Loaded train: (40991, 213)
✓ Loaded test: (504, 208)

✓ Autoregressive features: 10
  - Temporal: 10
  - Weather (current): 0
  - Pollutant lags: 0

✓ Clean train samples: 40991

TRAINING AUTOREGRESSIVE MODELS

Training: valeur_NO2
Fold 1/2 - Train: 31769, Val: 6148 → RMSE: 14.73, MAE: 10.50, R²: 0.117
Fold 2/2 - Train: 34843, Val: 6148 → RMSE: 12.31, MAE: 10.27, R²: -0.105
Training final model... ✓ CV: RMSE=13.52±1.71

Training: valeur_CO
Fold 1/2 - Train: 31769, Val: 6148 → RMSE: 0.11, MAE: 0.06, R²: 0.010
Fold 2/2 - Train: 34843, Val: 6148 → RMSE: 0.10, MAE: 0.06, R²: -0.024
Training final model... ✓ CV: RMSE=0.10±0.01

Training: valeur_O3
Fold 1/2 - Train: 31769, Val: 6148 → RMSE: 21.93, MAE: 17.40, R²: 0.086
Fold 2/2 - Train: 34843, Val: 6148 → RMSE: 20.50, MAE: 16.19, R²: 0.131
Training final model... ✓ CV: RMSE=21.21±1.01

Training: valeur_PM10
Fold 1/2 - Train: 31769, Val: 6148 → RMSE: 8.92, MAE: 7.15, R²: -0.067
Fold 2/2

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================
TARGETS = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']

TEMPORAL_FEATURES = ['hour', 'is_day', 'hour_sin', 'hour_cos', 'dow', 'dow_sin',
                     'dow_cos', 'is_holiday', 'is_weekend', 'lockdown_code']

# Weather features that are typically available in forecasts
WEATHER_FORECAST = [
    'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'apparent_temperature',
    'pressure_msl', 'wind_speed_10m', 'wind_direction_10m', 'precipitation',
    'cloud_cover', 'shortwave_radiation', 'wind_gusts_10m',
    'rain', 'weather_code'
]

# Additional derived weather features for air quality
WEATHER_DERIVED = [
    'wind_speed_squared',  # Wind intensity for dispersion
    'temp_humidity_interaction',  # Temperature-humidity interaction
    'is_calm_wind',  # Low wind = poor dispersion
    'is_precipitation'  # Rain = washout effect
]

DATE_COL = 'date'
N_SPLITS = 3
TEST_SIZE_RATIO = 0.15

CATBOOST_PARAMS = {
    'iterations': 1500,
    'learning_rate': 0.03,
    'depth': 7,
    'l2_leaf_reg': 3,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'verbose': False,
    'early_stopping_rounds': 100,
    'task_type': 'CPU',
    'thread_count': -1
}

FORECAST_HORIZON = 24

# =============================================================================
# LOAD AND PREPARE DATA
# =============================================================================
print("="*70)
print("IMPROVED AUTOREGRESSIVE PIPELINE - WITH WEATHER & LAGS")
print("="*70)

train_df = pd.read_csv("../data/train_features.csv")
test_df = pd.read_csv("../data/test_features_to_predict.csv")

print(f"\n✓ Loaded train: {train_df.shape}")
print(f"✓ Loaded test: {test_df.shape}")

if DATE_COL in train_df.columns:
    train_df[DATE_COL] = pd.to_datetime(train_df[DATE_COL])
    train_df = train_df.sort_values(DATE_COL).reset_index(drop=True)

if DATE_COL in test_df.columns:
    test_df[DATE_COL] = pd.to_datetime(test_df[DATE_COL])
    test_df = test_df.sort_values(DATE_COL).reset_index(drop=True)

# =============================================================================
# FEATURE ENGINEERING
# =============================================================================
def engineer_weather_features(df):
    """Create derived weather features for better air quality prediction."""
    df = df.copy()

    # Wind intensity (strong winds disperse pollutants)
    if 'wind_speed_10m' in df.columns:
        df['wind_speed_squared'] = df['wind_speed_10m'] ** 2
        df['is_calm_wind'] = (df['wind_speed_10m'] < 2).astype(int)

    # Temperature-humidity interaction (affects chemical reactions)
    if 'temperature_2m' in df.columns and 'relative_humidity_2m' in df.columns:
        df['temp_humidity_interaction'] = df['temperature_2m'] * df['relative_humidity_2m'] / 100

    # Precipitation effect (washout of pollutants)
    if 'precipitation' in df.columns:
        df['is_precipitation'] = (df['precipitation'] > 0.1).astype(int)
    elif 'rain' in df.columns:
        df['is_precipitation'] = (df['rain'] > 0.1).astype(int)

    return df

def create_lag_features(df, targets, lags=[1, 2, 3, 6, 12, 24]):
    """Create lag features for pollutants (recent history)."""
    df = df.copy()

    for target in targets:
        if target in df.columns:
            for lag in lags:
                df[f'{target}_lag_{lag}'] = df[target].shift(lag)

    return df

def create_rolling_features(df, targets, windows=[3, 6, 12, 24]):
    """Create rolling statistics (trends in pollution)."""
    df = df.copy()

    for target in targets:
        if target in df.columns:
            for window in windows:
                df[f'{target}_rolling_mean_{window}'] = df[target].shift(1).rolling(window).mean()
                df[f'{target}_rolling_std_{window}'] = df[target].shift(1).rolling(window).std()

    return df

def create_pollutant_interactions(df, targets):
    """Create interaction features between pollutants (they're correlated)."""
    df = df.copy()

    # NO2 and PM often correlate (traffic/combustion)
    if 'valeur_NO2_lag_1' in df.columns and 'valeur_PM10_lag_1' in df.columns:
        df['NO2_PM10_interaction'] = df['valeur_NO2_lag_1'] * df['valeur_PM10_lag_1']

    # O3 inversely correlates with NO2 (NO2 consumes O3)
    if 'valeur_O3_lag_1' in df.columns and 'valeur_NO2_lag_1' in df.columns:
        df['O3_NO2_ratio'] = df['valeur_O3_lag_1'] / (df['valeur_NO2_lag_1'] + 1)

    return df

print("\n" + "="*70)
print("FEATURE ENGINEERING")
print("="*70)

# Apply feature engineering to training data
train_df = engineer_weather_features(train_df)
train_df = create_lag_features(train_df, TARGETS, lags=[1, 2, 3, 6, 12, 24])
train_df = create_rolling_features(train_df, TARGETS, windows=[3, 6, 12, 24])
train_df = create_pollutant_interactions(train_df, TARGETS)

test_df = engineer_weather_features(test_df)

print("✓ Weather features engineered")
print("✓ Lag features created")
print("✓ Rolling statistics computed")
print("✓ Pollutant interactions added")

# =============================================================================
# FEATURE SELECTION FOR AUTOREGRESSIVE MODEL
# =============================================================================
def get_autoregressive_features(df):
    """
    Select features suitable for autoregressive forecasting:
    1. Temporal features (always available)
    2. Weather features (from forecast)
    3. Derived weather features
    4. Pollutant lags (short-term: 1-3 hours for autoregressive)
    5. Rolling features (recent trends)
    6. Pollutant interactions
    """
    features = []

    # Temporal
    features.extend([f for f in TEMPORAL_FEATURES if f in df.columns])

    # Weather (forecasted)
    features.extend([f for f in WEATHER_FORECAST if f in df.columns])

    # Derived weather
    features.extend([f for f in WEATHER_DERIVED if f in df.columns])

    # Short-term lags (1-3 hours for autoregressive step-by-step)
    for target in TARGETS:
        for lag in [1, 2, 3]:
            col = f'{target}_lag_{lag}'
            if col in df.columns:
                features.append(col)

    # Recent rolling statistics (3-hour window)
    for target in TARGETS:
        for col in [f'{target}_rolling_mean_3', f'{target}_rolling_std_3']:
            if col in df.columns:
                features.append(col)

    # Pollutant interactions
    for col in ['NO2_PM10_interaction', 'O3_NO2_ratio']:
        if col in df.columns:
            features.append(col)

    return features

feature_cols = get_autoregressive_features(train_df)

print(f"\n{'='*70}")
print("FEATURE SUMMARY")
print(f"{'='*70}")
print(f"Total features: {len(feature_cols)}")
print(f"  - Temporal: {len([f for f in feature_cols if f in TEMPORAL_FEATURES])}")
print(f"  - Weather: {len([f for f in feature_cols if any(w in f for w in WEATHER_FORECAST)])}")
print(f"  - Weather derived: {len([f for f in feature_cols if any(w in f for w in WEATHER_DERIVED)])}")
print(f"  - Pollutant lags: {len([f for f in feature_cols if 'lag_' in f])}")
print(f"  - Rolling stats: {len([f for f in feature_cols if 'rolling_' in f])}")
print(f"  - Interactions: {len([f for f in feature_cols if 'interaction' in f or 'ratio' in f])}")

# Clean data
train_clean = train_df[feature_cols + TARGETS].dropna().reset_index(drop=True)
print(f"\n✓ Clean train samples: {len(train_clean)} (after removing NaN from lags)")

# =============================================================================
# TRAIN MODELS
# =============================================================================
def train_models(df, feature_cols, n_splits=3):
    """Train autoregressive models with expanded features."""
    print(f"\n{'='*70}")
    print("TRAINING ENHANCED AUTOREGRESSIVE MODELS")
    print(f"{'='*70}")

    splits = []
    n = len(df)
    test_size = int(n * TEST_SIZE_RATIO)
    min_train_size = int(n * 0.5)

    for i in range(n_splits):
        val_end = n - (n_splits - i - 1) * (test_size // 2)
        val_start = val_end - test_size
        train_end = val_start

        if train_end < min_train_size:
            continue

        train_idx = df.index[:train_end]
        val_idx = df.index[val_start:val_end]
        splits.append((train_idx, val_idx))

    models = {}
    cv_scores_all = {}

    for target in TARGETS:
        print(f"\n{'='*70}")
        print(f"Training: {target}")
        print(f"{'='*70}")

        cv_scores = []

        for fold, (train_idx, val_idx) in enumerate(splits, 1):
            print(f"Fold {fold}/{len(splits)} - Train: {len(train_idx)}, Val: {len(val_idx)}", end=" ")

            X_train = df.loc[train_idx, feature_cols].fillna(0)
            y_train = df.loc[train_idx, target]
            X_val = df.loc[val_idx, feature_cols].fillna(0)
            y_val = df.loc[val_idx, target]

            try:
                train_pool = Pool(X_train, y_train)
                val_pool = Pool(X_val, y_val)

                model = CatBoostRegressor(**CATBOOST_PARAMS)
                model.fit(train_pool, eval_set=val_pool, use_best_model=True, plot=False)

                preds = np.maximum(model.predict(X_val), 0)
                rmse = np.sqrt(mean_squared_error(y_val, preds))
                mae = mean_absolute_error(y_val, preds)
                r2 = r2_score(y_val, preds)

                cv_scores.append({
                    'fold': fold,
                    'rmse': rmse,
                    'mae': mae,
                    'r2': r2,
                    'best_iteration': model.get_best_iteration()
                })

                print(f"→ RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.3f}")

            except Exception as e:
                print(f"✗ Error: {str(e)[:50]}")
                continue

        # Train final model
        print(f"Training final model...", end=" ")
        X_full = df[feature_cols].fillna(0)
        y_full = df[target]
        full_pool = Pool(X_full, y_full)

        final_model = CatBoostRegressor(**CATBOOST_PARAMS)
        final_model.fit(full_pool, plot=False)

        models[target] = final_model
        cv_scores_all[target] = cv_scores

        if cv_scores:
            cv_df = pd.DataFrame(cv_scores)
            print(f"✓ CV: RMSE={cv_df['rmse'].mean():.2f}±{cv_df['rmse'].std():.2f}, R²={cv_df['r2'].mean():.3f}")

    return models, cv_scores_all

models, cv_scores_all = train_models(train_clean, feature_cols, n_splits=N_SPLITS)

# =============================================================================
# FEATURE IMPORTANCE
# =============================================================================
print(f"\n{'='*70}")
print("TOP 10 FEATURE IMPORTANCE PER TARGET")
print(f"{'='*70}")

for target in TARGETS:
    if models[target] is not None:
        importance = models[target].get_feature_importance()
        importance_df = pd.DataFrame({
            'feature': feature_cols,
            'importance': importance
        }).sort_values('importance', ascending=False).head(10)

        print(f"\n{target}:")
        for idx, row in importance_df.iterrows():
            print(f"  {row['feature']:40s} {row['importance']:6.1f}")

# =============================================================================
# AUTOREGRESSIVE FORECASTING
# =============================================================================
def autoregressive_forecast_enhanced(df_init, models, feature_cols, horizon=24):
    """
    Enhanced autoregressive forecasting with proper lag updating.
    """
    print(f"\n{'='*70}")
    print(f"ENHANCED AUTOREGRESSIVE FORECASTING - {horizon} steps")
    print(f"{'='*70}")

    current_state = df_init.iloc[-1:].copy()
    all_predictions = []

    for step in range(horizon):
        step_preds = {'step': step + 1}

        # Prepare features
        X_current = current_state[feature_cols].fillna(0)

        # Predict all targets
        for target in TARGETS:
            if models[target] is not None:
                pred = np.maximum(models[target].predict(X_current)[0], 0)
                step_preds[target] = pred
            else:
                step_preds[target] = 0

        all_predictions.append(step_preds)

        # Update state for next iteration
        new_state = current_state.copy()

        # Update lags
        for target in TARGETS:
            # Shift all lags
            if f'{target}_lag_3' in new_state.columns:
                new_state[f'{target}_lag_3'] = new_state[f'{target}_lag_2'].values[0]
            if f'{target}_lag_2' in new_state.columns:
                new_state[f'{target}_lag_2'] = new_state[f'{target}_lag_1'].values[0]
            if f'{target}_lag_1' in new_state.columns:
                new_state[f'{target}_lag_1'] = step_preds[target]

            # Update rolling mean (simplified - 3-hour window)
            if f'{target}_rolling_mean_3' in new_state.columns:
                recent_vals = [
                    new_state[f'{target}_lag_3'].values[0] if f'{target}_lag_3' in new_state.columns else step_preds[target],
                    new_state[f'{target}_lag_2'].values[0] if f'{target}_lag_2' in new_state.columns else step_preds[target],
                    new_state[f'{target}_lag_1'].values[0] if f'{target}_lag_1' in new_state.columns else step_preds[target]
                ]
                new_state[f'{target}_rolling_mean_3'] = np.mean(recent_vals)

        # Update interactions
        if 'NO2_PM10_interaction' in new_state.columns:
            new_state['NO2_PM10_interaction'] = (
                new_state['valeur_NO2_lag_1'].values[0] *
                new_state['valeur_PM10_lag_1'].values[0]
            )

        if 'O3_NO2_ratio' in new_state.columns:
            new_state['O3_NO2_ratio'] = (
                new_state['valeur_O3_lag_1'].values[0] /
                (new_state['valeur_NO2_lag_1'].values[0] + 1)
            )

        # Update temporal features
        if 'hour' in new_state.columns:
            new_state['hour'] = (new_state['hour'].values[0] + 1) % 24
            if 'hour_sin' in new_state.columns:
                new_state['hour_sin'] = np.sin(2 * np.pi * new_state['hour'] / 24)
            if 'hour_cos' in new_state.columns:
                new_state['hour_cos'] = np.cos(2 * np.pi * new_state['hour'] / 24)

        current_state = new_state

        if (step + 1) % 6 == 0 or step == 0:
            print(f"Step {step + 1:2d}/{horizon} - NO2={step_preds['valeur_NO2']:5.1f}, "
                  f"CO={step_preds['valeur_CO']:4.2f}, O3={step_preds['valeur_O3']:5.1f}, "
                  f"PM10={step_preds['valeur_PM10']:5.1f}, PM25={step_preds['valeur_PM25']:5.1f}")

    return pd.DataFrame(all_predictions)

forecast_df = autoregressive_forecast_enhanced(train_clean, models, feature_cols, horizon=FORECAST_HORIZON)

# =============================================================================
# SAVE RESULTS
# =============================================================================
print(f"\n{'='*70}")
print("SAVING RESULTS")
print(f"{'='*70}")

forecast_df.to_csv('enhanced_autoregressive_forecast.csv', index=False)
print(f"✓ Forecast: enhanced_autoregressive_forecast.csv")

cv_summary = []
for target, scores in cv_scores_all.items():
    if scores:
        cv_df = pd.DataFrame(scores)
        cv_summary.append({
            'target': target,
            'mean_rmse': cv_df['rmse'].mean(),
            'std_rmse': cv_df['rmse'].std(),
            'mean_mae': cv_df['mae'].mean(),
            'mean_r2': cv_df['r2'].mean(),
            'std_r2': cv_df['r2'].std()
        })

if cv_summary:
    cv_summary_df = pd.DataFrame(cv_summary)
    cv_summary_df.to_csv('cv_scores_enhanced_autoregressive.csv', index=False)
    print(f"✓ CV scores: cv_scores_enhanced_autoregressive.csv\n")
    print(cv_summary_df.to_string(index=False))

print(f"\n{'='*70}")
print("✅ ENHANCED AUTOREGRESSIVE PIPELINE COMPLETE!")
print(f"{'='*70}")
print(f"\n💡 Improvements over basic version:")
print(f"   ✓ Weather features: {len([f for f in feature_cols if any(w in f for w in WEATHER_FORECAST)])} (wind, temp, humidity, pressure)")
print(f"   ✓ Pollutant lags: 1-3 hours (short-term history)")
print(f"   ✓ Rolling statistics: Recent trends (3-hour windows)")
print(f"   ✓ Pollutant interactions: NO2-PM10, O3-NO2 relationships")
print(f"   ✓ Weather-derived: Wind intensity, calm conditions, precipitation effects")
print(f"   ✓ Total features: {len(feature_cols)} (vs 10 in basic version)")

IMPROVED AUTOREGRESSIVE PIPELINE - WITH WEATHER & LAGS

✓ Loaded train: (40991, 213)
✓ Loaded test: (504, 208)

FEATURE ENGINEERING
✓ Weather features engineered
✓ Lag features created
✓ Rolling statistics computed
✓ Pollutant interactions added

FEATURE SUMMARY
Total features: 37
  - Temporal: 10
  - Weather: 0
  - Weather derived: 0
  - Pollutant lags: 15
  - Rolling stats: 10
  - Interactions: 2

✓ Clean train samples: 40988 (after removing NaN from lags)

TRAINING ENHANCED AUTOREGRESSIVE MODELS

Training: valeur_NO2
Fold 1/3 - Train: 28692, Val: 6148 → RMSE: 5.06, MAE: 3.04, R²: 0.885
Fold 2/3 - Train: 31766, Val: 6148 → RMSE: 5.29, MAE: 3.28, R²: 0.886
Fold 3/3 - Train: 34840, Val: 6148 → RMSE: 4.31, MAE: 2.66, R²: 0.865
Training final model... ✓ CV: RMSE=4.89±0.51, R²=0.879

Training: valeur_CO
Fold 1/3 - Train: 28692, Val: 6148 → RMSE: 0.03, MAE: 0.02, R²: 0.870
Fold 2/3 - Train: 31766, Val: 6148 → RMSE: 0.05, MAE: 0.02, R²: 0.795
Fold 3/3 - Train: 34840, Val: 6148 → RMSE: 0.05,