In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# SETUP
# ============================================================================
SEED = 42
np.random.seed(SEED)

TARGETS = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']
TEMPORAL_FEATURES = ['hour', 'is_day', 'hour_sin', 'hour_cos', 'dow', 'dow_sin', 
                     'dow_cos', 'is_holiday', 'is_weekend', 'lockdown_code']

print("Loading data with memory optimization...")
# Load with chunks to avoid memory error
train_df = pd.read_csv("../data/train_features.csv")
test_df = pd.read_csv("../data/test_features_to_predict.csv")

# Convert to float32 immediately to save memory
float_cols = train_df.select_dtypes(include=['float64']).columns
train_df[float_cols] = train_df[float_cols].astype('float32')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# ============================================================================
# PREPARE FEATURES
# ============================================================================
print("\nPreparing features...")

all_cols = train_df.columns.tolist()
lag_roll_cols = [c for c in all_cols if 'lag_' in c or 'roll_' in c]
feature_cols = lag_roll_cols + TEMPORAL_FEATURES

print(f"Total features: {len(feature_cols)}")

# Remove NaNs
train_clean = train_df[feature_cols + TARGETS].dropna()
print(f"Train samples: {len(train_clean)}")

# Split
split_idx = int(0.85 * len(train_clean))

X_train = train_clean.iloc[:split_idx][feature_cols].values
Y_train = train_clean.iloc[:split_idx][TARGETS].values
X_val = train_clean.iloc[split_idx:][feature_cols].values
Y_val = train_clean.iloc[split_idx:][TARGETS].values

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")

# ============================================================================
# TRAIN ONE MODEL PER POLLUTANT (FAST VERSION)
# ============================================================================
print("\n" + "="*70)
print("TRAINING LIGHTGBM MODELS (FAST)")
print("="*70)

models = {}
val_predictions = np.zeros_like(Y_val)

for i, target in enumerate(TARGETS):
    print(f"\n{'='*40}")
    print(f"Training {target}...")
    print(f"{'='*40}")
    
    # Prepare data
    lgb_train = lgb.Dataset(X_train, Y_train[:, i], free_raw_data=False)
    lgb_val = lgb.Dataset(X_val, Y_val[:, i], reference=lgb_train, free_raw_data=False)
    
    # Fast parameters - optimized for speed
    params = {
        'objective': 'regression',
        'metric': 'mae',
        'boosting_type': 'gbdt',
        'learning_rate': 0.1,  # Higher for speed
        'num_leaves': 31,
        'max_depth': -1,
        'min_data_in_leaf': 20,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': SEED,
        'force_col_wise': True  # Faster
    }
    
    # Train quickly
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=300,  # Reduced from 1000
        valid_sets=[lgb_val],
        valid_names=['val'],
        callbacks=[lgb.early_stopping(30, verbose=False)]
    )
    
    models[target] = model
    
    # Validate
    val_predictions[:, i] = model.predict(X_val, num_iteration=model.best_iteration)
    mae = mean_absolute_error(Y_val[:, i], val_predictions[:, i])
    
    print(f"‚úÖ Validation MAE: {mae:.4f}")
    print(f"   Best iteration: {model.best_iteration}")

# Overall performance
print("\n" + "="*70)
print("VALIDATION RESULTS")
print("="*70)
for i, target in enumerate(TARGETS):
    mae = mean_absolute_error(Y_val[:, i], val_predictions[:, i])
    print(f"{target}: {mae:.4f}")

avg_mae = np.mean([mean_absolute_error(Y_val[:, i], val_predictions[:, i]) for i in range(len(TARGETS))])
print(f"\nüéØ Average MAE: {avg_mae:.4f}")
print(f"üéØ Your Tuned Model: 2.9069")

if avg_mae < 2.9069:
    print(f"üéâ LightGBM is BETTER by {2.9069 - avg_mae:.4f}!")
else:
    print(f"üìä Tuned model still better by {avg_mae - 2.9069:.4f}")

# ============================================================================
# PREPARE TEST DATA - USE LAST ROW
# ============================================================================
print("\n" + "="*70)
print("PREPARING TEST DATA")
print("="*70)

last_row = train_df.iloc[-1:].copy()

test_features_list = []
for idx in range(len(test_df)):
    test_row = test_df.iloc[idx].copy()
    
    features = []
    for col in feature_cols:
        if col in TEMPORAL_FEATURES and col in test_row.index:
            features.append(float(test_row[col]))
        elif col in last_row.columns:
            features.append(float(last_row[col].values[0]))
        else:
            features.append(0.0)
    
    test_features_list.append(features)

X_test = np.array(test_features_list, dtype='float32')
print(f"Test features shape: {X_test.shape}")

# ============================================================================
# MAKE PREDICTIONS
# ============================================================================
print("\n" + "="*70)
print("MAKING PREDICTIONS")
print("="*70)

predictions = np.zeros((len(test_df), len(TARGETS)), dtype='float32')

for i, target in enumerate(TARGETS):
    predictions[:, i] = models[target].predict(X_test, num_iteration=models[target].best_iteration)
    print(f"‚úÖ {target}: mean={predictions[:, i].mean():.2f}, range=[{predictions[:, i].min():.2f}, {predictions[:, i].max():.2f}]")

# ============================================================================
# CREATE SUBMISSIONS
# ============================================================================
print("\n" + "="*70)
print("CREATING SUBMISSIONS")
print("="*70)

# Submission 1: Pure LightGBM
submission_lgbm = pd.DataFrame()
submission_lgbm['id'] = test_df['id'].values
for i, target in enumerate(TARGETS):
    submission_lgbm[target] = predictions[:, i]

submission_lgbm.to_csv('lgbm_fast.csv', index=False)
print("‚úÖ Saved: lgbm_fast.csv")

# Submission 2: Ensemble with tuned model
try:
    tuned_sub = pd.read_csv('ffnn_tuned_submission.csv')
    
    # 50/50 ensemble
    ensemble = submission_lgbm.copy()
    for target in TARGETS:
        ensemble[target] = 0.5 * submission_lgbm[target] + 0.5 * tuned_sub[target]
    
    ensemble.to_csv('ensemble_lgbm_nn_5050.csv', index=False)
    print("‚úÖ Saved: ensemble_lgbm_nn_5050.csv")
    
    # Smart weight based on validation
    if avg_mae < 2.9069:
        weight_lgbm = 0.6
        weight_nn = 0.4
        print(f"\nUsing weights: LightGBM={weight_lgbm}, NN={weight_nn}")
    else:
        weight_lgbm = 0.4
        weight_nn = 0.6
        print(f"\nUsing weights: LightGBM={weight_lgbm}, NN={weight_nn}")
    
    ensemble_smart = submission_lgbm.copy()
    for target in TARGETS:
        ensemble_smart[target] = weight_lgbm * submission_lgbm[target] + weight_nn * tuned_sub[target]
    
    ensemble_smart.to_csv('ensemble_lgbm_nn_smart.csv', index=False)
    print("‚úÖ Saved: ensemble_lgbm_nn_smart.csv")
    
except Exception as e:
    print(f"‚ö†Ô∏è Could not create ensemble: {e}")

print("\n" + "="*70)
print("SUMMARY")
print("="*70)
print("\nüìÅ Submissions created:")
print("  1. lgbm_fast.csv - Pure LightGBM")
print("  2. ensemble_lgbm_nn_5050.csv - 50/50 blend")
print("  3. ensemble_lgbm_nn_smart.csv - Smart weighted blend")
print("\nüöÄ SUBMIT ALL 3 TO KAGGLE!")
print(f"\n‚è±Ô∏è Estimated time: 15-20 minutes")

Loading data with memory optimization...
Train shape: (40991, 213)
Test shape: (504, 208)

Preparing features...
Total features: 204
Train samples: 40991
X_train shape: (34842, 204)
X_val shape: (6149, 204)

TRAINING LIGHTGBM MODELS (FAST)

Training valeur_NO2...
‚úÖ Validation MAE: 3.7313
   Best iteration: 115

Training valeur_CO...
‚úÖ Validation MAE: 0.0222
   Best iteration: 129

Training valeur_O3...
‚úÖ Validation MAE: 6.2799
   Best iteration: 299

Training valeur_PM10...
‚úÖ Validation MAE: 2.5006
   Best iteration: 170

Training valeur_PM25...
‚úÖ Validation MAE: 1.6679
   Best iteration: 187

VALIDATION RESULTS
valeur_NO2: 3.7313
valeur_CO: 0.0222
valeur_O3: 6.2799
valeur_PM10: 2.5006
valeur_PM25: 1.6679

üéØ Average MAE: 2.8404
üéØ Your Tuned Model: 2.9069
üéâ LightGBM is BETTER by 0.0665!

PREPARING TEST DATA
Test features shape: (504, 204)

MAKING PREDICTIONS
‚úÖ valeur_NO2: mean=24.09, range=[16.81, 33.55]
‚úÖ valeur_CO: mean=0.22, range=[0.20, 0.25]
‚úÖ valeur_O3: me