In [4]:
# GRU + 1D-CNN temporal model built from your existing lag/roll features.
# Assumptions:
# - LAGS = [6,12,24] correspond to columns with suffixes '_lag_6','_lag_12','_lag_24'
# - roll_mean / roll_std features are kept and broadcasted to each timestep
# - Targets: same as your original TARGETS
# - Paths: same as your original script (../data/...)
# - This trains a model from scratch (replace hyperparams if needed)

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers, callbacks
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import random
import warnings
warnings.filterwarnings("ignore")

# Repro
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)

# ====== CONFIG ======
TARGETS = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']
TEMPORAL_FEATURES = ['hour', 'is_day', 'hour_sin', 'hour_cos', 'dow', 'dow_sin', 
                     'dow_cos', 'is_holiday', 'is_weekend', 'lockdown_code']
LAGS = [6, 12, 24]
TRAIN_PATH = "../data/train_features.csv"
TEST_PATH = "../data/test_features_to_predict.csv"

# ====== LOAD ======
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
print("Train", train_df.shape, "Test", test_df.shape)

# ====== Identify lag bases and roll/static features ======
cols = train_df.columns.tolist()

# discover lag columns and their base names like 'valeur_NO2' (from 'valeur_NO2_lag_6')
lag_cols = {lag: [c for c in cols if f"_lag_{lag}" in c] for lag in LAGS}
# create set of bases that have at least one lag
bases = set()
for lag in LAGS:
    for c in lag_cols[lag]:
        base = c.replace(f"_lag_{lag}", "")
        bases.add(base)
bases = sorted(bases)

# collect roll_* columns (roll_mean/_std) and other non-lag static features
roll_cols = [c for c in cols if ('roll_mean' in c) or ('roll_std' in c)]
# other features we will treat as static (e.g., the TEMPORAL_FEATURES are handled separately)
static_cols = [c for c in cols if (c not in sum(lag_cols.values(), []) and c not in roll_cols + TEMPORAL_FEATURES + TARGETS + ['id','datetime'])]

print(f"Found {len(bases)} lag bases (examples): {bases[:6]}")
print(f"Found {len(roll_cols)} roll cols (examples): {roll_cols[:6]}")
print(f"Found {len(static_cols)} other static cols (samples): {static_cols[:6]}")

# ====== Build feature ordering for flat representation (same as before) ======
# We'll build a flattened feature vector for each sample like earlier (so scaling is straightforward),
# then reshape into (samples, timesteps=3, channels) for the temporal model.
# Flatten order: for each base in bases -> [lag_6, lag_12, lag_24] (fill missing with most recent available)
flat_feature_names = []
# include lag sequence for each base
for base in bases:
    for lag in LAGS:
        colname = f"{base}_lag_{lag}"
        if colname in cols:
            flat_feature_names.append(colname)
        else:
            # fallback: try to use a related column (e.g. if not present, use 0 placeholder)
            flat_feature_names.append(None)  # mark missing, will handle later

# append roll features and other static features (kept as additional flat features)
flat_feature_names += roll_cols + static_cols + TEMPORAL_FEATURES
# remove duplicates and None will be handled when building arrays
# compute number of expected features
print("Planned flat feature count:", len(flat_feature_names))

# ====== Prepare training data (drop rows with NaNs as you did) ======
# Build DataFrame for required flat features (replace missing cols with NaN so dropna removes them)
flat_df = pd.DataFrame()
for name in flat_feature_names:
    if name is None:
        flat_df[name or "MISSING"] = np.nan  # will be dropped by dropna
    else:
        flat_df[name] = train_df.get(name, np.nan)

# Combine with temporal features
for tcol in TEMPORAL_FEATURES:
    if tcol in train_df.columns:
        flat_df[tcol] = train_df[tcol]
    else:
        flat_df[tcol] = np.nan

# Add targets
for tgt in TARGETS:
    flat_df[tgt] = train_df[tgt]

# drop rows with NaNs in any required flat feature or target (same approach)
clean = flat_df.dropna()
print("Samples after dropna:", clean.shape[0])

# Extract X_flat and Y
X_flat = clean[[c for c in flat_feature_names if c is not None] + TEMPORAL_FEATURES].values
Y = clean[TARGETS].values

# ====== Train/Val split like before ======
split_idx = int(0.85 * len(X_flat))
X_train_flat = X_flat[:split_idx]
X_val_flat = X_flat[split_idx:]
Y_train = Y[:split_idx]
Y_val = Y[split_idx:]

# ====== Scale features & targets (same logic as yours) ======
feature_scaler = StandardScaler()
X_train_scaled_flat = feature_scaler.fit_transform(X_train_flat)
X_val_scaled_flat = feature_scaler.transform(X_val_flat)

# target scalers per pollutant
target_scalers = {}
Y_train_scaled = np.zeros_like(Y_train, dtype=np.float32)
Y_val_scaled = np.zeros_like(Y_val, dtype=np.float32)
for i, tgt in enumerate(TARGETS):
    s = StandardScaler()
    Y_train_scaled[:, i] = s.fit_transform(Y_train[:, i:i+1]).ravel()
    Y_val_scaled[:, i] = s.transform(Y_val[:, i:i+1]).ravel()
    target_scalers[tgt] = s

# ====== Reshape flattened scaled features into sequence (timesteps=3) ======
# To do this we need to know how many lag channels exist (len(bases)*len(LAGS) possibly)
n_bases = len(bases)
timesteps = len(LAGS)  # 3
# number of lag columns present in flat_feature_names equals n_bases * timesteps (with possible missing)
lag_flat_count = n_bases * timesteps

# We'll assume the flat ordering started with lag sequence for each base (see above)
# Build function to convert scaled flat vector -> (timesteps, channels)
def flat_to_seq(X_scaled_flat):
    n_samples = X_scaled_flat.shape[0]
    # first part corresponds to lag sequence region
    lag_region = X_scaled_flat[:, :lag_flat_count]
    # reshape into (samples, n_bases, timesteps) then transpose to (samples, timesteps, n_bases)
    lag_region = lag_region.reshape(n_samples, n_bases, timesteps)
    seq = np.transpose(lag_region, (0, 2, 1))  # (samples, timesteps, channels_per_timestep=n_bases)
    # remaining features (rolls, statics, temporal) appended after lag region in flat vector
    remaining = X_scaled_flat[:, lag_flat_count:]
    # broadcast remaining features into each timestep (so each timestep has same additional features)
    rem_per_timestep = np.repeat(remaining[:, np.newaxis, :], timesteps, axis=1)  # shape (samples, timesteps, rem_dim)
    seq_full = np.concatenate([seq, rem_per_timestep], axis=2)  # (samples, timesteps, channels)
    return seq_full

X_train_seq = flat_to_seq(X_train_scaled_flat)
X_val_seq = flat_to_seq(X_val_scaled_flat)
print("Seq shapes:", X_train_seq.shape, X_val_seq.shape)  # (samples, timesteps, channels)

# ====== Build model: Conv1D -> BiGRU -> Dense (multi-output) ======
def build_temporal_model(input_shape, n_targets=len(TARGETS)):
    inp = layers.Input(shape=input_shape)  # (timesteps, channels)
    x = layers.Conv1D(64, kernel_size=2, padding='causal', activation='relu')(inp)
    x = layers.BatchNormalization()(x)
    x = layers.SpatialDropout1D(0.2)(x)
    x = layers.Bidirectional(layers.GRU(128, return_sequences=False, dropout=0.2))(x)
    x = layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(64, activation='relu')(x)
    out = layers.Dense(n_targets)(x)
    model = models.Model(inp, out)
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss='mse', metrics=['mae'])
    return model

model = build_temporal_model(X_train_seq.shape[1:])
model.summary()

# ====== Train ======
es = callbacks.EarlyStopping(monitor='val_loss', patience=12, restore_best_weights=True, verbose=1)
rlr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)

history = model.fit(
    X_train_seq, Y_train_scaled,
    validation_data=(X_val_seq, Y_val_scaled),
    epochs=100,
    batch_size=128,
    callbacks=[es, rlr],
    verbose=1
)

# ====== Validation metrics in original scale ======
Y_val_pred_scaled = model.predict(X_val_seq, verbose=0)
mae_per_pollutant = []
for i, tgt in enumerate(TARGETS):
    y_pred_unscaled = target_scalers[tgt].inverse_transform(Y_val_pred_scaled[:, i:i+1]).ravel()
    mae = mean_absolute_error(Y_val[:, i], y_pred_unscaled)
    mae_per_pollutant.append(mae)
    print(f"{tgt}: MAE = {mae:.4f}")
print("Avg MAE:", np.mean(mae_per_pollutant))




Train (40991, 213) Test (504, 208)
Found 28 lag bases (examples): ['apparent_temperature', 'cloud_cover', 'cloud_cover_high', 'cloud_cover_low', 'cloud_cover_mid', 'dew_point_2m']
Found 110 roll cols (examples): ['valeur_NO2_roll_mean_6', 'valeur_NO2_roll_std_6', 'valeur_NO2_roll_mean_24', 'valeur_NO2_roll_std_24', 'valeur_CO_roll_mean_6', 'valeur_CO_roll_std_6']
Found 2 other static cols (samples): ['NO2_lag1_for_O3', 'PM10_lag1_for_PM25']
Planned flat feature count: 206
Samples after dropna: 40991
Seq shapes: (34842, 3, 160) (6149, 3, 160)


Epoch 1/100
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 23ms/step - loss: 0.3580 - mae: 0.4045 - val_loss: 0.2026 - val_mae: 0.2753 - learning_rate: 0.0010
Epoch 2/100
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - loss: 0.2481 - mae: 0.3284 - val_loss: 0.1821 - val_mae: 0.2534 - learning_rate: 0.0010
Epoch 3/100
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - loss: 0.2206 - mae: 0.3059 - val_loss: 0.1710 - val_mae: 0.2397 - learning_rate: 0.0010
Epoch 4/100
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - loss: 0.2045 - mae: 0.2943 - val_loss: 0.1635 - val_mae: 0.2310 - learning_rate: 0.0010
Epoch 5/100
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - loss: 0.1965 - mae: 0.2869 - val_loss: 0.1553 - val_mae: 0.2251 - learning_rate: 0.0010
Epoch 6/100
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - loss: 0.1858 - mae:

In [6]:
# ============================================================================ 
# PREPARE TEST DATA - ROLLING AVERAGE APPROACH 
# ============================================================================ 
print("\nPreparing test data with rolling lag features...")

# Get last 24 rows of train to use for averaging (more realistic than single row)
last_24_rows = train_df.iloc[-24:].copy()

# Get the exact feature columns used during training
feature_cols = [c for c in flat_feature_names if c is not None] + TEMPORAL_FEATURES
n_features = len(feature_cols)

print(f"feature_cols length: {n_features}")
print(f"Using last 24 rows of train for feature averaging")

# Build test features: average features from last 24 train rows, update temporal from test_df
test_features_list = []

for idx in range(len(test_df)):
    test_row = test_df.iloc[idx].copy()
    
    # Average each feature across last 24 train rows
    base_features = []
    for col in feature_cols:
        if col in last_24_rows.columns:
            # For temporal features, we'll override later; for others, take mean
            if col not in TEMPORAL_FEATURES:
                feat_val = float(last_24_rows[col].mean())
            else:
                feat_val = float(last_24_rows[col].iloc[-1])  # Use last value for temporal
            base_features.append(feat_val)
        else:
            base_features.append(0.0)
    
    base_features = np.array(base_features)
    
    # Update temporal features from test_df (these change for each test row)
    for i, col in enumerate(feature_cols):
        if col in TEMPORAL_FEATURES and col in test_row.index:
            base_features[i] = float(test_row[col])
    
    test_features_list.append(base_features)

X_test = np.array(test_features_list)
print(f"Test features shape: {X_test.shape}")
print(f"Expected features: {n_features}")

# Scale test features using the scaler fitted on training data
X_test_scaled = feature_scaler.transform(X_test)

# Reshape into sequences for GRU+CNN model
X_test_seq = flat_to_seq(X_test_scaled)
print(f"X_test_seq shape (for model): {X_test_seq.shape}")

# ============================================================================ 
# PREDICT
# ============================================================================ 
print("\nPredicting on test set...")
Y_test_pred_scaled = model.predict(X_test_seq, verbose=0)

# Inverse scale predictions per pollutant
Y_test_pred = np.zeros_like(Y_test_pred_scaled)
for i, target in enumerate(TARGETS):
    Y_test_pred[:, i] = target_scalers[target].inverse_transform(Y_test_pred_scaled[:, i:i+1]).ravel()

# ============================================================================ 
# CREATE SUBMISSION
# ============================================================================ 
submission = pd.DataFrame()
submission['id'] = test_df['id'].values
for i, target in enumerate(TARGETS):
    submission[target] = Y_test_pred[:, i]

submission.to_csv('submission.csv', index=False)
print("✅ Submission saved to submission.csv")
print(f"\nSubmission preview:\n{submission.head()}")
print(f"\nSubmission shape: {submission.shape}")


Preparing test data with rolling lag features...
feature_cols length: 216
Using last 24 rows of train for feature averaging
Test features shape: (504, 216)
Expected features: 216
X_test_seq shape (for model): (504, 3, 160)

Predicting on test set...
✅ Submission saved to submission.csv

Submission preview:
              id  valeur_NO2  valeur_CO  valeur_O3  valeur_PM10  valeur_PM25
0  2024-09-03 23   18.036446   0.175036  42.576515    10.783887     6.141726
1  2024-09-04 00   19.455297   0.177276  40.633728    12.067898     6.740830
2  2024-09-04 01   19.896864   0.175878  39.447834    11.909089     6.649740
3  2024-09-04 02   20.738203   0.175373  38.268314    11.807177     6.507667
4  2024-09-04 03   21.816051   0.176506  37.283451    11.743945     6.340149

Submission shape: (504, 6)


In [5]:
# ============================================================================ 
# PREPARE TEST DATA - ROLLING AVERAGE APPROACH 
# ============================================================================ 
print("\nPreparing test data with rolling lag features...")

# Get last 24 rows of train to use for averaging (more realistic than single row)
last_24_rows = train_df.iloc[-24:].copy()

# Only take temporal features from test_df that are in our original TEMPORAL_FEATURES list
temporal_cols_in_test = [c for c in TEMPORAL_FEATURES if c in test_df.columns]

# Remaining features are all used in training except temporal
remaining_cols = [c for c in used_flat_names if c not in TEMPORAL_FEATURES]

# Combine in order for building test vectors
feature_cols = remaining_cols + temporal_cols_in_test
n_features = len(feature_cols)
print(f"Number of features for test: {n_features}")

# Build test features
test_features_list = []
for idx in range(len(test_df)):
    test_row = test_df.iloc[idx].copy()
    
    base_features = []
    for col in feature_cols:
        if col in temporal_cols_in_test:
            # placeholder from last train row; will override with test value
            base_features.append(float(last_24_rows[col].iloc[-1]) if col in last_24_rows else 0.0)
        else:
            # rolling mean from last 24 train rows
            base_features.append(float(last_24_rows[col].mean()) if col in last_24_rows else 0.0)
    
    # Override temporal features with actual test row values
    for i, col in enumerate(feature_cols):
        if col in temporal_cols_in_test:
            base_features[i] = float(test_row[col])
    
    test_features_list.append(base_features)

X_test = np.array(test_features_list)
print(f"Test features shape: {X_test.shape}")

# Scale features using the scaler fitted on training data
X_test_scaled = feature_scaler.transform(X_test)

# Reshape into sequences for GRU+CNN model
X_test_seq = flat_to_seq(X_test_scaled)
print(f"X_test_seq shape (for model): {X_test_seq.shape}")

# ============================================================================ 
# PREDICT
# ============================================================================ 
print("\nPredicting on test set...")
Y_test_pred_scaled = model.predict(X_test_seq, verbose=0)

# Inverse scale predictions per pollutant
Y_test_pred = np.zeros_like(Y_test_pred_scaled)
for i, target in enumerate(TARGETS):
    Y_test_pred[:, i] = target_scalers[target].inverse_transform(Y_test_pred_scaled[:, i:i+1]).ravel()

# ============================================================================ 
# CREATE SUBMISSION
# ============================================================================ 
submission = pd.DataFrame()
submission['id'] = test_df['id'].values
for i, target in enumerate(TARGETS):
    submission[target] = Y_test_pred[:, i]

submission.to_csv('submission.csv', index=False)
print("✅ Submission saved to submission.csv")
print(f"\nSubmission preview:\n{submission.head()}")
print(f"\nSubmission shape: {submission.shape}")



Preparing test data with rolling lag features...
Number of features for test: 206
Test features shape: (504, 206)


ValueError: X has 206 features, but StandardScaler is expecting 216 features as input.

In [None]:
# ====== Prepare test features exactly like your previous approach, then scale & reshape ======
# Recreate your test-building logic (average last 24 rows of train_df for non-temporal features)
last_24 = train_df.iloc[-24:].copy()
feature_cols = [c for c in cols if ('_lag_' in c) or ('roll_' in c)] + TEMPORAL_FEATURES

test_feats_list = []
for idx in range(len(test_df)):
    tr = test_df.iloc[idx]
    base_features = []
    for name in flat_feature_names:
        # skip None markers
        if name is None:
            base_features.append(np.nan)
            continue
        if name in last_24.columns:
            if name in TEMPORAL_FEATURES:
                base_features.append(float(last_24[name].iloc[-1]))
            else:
                base_features.append(float(last_24[name].mean()))
        else:
            base_features.append(0.0)
    # override temporal vars from test row
    for i, name in enumerate(flat_feature_names):
        if name in TEMPORAL_FEATURES and name in tr.index:
            base_features[i] = float(tr[name])
    test_feats_list.append(base_features)

X_test_flat = np.array(test_feats_list)  # shape (504, n_flat)
print("Raw X_test_flat shape:", X_test_flat.shape)

# Align columns used for scaling: we trained scalers on the subset (excluded None entries)
# Build X_test array matching columns used in scaling: keep only non-None flat_feature entries + temporal (we used this earlier)
used_flat_names = [c for c in flat_feature_names if c is not None] + TEMPORAL_FEATURES
X_test_used = X_test_flat[:, :len(used_flat_names)]

# scale then reshape
X_test_scaled_flat = feature_scaler.transform(X_test_used)
X_test_seq = flat_to_seq(X_test_scaled_flat)
print("X_test_seq shape:", X_test_seq.shape)

# ====== Predict on test and inverse-scale predictions ======
Y_test_pred_scaled = model.predict(X_test_seq, verbose=0)
Y_test_pred = np.zeros_like(Y_test_pred_scaled)
for i, tgt in enumerate(TARGETS):
    Y_test_pred[:, i] = target_scalers[tgt].inverse_transform(Y_test_pred_scaled[:, i:i+1]).ravel()

submission = pd.DataFrame({'id': test_df['id']})
for i, tgt in enumerate(TARGETS):
    submission[tgt] = Y_test_pred[:, i]
submission.to_csv("gru_cnn_submission.csv", index=False)
print("Saved gru_cnn_submission.csv — shape:", submission.shape)
print(submission.head())