In [44]:
import pandas as pd
import numpy as np



In [45]:
TRAIN_PATH = "traindata_with_lockdown.csv"   # update if needed
TEST_PATH  = "testdata_with_lockdown.csv"


# -----------------------------
# LOAD
# -----------------------------
df_tr = pd.read_csv(TRAIN_PATH)
df_te = pd.read_csv(TEST_PATH)

In [46]:


# =========================
# CONFIG
# =========================
TIME = "datetime"
RAW_ID = "id"  # might be wrong in your data; we will auto-fix below

TARGETS = ["valeur_NO2","valeur_CO","valeur_O3","valeur_PM10","valeur_PM25"]

WEATHER = [
    'temperature_2m','relative_humidity_2m','dew_point_2m','apparent_temperature',
    'pressure_msl','wind_speed_10m','wind_direction_10m','precipitation',
    'cloud_cover','cloud_cover_low','cloud_cover_mid','cloud_cover_high',
    'shortwave_radiation','direct_radiation','diffuse_radiation','global_tilted_irradiance',
    'wind_speed_80m','wind_speed_120m','wind_speed_180m',
    'wind_direction_80m','wind_direction_120m','wind_direction_180m',
    'wind_gusts_10m','vapour_pressure_deficit','cape','evapotranspiration',
    'et0_fao_evapotranspiration','snowfall','rain','showers','weather_code',
    'visibility','lockdown_code'
]
STATIC_KNOWN = ["is_holiday","is_weekend","lockdown_code"]

LAGS  = [1, 6, 12, 24]
ROLLS = [6, 24]   # hours

# =========================
# MAIN
# =========================
def build_dataset(df_tr: pd.DataFrame, df_te: pd.DataFrame):
    # --- Tidy/union ---
    df_tr = df_tr.copy()
    df_te = df_te.copy()
    df_tr[TIME] = pd.to_datetime(df_tr[TIME])
    df_te[TIME] = pd.to_datetime(df_te[TIME])
    df_tr["__split__"] = "train"; df_te["__split__"] = "test"
    df = pd.concat([df_tr, df_te], ignore_index=True)
    df = df.sort_values([TIME]).reset_index(drop=True)

    # --- Choose grouping key ---
    # If RAW_ID repeats, use it. Otherwise create a single group "__grp__" so lags/rolls work.
    use_id = RAW_ID if RAW_ID in df.columns else None
    if use_id is not None:
        nunq = df[RAW_ID].nunique()
        if nunq == len(df):  # all unique -> useless for groupby
            use_id = None
    if use_id is None:
        df["__grp__"] = "all"
        GROUP = "__grp__"
    else:
        GROUP = RAW_ID
        # make sure it's clean type
        df[GROUP] = df[GROUP].astype(str).str.strip()

    # --- Which weather cols exist ---
    weather_cols = [c for c in WEATHER if c in df.columns]

    # --- Mark future test horizon (first row per group where ANY target is NaN) ---
    te = df[df["__split__"] == "test"]
    if len(te):
        # compute per-group first future timestamp
        tmp = te.copy()
        tmp["_nan_any"] = tmp[TARGETS].isna().any(axis=1)
        first_future = (
            tmp[tmp["_nan_any"]]
            .groupby(GROUP, as_index=True)[TIME]
            .min()
        )
        df = df.join(first_future.rename("__future_start__"), on=GROUP)
        df["__is_future_test__"] = (df["__split__"]=="test") & df["__future_start__"].notna() & (df[TIME] >= df["__future_start__"])
        df.drop(columns="__future_start__", inplace=True)
    else:
        df["__is_future_test__"] = False

    # --- Time features ---
    df["hour"] = df[TIME].dt.hour
    df["is_day"] = ((df["hour"] >= 6) & (df["hour"] < 18)).astype(int)
    df["hour_sin"] = np.sin(2*np.pi*df["hour"]/24)
    df["hour_cos"] = np.cos(2*np.pi*df["hour"]/24)
    df["dow"] = df[TIME].dt.dayofweek
    df["dow_sin"] = np.sin(2*np.pi*df["dow"]/7)
    df["dow_cos"] = np.cos(2*np.pi*df["dow"]/7)

    # --- Mask future test weather (rule) ---
    if weather_cols:
        df.loc[df["__is_future_test__"], weather_cols] = np.nan

    # --- Build features into a dict (avoid fragmentation) ---
    feats = {}

    # Helper: add series to dict safely
    def add_feat(name, series):
        feats[name] = series

    # POLLUTANT lags/rollings (past only)
    for col in TARGETS:
        g = df.groupby(GROUP)[col]
        for l in LAGS:
            add_feat(f"{col}_lag_{l}", g.shift(l))
        s = g.shift(1)  # past-only base
        for w in ROLLS:
            add_feat(f"{col}_roll_mean_{w}", s.groupby(df[GROUP]).rolling(w, min_periods=1).mean().reset_index(level=0, drop=True))
            add_feat(f"{col}_roll_std_{w}",  s.groupby(df[GROUP]).rolling(w, min_periods=2).std().reset_index(level=0, drop=True))

    # Cross-lags (examples)
    add_feat("NO2_lag1_for_O3", df.groupby(GROUP)["valeur_NO2"].shift(1))
    add_feat("PM10_lag1_for_PM25", df.groupby(GROUP)["valeur_PM10"].shift(1))

    # WEATHER lags/rollings (past only, after masking)
    for col in weather_cols:
        g = df.groupby(GROUP)[col]
        for l in LAGS:
            add_feat(f"{col}_lag_{l}", g.shift(l))
        s = g.shift(1)
        for w in ROLLS:
            add_feat(f"{col}_roll_mean_{w}", s.groupby(df[GROUP]).rolling(w, min_periods=1).mean().reset_index(level=0, drop=True))
            # std only if numeric
            if pd.api.types.is_numeric_dtype(df[col]):
                add_feat(f"{col}_roll_std_{w}",  s.groupby(df[GROUP]).rolling(w, min_periods=2).std().reset_index(level=0, drop=True))

    # Concatenate all features at once (fast, no fragmentation)
    feat_df = pd.DataFrame(feats, index=df.index)

    # --- Assemble full X with calendar/static features ---
    base_cols = ["hour","is_day","hour_sin","hour_cos","dow","dow_sin","dow_cos"] + [c for c in STATIC_KNOWN if c in df.columns]
    X_all = pd.concat([feat_df, df[base_cols]], axis=1)

    # --- Impute features (ffill per group → train medians; std→0) ---
    X_all = X_all.groupby(df[GROUP]).ffill()

    train_meds = X_all[df["__split__"]=="train"].median(numeric_only=True)
    X_all = X_all.fillna(train_meds)

    std_cols = [c for c in X_all.columns if "_roll_std_" in c]
    if std_cols:
        X_all[std_cols] = X_all[std_cols].fillna(0.0)

    # --- Outputs ---
    feat_cols = list(X_all.columns)
    meta = df[[GROUP, TIME, "__split__", "__is_future_test__"]].copy()
    Y_all = df[TARGETS].copy()

    # Train
    X_train = X_all[meta["__split__"]=="train"].reset_index(drop=True)
    Y_train = Y_all[meta["__split__"]=="train"].reset_index(drop=True)

    # Test rows to predict (future only)
    mask_pred = (meta["__split__"]=="test") & (meta["__is_future_test__"])
    X_test_pred = X_all[mask_pred].reset_index(drop=True)
    meta_test = df.loc[mask_pred, [GROUP, TIME]].rename(columns={GROUP:"id"}).reset_index(drop=True)

    print(f"Groups used: {GROUP} ({df[GROUP].nunique()} unique)")
    print("X_train:", X_train.shape, "| Y_train:", Y_train.shape)
    print("X_test_pred:", X_test_pred.shape, "| meta_test:", meta_test.shape)
    print("Num features:", len(feat_cols))

    return X_train, Y_train, X_test_pred, meta_test, feat_cols






Groups used: __grp__ (1 unique)
X_train: (40991, 316) | Y_train: (40991, 5)
X_test_pred: (504, 316) | meta_test: (504, 2)
Num features: 316


In [None]:
X_train, Y_train, X_test_pred, meta_test, feat_cols = build_dataset(df_tr, df_te)
