In [12]:
import pandas as pd
import numpy as np



In [13]:
TRAIN_PATH = "traindata_imputed.csv"   # update if needed
TEST_PATH  = "testdata_preprocessed.csv"


# -----------------------------
# LOAD
# -----------------------------
df_tr = pd.read_csv(TRAIN_PATH)
df_te = pd.read_csv(TEST_PATH)

In [14]:
cols_to_drop = [
    "valeur_CO_was_missing",
    "valeur_NO2_was_missing",
    "valeur_O3_was_missing",
    "valeur_PM10_was_missing",
    "valeur_PM25_was_missing"
]

df_tr = df_tr.drop(columns=cols_to_drop, errors="ignore")


In [15]:
df_tr.head(4)


Unnamed: 0,id,valeur_NO2,valeur_CO,valeur_O3,valeur_PM10,valeur_PM25,is_holiday,is_weekend,temperature_2m,relative_humidity_2m,...,et0_fao_evapotranspiration,snowfall,rain,showers,weather_code,lockdown_code,datetime,hour,dayofweek,month
0,2020-01-01 00,42.9,0.718,15.7,73.1,64.4,1,0,0.9,100,...,0.0,0.0,0.0,0.0,3,0,2020-01-01 00:00:00,0,2,1
1,2020-01-01 01,33.6,0.587,10.1,74.8,66.0,1,0,-0.1,99,...,0.0,0.0,0.0,0.0,3,0,2020-01-01 01:00:00,1,2,1
2,2020-01-01 02,29.3,0.4165,5.1,51.0,44.9,1,0,2.6,98,...,0.0,0.0,0.0,0.0,3,0,2020-01-01 02:00:00,2,2,1
3,2020-01-01 03,30.5,0.246,7.2,27.7,25.1,1,0,2.1,100,...,0.0,0.0,0.0,0.0,3,0,2020-01-01 03:00:00,3,2,1


In [16]:
import pandas as pd
import numpy as np

# =========================
# CONFIG
# =========================
TIME = "datetime"
RAW_ID = "id"  # use if it repeats; else fallback to single group "__grp__"

TARGETS = ["valeur_NO2","valeur_CO","valeur_O3","valeur_PM10","valeur_PM25"]

# treat these as STATIC (known ahead), so: keep raw, NO lags/roll, NO masking
STATIC_KNOWN = ["is_holiday","is_weekend","lockdown_code"]

# your weather candidates (we will REMOVE STATIC_KNOWN & non-existent ones)
WEATHER_CANDIDATES = [
    'temperature_2m','relative_humidity_2m','dew_point_2m','apparent_temperature',
    'pressure_msl','wind_speed_10m','wind_direction_10m','precipitation',
    'cloud_cover','cloud_cover_low','cloud_cover_mid','cloud_cover_high',
    'shortwave_radiation','direct_radiation','diffuse_radiation','global_tilted_irradiance',
    'wind_speed_80m','wind_speed_120m','wind_speed_180m',
    'wind_direction_80m','wind_direction_120m','wind_direction_180m',
    'wind_gusts_10m','vapour_pressure_deficit','cape','evapotranspiration',
    'et0_fao_evapotranspiration','snowfall','rain','showers',
    'weather_code',   # categorical-like; we won't roll std on it
    'visibility'
]

# lags/rolls (add 24h lag to reduce early-NaNs on forecast)
LAGS  = [6, 12, 24]
ROLLS = [6, 24]   # hours

def build_dataset(df_tr: pd.DataFrame, df_te: pd.DataFrame):
    # --- union ---
    df_tr = df_tr.copy(); df_te = df_te.copy()
    df_tr[TIME] = pd.to_datetime(df_tr[TIME]); df_te[TIME] = pd.to_datetime(df_te[TIME])
    df_tr["__split__"] = "train"; df_te["__split__"] = "test"
    df = pd.concat([df_tr, df_te], ignore_index=True).sort_values([TIME]).reset_index(drop=True)

    # --- pick group key (per-station or single group) ---
    use_id = RAW_ID if RAW_ID in df.columns else None
    if use_id is not None and df[RAW_ID].nunique() == len(df):
        use_id = None
    if use_id is None:
        df["__grp__"] = "all"; GROUP = "__grp__"
    else:
        GROUP = RAW_ID
        df[GROUP] = df[GROUP].astype(str).str.strip()

    # --- build time features early ---
    df["hour"] = df[TIME].dt.hour
    df["is_day"] = ((df["hour"] >= 6) & (df["hour"] < 18)).astype(int)
    df["hour_sin"] = np.sin(2*np.pi*df["hour"]/24)
    df["hour_cos"] = np.cos(2*np.pi*df["hour"]/24)
    df["dow"] = df[TIME].dt.dayofweek
    df["dow_sin"] = np.sin(2*np.pi*df["dow"]/7)
    df["dow_cos"] = np.cos(2*np.pi*df["dow"]/7)

    # --- choose real weather columns (exclude STATIC_KNOWN) ---
    weather_cols = [c for c in WEATHER_CANDIDATES if c in df.columns and c not in STATIC_KNOWN]

    # --- mark future test horizon (first row per group with ANY target NaN) ---
    te = df[df["__split__"] == "test"]
    if len(te):
        tmp = te.copy()
        tmp["_nan_any"] = tmp[TARGETS].isna().any(axis=1)
        first_future = (tmp[tmp["_nan_any"]].groupby(GROUP, as_index=True)[TIME].min())
        df = df.join(first_future.rename("__future_start__"), on=GROUP)
        df["__is_future_test__"] = (df["__split__"]=="test") & df["__future_start__"].notna() & (df[TIME] >= df["__future_start__"])
        df.drop(columns="__future_start__", inplace=True)
    else:
        df["__is_future_test__"] = False

    # --- MASK ONLY true weather on future test rows (rule).
    # DO NOT mask static known cols.
    if weather_cols:
        df.loc[df["__is_future_test__"], weather_cols] = np.nan

    # --- feature assembly into dict (avoid fragmentation) ---
    feats = {}
    def add_feat(name, series): feats[name] = series

    # POLLUTANTS: lags/rolls (past-only)
    for col in TARGETS:
        g = df.groupby(GROUP)[col]
        for l in LAGS:
            add_feat(f"{col}_lag_{l}", g.shift(l))
        s = g.shift(1)
        for w in ROLLS:
            add_feat(f"{col}_roll_mean_{w}", s.groupby(df[GROUP]).rolling(w, min_periods=1).mean().reset_index(level=0, drop=True))
            add_feat(f"{col}_roll_std_{w}",  s.groupby(df[GROUP]).rolling(w, min_periods=2).std().reset_index(level=0, drop=True))

    # Cross-lags (examples)
    add_feat("NO2_lag1_for_O3", df.groupby(GROUP)["valeur_NO2"].shift(1))
    add_feat("PM10_lag1_for_PM25", df.groupby(GROUP)["valeur_PM10"].shift(1))

    # WEATHER: lags/rolls (past-only), but skip std for categorical-like
    for col in weather_cols:
        g = df.groupby(GROUP)[col]
        for l in LAGS:
            add_feat(f"{col}_lag_{l}", g.shift(l))
        s = g.shift(1)
        for w in ROLLS:
            add_feat(f"{col}_roll_mean_{w}", s.groupby(df[GROUP]).rolling(w, min_periods=1).mean().reset_index(level=0, drop=True))
            if pd.api.types.is_numeric_dtype(df[col]) and col != "weather_code":
                add_feat(f"{col}_roll_std_{w}",  s.groupby(df[GROUP]).rolling(w, min_periods=2).std().reset_index(level=0, drop=True))

    # build features dataframe once
    feat_df = pd.DataFrame(feats, index=df.index)

    # base (calendar + static known, as raw flags)
    base_cols = ["hour","is_day","hour_sin","hour_cos","dow","dow_sin","dow_cos"] + [c for c in STATIC_KNOWN if c in df.columns]
    X_all = pd.concat([feat_df, df[base_cols]], axis=1)

    # --- Impute features (per-group ffill -> train medians; std->0) ---
    X_all = X_all.groupby(df[GROUP]).ffill()

    # train medians computed on train rows only
    train_meds = X_all[df["__split__"]=="train"].median(numeric_only=True)
    X_all = X_all.fillna(train_meds)

    # remaining std NaNs to 0
    std_cols = [c for c in X_all.columns if "_roll_std_" in c]
    if std_cols:
        X_all[std_cols] = X_all[std_cols].fillna(0.0)

    # --- outputs ---
    feat_cols = list(X_all.columns)
    meta = df[[GROUP, TIME, "__split__", "__is_future_test__"]].copy()
    Y_all = df[TARGETS].copy()

    X_train = X_all[meta["__split__"]=="train"].reset_index(drop=True)
    Y_train = Y_all[meta["__split__"]=="train"].reset_index(drop=True)

    mask_pred = (meta["__split__"]=="test") & (meta["__is_future_test__"])
    X_test_pred = X_all[mask_pred].reset_index(drop=True)
    meta_test = df.loc[mask_pred, [GROUP, TIME]].rename(columns={GROUP:"id"}).reset_index(drop=True)

    print(f"Groups used: {GROUP} ({df[GROUP].nunique()} unique)")
    print("X_train:", X_train.shape, "| Y_train:", Y_train.shape)
    print("X_test_pred:", X_test_pred.shape, "| meta_test:", meta_test.shape)
    print("Num features:", len(feat_cols))

    return X_train, Y_train, X_test_pred, meta_test, feat_cols


In [17]:
X_train, Y_train, X_test_pred, meta_test, feat_cols = build_dataset(df_tr, df_te)


Groups used: __grp__ (1 unique)
X_train: (40991, 206) | Y_train: (40991, 5)
X_test_pred: (504, 206) | meta_test: (504, 2)
Num features: 206


In [18]:
X_test_pred["id"]=df_te["id"]

In [20]:
# 1) Rebuild meta_train
meta_train = df_tr[[RAW_ID, TIME]].copy()  # original train metadata
meta_train = meta_train.reset_index(drop=True)

# 2) Build full train dataset (X + Y + metadata)
train_full = X_train.copy()
for tgt in TARGETS:
    train_full[tgt] = Y_train[tgt].values

train_full["id"] = meta_train[RAW_ID].values
train_full["datetime"] = meta_train[TIME].values

# 3) Build full test dataset (X_test_pred + meta_test)
test_full = X_test_pred.copy()
# test_full["id"] = meta_test["id"].values
test_full["datetime"] = meta_test["datetime"].values

# 4) Save to CSV
train_full.to_csv("train_features.csv", index=False)
test_full.to_csv("test_features_to_predict.csv", index=False)

print("✅ Saved train_features.csv and test_features_to_predict.csv")


✅ Saved train_features.csv and test_features_to_predict.csv
