# 4. Feature Engineering

In [1]:
import warnings
import os
from pathlib import Path

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

# --------------------------------------------------
# Paths and basic setup
# --------------------------------------------------
PROJECT_ROOT = Path("..")  # from notebook/ back to project root
PREP_DIR = PROJECT_ROOT / "output_Preprocessing_TemporalDataSplitting"
ANOM_DIR = PROJECT_ROOT / "output_AnomalyDetection"
FE_DIR = PROJECT_ROOT / "output_FeatureEngineering"

FE_DIR.mkdir(parents=True, exist_ok=True)

FE_TRAIN_DIR = FE_DIR / "train"
FE_TRAIN_ORIG_DIR = FE_TRAIN_DIR / "orig"
FE_TRAIN_CLEAN_DIR = FE_TRAIN_DIR / "cleaned"
FE_TEST_DIR = FE_DIR / "test"

FE_TRAIN_ORIG_DIR.mkdir(parents=True, exist_ok=True)
FE_TRAIN_CLEAN_DIR.mkdir(parents=True, exist_ok=True)
FE_TEST_DIR.mkdir(parents=True, exist_ok=True)

print("=" * 70)
print("FEATURE ENGINEERING FOR 2004 (TRAIN) AND 2005 (TEST)")
print("=" * 70)
print("Output directory:", FE_DIR)

# --------------------------------------------------
# 1. Load preprocessed train/test and cleaned train
# --------------------------------------------------
print("\n[STEP 1] Loading preprocessed and cleaned data")

train_2004_orig = pd.read_csv(
    PREP_DIR / "train_2004.csv",
    index_col="DateTime",
    parse_dates=True,
)

test_2005 = pd.read_csv(
    PREP_DIR / "test_2005.csv",
    index_col="DateTime",
    parse_dates=True,
)

train_2004_cleaned = pd.read_csv(
    ANOM_DIR / "train_2004_cleaned.csv",
    index_col="DateTime",
    parse_dates=True,
)

train_2004_orig = train_2004_orig.sort_index()
test_2005 = test_2005.sort_index()
train_2004_cleaned = train_2004_cleaned.sort_index()

print("train_2004_orig shape:", train_2004_orig.shape)
print("train_2004_cleaned shape:", train_2004_cleaned.shape)
print("test_2005 shape:", test_2005.shape)
print("train_2004_orig range:", train_2004_orig.index.min(), "→", train_2004_orig.index.max())
print("train_2004_cleaned range:", train_2004_cleaned.index.min(), "→", train_2004_cleaned.index.max())
print("test_2005 range:", test_2005.index.min(), "→", test_2005.index.max())

# Original pipeline: full 2004 + full 2005
df_all_orig = pd.concat([train_2004_orig, test_2005]).sort_index()

# Cleaned pipeline: cleaned 2004 + full 2005
df_all_cleaned = pd.concat([train_2004_cleaned, test_2005]).sort_index()

print("\n[INFO] Combined pipelines")
print("df_all_orig shape:", df_all_orig.shape, "range:", df_all_orig.index.min(), "→", df_all_orig.index.max())
print("df_all_cleaned shape:", df_all_cleaned.shape, "range:", df_all_cleaned.index.min(), "→", df_all_cleaned.index.max())


# --------------------------------------------------
# Helper functions
# --------------------------------------------------
def encode_cyclical(df: pd.DataFrame, col: str, max_val: int) -> pd.DataFrame:
    df[f"{col}_sin"] = np.sin(2 * np.pi * df[col] / max_val)
    df[f"{col}_cos"] = np.cos(2 * np.pi * df[col] / max_val)
    return df


def build_features_for_df(df: pd.DataFrame, tag: str):
    """
    Build hourly, daily and merged (hourly + daily) feature tables.
    df: hourly time series covering 2004 + 2005.
    tag: label for logging.
    """
    print("\n" + "-" * 60)
    print(f"[BUILD FEATURES] version = {tag}")
    print("Input shape:", df.shape)
    print("Date range:", df.index.min(), "→", df.index.max())

    pollutants = ["CO(GT)", "C6H6(GT)", "NOx(GT)", "NO2(GT)"]
    sensors = [
        "PT08.S1(CO)",
        "PT08.S2(NMHC)",
        "PT08.S3(NOx)",
        "PT08.S4(NO2)",
        "PT08.S5(O3)",
    ]
    meteo = ["T", "RH", "AH"]

    # -----------------------------------
    # 2. Hourly features
    # -----------------------------------
    print("[Step 2] Creating hourly features")

    df_hourly = df.copy()

    # Calendar indicators from index
    df_hourly["hour"] = df_hourly.index.hour
    df_hourly["day_of_week"] = df_hourly.index.dayofweek
    df_hourly["month"] = df_hourly.index.month
    df_hourly["is_weekend"] = (df_hourly["day_of_week"] >= 5).astype(int)

    # Sine–cosine encoding for hour and day_of_week
    df_hourly = encode_cyclical(df_hourly, "hour", 24)
    df_hourly = encode_cyclical(df_hourly, "day_of_week", 7)

    # Short-term lags: 1, 3, 6, 12 hours
    print("Adding hourly lags (1, 3, 6, 12 h)")
    for pollutant in pollutants:
        for lag in [1, 3, 6, 12]:
            df_hourly[f"{pollutant}_lag_{lag}h"] = df_hourly[pollutant].shift(lag)

    # Moving averages: 4, 8, 12 hours
    print("Adding moving averages (4, 8, 12 h)")
    for pollutant in pollutants:
        for window in [4, 8, 12]:
            df_hourly[f"{pollutant}_ma_{window}h"] = (
                df_hourly[pollutant].rolling(window=window, min_periods=1).mean()
            )

    # Rate-of-change: absolute and percentage
    print("Adding rate-of-change features")
    for pollutant in pollutants:
        df_hourly[f"{pollutant}_hourly_change"] = df_hourly[pollutant].diff()
        df_hourly[f"{pollutant}_hourly_pct_change"] = df_hourly[pollutant].pct_change()

    df_hourly_clean = df_hourly.dropna()
    print("Hourly feature table shape:", df_hourly_clean.shape)

    # -----------------------------------
    # 3. Daily features
    # -----------------------------------
    print("[Step 3] Creating daily features")

    daily_agg_dict = {}
    for pollutant in pollutants:
        daily_agg_dict[pollutant] = ["mean", "max", "std"]
    for sensor in sensors:
        daily_agg_dict[sensor] = ["mean", "max", "std"]
    for meteo_var in meteo:
        daily_agg_dict[meteo_var] = ["mean", "max", "std"]

    df_daily = df.resample("D").agg(daily_agg_dict)
    df_daily.columns = ["_".join(col).strip() for col in df_daily.columns.values]

    df_daily["day_of_week"] = df_daily.index.dayofweek
    df_daily["month"] = df_daily.index.month
    df_daily["is_weekend"] = (df_daily["day_of_week"] >= 5).astype(int)

    # Daily lags of pollutant means: 1, 2, 7 days
    print("Adding daily lags (1, 2, 7 d)")
    for col in ["CO(GT)_mean", "C6H6(GT)_mean", "NOx(GT)_mean", "NO2(GT)_mean"]:
        for lag in [1, 2, 7]:
            df_daily[f"{col}_lag_{lag}d"] = df_daily[col].shift(lag)

    df_daily_clean = df_daily.dropna()
    print("Daily feature table shape:", df_daily_clean.shape)

    # -----------------------------------
    # 4. Merged hourly + daily features
    # -----------------------------------
    print("[Step 4] Creating merged hourly + daily features")

    daily_for_merge = df_daily_clean.copy()
    daily_for_merge.index = pd.to_datetime(daily_for_merge.index)

    df_full = df_hourly_clean.copy()
    df_full["merge_date"] = df_full.index.normalize()

    daily_features_to_merge = [
        col
        for col in daily_for_merge.columns
        if col not in ["day_of_week", "month", "is_weekend"]
    ]

    df_full = df_full.merge(
        daily_for_merge[daily_features_to_merge],
        left_on="merge_date",
        right_index=True,
        how="left",
        suffixes=("", "_daily"),
    )

    df_full = df_full.drop(columns=["merge_date"])
    df_full_clean = df_full.dropna()

    print("Merged feature table shape:", df_full_clean.shape)
    print("Number of daily features merged per row:", len(daily_features_to_merge))

    return df_hourly_clean, df_daily_clean, df_full_clean


# --------------------------------------------------
# 2. Feature engineering for original pipeline
# --------------------------------------------------
hourly_orig_all, daily_orig_all, merge_orig_all = build_features_for_df(
    df_all_orig,
    tag="orig_all",
)

mask_2004_h = hourly_orig_all.index.year == 2004
mask_2005_h = hourly_orig_all.index.year == 2005

mask_2004_d = daily_orig_all.index.year == 2004
mask_2005_d = daily_orig_all.index.year == 2005

mask_2004_m = merge_orig_all.index.year == 2004
mask_2005_m = merge_orig_all.index.year == 2005

train_hourly_orig = hourly_orig_all.loc[mask_2004_h]
test_hourly = hourly_orig_all.loc[mask_2005_h]

train_daily_orig = daily_orig_all.loc[mask_2004_d]
test_daily = daily_orig_all.loc[mask_2005_d]

train_merge_orig = merge_orig_all.loc[mask_2004_m]
test_merge = merge_orig_all.loc[mask_2005_m]

# Save original train features
train_hourly_orig.to_csv(FE_TRAIN_ORIG_DIR / "train_2004_fe_hourly_orig.csv")
train_daily_orig.to_csv(FE_TRAIN_ORIG_DIR / "train_2004_fe_daily_orig.csv")
train_merge_orig.to_csv(FE_TRAIN_ORIG_DIR / "train_2004_fe_merge_orig.csv")

print("\n[OUTPUT] Original pipeline (train 2004)")
print(FE_TRAIN_ORIG_DIR / "train_2004_fe_hourly_orig.csv")
print(FE_TRAIN_ORIG_DIR / "train_2004_fe_daily_orig.csv")
print(FE_TRAIN_ORIG_DIR / "train_2004_fe_merge_orig.csv")

# Save shared test features
test_hourly.to_csv(FE_TEST_DIR / "test_2005_fe_hourly.csv")
test_daily.to_csv(FE_TEST_DIR / "test_2005_fe_daily.csv")
test_merge.to_csv(FE_TEST_DIR / "test_2005_fe_merge.csv")

print("\n[OUTPUT] Shared test pipeline (2005)")
print(FE_TEST_DIR / "test_2005_fe_hourly.csv")
print(FE_TEST_DIR / "test_2005_fe_daily.csv")
print(FE_TEST_DIR / "test_2005_fe_merge.csv")

# --------------------------------------------------
# 3. Feature engineering for cleaned pipeline
# --------------------------------------------------
hourly_clean_all, daily_clean_all, merge_clean_all = build_features_for_df(
    df_all_cleaned,
    tag="cleaned_all",
)

mask_2004_h_c = hourly_clean_all.index.year == 2004
mask_2004_d_c = daily_clean_all.index.year == 2004
mask_2004_m_c = merge_clean_all.index.year == 2004

train_hourly_cleaned = hourly_clean_all.loc[mask_2004_h_c]
train_daily_cleaned = daily_clean_all.loc[mask_2004_d_c]
train_merge_cleaned = merge_clean_all.loc[mask_2004_m_c]

train_hourly_cleaned.to_csv(FE_TRAIN_CLEAN_DIR / "train_2004_fe_hourly_cleaned.csv")
train_daily_cleaned.to_csv(FE_TRAIN_CLEAN_DIR / "train_2004_fe_daily_cleaned.csv")
train_merge_cleaned.to_csv(FE_TRAIN_CLEAN_DIR / "train_2004_fe_merge_cleaned.csv")

print("\n[OUTPUT] Cleaned pipeline (train 2004 after anomaly removal)")
print(FE_TRAIN_CLEAN_DIR / "train_2004_fe_hourly_cleaned.csv")
print(FE_TRAIN_CLEAN_DIR / "train_2004_fe_daily_cleaned.csv")
print(FE_TRAIN_CLEAN_DIR / "train_2004_fe_merge_cleaned.csv")

# --------------------------------------------------
# 4. Summary table
# --------------------------------------------------
summary = pd.DataFrame(
    {
        "version": [
            "train_orig_hourly",
            "train_orig_daily",
            "train_orig_merge",
            "train_cleaned_hourly",
            "train_cleaned_daily",
            "train_cleaned_merge",
            "test_2005_hourly",
            "test_2005_daily",
            "test_2005_merge",
        ],
        "n_rows": [
            train_hourly_orig.shape[0],
            train_daily_orig.shape[0],
            train_merge_orig.shape[0],
            train_hourly_cleaned.shape[0],
            train_daily_cleaned.shape[0],
            train_merge_cleaned.shape[0],
            test_hourly.shape[0],
            test_daily.shape[0],
            test_merge.shape[0],
        ],
        "n_features": [
            train_hourly_orig.shape[1],
            train_daily_orig.shape[1],
            train_merge_orig.shape[1],
            train_hourly_cleaned.shape[1],
            train_daily_cleaned.shape[1],
            train_merge_cleaned.shape[1],
            test_hourly.shape[1],
            test_daily.shape[1],
            test_merge.shape[1],
        ],
    }
)

summary_path = FE_DIR / "fe_train_test_summary.csv"
summary.to_csv(summary_path, index=False)

print("\n[SUMMARY]")
print(summary)
print("\nSummary saved to:", summary_path)
print("\n" + "=" * 70)
print("FEATURE ENGINEERING COMPLETE")
print("=" * 70)

FEATURE ENGINEERING FOR 2004 (TRAIN) AND 2005 (TEST)
Output directory: ../output_FeatureEngineering

[STEP 1] Loading preprocessed and cleaned data
train_2004_orig shape: (6602, 12)
train_2004_cleaned shape: (6426, 12)
test_2005 shape: (2231, 12)
train_2004_orig range: 2004-03-10 18:00:00 → 2004-12-31 23:00:00
train_2004_cleaned range: 2004-03-10 18:00:00 → 2004-12-31 23:00:00
test_2005 range: 2005-01-01 00:00:00 → 2005-04-04 14:00:00

[INFO] Combined pipelines
df_all_orig shape: (8833, 12) range: 2004-03-10 18:00:00 → 2005-04-04 14:00:00
df_all_cleaned shape: (8657, 12) range: 2004-03-10 18:00:00 → 2005-04-04 14:00:00

------------------------------------------------------------
[BUILD FEATURES] version = orig_all
Input shape: (8833, 12)
Date range: 2004-03-10 18:00:00 → 2005-04-04 14:00:00
[Step 2] Creating hourly features
Adding hourly lags (1, 3, 6, 12 h)
Adding moving averages (4, 8, 12 h)
Adding rate-of-change features
Hourly feature table shape: (8821, 56)
[Step 3] Creating dail