In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.decomposition import PCA
import joblib

RESOURCES_DIR = Path("../resources")
RESOURCES_DIR.mkdir(exist_ok=True)

# -------------------------------
# FAST FEATURE EXTRACTORS
# -------------------------------
def _agg_basic_stats(df, value_col):
    agg = {
        value_col: [
            "mean", "std", "min", "max", "median",
            lambda s: s.quantile(0.10),
            lambda s: s.quantile(0.90),
            "skew", "kurt"
        ]
    }
    g = df.groupby(level="id")[value_col].agg(agg[value_col])
    g.columns = [
        "mean","std","min","max","median","q10","q90","skew","kurt"
    ]
    return g.add_prefix("whole_")

def _agg_segment_stats(df, value_col, period_col, seg):
    """seg=0 or 1 (pre/post)."""
    sub = df[df[period_col] == seg]
    if sub.empty:
        # no rows for this segment
        return pd.DataFrame(index=df.index.get_level_values("id").unique()).assign(
            **{f"{'pre' if seg==0 else 'post'}_{n}": 0.0 for n in
               ["mean","std","min","max","median","q10","q90","skew","kurt"]}
        )
    agg = sub.groupby(level="id")[value_col].agg([
        "mean","std","min","max","median",
        lambda s: s.quantile(0.10),
        lambda s: s.quantile(0.90),
        "skew","kurt"
    ])
    agg.columns = ["mean","std","min","max","median","q10","q90","skew","kurt"]
    return agg.add_prefix("pre_" if seg == 0 else "post_")

def _length_feats(df, period_col):
    g = df.groupby(level="id")[period_col]
    n_total = g.size().rename("n_total")
    # counts by value
    counts = g.value_counts().unstack(fill_value=0)
    counts = counts.rename(columns={0: "n_pre", 1: "n_post"})
    if "n_pre" not in counts: counts["n_pre"] = 0
    if "n_post" not in counts: counts["n_post"] = 0
    out = pd.concat([n_total, counts[["n_pre","n_post"]]], axis=1).fillna(0)
    out["post_pre_ratio"] = np.where(out["n_pre"] > 0, out["n_post"] / out["n_pre"], 0.0)
    return out

def _ewm_tail_last(df, value_col, alphas=(0.1, 0.01, 0.001)):
    """Exponentially-weighted stats per id (last value only). Vectorised via groupby.transform + last()."""
    out = pd.DataFrame(index=df.index.get_level_values("id").unique())
    # we need the last index per id to pick tail values
    last_idx = df.groupby(level="id").tail(1).index
    for a in alphas:
        mean_series = df.groupby(level="id")[value_col].transform(lambda s: s.ewm(alpha=a, adjust=False).mean())
        var_series  = df.groupby(level="id")[value_col].transform(lambda s: s.ewm(alpha=a, adjust=False).var(bias=False))
        mean_last = mean_series.loc[last_idx]
        std_last  = var_series.loc[last_idx].clip(lower=0).pow(0.5)
        mean_last.index = mean_last.index.droplevel("time")
        std_last.index  = std_last.index.droplevel("time")
        out[f"ewm{a:g}_mean_last"] = mean_last.astype(float)
        out[f"ewm{a:g}_std_last"]  = std_last.astype(float).fillna(0.0)
    return out

def _fft_band_powers_fast(df, value_col="value", Nf=12):
    """
    Fast-ish rFFT per id by operating on contiguous blocks.
    Assumes df is sorted by (id, time). Uses NumPy to minimise pandas overhead.
    """
    if not df.index.is_monotonic_increasing:
        df = df.sort_index()

    ids = df.index.get_level_values("id").to_numpy()
    vals = df[value_col].to_numpy(dtype=np.float64)

    # boundaries where id changes
    boundaries = np.concatenate([[0], np.flatnonzero(ids[1:] != ids[:-1]) + 1, [len(ids)]])
    out_rows = {}
    for b0, b1 in zip(boundaries[:-1], boundaries[1:]):
        gid = int(ids[b0])
        v = vals[b0:b1]
        if v.size < 2:
            out_rows[gid] = {f"fpow_{k}": 0.0 for k in range(1, Nf+1)}
            continue
        v = v - v.mean()
        power = np.abs(np.fft.rfft(v))**2
        band = power[1:1+Nf]
        if band.size < Nf:
            band = np.pad(band, (0, Nf - band.size))
        total = power.sum()
        norm = (band / total) if total > 0 else np.zeros_like(band)
        out_rows[gid] = {f"fpow_{k}": float(norm[k-1]) for k in range(1, Nf+1)}
    return pd.DataFrame.from_dict(out_rows, orient="index").rename_axis("id").fillna(0.0)

def f_x2z_expert_fast(X: pd.DataFrame, Nf: int = 12,
                      value_col: str = "value", period_col: str = "period") -> pd.DataFrame:
    """Vectorised expert features (fast)."""
    # Ensure sort for FFT block splits
    if not X.index.is_monotonic_increasing:
        X = X.sort_index()

    # Whole / pre / post stats
    whole = _agg_basic_stats(X, value_col=value_col)
    pre   = _agg_segment_stats(X, value_col=value_col, period_col=period_col, seg=0)
    post  = _agg_segment_stats(X, value_col=value_col, period_col=period_col, seg=1)

    # Align frames and fill gaps
    feats = whole.join(pre, how="outer").join(post, how="outer").fillna(0.0)

    # Deltas (post - pre)
    for k in ["mean","std","median","q10","q90","skew","kurt"]:
        feats[f"delta_{k}"] = feats.get(f"post_{k}", 0.0) - feats.get(f"pre_{k}", 0.0)

    # Length features + EWM tails
    lenf = _length_feats(X, period_col=period_col)
    ewmf = _ewm_tail_last(X, value_col=value_col, alphas=(0.1, 0.01, 0.001))

    feats = feats.join(lenf, how="left").join(ewmf, how="left").fillna(0.0)

    # FFT band powers
    fftf = _fft_band_powers_fast(X, value_col=value_col, Nf=Nf)
    feats = feats.join(fftf, how="left").fillna(0.0)

    # Safety
    feats = feats.replace([np.inf, -np.inf], 0.0).fillna(0.0)
    return feats

# -------------------------------
# SAVE & PCA WRAPPERS
# -------------------------------
def build_and_save_features(X_train: pd.DataFrame, X_test: pd.DataFrame,
                            Nf: int = 12,
                            out_dir: Path = RESOURCES_DIR) -> tuple[pd.DataFrame, pd.DataFrame]:
    Z_train = f_x2z_expert_fast(X_train, Nf=Nf)
    Z_test  = f_x2z_expert_fast(X_test,  Nf=Nf)

    Z_train_path = out_dir / "Z_train.parquet"
    Z_test_path  = out_dir / "Z_test.parquet"
    Z_train.to_parquet(Z_train_path)
    Z_test.to_parquet(Z_test_path)
    print(f"[features] saved {Z_train.shape} -> {Z_train_path}")
    print(f"[features] saved {Z_test.shape}  -> {Z_test_path}")
    return Z_train, Z_test

def fit_pca_and_transform(Z_train: pd.DataFrame, Z_test: pd.DataFrame,
                          n_components: int | float = 0.99,
                          out_dir: Path = RESOURCES_DIR):
    """
    Fit PCA on train (n_components as int or variance fraction), transform both,
    save PCA and transformed matrices.
    """
    pca = PCA(n_components=n_components, svd_solver="auto", random_state=42)
    Ztr_pca = pca.fit_transform(Z_train.values)
    Zte_pca = pca.transform(Z_test.values)

    Ztr_pca_df = pd.DataFrame(Ztr_pca, index=Z_train.index,
                              columns=[f"pc_{i}" for i in range(Ztr_pca.shape[1])])
    Zte_pca_df = pd.DataFrame(Zte_pca, index=Z_test.index,
                              columns=[f"pc_{i}" for i in range(Zte_pca.shape[1])])

    joblib.dump(pca, out_dir / "pca.joblib")
    Ztr_pca_df.to_parquet(out_dir / "Z_train_pca.parquet")
    Zte_pca_df.to_parquet(out_dir / "Z_test_pca.parquet")
    print(f"[pca] kept {Ztr_pca.shape[1]} components; saved PCA + transformed matrices.")
    return Ztr_pca_df, Zte_pca_df

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import joblib

RESOURCES_DIR = Path("../resources")
RESOURCES_DIR.mkdir(exist_ok=True)

if __name__ == "__main__":

        # import data
        DATA_DIR = Path("../data")
        X_train = pd.read_parquet(DATA_DIR / "X_train.parquet")
        X_test = pd.read_parquet(DATA_DIR / "X_test.reduced.parquet")

        y_train = pd.read_parquet(DATA_DIR / "y_train.parquet").squeeze()
        y_test = pd.read_parquet(DATA_DIR / "y_test.reduced.parquet").squeeze()

        # after you loaded X_train, X_test
        Z_train, Z_test = build_and_save_features(X_train, X_test, Nf=12)

        # OPTIONAL: PCA to denoise / compress (keeps 99% variance)
        Z_train_pca, Z_test_pca = fit_pca_and_transform(Z_train, Z_test, n_components=0.99)