# 04 — Feature Pipeline

Demonstrates the preprocessing pipeline: rolling/lag feature creation,
categorical encoding, scaling, and persisted `ColumnTransformer`.

In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.insert(0, str(PROJECT_ROOT))
print("Project root:", PROJECT_ROOT)

## 1. Run the full pipeline (or load saved artifacts)

In [None]:
import numpy as np
import pandas as pd
import joblib
from src.features.feature_pipeline import (
    build_pipeline, prepare_dataset, get_feature_names,
    RollingLagFeatures, build_column_transformer,
)

DATA_PATH = PROJECT_ROOT / "data" / "Combined_Dataset_40k.parquet"
PIPELINE_PATH = PROJECT_ROOT / "models" / "preprocess_pipeline.joblib"

# Build from scratch (also saves pipeline + train_ready.parquet)
X, y, feature_names, ct = build_pipeline(
    parquet_path=DATA_PATH,
    save_dir=PROJECT_ROOT / "models",
)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"NaN count in X: {np.isnan(X).sum()}")
print(f"Feature count: {len(feature_names)}")

## 2. List all feature names

In [None]:
feat_df = pd.DataFrame({
    "idx": range(1, len(feature_names) + 1),
    "feature": feature_names,
})
feat_df

## 3. Load the saved pipeline and transform a subset

In [None]:
# Load the persisted pipeline
ct_loaded = joblib.load(PIPELINE_PATH)
print(f"Loaded pipeline from {PIPELINE_PATH}")
print(f"Pipeline transformers: {[t[0] for t in ct_loaded.transformers]}")

# Load train_ready and transform a 100-row sample
train_ready = pd.read_parquet(PROJECT_ROOT / "data" / "train_ready.parquet")
sample = train_ready.sample(100, random_state=42)
X_sample = ct_loaded.transform(sample)

print(f"\nSample X shape: {X_sample.shape}")
print(f"NaNs in sample X: {np.isnan(X_sample).sum()}")

In [None]:
# Show first 5 rows as a DataFrame
sample_df = pd.DataFrame(X_sample[:5], columns=feature_names)
sample_df.T

## 4. Feature group breakdown

In [None]:
from src.features.feature_pipeline import (
    PACKET_FLOW_COLS, DEVICE_COLS, TIMING_COLS,
    QUEUE_CONTROLLER_COLS, SECURITY_COLS,
    CAT_ONEHOT_COLS, CAT_ORDINAL_COLS, CAT_FREQ_COLS,
    ROLLING_WINDOWS_SEC, LAG_STEPS,
)

groups = {
    "Packet / Flow":      PACKET_FLOW_COLS,
    "Device":             DEVICE_COLS,
    "Timing":             TIMING_COLS,
    "Queue / Controller": QUEUE_CONTROLLER_COLS,
    "Security":           SECURITY_COLS,
    "Rolling features":   [f"latency_roll_*_{w}s, packet_rate_{w}s" for w in ROLLING_WINDOWS_SEC],
    "Lag features":       [f"latency_lag_{l}" for l in LAG_STEPS],
    "One-hot encoded":    CAT_ONEHOT_COLS,
    "Ordinal encoded":    CAT_ORDINAL_COLS,
    "Frequency encoded":  CAT_FREQ_COLS,
}

for gname, cols in groups.items():
    print(f"\n{gname} ({len(cols)} raw columns):")
    for c in cols:
        print(f"  • {c}")

## 5. Quick statistics on transformed X

In [None]:
X_df = pd.DataFrame(X, columns=feature_names)
X_df.describe().T.round(3)

In [None]:
# Target distribution
print(f"Target (success_flag) distribution:")
print(y.value_counts())
print(f"\nPositive rate: {y.mean():.4f}")

---
**Artifacts saved:**
- `models/preprocess_pipeline.joblib`
- `data/train_ready.parquet`