
# Feature Audit Notebook (after `preprocess.py`)

This notebook:
1) Loads your data and config  
2) Fits **preprocessor** (from `preprocess.py`) on train-only  
3) Produces **named, pandas** feature matrix for EDA  
4) Runs essential **overfit diagnostics**: importance, drift, leakage checks  
5) Gives visual summaries to guide next-step FE decisions

> **Tip**: If your `scikit-learn` < 1.2 and `set_output(transform="pandas")` fails, the notebook will fall back and still work (w/o column names). 


In [None]:

# --- Imports & paths ---
import os, sys, json, math
from pathlib import Path
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.inspection import permutation_importance

# Allow importing from your src/
repo_root = Path.cwd()
src_dir = (repo_root / "src")
if not src_dir.exists():
    # fallback: assume notebook is not in project root
    # try one level up
    src_dir = Path.cwd().parent / "src"
sys.path.append(str(src_dir))

from config import TrainConfig
from preprocess import infer_column_types, build_preprocessor

cfg = TrainConfig()
print(cfg)


In [None]:

# --- Load data ---
df = pd.read_csv(cfg.paths.train_csv)
target = cfg.cols.target
if cfg.cols.numeric is None or cfg.cols.categorical is None:
    num_cols, cat_cols = infer_column_types(df, target)
else:
    num_cols, cat_cols = cfg.cols.numeric, cfg.cols.categorical

X = df.drop(columns=[target])
y = df[target]

X_tr, X_va, y_tr, y_va = train_test_split(X, y, test_size=cfg.test_size, random_state=42, stratify=y)
print(X_tr.shape, X_va.shape)
print(y_tr.value_counts(normalize=True).sort_index())


In [None]:
# --- Fit preprocessor on train only ---
from pathlib import Path
import pandas as pd
import numpy as np

pre = build_preprocessor(num_cols, cat_cols)

# 输出目录
out_path = Path("outputs")
out_path.mkdir(parents=True, exist_ok=True)
save_file = out_path / "preview_transformed_head.csv"

# 尝试获取 pandas 输出（sklearn >=1.2）
use_pandas = False
try:
    pre.set_output(transform="pandas")
    X_tr_t = pre.fit_transform(X_tr, y_tr)
    X_va_t = pre.transform(X_va)
    if hasattr(X_tr_t, "columns"):
        feature_names = list(X_tr_t.columns)
        use_pandas = True
    else:
        feature_names = [f"f{i}" for i in range(X_tr_t.shape[1])]
except Exception as e:
    print(f"[warn] pandas 输出失败，回退为 numpy。原因: {e}")
    X_tr_t = pre.fit_transform(X_tr, y_tr)
    X_va_t = pre.transform(X_va)
    feature_names = [f"f{i}" for i in range(X_tr_t.shape[1])]

print(f"Transformed shapes: train {X_tr_t.shape}, valid {X_va_t.shape}")
print("First 5 feature names:", feature_names[:5])

# 保存前 20 行特征快照
try:
    pd.DataFrame(np.asarray(X_tr_t)[:20], columns=feature_names).to_csv(save_file, index=False)
    print(f"Saved transformed head -> {save_file.resolve()}")
except Exception as e:
    print(f"[warn] 无法保存特征快照: {e}")


## 1) Original Feature Health Checks

In [None]:

# Missingness in original features
miss = X_tr.isna().mean().sort_values(ascending=False)
display(miss.head(30))

# Cardinality for categoricals
card = X_tr[cat_cols].nunique(dropna=False).sort_values(ascending=False) if len(cat_cols) else pd.Series(dtype=int)
display(card.head(30))


## 2) Variance (Transformed Features)

In [None]:

Xtr_df = pd.DataFrame(X_tr_t, columns=feature_names)
var = Xtr_df.var().sort_values(ascending=False)
display(var.head(30))
low_var = (var <= 1e-6).sum()
print("Low/near-constant features:", low_var)


## 3) Correlation & Redundancy (numeric transformed)

In [None]:

# Only on numeric; here Xtr_df is numeric already
corr = Xtr_df.corr()
plt.figure(figsize=(8,6))
plt.imshow(corr.values, aspect='auto')
plt.colorbar()
plt.title("Correlation heatmap (transformed)")
plt.tight_layout()
plt.show()

# Identify highly correlated (>|0.95|)
thr = 0.95
upper = corr.where(np.triu(np.ones(corr.shape), 1).astype(bool))
high_corr = [c for c in upper.columns if any(upper[c].abs() > thr)]
print("Highly correlated columns (>|0.95|), count:", len(high_corr))
print(high_corr[:20])


In [None]:
high_corr_names = [feature_names[i] for i in [48,180,255,256,257,259,261,271,280,281,290,291]]

## 4) Fast Importance (XGB, strong regularization)

In [None]:

from xgboost import XGBClassifier
num_class = len(pd.unique(y_tr))
xgb_light = XGBClassifier(
    objective="multi:softprob", num_class=num_class,
    n_estimators=400, learning_rate=0.05,
    max_depth=4, min_child_weight=6,
    subsample=0.7, colsample_bytree=0.7,
    reg_lambda=12.0, reg_alpha=1.0,
    eval_metric="mlogloss", random_state=42
)
xgb_light.fit(X_tr_t, y_tr, eval_set=[(X_va_t, y_va)], verbose=False)
imp = getattr(xgb_light, "feature_importances_", np.zeros(X_tr_t.shape[1]))
imp_s = pd.Series(imp, index=feature_names).sort_values(ascending=False)
display(imp_s.head(40))
print("Train acc:", accuracy_score(y_tr, xgb_light.predict(X_tr_t)).round(4),
      " Valid acc:", accuracy_score(y_va, xgb_light.predict(X_va_t)).round(4))


## 5) Permutation Importance (Validation Set)

In [None]:

pi = permutation_importance(xgb_light, X_va_t, y_va, n_repeats=5, scoring="accuracy", random_state=42)
pi_mean = pd.Series(pi.importances_mean, index=feature_names).sort_values(ascending=False)
display(pi_mean.head(30))


## 6) Adversarial Validation (Train vs Valid Drift)

In [None]:

from sklearn.linear_model import LogisticRegression
Z = np.vstack([np.asarray(X_tr_t), np.asarray(X_va_t)])
d = np.r_[np.zeros(len(X_tr_t)), np.ones(len(X_va_t))]

Z_tr, Z_te, d_tr, d_te = train_test_split(Z, d, test_size=0.3, stratify=d, random_state=42)
adv = LogisticRegression(max_iter=2000)
adv.fit(Z_tr, d_tr)
p = adv.predict_proba(Z_te)[:,1]
auc = roc_auc_score(d_te, p)
print(f"[Adversarial AUC] {auc:.4f}  (≈0.5 good; ≥0.7 drift)")

# Train-side validation probability (to build weights later if needed)
p_train = adv.predict_proba(np.asarray(X_tr_t))[:,1]
pd.Series(p_train).to_csv("/mnt/data/adv_prob_train.csv", index=False)
print("Saved train-side adv probs -> /mnt/data/adv_prob_train.csv")


## 7) Leakage Probes (quick checks)

In [None]:

# Numeric features correlation with y (quick sanity; for multiclass use ANOVA-like score is better)
from sklearn.feature_selection import mutual_info_classif
mi = mutual_info_classif(np.asarray(X_tr_t), y_tr, discrete_features=False, random_state=42)
mi_s = pd.Series(mi, index=feature_names).sort_values(ascending=False)
display(mi_s.head(30))


## 8) Save Ranked Lists for Manual Review

In [None]:

imp_s.to_csv("/mnt/data/rank_xgb_importance.csv")
pi_mean.to_csv("/mnt/data/rank_permutation_importance.csv")
mi_s.to_csv("/mnt/data/rank_mutual_info.csv")
var.to_csv("/mnt/data/rank_variance.csv")

print("Saved:")
print(" - /mnt/data/rank_xgb_importance.csv")
print(" - /mnt/data/rank_permutation_importance.csv")
print(" - /mnt/data/rank_mutual_info.csv")
print(" - /mnt/data/rank_variance.csv")



## 9) What to do next (Checklist)

- **Drop**: low-variance (~0), ultra-high correlation (>0.95) duplicates, obvious leakage fields.  
- **Encoding cleanup**: high-cardinality cols → pick **one** among Target Encoding (with strong smoothing) or Frequency Encoding. Low-cardinality → One-Hot. Rare categories → group to `__RARE__`.  
- **Rebuild preprocessor**: with the reduced set; rerun notebook; verify overfit gap.  
- **Group-wise add-back**: age/area/ratio/basement/quality/encodings in small groups; keep only groups with Δacc ≥ 0.2pp on validation.  
- **(If drift high)**: use adversarial reweighting (1 - p_train) as `sample_weight` in training.


In [None]:
import pandas as pd

# 假设三种重要性指标都是 pd.Series，index 是特征索引或名称
df_imp = pd.DataFrame({
    "fast_gain": fast_importance,
    "perm_importance": perm_importance,
    "leak_score": leakage_probe,
    "variance": variance
}).fillna(0)

keep_mask = (
    (df_imp["perm_importance"] > 0.005) &          # 泛化稳定
    (df_imp["leak_score"] < 0.3) &                 # 无泄漏
    (df_imp["variance"].between(1e-3, 1e7))        # 方差合理
)

keep_features = df_imp[keep_mask].index.tolist()
drop_features = df_imp.index.difference(keep_features).tolist()

print(f"保留 {len(keep_features)} 个，删除 {len(drop_features)} 个")
