In [None]:
# ============================================================
# CA6000 (Kaggle PS S5E12) — Data Cleaning & Preprocessing
# Output: X_train_proc, X_val_proc, y_train, y_val, X_test_proc
# ============================================================

import os
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin, clone
import joblib
import scipy.sparse as sp

SEED = 42
TARGET_COL = "diagnosed_diabetes"
ID_COL = "id"


In [None]:

# ----------------------------
# 1) Robust path resolver (Kaggle / Colab / local / /mnt/data)
# ----------------------------
from pathlib import Path

def resolve_dataset_paths(prefer_dir="/content"):
    candidates = [Path(prefer_dir), Path("/mnt/data"), Path(".")]

    kaggle_input = Path("/kaggle/input")
    if kaggle_input.exists():
        candidates.append(kaggle_input)

    def find_file(root: Path, filename: str):
        direct = root / filename
        if direct.exists():
            return direct
        hits = list(root.rglob(filename))
        return hits[0] if hits else None

    train_path = test_path = sub_path = None
    for root in candidates:
        tp = find_file(root, "train.csv")
        te = find_file(root, "test.csv")
        ss = find_file(root, "sample_submission.csv")
        if tp is not None and te is not None:
            train_path, test_path, sub_path = tp, te, ss
            break

    if train_path is None or test_path is None:
        raise FileNotFoundError("Cannot find train.csv/test.csv under preferred dirs.")

    return str(train_path), str(test_path), (str(sub_path) if sub_path else None)

TRAIN_PATH, TEST_PATH, SUB_PATH = resolve_dataset_paths("/content")
print(TRAIN_PATH, TEST_PATH, SUB_PATH)

In [None]:
# ----------------------------
# 2) Load data
# ----------------------------
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

print("\nShapes:")
print("train:", train_df.shape)
print("test :", test_df.shape)
print("\nTrain head:")
display(train_df.head(3))

In [None]:
# ----------------------------
# 3) Data audit & sanity checks (good for your report)
# ----------------------------
def basic_audit(train_df: pd.DataFrame, test_df: pd.DataFrame):
    # Required columns
    assert TARGET_COL in train_df.columns, f"Missing target '{TARGET_COL}' in train.csv"
    assert ID_COL in train_df.columns and ID_COL in test_df.columns, "Missing 'id' in train/test"

    # Column alignment (except target)
    train_features = [c for c in train_df.columns if c != TARGET_COL]
    assert set(train_features) == set(test_df.columns), "Train features != Test columns (schema mismatch)"

    # ID uniqueness
    assert train_df[ID_COL].is_unique, "Train id is not unique"
    assert test_df[ID_COL].is_unique, "Test id is not unique"

    # Duplicates
    dup_train = train_df.duplicated().sum()
    dup_test = test_df.duplicated().sum()

    # Missing summary
    miss_train = (train_df.isnull().mean().sort_values(ascending=False))
    miss_test = (test_df.isnull().mean().sort_values(ascending=False))

    # Target check
    y = train_df[TARGET_COL]
    # Ensure binary-like
    unique_y = sorted(y.dropna().unique().tolist())

    print("\n[Audit] duplicates:", {"train": int(dup_train), "test": int(dup_test)})
    print("[Audit] top missing rate (train):")
    print(miss_train.head(10))
    print("[Audit] top missing rate (test):")
    print(miss_test.head(10))
    print("[Audit] target unique values:", unique_y)
    print("[Audit] target distribution:\n", y.value_counts(dropna=False))

basic_audit(train_df, test_df)

# Convert target to int (0/1)
train_df[TARGET_COL] = train_df[TARGET_COL].astype(int)

In [None]:

# ----------------------------
# 4) Define column groups
# ----------------------------
# Categorical columns (object/string)
cat_cols = train_df.select_dtypes(include=["object"]).columns.tolist()
cat_cols = [c for c in cat_cols if c != ID_COL]  # ensure id is not treated as category

# Binary columns (known 0/1 flags in this dataset)
bin_cols = ["family_history_diabetes", "hypertension_history", "cardiovascular_history"]
bin_cols = [c for c in bin_cols if c in train_df.columns]

# Numeric columns = all numeric excluding id/target/binary
num_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
num_cols = [c for c in num_cols if c not in [ID_COL, TARGET_COL] + bin_cols]

print("Column groups:")
print("num_cols:", num_cols)
print("bin_cols:", bin_cols)
print("cat_cols:", cat_cols)


In [None]:
# Optional: verify binary columns truly contain only 0/1
for c in bin_cols:
    bad_vals = set(train_df[c].dropna().unique()) - {0, 1}
    if bad_vals:
        raise ValueError(f"Binary col '{c}' has unexpected values: {bad_vals}")

In [None]:
# ----------------------------
# 5) (Optional but nice) Range check for numeric columns
# ----------------------------
def numeric_range_report(df: pd.DataFrame, columns):
    desc = df[columns].describe(percentiles=[0.01, 0.5, 0.99]).T
    # Keep a compact view
    return desc[["min", "1%", "50%", "99%", "max", "mean", "std"]].sort_values("max", ascending=False)

range_report = numeric_range_report(train_df, num_cols)
print("\nNumeric range report (top 8 by max):")
display(range_report.head(8))


In [None]:
# ----------------------------
# 6) Split data BEFORE fitting preprocessors (avoid leakage)
# ----------------------------
X = train_df.drop(columns=[TARGET_COL])
y = train_df[TARGET_COL].values.astype(np.int32)

train_ids = X[ID_COL].values
test_ids = test_df[ID_COL].values

X = X.drop(columns=[ID_COL])
X_test = test_df.drop(columns=[ID_COL])

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=SEED,
    stratify=y
)

print("\nSplit shapes:")
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val  :", X_val.shape,   "y_val  :", y_val.shape)
print("X_test :", X_test.shape)


In [None]:
# ----------------------------
# 7) Custom transformer: quantile clipping for numeric outliers
#    (fit on training only)
# ----------------------------
class QuantileClipper(BaseEstimator, TransformerMixin):
    def __init__(self, lower_q=0.005, upper_q=0.995):
        self.lower_q = lower_q
        self.upper_q = upper_q

    def fit(self, X, y=None):
        X = np.asarray(X, dtype=float)
        self.lower_ = np.nanquantile(X, self.lower_q, axis=0)
        self.upper_ = np.nanquantile(X, self.upper_q, axis=0)
        return self

    def transform(self, X):
        X = np.asarray(X, dtype=float)
        return np.clip(X, self.lower_, self.upper_)

In [None]:

# ----------------------------
# 8) Build preprocessing pipeline
#    - numeric: median impute -> clip -> standardize
#    - binary : most_frequent impute (keep 0/1)
#    - cate   : most_frequent impute -> one-hot (sparse)
# ----------------------------
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("clipper", QuantileClipper(lower_q=0.005, upper_q=0.995)),
    ("scaler", StandardScaler())
])

binary_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

# Keep output sparse for efficiency; handle sklearn <1.2 fallback
try:
    categorical_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
except TypeError:
    categorical_encoder = OneHotEncoder(handle_unknown="ignore", sparse=True)

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", categorical_encoder)
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("bin", binary_pipe, bin_cols),
        ("cat", categorical_pipe, cat_cols),
    ],
    remainder="drop",
    sparse_threshold=1.0,  # prefer sparse output so XGBoost can consume efficiently
    verbose_feature_names_out=False
)


In [None]:

# ----------------------------
# 9) Fit on train, transform val/test
# ----------------------------
X_train_proc = preprocess.fit_transform(X_train)
X_val_proc = preprocess.transform(X_val)
X_test_proc = preprocess.transform(X_test)

# Cast to float32 for models (works for dense or sparse matrices)
X_train_proc = X_train_proc.astype(np.float32)
X_val_proc = X_val_proc.astype(np.float32)
X_test_proc = X_test_proc.astype(np.float32)

print("Processed shapes:")
print("X_train_proc:", X_train_proc.shape)
print("X_val_proc  :", X_val_proc.shape)
print("X_test_proc :", X_test_proc.shape)

# Safety checks (support sparse/dense)
def assert_no_nan(arr, name):
    if sp.issparse(arr):
        assert not np.isnan(arr.data).any(), f"NaNs remain in {name}"
    else:
        assert not np.isnan(arr).any(), f"NaNs remain in {name}"

assert_no_nan(X_train_proc, "X_train_proc")
assert_no_nan(X_val_proc, "X_val_proc")
assert_no_nan(X_test_proc, "X_test_proc")


In [None]:
# ----------------------------
# 10) Save artifacts for reproducibility
# ----------------------------
artifact = {
    "id_col": ID_COL,
    "target_col": TARGET_COL,
    "num_cols": num_cols,
    "bin_cols": bin_cols,
    "cat_cols": cat_cols,
    "preprocess": preprocess,
}

joblib.dump(artifact, "preprocess_artifact.joblib")
print("\nSaved preprocess artifact -> preprocess_artifact.joblib")

# Optional: save processed arrays (may be large, enable if you want)
# np.save("X_train_proc.npy", X_train_proc)
# np.save("X_val_proc.npy", X_val_proc)
# np.save("X_test_proc.npy", X_test_proc)
# np.save("y_train.npy", y_train)
# np.save("y_val.npy", y_val)

print("\n✅ Ready for model training stage:")
print("Use X_train_proc, y_train, X_val_proc, y_val, X_test_proc")

In [None]:
# ----------------------------
# 11) 5-fold OOF XGBoost (AUC, sparse-friendly) — xgboost==3.1.2 compatible
# ----------------------------
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

X_full_raw = train_df.drop(columns=[TARGET_COL, ID_COL])
y_full = train_df[TARGET_COL].values.astype(np.int32)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

oof_pred = np.zeros(len(y_full), dtype=np.float32)
fold_scores, best_iters = [], []

# Base params (per-fold scale_pos_weight will be computed inside the loop)
xgb_params = dict(
    n_estimators=5000,
    learning_rate=0.03,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",        # GPU可用时：device="cuda"
    device="cuda", 
    n_jobs=-1,
    random_state=SEED,
    early_stopping_rounds=200, # ✅ xgboost>=2.0/3.x：放在构造函数里，不要放 fit()
)

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_full_raw, y_full), start=1):
    X_tr_raw, X_val_raw = X_full_raw.iloc[tr_idx], X_full_raw.iloc[val_idx]
    y_tr, y_val_fold = y_full[tr_idx], y_full[val_idx]

    # 1) Fold-specific preprocessing (fit on train fold only)
    pre_fold = clone(preprocess)
    X_tr_proc = pre_fold.fit_transform(X_tr_raw)
    X_val_proc = pre_fold.transform(X_val_raw)

    # Keep float32 (works for numpy arrays and scipy sparse matrices)
    X_tr_proc = X_tr_proc.astype(np.float32)
    X_val_proc = X_val_proc.astype(np.float32)

    # 2) Fold-specific class weight (more correct than using global)
    pos = int((y_tr == 1).sum())
    neg = int((y_tr == 0).sum())
    scale_pos = neg / max(pos, 1)

    # 3) Train model
    model = xgb.XGBClassifier(**xgb_params, scale_pos_weight=scale_pos)

    model.fit(
        X_tr_proc, y_tr,
        eval_set=[(X_val_proc, y_val_fold)],
        verbose=50,
    )

    # 4) Predict using best_iteration (important even if model stored all trees)
    best_iter = getattr(model, "best_iteration", None)
    if best_iter is not None:
        fold_pred = model.predict_proba(X_val_proc, iteration_range=(0, best_iter + 1))[:, 1]
    else:
        fold_pred = model.predict_proba(X_val_proc)[:, 1]

    oof_pred[val_idx] = fold_pred
    fold_auc = roc_auc_score(y_val_fold, fold_pred)

    best_iters.append(int(best_iter) if best_iter is not None else 0)
    fold_scores.append(float(fold_auc))
    print(f"Fold {fold}: AUC={fold_auc:.5f}, best_iter={best_iters[-1]}")

oof_auc = roc_auc_score(y_full, oof_pred)
valid_best_iters = [b for b in best_iters if b > 0]
avg_best_iter = int(np.mean(valid_best_iters)) if valid_best_iters else 500

print(f"OOF AUC: {oof_auc:.5f}")
print(f"Fold AUCs: {[round(s, 5) for s in fold_scores]}")
print(f"Avg best_iter (rounded): {avg_best_iter}")


In [None]:
# ----------------------------
# 12) Final train on full data + generate submission (XGBoost 3.1.2 compatible)
# ----------------------------
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.base import clone

# Ensure raw matrices exist even if previous cell wasn't run
X_full_raw = train_df.drop(columns=[TARGET_COL, ID_COL])
y_full = train_df[TARGET_COL].values.astype(np.int32)
X_test_raw = test_df.drop(columns=[ID_COL])

if 'test_ids' not in locals():
    test_ids = test_df[ID_COL].values

# Recompute class weight and xgb_params if missing
scale_pos_final = (y_full == 0).sum() / (y_full == 1).sum()

if 'xgb_params' not in locals():
    xgb_params = dict(
        n_estimators=5000,
        learning_rate=0.03,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        objective="binary:logistic",
        eval_metric="auc",
        tree_method="hist",
        scale_pos_weight=scale_pos_final,
        n_jobs=-1,
        random_state=SEED,
    )
else:
    xgb_params = dict(xgb_params)
    xgb_params['scale_pos_weight'] = scale_pos_final

# ---- NEW: final params (remove early stopping related settings) ----
xgb_params_final = dict(xgb_params)
xgb_params_final.pop("early_stopping_rounds", None)  # ✅ must remove for full fit without eval_set
xgb_params_final.pop("callbacks", None)              # ✅ just in case

# ---- NEW: try to enable GPU safely (xgboost 3.x) ----
use_cuda = bool(xgb.build_info().get("USE_CUDA", False))
if use_cuda:
    xgb_params_final["tree_method"] = "hist"
    xgb_params_final["device"] = "cuda"  # or "cuda:0"
    print("[XGBoost] GPU enabled: device=cuda")
else:
    # CPU fallback
    xgb_params_final.pop("device", None)
    xgb_params_final["tree_method"] = "hist"
    print("[XGBoost] GPU not available, using CPU")

# Fit preprocessing on full train, transform test
preprocess_final = clone(preprocess)
X_full_proc = preprocess_final.fit_transform(X_full_raw).astype(np.float32)
X_test_proc_final = preprocess_final.transform(X_test_raw).astype(np.float32)

# Use avg_best_iter from OOF if available; otherwise fallback
final_n_estimators = int(max(avg_best_iter, 50)) if 'avg_best_iter' in locals() else 500

# Train final model on full data
#model_final = xgb.XGBClassifier(**xgb_params_final, n_estimators=final_n_estimators)
#model_final.fit(X_full_proc, y_full, verbose=200)

xgb_params_final['n_estimators'] = final_n_estimators

model_final = xgb.XGBClassifier(**xgb_params_final) # 这样就不会报错了
model_final.fit(X_full_proc, y_full, verbose=100)

# Predict + submission
test_pred = model_final.predict_proba(X_test_proc_final)[:, 1]

submission = pd.DataFrame({ID_COL: test_ids, TARGET_COL: test_pred})
submission.to_csv("submission.csv", index=False)

print(f"Saved submission_xgb.csv with shape {submission.shape} and n_estimators={final_n_estimators}")
display(submission.head())


In [None]:
assert submission.shape[0] == 300000
assert list(submission.columns) == ["id", "diagnosed_diabetes"]
assert submission["diagnosed_diabetes"].between(0, 1).all()
assert submission["id"].is_unique
