# 数据处理

In [3]:
import numpy as np
import pandas as pd

df = pd.read_excel('基差.xls')
df.set_index('日期', inplace=True)

df = df.diff(1)  # 全部先差分，后续再细化
df_clean = df.dropna().copy()
# 标签：下一期 Δ(基差) 是否 > 0
df_clean['value_sort'] = df_clean['基差'].shift(-1).apply(lambda x: 1 if x > 0 else 0)
df_clean = df_clean.iloc[:-1]

In [4]:
print(df_clean['value_sort'].value_counts())

value_sort
1    1324
0    1219
Name: count, dtype: int64


In [5]:
df_clean.head()

Unnamed: 0_level_0,收益率(%),净价,全价,期货价格,发票价格,转换因子,基差,期现价差,IRR(%),十债主连成交量,十债主连持仓量,置信区间上限,置信区间下限,value_sort
日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2025-09-04,-0.035,-0.3109,-0.4428,0.382,-0.3802,-0.0057,-0.0688,0.063,0.2044,-8599.0,-523.0,0.001,-0.0003,1
2025-09-03,0.0142,0.4564,0.5737,-0.218,0.5411,0.0057,0.053,-0.0326,-0.1271,2944.0,-4154.0,0.0035,0.0327,1
2025-09-02,0.0208,-0.1384,-0.1457,-0.15,-0.1472,0.0,0.0091,-0.0018,-0.0156,-26100.0,-7855.0,0.0266,-0.0033,0
2025-09-01,0.0109,-0.6229,-0.7545,0.017,-0.7364,-0.0057,-0.0241,0.0179,0.0555,23432.0,1747.0,0.0444,-0.0086,1
2025-08-29,0.0016,0.5478,0.6501,-0.102,0.653,0.0057,0.0324,0.0031,-0.0318,-10472.0,-7482.0,-0.0105,0.0332,1


# LIGHTGBM

In [9]:
# ========= 全量可运行脚本：漂移检查 + 阈值策略 + 贝叶斯超参搜索（静默日志，兼容新版LightGBM） =========
# 依赖：
#   pip install numpy pandas scipy scikit-learn lightgbm optuna

import os
os.environ["LIGHTGBM_VERBOSITY"] = "-1"  # 兜底关闭底层日志

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import numpy as np
import pandas as pd

from scipy.stats import ks_2samp, chi2_contingency
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score, recall_score, f1_score,
    average_precision_score, log_loss
)
from sklearn.inspection import permutation_importance  # NEW
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
import optuna
import matplotlib.pyplot as plt  # NEW

# ========= 公共工具 =========

def set_seed(seed=42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

set_seed(42)

def safe_auc(y_true, y_prob):
    y_true = np.asarray(y_true).astype(int)
    if len(np.unique(y_true)) < 2:
        return np.nan
    return roc_auc_score(y_true, y_prob)

# ========= 1) 特征稳定性 / 漂移检查 =========

def psi_for_series(train_s: pd.Series, test_s: pd.Series, bins=10):
    train_s = pd.to_numeric(train_s, errors='coerce')
    test_s  = pd.to_numeric(test_s,  errors='coerce')
    tr = train_s.dropna(); te = test_s.dropna()
    if tr.empty or te.empty:
        return np.nan
    quantiles = np.linspace(0, 1, bins + 1)
    cuts = np.unique(np.nanquantile(tr, quantiles))
    if len(cuts) <= 2:
        return np.nan
    tr_bins = pd.cut(train_s, bins=cuts, include_lowest=True)
    te_bins = pd.cut(test_s,  bins=cuts, include_lowest=True)
    tr_ratio = tr_bins.value_counts(normalize=True).sort_index()
    te_ratio = te_bins.value_counts(normalize=True).sort_index()
    te_ratio = te_ratio.reindex(tr_ratio.index).fillna(0.0)
    tr_ratio = tr_ratio.fillna(0.0)
    tr_ratio = tr_ratio.replace(0, 1e-8)
    te_ratio = te_ratio.replace(0, 1e-8)
    psi = np.sum((te_ratio - tr_ratio) * np.log(te_ratio / tr_ratio))
    return float(psi)

def cat_psi(train_s: pd.Series, test_s: pd.Series):
    tr_p = train_s.value_counts(normalize=True)
    te_p = test_s.value_counts(normalize=True)
    idx = tr_p.index.union(te_p.index)
    tr_p = tr_p.reindex(idx).fillna(0.0).replace(0, 1e-8)
    te_p = te_p.reindex(idx).fillna(0.0).replace(0, 1e-8)
    psi = np.sum((te_p - tr_p) * np.log(te_p / tr_p))
    return float(psi)

def two_sample_drift(train_s: pd.Series, test_s: pd.Series, is_categorical=False):
    if is_categorical:
        idx = pd.Index(pd.concat([train_s.astype(str), test_s.astype(str)], ignore_index=True).unique())
        tr_counts = train_s.astype(str).value_counts().reindex(idx, fill_value=0).astype(float)
        te_counts = test_s.astype(str).value_counts().reindex(idx, fill_value=0).astype(float)
        table = np.vstack([tr_counts.values, te_counts.values])
        try:
            chi2, p, dof, exp = chi2_contingency(table)
        except ValueError:
            p = 1.0
        return {"stat": None, "pvalue": float(p)}
    else:
        tr = pd.to_numeric(train_s, errors='coerce').dropna()
        te = pd.to_numeric(test_s,  errors='coerce').dropna()
        if len(tr) < 2 or len(te) < 2:
            return {"stat": None, "pvalue": np.nan}
        ks = ks_2samp(tr, te, alternative='two-sided', mode='auto')
        return {"stat": float(ks.statistic), "pvalue": float(ks.pvalue)}

def drift_report(df_ref: pd.DataFrame, df_new: pd.DataFrame,
                 categorical_cols=None, topk=15):
    categorical_cols = set(categorical_cols or [])
    rows = []
    for c in df_ref.columns:
        is_cat = c in categorical_cols or (df_ref[c].dtype.name in ["category", "object"])
        psi = cat_psi(df_ref[c], df_new[c]) if is_cat else psi_for_series(df_ref[c], df_new[c])
        stat = two_sample_drift(df_ref[c], df_new[c], is_categorical=is_cat)
        miss_ref = df_ref[c].isna().mean()
        miss_new = df_new[c].isna().mean()
        rows.append({
            "feature": c,
            "is_categorical": is_cat,
            "PSI": psi,
            "KS/Chi2_p": stat["pvalue"],
            "KS_stat": stat["stat"],
            "missing_ref": miss_ref,
            "missing_new": miss_new,
            "missing_diff": miss_new - miss_ref,
        })
    rep = pd.DataFrame(rows)
    rep = rep.sort_values(by=["PSI", "KS/Chi2_p"], ascending=[False, True]).reset_index(drop=True)
    return rep.iloc[:topk]

# ========= 2) 阈值策略 =========

def choose_threshold(
    y_true, y_prob,
    method="f1",
    grid=None,
    min_precision=None,
    min_recall=None,
    target_pos_rate=None
):
    if grid is None:
        grid = np.linspace(0.01, 0.99, 99)
    y_true = np.asarray(y_true).astype(int)

    out_rows = []
    best_thr, best_key = 0.5, (-1e9, -1e9)

    for t in grid:
        pred = (y_prob >= t).astype(int)
        P  = precision_score(y_true, pred, zero_division=0)
        R  = recall_score(y_true, pred, zero_division=0)
        F1 = f1_score(y_true, pred, zero_division=0)
        tn = np.sum((pred==0)&(y_true==0))
        fp = np.sum((pred==1)&(y_true==0))
        fn = np.sum((pred==0)&(y_true==1))
        tp = np.sum((pred==1)&(y_true==1))
        TNR = tn / max(1, (tn+fp))
        J = R + TNR - 1
        pos_rate = pred.mean()

        out_rows.append({"thr": t, "precision": P, "recall": R, "f1": F1,
                         "youdenJ": J, "pos_rate": pos_rate, "tp": tp, "fp": fp, "tn": tn, "fn": fn})

        if method == "f1":
            key = (F1, 0.0)
        elif method == "youden":
            key = (J, 0.0)
        elif method == "posrate" and target_pos_rate is not None:
            key = (-abs(pos_rate - target_pos_rate), 0.0)
        elif method == "constraint":
            if (min_precision is not None and P < min_precision) or (min_recall is not None and R < min_recall):
                key = (-1e9, -1e9)
            else:
                key = (R, F1)
        else:
            key = (F1, 0.0)

        if key > best_key:
            best_key = key
            best_thr = t

    table = pd.DataFrame(out_rows).sort_values("thr").reset_index(drop=True)
    best_row = table.loc[table["thr"].sub(best_thr).abs().idxmin()].to_dict()
    return float(best_thr), best_row, table

# ========= 3) 分数 PSI =========

def score_psi(ref_scores, new_scores, bins=10):
    ref = pd.Series(ref_scores)
    new = pd.Series(new_scores)
    return psi_for_series(ref, new, bins=bins)

# ========= 4) 数据准备（日期阈值 / 比例切分） =========

def temporal_split(df_clean: pd.DataFrame,
                   label_col="value_sort",
                   cutoff_date=None,
                   test_size_ratio=0.2,
                   val_size_ratio=0.2):
    assert label_col in df_clean.columns
    df = df_clean.copy().sort_index()

    feat_cols = df.columns.drop([label_col]).tolist()
    X_all = df[feat_cols].values
    y_all = df[label_col].astype(int).values

    if cutoff_date is not None:
        assert isinstance(df.index, pd.DatetimeIndex), "需 DatetimeIndex 才能按日期切分"
        mask_trainval = (df.index <= pd.to_datetime(cutoff_date))
        X_trainval, y_trainval = X_all[mask_trainval], y_all[mask_trainval]
        X_test, y_test = X_all[~mask_trainval], y_all[~mask_trainval]

        n_tv = len(X_trainval)
        n_val = max(1, int(n_tv * val_size_ratio))
        X_tr, y_tr = X_trainval[:-n_val], y_trainval[:-n_val]
        X_val, y_val = X_trainval[-n_val:], y_trainval[-n_val:]
        return X_tr, y_tr, X_val, y_val, X_test, y_test, feat_cols

    N = len(X_all)
    n_test = max(1, int(N * test_size_ratio))
    X_tv, y_tv = X_all[:-n_test], y_all[:-n_test]
    X_test, y_test = X_all[-n_test:], y_all[-n_test:]

    n_tv = len(X_tv)
    n_val = max(1, int(n_tv * val_size_ratio))
    X_tr, y_tr = X_tv[:-n_val], y_tv[:-n_val]
    X_val, y_val = X_tv[-n_val:], y_tv[-n_val:]
    return X_tr, y_tr, X_val, y_val, X_test, y_test, feat_cols

# ========= 2.5) 统一评估接口 =========

def _compute_metrics_at_thr(y_true, y_prob, thr):
    y_pred = (y_prob >= thr).astype(int)
    tn = np.sum((y_pred == 0) & (y_true == 0))
    fp = np.sum((y_pred == 1) & (y_true == 0))
    fn = np.sum((y_pred == 0) & (y_true == 1))
    tp = np.sum((y_pred == 1) & (y_true == 1))
    tnr = tn / max(1, (tn + fp))
    youdenJ = (recall_score(y_true, y_pred, zero_division=0) + tnr - 1.0)
    return {
        "accuracy":  accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall":    recall_score(y_true, y_pred, zero_division=0),
        "f1":        f1_score(y_true, y_pred, zero_division=0),
        "youden":    youdenJ,
        "pos_rate":  y_pred.mean()
    }

def evaluate_with_optional_threshold(
    y_true, y_prob,
    optimize_metric="f1",
    thr_source="auto",
    fixed_thr=0.5,
    constraint_min_precision=None,
    constraint_min_recall=None,
    target_pos_rate=None
):
    y_true = np.asarray(y_true).astype(int)

    if optimize_metric in {"auc", "ap", "logloss"}:
        if len(np.unique(y_true)) < 2:
            return (-np.inf if optimize_metric == "logloss" else np.nan), None, {}
        if optimize_metric == "auc":
            return float(roc_auc_score(y_true, y_prob)), None, {}
        elif optimize_metric == "ap":
            return float(average_precision_score(y_true, y_prob)), None, {}
        else:
            ll = log_loss(y_true, np.vstack([1 - y_prob, y_prob]).T, labels=[0, 1])
            return float(-ll), None, {"raw_logloss": ll}

    if thr_source == "auto":
        thr_source = "youden" if optimize_metric == "youden" else "f1"

    if   thr_source == "f1":
        thr, row, _ = choose_threshold(y_true, y_prob, method="f1")
    elif thr_source == "youden":
        thr, row, _ = choose_threshold(y_true, y_prob, method="youden")
    elif thr_source == "constraint":
        thr, row, _ = choose_threshold(
            y_true, y_prob, method="constraint",
            min_precision=constraint_min_precision, min_recall=constraint_min_recall
        )
    elif thr_source == "posrate":
        thr, row, _ = choose_threshold(
            y_true, y_prob, method="posrate", target_pos_rate=target_pos_rate
        )
    elif thr_source == "fixed":
        thr = float(fixed_thr)
        row = _compute_metrics_at_thr(y_true, y_prob, thr)
    else:
        thr, row, _ = choose_threshold(y_true, y_prob, method="f1")

    if optimize_metric not in {"accuracy", "precision", "recall", "f1", "youden"}:
        optimize_metric = "f1"
    score = float(row[optimize_metric])
    return score, float(thr), row

# ========= 5) Optuna + LightGBM 搜索（静默） =========

def run_optuna_lgbm(
    X_tr, y_tr, X_val, y_val,
    n_trials=50,
    method_for_thr="f1",
    constraint_min_precision=None, constraint_min_recall=None, target_pos_rate=None,
    optimize_metric="f1",
    thr_source="auto",
    fixed_thr=0.5
):
    if thr_source == "auto":
        thr_source = method_for_thr

    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 200, 2000),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.2, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 16, 512, log=True),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 200),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
            "random_state": 42,
            "n_jobs": -1,
            "objective": "binary",
            "verbosity": -1,
        }

        model = LGBMClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            eval_metric="auc",
            callbacks=[
                early_stopping(stopping_rounds=100, verbose=False),
                log_evaluation(period=0)
            ]
        )

        val_prob = model.predict_proba(X_val)[:, 1]
        score, used_thr, row = evaluate_with_optional_threshold(
            y_val, val_prob,
            optimize_metric=optimize_metric,
            thr_source=thr_source,
            fixed_thr=fixed_thr,
            constraint_min_precision=constraint_min_precision,
            constraint_min_recall=constraint_min_recall,
            target_pos_rate=target_pos_rate
        )

        trial.set_user_attr("metric", optimize_metric)
        trial.set_user_attr("thr_source", thr_source)
        trial.set_user_attr("thr", used_thr)
        for k in ["precision", "recall", "f1", "accuracy", "youden", "pos_rate"]:
            if k in row:
                trial.set_user_attr(k, row[k])
        if "raw_logloss" in row:
            trial.set_user_attr("raw_logloss", row["raw_logloss"])

        return float(score)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
    return study

# ========= 6) 主流程 =========

if __name__ == "__main__":
    # 你需要先准备好 df_clean（含列 value_sort，索引为日期）
    # 这里默认你已在外部构建好 df_clean 变量
    # 例如：
    # df_clean = pd.read_excel('基差_diff版.xlsx', parse_dates=['日期'], index_col='日期')
    # assert 'value_sort' in df_clean.columns

    # === 切分 ===
    X_tr_fit_raw, y_tr_fit, X_val_fit_raw, y_val_fit, X_te_raw, y_te, feat_cols = temporal_split(
        df_clean, label_col="value_sort", cutoff_date=None,
        test_size_ratio=0.2, val_size_ratio=0.2
    )

    # === 漂移检查 ===
    df_tr_fit = pd.DataFrame(X_tr_fit_raw, columns=feat_cols)
    df_val    = pd.DataFrame(X_val_fit_raw, columns=feat_cols)
    df_te     = pd.DataFrame(X_te_raw,     columns=feat_cols)

    rep_tr_te = drift_report(df_tr_fit, df_te, categorical_cols=[], topk=30)
    print("\n==== Top Drifted Features (TrainFit vs Test) ====")
    pd.set_option('display.max_rows', 200)
    print(rep_tr_te.to_string(index=False, float_format=lambda x: f"{x:.4f}"))

    # === 贝叶斯优化 ===
    study = run_optuna_lgbm(
        X_tr_fit_raw, y_tr_fit,
        X_val_fit_raw, y_val_fit,
        n_trials=100,
        optimize_metric="f1",
        thr_source="f1",
        fixed_thr=0.5,
        constraint_min_precision=None,
        constraint_min_recall=None,
        target_pos_rate=None,
        method_for_thr="precision",
    )

    print("\n==== Optuna Best (VAL by chosen metric) ====")
    print("Optimize metric:", study.best_trial.user_attrs.get("metric"))
    print("Best score:", study.best_value)
    print("Best params:", study.best_trial.params)
    print("Best VAL thr_source:", study.best_trial.user_attrs.get("thr_source"))
    print("Best VAL thr:", study.best_trial.user_attrs.get("thr"))
    for k in ["precision", "recall", "f1", "accuracy", "youden", "pos_rate", "raw_logloss"]:
        if k in study.best_trial.user_attrs:
            print(f"Best VAL {k}:", study.best_trial.user_attrs[k])

    # === 以最优参数训练最终模型 ===
    best_params = study.best_trial.params
    final_model = LGBMClassifier(
        **best_params, objective="binary", random_state=42, n_jobs=-1, verbosity=-1
    )
    final_model.fit(
        X_tr_fit_raw, y_tr_fit,
        eval_set=[(X_val_fit_raw, y_val_fit)],
        eval_metric="auc",
        callbacks=[
            early_stopping(stopping_rounds=100, verbose=False),
            log_evaluation(period=0)
        ]
    )

    # === 在验证集上选“生产用阈值” ===
    val_prob = final_model.predict_proba(X_val_fit_raw)[:, 1]
    _, chosen_thr, row_any = evaluate_with_optional_threshold(
        y_val_fit, val_prob,
        optimize_metric="precision",
        thr_source="precision"
    )
    print(f"\n[Threshold] VAL selected via F1: t*={chosen_thr:.3f}, "
          f"P={row_any.get('precision', np.nan):.4f}, R={row_any.get('recall', np.nan):.4f}, "
          f"F1={row_any.get('f1', np.nan):.4f}, Acc={row_any.get('accuracy', np.nan):.4f}, "
          f"Youden={row_any.get('youden', np.nan):.4f}")

    # === 测试集评估（固定 chosen_thr） ===
    y_te_prob = final_model.predict_proba(X_te_raw)[:, 1]
    y_te_pred = (y_te_prob >= chosen_thr).astype(int)

    test_auc  = safe_auc(y_te, y_te_prob)
    test_ap   = average_precision_score(y_te, y_te_prob) if len(np.unique(y_te)) > 1 else np.nan
    test_logloss = log_loss(y_te, np.vstack([1 - y_te_prob, y_te_prob]).T, labels=[0,1]) if len(np.unique(y_te)) > 1 else np.nan
    test_acc  = accuracy_score(y_te, y_te_pred)
    test_prec = precision_score(y_te, y_te_pred, zero_division=0)
    test_rec  = recall_score(y_te, y_te_pred, zero_division=0)
    test_f1   = f1_score(y_te, y_te_pred, zero_division=0)

    print("\n==== Test Performance (held-out, with chosen threshold) ====")
    print(f"AUC:           {test_auc:.6f}")
    print(f"AveragePrecision(PR-AUC): {test_ap:.6f}")
    print(f"LogLoss:       {test_logloss:.6f}")
    print(f"Accuracy:      {test_acc:.6f}")
    print(f"Precision@t*:  {test_prec:.6f}")
    print(f"Recall@t*:     {test_rec:.6f}")
    print(f"F1@t*:         {test_f1:.6f}")
    print(f"(t* chosen on VAL: {chosen_thr:.3f})")

    # === 分数 PSI（可选） ===
    tr_scores  = final_model.predict_proba(X_tr_fit_raw)[:, 1]
    val_scores = final_model.predict_proba(X_val_fit_raw)[:, 1]
    te_scores  = y_te_prob
    print("\nScore PSI (TrainFit→Val): ", f"{score_psi(tr_scores, val_scores):.4f}")
    print("Score PSI (TrainFit→Test):", f"{score_psi(tr_scores, te_scores):.4f}")

    # ========== NEW (A): 因子重要性 ==========
    feat_cols_series = pd.Index(feat_cols, dtype=str)
    # 1) LGBM 原生重要性
    split_imp = final_model.booster_.feature_importance(importance_type="split")
    gain_imp  = final_model.booster_.feature_importance(importance_type="gain")
    imp_df = pd.DataFrame({
        "feature": feat_cols_series,
        "lgb_split": split_imp,
        "lgb_gain": gain_imp
    })
    imp_df["lgb_gain_norm"] = imp_df["lgb_gain"] / (imp_df["lgb_gain"].sum() + 1e-12)
    imp_df = imp_df.sort_values("lgb_gain", ascending=False).reset_index(drop=True)

    # 2) Permutation Importance（在验证集上）
    perm = permutation_importance(
        final_model, X_val_fit_raw, y_val_fit,
        scoring="f1", n_repeats=10, random_state=42, n_jobs=-1
    )
    imp_df["perm_importance_mean"] = perm.importances_mean
    imp_df["perm_importance_std"]  = perm.importances_std

    print("\n==== Feature Importance (top 20 by LGB Gain) ====")
    print(imp_df.head(20).to_string(index=False, float_format=lambda x: f"{x:.6f}"))
    imp_df.to_csv("feature_importance.csv", index=False, encoding="utf-8-sig")

    # 简单柱状图（按 Gain 排）
    topk = 15
    plot_df = imp_df.head(topk).iloc[::-1]
    plt.figure(figsize=(8, 6))
    plt.barh(plot_df["feature"], plot_df["lgb_gain_norm"])
    plt.xlabel("Normalized Gain Importance")
    plt.title(f"Top-{topk} Feature Importance (LightGBM Gain)")
    plt.tight_layout()
    plt.savefig("feature_importance_bar.png", dpi=150)
    plt.close()

[I 2025-09-10 09:36:05,362] A new study created in memory with name: no-name-743faec4-0e46-421f-8c45-2a4fd7098662



==== Top Drifted Features (TrainFit vs Test) ====
feature  is_categorical    PSI  KS/Chi2_p  KS_stat  missing_ref  missing_new  missing_diff
   期现价差           False 0.8140     0.0000   0.1833       0.0000       0.0000        0.0000
 IRR(%)           False 0.8045     0.0000   0.1897       0.0000       0.0000        0.0000
十债主连持仓量           False 0.7268     0.0000   0.1936       0.0000       0.0000        0.0000
     基差           False 0.6009     0.0000   0.1884       0.0000       0.0000        0.0000
     净价           False 0.4835     0.0000   0.1542       0.0000       0.0000        0.0000
 置信区间上限           False 0.4779     0.0000   0.1436       0.0000       0.0000        0.0000
 置信区间下限           False 0.4329     0.0000   0.1532       0.0000       0.0000        0.0000
   发票价格           False 0.4185     0.0000   0.1547       0.0000       0.0000        0.0000
十债主连成交量           False 0.3992     0.0000   0.1549       0.0000       0.0000        0.0000
     全价           False 0.3792     0.00

[I 2025-09-10 09:36:05,794] Trial 0 finished with value: 0.7140600315955766 and parameters: {'n_estimators': 1424, 'learning_rate': 0.00184456687425243, 'num_leaves': 110, 'min_child_samples': 75, 'subsample': 0.8412014500110103, 'colsample_bytree': 0.5488239470061993, 'reg_alpha': 3.8804425708981016, 'reg_lambda': 0.0005604986873053644}. Best is trial 0 with value: 0.7140600315955766.
[I 2025-09-10 09:36:06,455] Trial 1 finished with value: 0.7664670658682635 and parameters: {'n_estimators': 1785, 'learning_rate': 0.003982908557807532, 'num_leaves': 71, 'min_child_samples': 17, 'subsample': 0.6397661809047013, 'colsample_bytree': 0.7175662567583736, 'reg_alpha': 0.09630601503340519, 'reg_lambda': 0.0005380596544962404}. Best is trial 1 with value: 0.7664670658682635.
[I 2025-09-10 09:36:06,934] Trial 2 finished with value: 0.7652495378927912 and parameters: {'n_estimators': 1628, 'learning_rate': 0.025271766588726815, 'num_leaves': 74, 'min_child_samples': 39, 'subsample': 0.783562772


==== Optuna Best (VAL by chosen metric) ====
Optimize metric: f1
Best score: 0.781021897810219
Best params: {'n_estimators': 725, 'learning_rate': 0.003403294193734849, 'num_leaves': 19, 'min_child_samples': 28, 'subsample': 0.9580489585456795, 'colsample_bytree': 0.8880496038248439, 'reg_alpha': 0.00034642111177091286, 'reg_lambda': 9.309572597338924e-07}
Best VAL thr_source: f1
Best VAL thr: 0.4
Best VAL precision: 0.6645962732919255
Best VAL recall: 0.9469026548672567
Best VAL f1: 0.781021897810219
Best VAL pos_rate: 0.7911547911547911

[Threshold] VAL selected via F1: t*=0.400, P=0.6646, R=0.9469, F1=0.7810, Acc=nan, Youden=nan

==== Test Performance (held-out, with chosen threshold) ====
AUC:           0.737195
AveragePrecision(PR-AUC): 0.753538
LogLoss:       0.636761
Accuracy:      0.612205
Precision@t*:  0.589074
Recall@t*:     0.911765
F1@t*:         0.715729
(t* chosen on VAL: 0.400)

Score PSI (TrainFit→Val):  0.0098
Score PSI (TrainFit→Test): 0.2173

==== Feature Importanc

In [13]:
# ========== NEW (B): 累计价格损益（下一期结算，实盘口径，稳健对齐） ==========
# 规则：t 日生成仓位 pos_t（预测涨=+1，预测跌=-1），在 t→t+1 期间以
# Δbasis_{t+1} = basis_{t+1} - basis_t 结算：PnL_t = pos_t * Δbasis_{t+1}

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- 1) 读取“未差分”的原始基差 ---
basis_excel_path = "基差.xls"  # 如路径不同请修改
basis_series = None
if os.path.exists(basis_excel_path):
    try:
        raw_basis = pd.read_excel(basis_excel_path).copy()
        # 自动识别日期列
        date_col = None
        for c in raw_basis.columns:
            if str(c).strip() in ["日期", "date", "Date", "交易日"]:
                date_col = c; break
        if date_col is not None:
            raw_basis[date_col] = pd.to_datetime(raw_basis[date_col], errors="coerce")
            raw_basis.set_index(date_col, inplace=True)
        else:
            # 若无日期列，假设与 df_clean 索引一致
            raw_basis.index = df_clean.index
        # 自动识别“基差”列（若列名不是“基差”，请手动指定）
        basis_col = None
        for c in raw_basis.columns:
            if "基差" in str(c):
                basis_col = c; break
        if basis_col is not None:
            basis_series = raw_basis[basis_col].astype(float).sort_index()
    except Exception as e:
        print(f"[WARN] 读取原始基差失败，将用差分累加近似重构。err={e}")

# --- 兜底：若未找到，则用 df_clean 中“基差差分”累加近似 ---
if basis_series is None:
    print("[WARN] 未找到未差分基差，将用 df_clean['基差(差分)'] 累加近似（可能与真实值有偏差）。")
    diff_basis_col = None
    for c in df_clean.columns:
        if "基差" in str(c):
            diff_basis_col = c; break
    if diff_basis_col is not None:
        basis_series = df_clean[diff_basis_col].astype(float).sort_index().cumsum()
    else:
        raise RuntimeError("未在 df_clean 中找到含“基差”的列，无法计算累计损益。")

# --- 2) 稳健对齐测试期索引与基差序列 ---
def _normalize_dtindex(idx) -> pd.DatetimeIndex:
    idx = pd.to_datetime(idx, errors="coerce")
    if getattr(idx, "tz", None) is not None:
        idx = idx.tz_localize(None)
    return idx.normalize()  # 归一化到“日期”粒度

# 规范化基差索引
basis_series = basis_series.copy()
basis_series.index = _normalize_dtindex(basis_series.index)
basis_series = basis_series.sort_index()

# 取得“测试期日期索引”：若 df_te.index 不是日期或大多为 NaT，就用 df_clean 末尾 N_te 天
N_te = len(df_te)
test_index_raw = df_te.index
test_index_norm = _normalize_dtindex(test_index_raw)

if (not isinstance(test_index_raw, pd.DatetimeIndex)) or (pd.isna(test_index_norm).mean() > 0.5):
    idx_all = _normalize_dtindex(df_clean.sort_index().index)
    assert len(idx_all) >= N_te, "df_clean 行数少于测试集样本数，无法回推测试期日期。"
    test_index = idx_all[-N_te:]
else:
    test_index = test_index_norm

# 先做“精确对齐”
basis_test = basis_series.reindex(test_index)

# 若大量 NaN，使用 merge_asof 向前对齐（容忍 5 天）
if basis_test.isna().mean() > 0.5:
    lo, hi = test_index.min() - pd.Timedelta(days=10), test_index.max() + pd.Timedelta(days=10)
    basis_clip = basis_series.loc[(basis_series.index >= lo) & (basis_series.index <= hi)]
    # left: 测试期日期（强制列名为 'date'）
    left = pd.DataFrame({"date": pd.to_datetime(test_index)})
    # right: 基差（两列，强制命名为 'date' 和 'basis'）
    right = basis_clip.reset_index()
    right.columns = ["date", "basis"]
    right["date"] = pd.to_datetime(right["date"])
    aligned = pd.merge_asof(
        left.sort_values("date"),
        right.sort_values("date"),
        on="date",
        direction="backward",
        tolerance=pd.Timedelta(days=5)
    )
    basis_test = aligned.set_index("date")["basis"]

# 若仍全 NaN，再次回退到 df_clean 的差分累加近似并强制对齐
if basis_test.isna().all():
    print("[WARN] 基差文件与测试期日期完全不匹配；使用 df_clean['基差'] 的差分累加近似。")
    diff_basis_col = None
    for c in df_clean.columns:
        if "基差" in str(c):
            diff_basis_col = c; break
    assert diff_basis_col is not None, "回退失败：df_clean 中未找到‘基差’列。"
    approx_series = df_clean[diff_basis_col].astype(float).sort_index().cumsum()
    approx_series.index = _normalize_dtindex(approx_series.index)
    basis_test = approx_series.reindex(test_index).ffill().bfill()

# 最终再做一次前后填补，确保无 NaN
basis_test = basis_test.ffill().bfill()
assert not basis_test.isna().all(), "对齐失败：basis_test 仍为全 NaN，请检查基差文件日期或 df_clean 的索引。"

# --- 3) 生成仓位（预测明天涨→做多 +1；预测明天跌→做空 -1） ---
pred_te = (y_te_prob >= chosen_thr).astype(int)    # 长度 N_te
pos = np.where(pred_te == 1, 1, -1)                # +1/-1

# --- 4) 计算 Δbasis_{t+1} 并用 t 的仓位结算 ---
delta_next = basis_test.diff()                     # Δbasis_t
pnl = pos[:-1] * delta_next.values[1:]            # 长度 N_te - 1

# --- 5) 可选：加入换手成本 ---
fee_per_change = 0.0                               # 单位与“基差”一致；如有手续费/滑点在此设置
turnover = (pos[:-1] != pos[1:]).astype(int)       # 长度 N_te - 1
pnl = pnl - turnover * fee_per_change

# --- 6) 累计 & 保存结果 ---
cum_pnl = np.cumsum(pnl)
N = len(basis_test)
assert len(pred_te) == N, f"预测长度 {len(pred_te)} 与测试期 {N} 不一致"
assert len(pnl) == N-1 == len(turnover)

pnl_df = pd.DataFrame({
    "date":        basis_test.index[:-1],
    "basis_t":     basis_test.values[:-1],
    "basis_t1":    basis_test.values[1:],
    "delta_t1":    delta_next.values[1:],
    "prob":        np.asarray(y_te_prob)[:-1],
    "pred":        np.asarray(pred_te)[:-1],
    "pos":         np.asarray(pos)[:-1],
    "turnover":    turnover,
    "pnl":         pnl,
    "cum_pnl":     cum_pnl
}).set_index("date")

pnl_df.to_csv("pnl_detail.csv", encoding="utf-8-sig")

plt.figure(figsize=(10, 5))
plt.plot(pnl_df.index, pnl_df["cum_pnl"].values, linewidth=1.5)
plt.title("Result on test")
plt.xlabel("Date"); plt.ylabel("Cumulative P&L")
# 若曲线为常数（例如全 0），强制展开 y 轴
if np.nanmin(pnl_df["cum_pnl"].values) == np.nanmax(pnl_df["cum_pnl"].values):
    y0 = float(pnl_df["cum_pnl"].iloc[-1])
    plt.ylim(y0 - 1, y0 + 1)
plt.tight_layout(); plt.savefig("cum_pnl.png", dpi=150); plt.close()

print("\n==== Cumulative P&L (Test, next-day realized) ====")
print(f"Final Cum P&L: {float(cum_pnl[-1]) if len(cum_pnl)>0 else float('nan'):.6f}")
print("Saved: pnl_detail.csv, cum_pnl.png")

# （可选）快速自检
# print("[CHECK] test_index[:5]:", test_index[:5])
# print("[CHECK] basis_test.head():\n", basis_test.head())
# print("[CHECK] NaN % in basis_test:", float(basis_test.isna().mean())*100, "%")
# print("[CHECK] pnl_df.head():\n", pnl_df.head())



==== Cumulative P&L (Test, next-day realized) ====
Final Cum P&L: 4.419200
Saved: pnl_detail.csv, cum_pnl.png
