# 1 加载数据与所需包

In [1]:
import pandas as pd
import numpy as np
import joblib  # 用于加载.pkl模型
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_curve, f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

# 加载数据（与训练LightGBM时相同的预处理）
data = pd.read_csv("creditcard.csv")
X = data.drop(columns=['Class', 'Time'])  # 确保特征与训练时一致
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 2 加载预训练 LightGBM 模型

In [8]:
# 加载预训练LightGBM模型
lgb_booster = joblib.load('lgb_optimized_model.pkl')  

In [None]:
import lightgbm  
from lightgbm import LGBMClassifier  

# 检查模型类型并转换
if isinstance(lgb_booster, lightgbm.Booster):
    # 创建空的LGBMClassifier并注入Booster
    lgb_model = LGBMClassifier()
    lgb_model._Booster = lgb_booster
    lgb_model.fitted_ = True
    lgb_model._n_classes = 2  # 二分类
    lgb_model._classes_ = np.array([0, 1])
else:
    lgb_model = lgb_booster  # 已经是sklearn接口则直接使用

# 3 xuyaoxiugai

In [16]:
# 4. 获取模型期望的特征（关键修改）
expected_features = lgb_model.booster_.feature_name()
print(f"Model expects features: {expected_features}")

# 5. 准备数据（只保留模型期望的特征）
X = data[expected_features]  # 重要：只选择模型训练时使用的特征
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Model expects features: ['V1', 'V3', 'V4', 'V7', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18']


# 4 定义其他基模型

In [17]:
xgb_model = XGBClassifier(
    n_estimators=100,
    scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]),
    eval_metric='aucpr',
    random_state=42
)

rf_model = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42
)

# 5 训练 XGBoost 和 Random Forest

In [18]:
print("Training XGBoost and Random Forest...")
xgb_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

Training XGBoost and Random Forest...


# 5 软投票 Stacking

In [21]:
def soft_voting_predict(models, weights, X, threshold=0.5):
    probas = [model.predict_proba(X)[:, 1] for model in models]
    weighted_proba = np.average(probas, axis=0, weights=weights)
    return (weighted_proba >= threshold).astype(int)

In [22]:
model_weights = [0.6, 0.3, 0.1]  # LightGBM > XGBoost > RF
models = [lgb_model, xgb_model, rf_model]

# 6 阈值调优

In [23]:
y_proba = np.average(
    [model.predict_proba(X_test)[:, 1] for model in models],
    axis=0,
    weights=model_weights
)

In [24]:
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
best_threshold = thresholds[np.argmax(2 * precision * recall / (precision + recall + 1e-9))]
print(f"Optimal Threshold for F1: {best_threshold:.4f}")

Optimal Threshold for F1: 0.3788


# 7 评估

In [25]:
y_pred = soft_voting_predict(models, model_weights, X_test, best_threshold)
print("\nSoft Voting Ensemble Performance:")
print(classification_report(y_test, y_pred))


Soft Voting Ensemble Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.84      0.85      0.84        98

    accuracy                           1.00     56962
   macro avg       0.92      0.92      0.92     56962
weighted avg       1.00      1.00      1.00     56962



In [26]:
from sklearn.metrics import average_precision_score, precision_recall_curve, auc

precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
auprc = auc(recall, precision)
print(f"Ensemble AUPRC: {auprc:.4f}")

Ensemble AUPRC: 0.8529
