In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, precision_score, recall_score, f1_score, \
    brier_score_loss, roc_curve, average_precision_score
from scipy.stats import ks_2samp
import os
import pickle
from lightgbm import LGBMClassifier

# 设置路径 (保持不变)
root_path = 'D:/study/Credit(1)/Credit/'
params_path = r'D:\study\Credit(1)\Credit\params/'
dataset_path = r'D:\study\credit_scoring_datasets/'
shuffle_path = r'D:\study\Credit(1)\Credit\shuffle_index/'
save_path = r'D:\study\second\outcome/'
os.makedirs(save_path, exist_ok=True)

# 加载新的数据集
data = pd.read_csv(r'D:\study\credit_scroing_datasets\lending club/2012-2013.csv', low_memory=True)
features = data.drop('loan_status', axis=1)
labels = data['loan_status']

# 手动shuffle索引
indices = np.arange(features.shape[0])
np.random.shuffle(indices)

# 分割数据集
train_size = int(features.shape[0] * 0.8)
valid_size = int(features.shape[0] * 0.1)
test_size = valid_size  # 假设测试集大小与验证集相同

train_index = indices[:train_size]
valid_index = indices[train_size:(train_size + valid_size)]
test_index = indices[(train_size + valid_size):(train_size + valid_size + test_size)]
remaining_index = indices[(train_size + valid_size + test_size):]

# 确保索引范围正确
print(f"Total data size: {features.shape[0]}")
print(f"Train indices: {train_index[:5]}...{train_index[-5:]}")
print(f"Valid indices: {valid_index[:5]}...{valid_index[-5:]}")
print(f"Test indices: {test_index[:5]}...{test_index[-5:]}")
print(f"Remaining indices: {remaining_index[:5]}...{remaining_index[-5:]}")

train_x, train_y = features.iloc[train_index, :], labels.iloc[train_index]
valid_x, valid_y = features.iloc[valid_index], labels.iloc[valid_index]
test_x, test_y = features.iloc[test_index], labels.iloc[test_index]
remaining_x, remaining_y = features.iloc[remaining_index], labels.iloc[remaining_index]

# 将训练集和验证集合并用于交叉验证
full_train_x = pd.concat([train_x, valid_x], axis=0)
full_train_y = pd.concat([train_y, valid_y], axis=0)

# 如果需要对数据进行预处理（例如标准化），可以在这里进行
# full_train_x_transformed = some_preprocessing_function(full_train_x)
# test_x_transformed = some_preprocessing_function(test_x)

# 如果不需要任何预处理，则直接使用原始数据
full_train_x_transformed = full_train_x
test_x_transformed = test_x

# 定义参数网格
xgb_param_grid = {
    'n_estimators': [500],
    'max_depth': [4],
    'learning_rate': [0.05],
    'reg_alpha': [5]
}

# 获取所有参数组合
param_combinations = list(ParameterGrid(xgb_param_grid))
results = []

for params_set in param_combinations:
    params = {
        'n_estimators': params_set['n_estimators'],
        'learning_rate': params_set['learning_rate'],
        'max_depth': params_set['max_depth'],
        'reg_alpha': params_set['reg_alpha'],
        'objective': 'binary',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'verbosity': 0
    }

    # 训练模型
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(full_train_x_transformed, full_train_y)

    # 预测和评估模型
    preds_proba = lgb_model.predict_proba(test_x_transformed)[:, 1]
    preds = lgb_model.predict(test_x_transformed)

    # 计算评估指标
    auc_score = roc_auc_score(test_y, preds_proba)
    logloss = log_loss(test_y, preds_proba)
    ks = ks_2samp(preds_proba[test_y == 1], preds_proba[test_y != 1]).statistic
    accuracy = accuracy_score(test_y, preds)
    precision = precision_score(test_y, preds)
    recall = recall_score(test_y, preds)
    f1 = f1_score(test_y, preds)
    brier_score = brier_score_loss(test_y, preds_proba)
    average_precision = average_precision_score(test_y, preds_proba)
    fprs, tprs, thresholds = roc_curve(test_y, preds_proba)
    true_positive_rate = tprs
    true_negative_rate = 1 - fprs
    gmean = np.sqrt(true_positive_rate * true_negative_rate)

    # 计算 H-mean 和其他自定义指标
    def h_mean(precision, recall):
        if precision + recall == 0:
            return 0
        return 2 * (precision * recall) / (precision + recall)

    hm = h_mean(precision, recall)

    # 计算 type1error 和 type2error
    def type1error(y_proba, y_true, threshold=0.5):
        y_pred = (y_proba >= threshold).astype(int)
        fp = ((y_pred == 1) & (y_true == 0)).sum()
        return fp / (y_true == 0).sum()

    def type2error(y_proba, y_true, threshold=0.5):
        y_pred = (y_proba >= threshold).astype(int)
        fn = ((y_pred == 0) & (y_true == 1)).sum()
        return fn / (y_true == 1).sum()

    type1_error = type1error(preds_proba, test_y)
    type2_error = type2error(preds_proba, test_y)

    # 计算 Acc AUC Prec Rec 的平均值
    average_score = (accuracy + auc_score + precision + recall) / 4

    # 将结果存入列表
    results.append({
        'params': params_set,
        'accuracy': accuracy,
        'auc_score': auc_score,
        'logloss': logloss,
        'ks_stat': ks,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'brier_score': brier_score,
        'average_precision': average_precision,
        'hm': hm,
        'gmean': gmean,
        'type1_error': type1_error,
        'type2_error': type2_error,
        'average_score': average_score
    })

    # 输出每个参数组合的结果
    print(f"Params: {params_set}, Accuracy: {accuracy}, AUC: {auc_score}, Average Score: {average_score}")

# 输出所有结果
print("所有结果：")
for result in results:
    print(result)

# 输出最佳结果 (基于平均得分)
best_result = max(results, key=lambda x: x['average_score'])
print("最佳结果：", best_result)

# 保存所有结果到字典并写入文件
results_dict = {'results': results, 'best_result': best_result}

dataset = 'bankfear'
method = 'VIP'
file_path = os.path.join(save_path, f'{dataset}\\{method}_res.pickle')
os.makedirs(os.path.dirname(file_path), exist_ok=True)

with open(file_path, 'wb') as f:
    pickle.dump(results_dict, f)

print(f'This is ACC: {best_result["accuracy"]}')
print(f'This is AUC: {best_result["auc_score"]}')
print(f'The results of {method} on {dataset} have been calculated and saved.\n\n')


Total data size: 235314
Train indices: [160007  17179  21258 206000 173854]...[190864  49130 120248 140099  71040]
Valid indices: [185626 145947  58487 161900 100644]...[142983  74111 124454 209501  58216]
Test indices: [ 44266    389  70815 156534 184434]...[206784  31054  56679  24565  30624]
Remaining indices: [129596]...[129596]


Params: {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 500, 'reg_alpha': 5}, Accuracy: 0.8680039097360928, AUC: 0.9147249175283935, Average Score: 0.7372989353505853
所有结果：
{'params': {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 500, 'reg_alpha': 5}, 'accuracy': 0.8680039097360928, 'auc_score': 0.9147249175283935, 'logloss': 0.26365838514158874, 'ks_stat': 0.6830977160637849, 'precision': 0.6613722998729352, 'recall': 0.50509461426492, 'f1': 0.5727647867950482, 'brier_score': 0.0879587373046223, 'average_precision': 0.6604085604972226, 'hm': 0.5727647867950482, 'gmean': array([0.        , 0.01557564, 0.06789266, ..., 0.67525287, 0.67533479,
       0.        ]), 'type1_error': 0.05492297387809779, 'type2_error': 0.4949053857350801, 'average_score': 0.7372989353505853}
最佳结果： {'params': {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 500, 'reg_alpha': 5}, 'accuracy': 0.8680039097360928, 'auc_score': 0.9147249175283935, 'logloss': 0.26365838514158874, 'ks_stat': 