In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import balanced_accuracy_score, classification_report
from lightgbm import LGBMClassifier
import lightgbm as lgb
import matplotlib.pyplot as plt
import time
import re


In [2]:
def remove_columns_with_keywords(df, keywords):
    columns_to_drop = [col for col in df.columns if any(keyword in col for keyword in keywords)]
    return df.drop(columns=columns_to_drop)

def get_columns_with_keywords(df, keywords):
    columns_to_get = [col for col in df.columns if any(keyword in col for keyword in keywords)]
    return columns_to_get

def clean_feature_names(X):
    # 函数用于清理特征名称
    def clean_name(name):
        # 移除或替换特殊字符
        name = re.sub(r'[^\w\s-]', '_', name)
        # 确保名称不以数字开头
        if name and name[0].isdigit():
            name = 'f_' + name
        return name

    X.columns = [clean_name(col) for col in X.columns]
    return X

def process_categorical_features(df, max_unique=10):
    """
    检测和处理数据框中的类别变量。

    参数：
    - df (pd.DataFrame): 输入的数据框。
    - max_unique (int): 判定为类别变量的最大唯一值数量。

    返回：
    - cat_idxs (list of int): 类别特征的索引。
    - cat_dims (list of int): 每个类别特征的模态数。
    - df (pd.DataFrame): 经过编码后的数据框。
    """
    cat_cols = [col for col in df.columns if df[col].nunique() <= max_unique]
    cat_dims = []
    cat_idxs = []

    for col in cat_cols:
        print(f"处理类别特征: {col}，唯一值数量: {df[col].nunique()}")
        # 使用 LabelEncoder
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str).fillna('NaN'))
        cat_dims.append(len(le.classes_))
        cat_idxs.append(df.columns.get_loc(col))

    return cat_idxs, cat_dims, df

In [3]:
# 读取数据
X_y_group_train = pd.read_csv('./mid_data/X_y_group_train_updated_v13.4_rolling.csv')

print("Adding numeric labels y")
le = LabelEncoder()
X_y_group_train["y"] = le.fit_transform(X_y_group_train["label"])
# 重新排列列
X_y_group_train = X_y_group_train[["dataset", "variable"] + X_y_group_train.columns.drop(["dataset", "variable", "label", "y"]).tolist() + ["label", "y"]]

# 定义要删除的列
blacklist = [
    "ttest(v,X)", 
    "pvalue(ttest(v,X))<=0.05", 
    "ttest(v,Y)", 
    "pvalue(ttest(v,Y))<=0.05", 
    "ttest(X,Y)", 
    "pvalue(ttest(X,Y))<=0.05",
    "square_dimension", 
    "max(PPS(v,others))",
    "TLI_Collider",
    "TLI_Confounder",
    "RMSEA_Collider",
    "RMSEA_Confounder"
]
columns_to_drop = [col for col in blacklist if col in X_y_group_train.columns]
X_y_group_train = X_y_group_train.drop(columns=columns_to_drop)

# 处理数值列的缺失值
numeric_columns = X_y_group_train.select_dtypes(include=[np.number]).columns
X_y_group_train[numeric_columns] = X_y_group_train[numeric_columns].fillna(X_y_group_train[numeric_columns].mean())

# 清理特征名称
X_y_group_train = clean_feature_names(X_y_group_train)

print("Extracting X_train, y_train, and group")
# 分离数据集ID、特征和标签
group_train = X_y_group_train["dataset"]
X = X_y_group_train.drop(["variable", "dataset", "label", "y"], axis="columns")
y = X_y_group_train["y"]


# 分割数据集为训练集和测试集
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("y_train 唯一值:", np.unique(y_train_full))
print("y_test 唯一值:", np.unique(y_test))


Adding numeric labels y
Extracting X_train, y_train, and group
y_train 唯一值: [0 1 2 3 4 5 6 7]
y_test 唯一值: [0 1 2 3 4 5 6 7]


In [4]:
from lightgbm import LGBMClassifier, log_evaluation, early_stopping
from sklearn.metrics import balanced_accuracy_score, classification_report
def create_model():
    return LGBMClassifier(
        n_estimators=2000,            # 减少树的数量以防止过拟合
        learning_rate=0.05,           # 保持较低的学习率

        num_leaves=21,                # 减少叶子数量以控制复杂度
        max_depth=5,                  # 减小树的深度以简化模型

        min_child_samples=50,         # 增加叶子节点的最小样本数
        colsample_bytree=0.6,         # 减少特征采样比例

        reg_alpha=25.0,               # 增加L1正则化
        reg_lambda=25.0,              # 增加L2正则化

        random_state=42,
        n_jobs=-1,
        class_weight='balanced',
        device='cpu',
        verbose=-1,                   # 设置详细程度
    )

# 定义回调函数
callbacks = [log_evaluation(period=500), early_stopping(stopping_rounds=10)]

# 初始化变量
selected_features = X_train_full.columns.tolist()
best_test_score = 0
best_features = selected_features.copy()
train_scores = []
test_scores = []
feature_elimination_steps = []
iteration = 0

# 最少保留的特征数量
min_features_to_select = 160

print("\n开始按特征重要性从低到高的反向特征消除...\n")
start_time = time.time()

# 初始训练以获取特征重要性
initial_model = create_model()
initial_model.fit(
    X_train_full[selected_features], y_train_full,
    eval_set=[(X_test[selected_features], y_test)],
    callbacks=callbacks,
)

# 获取特征重要性并排序（从低到高）
feature_importances = initial_model.feature_importances_
feature_importance_dict = dict(zip(selected_features, feature_importances))
sorted_features = sorted(feature_importance_dict, key=feature_importance_dict.get)

print("特征按重要性排序（从低到高）:")
for idx, feature in enumerate(sorted_features, 1):
    print(f"{idx}. {feature} (重要性: {feature_importance_dict[feature]})")

# 设定一个阈值，仅在提升超过该阈值时才保留删除
improvement_threshold = 0.0005

# 标记是否有特征被移除，用于控制循环
features_removed_in_last_pass = True

while features_removed_in_last_pass and len(selected_features) > min_features_to_select:
    features_removed_in_last_pass = False
    # 复制一份排序好的特征列表，避免在循环中修改原列表
    current_sorted_features = sorted_features.copy()
    
    for feature in current_sorted_features:
        if len(selected_features) <= min_features_to_select:
            break

        print(f"\n迭代 {iteration + 1}: 尝试移除特征 '{feature}'")

        temp_features = selected_features.copy()
        temp_features.remove(feature)

        X_train_temp = X_train_full[temp_features]
        X_test_temp = X_test[temp_features]

        model = create_model()

        # 训练模型
        model.fit(
            X_train_temp, y_train_full,
            eval_set=[(X_test_temp, y_test)],
            callbacks=callbacks,
        )

        # 预测
        y_train_pred = model.predict(X_train_temp)
        y_test_pred = model.predict(X_test_temp)

        # 计算平衡准确率
        train_score = balanced_accuracy_score(y_train_full, y_train_pred)
        test_score = balanced_accuracy_score(y_test, y_test_pred)

        print(f"当前特征数量: {len(temp_features)}")
        print(f"训练集平衡准确率: {train_score:.6f}")
        print(f"测试集平衡准确率: {test_score:.6f}")

        # 检查是否有提升
        if test_score >= best_test_score + improvement_threshold:
            print(f"移除特征 '{feature}' 后测试集平衡准确率提升至 {test_score:.6f}，保留删除。")
            selected_features.remove(feature)
            best_test_score = test_score
            best_features = selected_features.copy()
            train_scores.append(train_score)
            test_scores.append(test_score)
            feature_elimination_steps.append(feature)
            features_removed_in_last_pass = True  # 标记有特征被移除
        else:
            print(f"移除特征 '{feature}' 后测试集平衡准确率未提升，保留该特征。")

    iteration += 1  # 增加总迭代次数

end_time = time.time()
elapsed_time = end_time - start_time
print(f"\n按特征重要性反向特征消除完成，总耗时: {elapsed_time/60:.2f} 分钟")

# 绘制特征数量与测试集平衡准确率的关系
plt.figure(figsize=(10, 6))
feature_counts = list(range(len(selected_features) + len(test_scores), len(best_features), -1))
plt.plot(range(len(best_features), len(best_features) + len(test_scores)), test_scores, marker='o', label='Test Balanced Accuracy')
plt.xlabel('Number of Features')
plt.ylabel('Balanced Accuracy')
plt.title('Feature Elimination Performance (按重要性从低到高)')
plt.legend()
plt.gca().invert_xaxis()
plt.grid(True)
plt.show()

print(f"\n最佳特征数量: {len(best_features)}")
print(f"最佳特征列表: {best_features}")

# 使用最佳特征重新训练最终模型
print("\n训练最终模型...")
final_model = create_model()

X_train_best = X_train_full[best_features]
X_test_best = X_test[best_features]

final_model.fit(
    X_train_best, y_train_full,
    eval_set=[(X_test_best, y_test)],
    callbacks=callbacks,
)

# 最终模型评估
y_test_pred_final = final_model.predict(X_test_best)
test_score_final = balanced_accuracy_score(y_test, y_test_pred_final)

print(f"\n最终模型在测试集上的平衡准确率: {test_score_final:.6f}")
print("\n最终模型的分类报告:")
print(classification_report(y_test, y_test_pred_final))


开始按特征重要性从低到高的反向特征消除...

Training until validation scores don't improve for 10 rounds
[500]	valid_0's multi_logloss: 0.75432
[1000]	valid_0's multi_logloss: 0.712997
[1500]	valid_0's multi_logloss: 0.700509
Early stopping, best iteration is:
[1891]	valid_0's multi_logloss: 0.697674
特征按重要性排序（从低到高）:
1. RMSEA_Cause of Y (重要性: 4)
2. PC_v_X_ (重要性: 28)
3. spearman_corr_v_Y_ (重要性: 55)
4. PC_Y_v_ (重要性: 60)
5. PC_v_Y_ (重要性: 81)
6. PC_X_Y_ (重要性: 105)
7. GRaSP_X_Y_ (重要性: 112)
8. PC_X_v_ (重要性: 115)
9. ExactSearch_X_Y_ (重要性: 119)
10. corr_v_Y_ (重要性: 123)
11. spearman_corr_v_X_ (重要性: 170)
12. kendall_corr_v_Y_ (重要性: 237)
13. GRaSP_X_v_ (重要性: 257)
14. v_Y_sliding_coef3 (重要性: 263)
15. Y_v_sliding_coef2 (重要性: 268)
16. ExactSearch_Y_v_ (重要性: 273)
17. spearman_corr_X_Y_ (重要性: 287)
18. FCI_X_v_ (重要性: 299)
19. GRaSP_v_Y_ (重要性: 302)
20. Y_v_sliding_coef3 (重要性: 303)
21. Y_v_sliding_coef1 (重要性: 304)
22. Y_v_sliding_coef5 (重要性: 304)
23. GRaSP_v_X_ (重要性: 313)
24. GRaSP_Y_v_ (重要性: 313)
25. BIC_Consequence of X (

ValueError: list.remove(x): x not in list