In [1]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

import lightgbm as lgb
from lightgbm import LGBMClassifier

In [2]:
def remove_columns_with_keywords(df, keywords):
    columns_to_drop = [col for col in df.columns if any(keyword in col for keyword in keywords)]
    return df.drop(columns=columns_to_drop)

def get_columns_with_keywords(df, keywords):
    columns_to_get = [col for col in df.columns if any(keyword in col for keyword in keywords)]
    return columns_to_get

def recovery_dataset_id(X_y_group_train):
    # 假设'dataset'列是需要转换的列
    X_y_group_train['dataset'] = X_y_group_train['dataset'].apply(lambda x: f'{int(x):05}')
    return X_y_group_train

def clean_feature_names(X):
    # 函数用于清理特征名称
    def clean_name(name):
        # 移除或替换特殊字符
        name = re.sub(r'[^\w\s-]', '_', name)
        # 确保名称不以数字开头
        if name and name[0].isdigit():
            name = 'f_' + name
        return name

    X.columns = [clean_name(col) for col in X.columns]
    return X

def process_categorical_features(df, max_unique=10):
    """
    检测和处理数据框中的类别变量。

    参数：
    - df (pd.DataFrame): 输入的数据框。
    - max_unique (int): 判定为类别变量的最大唯一值数量。

    返回：
    - cat_idxs (list of int): 类别特征的索引。
    - cat_dims (list of int): 每个类别特征的模态数。
    - df (pd.DataFrame): 经过编码后的数据框。
    """
    cat_cols = [col for col in df.columns if df[col].nunique() <= max_unique]
    cat_dims = []
    cat_idxs = []

    for col in cat_cols:
        print(f"     处理类别特征: {col}，唯一值数量: {df[col].nunique()}")
        # 使用 LabelEncoder
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str).fillna('NaN'))
        cat_dims.append(len(le.classes_))
        cat_idxs.append(df.columns.get_loc(col))

    return cat_idxs, cat_dims, df

In [9]:
X_y_group_train = pd.read_csv('mid_data/X_y_group_train_updated_v13.4_rolling.csv')

print("##### Adding numeric labels y")
# 添加数值标签 y
le = LabelEncoder()
X_y_group_train["y"] = le.fit_transform(X_y_group_train["label"])
# 重新排列列
X_y_group_train = X_y_group_train[["dataset", "variable"] + X_y_group_train.columns.drop(["dataset", "variable", "label", "y"]).tolist() + ["label", "y"]]

print("##### Data Preprocessing...")
# 定义要删除的列
blacklist = [
    "ttest(v,X)", 
    "pvalue(ttest(v,X))<=0.05", 
    "ttest(v,Y)", 
    "pvalue(ttest(v,Y))<=0.05", 
    "ttest(X,Y)", 
    "pvalue(ttest(X,Y))<=0.05",
    "square_dimension", 
    "max(PPS(v,others))",
    "TLI_Collider",
    "TLI_Confounder",
    "RMSEA_Collider",
    "RMSEA_Confounder",
    # "RMSEA_Cause of Y",
]
columns_to_drop = [col for col in blacklist if col in X_y_group_train.columns]
X_y_group_train = X_y_group_train.drop(columns=columns_to_drop)
print('     删除多余列后样本量', X_y_group_train.shape)


# 处理数值列的缺失值
numeric_columns = X_y_group_train.select_dtypes(include=[np.number]).columns
X_y_group_train[numeric_columns] = X_y_group_train[numeric_columns].fillna(X_y_group_train[numeric_columns].mean())

# 清理特征名称
X_y_group_train = clean_feature_names(X_y_group_train)

print("##### Extracting X_train, y_train, and group")
# 分离数据集ID、特征和标签
group_train = X_y_group_train["dataset"]
X = X_y_group_train.drop(["variable", "dataset", "label", "y"], axis="columns")
y = X_y_group_train["y"]

# 处理类别特征
cat_idxs, cat_dims, X = process_categorical_features(X)
print(f"     类别特征索引 (cat_idxs): {cat_idxs}")
print(f"     类别特征模态数 (cat_dims): {cat_dims}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("     分割数据集后X_train & X_test shape:", X_train.shape, X_test.shape)
print("     y_train 唯一值:", np.unique(y_train))
print("     y_test 唯一值:", np.unique(y_test))

##### Adding numeric labels y
##### Data Preprocessing...
     删除多余列后样本量 (142910, 211)
##### Extracting X_train, y_train, and group
     处理类别特征: dimension，唯一值数量: 8
     处理类别特征: ExactSearch_v_X_，唯一值数量: 2
     处理类别特征: ExactSearch_X_v_，唯一值数量: 2
     处理类别特征: ExactSearch_v_Y_，唯一值数量: 2
     处理类别特征: ExactSearch_Y_v_，唯一值数量: 2
     处理类别特征: ExactSearch_X_Y_，唯一值数量: 2
     处理类别特征: PC_v_X_，唯一值数量: 2
     处理类别特征: PC_X_v_，唯一值数量: 2
     处理类别特征: PC_v_Y_，唯一值数量: 2
     处理类别特征: PC_Y_v_，唯一值数量: 2
     处理类别特征: PC_X_Y_，唯一值数量: 2
     处理类别特征: FCI_v_X_，唯一值数量: 4
     处理类别特征: FCI_X_v_，唯一值数量: 4
     处理类别特征: FCI_v_Y_，唯一值数量: 4
     处理类别特征: FCI_Y_v_，唯一值数量: 4
     处理类别特征: FCI_X_Y_，唯一值数量: 4
     处理类别特征: GRaSP_v_X_，唯一值数量: 3
     处理类别特征: GRaSP_X_v_，唯一值数量: 3
     处理类别特征: GRaSP_v_Y_，唯一值数量: 3
     处理类别特征: GRaSP_Y_v_，唯一值数量: 3
     处理类别特征: GRaSP_X_Y_，唯一值数量: 3
     类别特征索引 (cat_idxs): [0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 137, 138, 139, 140, 141]
     类别特征模态数 (cat_dims): [8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, classification_report
import numpy as np

# 定义模型
model = LGBMClassifier(
    n_estimators=3000,
    learning_rate=0.05,
    num_leaves=21,
    max_depth=5,
    min_child_samples=50,
    colsample_bytree=0.6,
    reg_alpha=25.0,
    reg_lambda=25.0,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced',
    device='cpu',
    verbose=-1,
)

# 定义5折交叉验证
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 存储每次折叠的分数
train_scores = []
test_scores = []

# 进行5折交叉验证
for fold, (train_index, val_index) in enumerate(skf.split(X, y), 1):
    
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # 定义回调函数
    callbacks = [lgb.log_evaluation(period=100), lgb.early_stopping(stopping_rounds=10)]
    
    # 训练模型
    model.fit(X_train, y_train, 
              callbacks=callbacks, 
              eval_set=[(X_val, y_val)],
              categorical_feature=cat_idxs)  
    
    # 预测
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    
    # 计算平衡准确率
    train_score = balanced_accuracy_score(y_train, y_train_pred)
    val_score = balanced_accuracy_score(y_val, y_val_pred)
    
    train_scores.append(train_score)
    test_scores.append(val_score)
    
    print(f"Fold {fold}:")
    print(f"  Train balanced accuracy: {train_score:.6f}")
    print(f"  Validation balanced accuracy: {val_score:.6f}")
    

# 打印平均分数
print("\nAverage Scores:")
print(f"  Train balanced accuracy: {np.mean(train_scores):.6f} (+/- {np.std(train_scores):.6f})")
print(f"  Validation balanced accuracy: {np.mean(test_scores):.6f} (+/- {np.std(test_scores):.6f})")

     剔除离群样本后X_train & X_val shape: (114203, 207) (28582, 207)
     删除多余列后X_train & X_val shape: (114203, 203) (28582, 203)
Training until validation scores don't improve for 10 rounds
[100]	valid_0's multi_logloss: 0.905664
[200]	valid_0's multi_logloss: 0.832034
[300]	valid_0's multi_logloss: 0.796924
[400]	valid_0's multi_logloss: 0.77459
[500]	valid_0's multi_logloss: 0.758362
[600]	valid_0's multi_logloss: 0.746004
[700]	valid_0's multi_logloss: 0.736635
[800]	valid_0's multi_logloss: 0.729625
[900]	valid_0's multi_logloss: 0.723541
[1000]	valid_0's multi_logloss: 0.718717
[1100]	valid_0's multi_logloss: 0.71452
[1200]	valid_0's multi_logloss: 0.711193
[1300]	valid_0's multi_logloss: 0.708841
[1400]	valid_0's multi_logloss: 0.707141
[1500]	valid_0's multi_logloss: 0.706008
[1600]	valid_0's multi_logloss: 0.705084
Early stopping, best iteration is:
[1662]	valid_0's multi_logloss: 0.704752
Fold 1:
  Train balanced accuracy: 0.899174
  Validation balanced accuracy: 0.672930
     剔除离群样