In [6]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

import xgboost as xgb
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [7]:
def remove_columns_with_keywords(df, keywords):
    columns_to_drop = [col for col in df.columns if any(keyword in col for keyword in keywords)]
    return df.drop(columns=columns_to_drop)

def get_columns_with_keywords(df, keywords):
    columns_to_get = [col for col in df.columns if any(keyword in col for keyword in keywords)]
    return columns_to_get

def recovery_dataset_id(X_y_group_train):
    # 假设'dataset'列是需要转换的列
    X_y_group_train['dataset'] = X_y_group_train['dataset'].apply(lambda x: f'{int(x):05}')
    return X_y_group_train

def clean_feature_names(X):
    # 函数用于清理特征名称
    def clean_name(name):
        # 移除或替换特殊字符
        name = re.sub(r'[^\w\s-]', '_', name)
        # 确保名称不以数字开头
        if name and name[0].isdigit():
            name = 'f_' + name
        return name

    X.columns = [clean_name(col) for col in X.columns]
    return X

def process_categorical_features(df, max_unique=10):
    """
    检测和处理数据框中的类别变量。

    参数：
    - df (pd.DataFrame): 输入的数据框。
    - max_unique (int): 判定为类别变量的最大唯一值数量。

    返回：
    - cat_idxs (list of int): 类别特征的索引。
    - cat_dims (list of int): 每个类别特征的模态数。
    - df (pd.DataFrame): 经过编码后的数据框。
    """
    cat_cols = [col for col in df.columns if df[col].nunique() <= max_unique]
    cat_dims = []
    cat_idxs = []

    for col in cat_cols:
        print(f"     处理类别特征: {col}，唯一值数量: {df[col].nunique()}")
        # 使用 LabelEncoder
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str).fillna('NaN'))
        cat_dims.append(len(le.classes_))
        cat_idxs.append(df.columns.get_loc(col))

    return cat_idxs, cat_dims, df

In [8]:
X_y_group_train = pd.read_csv('mid_data/X_y_group_train_updated_v13.4_rolling.csv')

print("##### Adding numeric labels y")
# 添加数值标签 y
le = LabelEncoder()
X_y_group_train["y"] = le.fit_transform(X_y_group_train["label"])
# 重新排列列
X_y_group_train = X_y_group_train[["dataset", "variable"] + X_y_group_train.columns.drop(["dataset", "variable", "label", "y"]).tolist() + ["label", "y"]]

print("##### Data Preprocessing...")
# 定义要删除的列
blacklist = [
    "ttest(v,X)", 
    "pvalue(ttest(v,X))<=0.05", 
    "ttest(v,Y)", 
    "pvalue(ttest(v,Y))<=0.05", 
    "ttest(X,Y)", 
    "pvalue(ttest(X,Y))<=0.05",
    "square_dimension", 
    "max(PPS(v,others))",
    "TLI_Collider",
    "TLI_Confounder",
    "RMSEA_Collider",
    "RMSEA_Confounder",
    # "RMSEA_Cause of Y",
]
columns_to_drop = [col for col in blacklist if col in X_y_group_train.columns]
X_y_group_train = X_y_group_train.drop(columns=columns_to_drop)
print('     删除多余列后样本量', X_y_group_train.shape)

# 处理无穷值：转换为NaN
X_y_group_train = X_y_group_train.replace([np.inf, -np.inf], np.nan)

# 处理数值列的缺失值
numeric_columns = X_y_group_train.select_dtypes(include=[np.number]).columns
X_y_group_train[numeric_columns] = X_y_group_train[numeric_columns].fillna(X_y_group_train[numeric_columns].mean())

# 清理特征名称
X_y_group_train = clean_feature_names(X_y_group_train)

print("##### Extracting X_train, y_train, and group")
# 分离数据集ID、特征和标签
group_train = X_y_group_train["dataset"]
X = X_y_group_train.drop(["variable", "dataset", "label", "y"], axis="columns")
y = X_y_group_train["y"]

# 处理类别特征
cat_idxs, cat_dims, X = process_categorical_features(X)
print(f"     类别特征索引 (cat_idxs): {cat_idxs}")
print(f"     类别特征模态数 (cat_dims): {cat_dims}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("     分割数据集后X_train & X_test shape:", X_train.shape, X_test.shape)
print("     y_train 唯一值:", np.unique(y_train))
print("     y_test 唯一值:", np.unique(y_test))

classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = list(class_weights)  # 转换为列表
print(f"类别权重: {class_weights}")

##### Adding numeric labels y
##### Data Preprocessing...
     删除多余列后样本量 (142910, 207)
##### Extracting X_train, y_train, and group
     处理类别特征: dimension，唯一值数量: 8
     处理类别特征: ExactSearch_v_X_，唯一值数量: 2
     处理类别特征: ExactSearch_X_v_，唯一值数量: 2
     处理类别特征: ExactSearch_v_Y_，唯一值数量: 2
     处理类别特征: ExactSearch_Y_v_，唯一值数量: 2
     处理类别特征: ExactSearch_X_Y_，唯一值数量: 2
     处理类别特征: PC_v_X_，唯一值数量: 2
     处理类别特征: PC_X_v_，唯一值数量: 2
     处理类别特征: PC_v_Y_，唯一值数量: 2
     处理类别特征: PC_Y_v_，唯一值数量: 2
     处理类别特征: PC_X_Y_，唯一值数量: 2
     处理类别特征: FCI_v_X_，唯一值数量: 4
     处理类别特征: FCI_X_v_，唯一值数量: 4
     处理类别特征: FCI_v_Y_，唯一值数量: 4
     处理类别特征: FCI_Y_v_，唯一值数量: 4
     处理类别特征: FCI_X_Y_，唯一值数量: 4
     处理类别特征: GRaSP_v_X_，唯一值数量: 3
     处理类别特征: GRaSP_X_v_，唯一值数量: 3
     处理类别特征: GRaSP_v_Y_，唯一值数量: 3
     处理类别特征: GRaSP_Y_v_，唯一值数量: 3
     处理类别特征: GRaSP_X_Y_，唯一值数量: 3
     类别特征索引 (cat_idxs): [0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 137, 138, 139, 140, 141]
     类别特征模态数 (cat_dims): [8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [15]:


# 计算类别权重
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
weight_dict = dict(zip(classes, class_weights))
sample_weights = y_train.map(weight_dict)

# 设置XGBoost参数
params = {
    'max_depth': 5,
    'learning_rate': 0.05,
    'objective': 'multi:softmax',
    'num_class': 8,
    'eval_metric': 'mlogloss',
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'min_child_weight': 1,
    'gamma': 0.3,
    'reg_lambda': 10,
    'reg_alpha': 10,
    'tree_method': 'gpu_hist',  # 使用GPU
    'max_delta_step': 0.5,
    'n_estimators': 2000,
    'early_stopping_rounds': 50,
    'verbose': 100
}

# 创建XGBClassifier
xgb_model = XGBClassifier(**params)

# 训练模型
xgb_model.fit(
    X_train, 
    y_train,
    sample_weight=sample_weights,
    eval_set=[(X_test, y_test)],
    verbose=100
)

# 预测和评估
y_test_pred = xgb_model.predict(X_test)
test_score = balanced_accuracy_score(y_test, y_test_pred)
print(f"Test set balanced accuracy: {test_score:.6f}")


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "verbose" } are not used.



[0]	validation_0-mlogloss:2.00697
[100]	validation_0-mlogloss:0.95882
[200]	validation_0-mlogloss:0.86604
[300]	validation_0-mlogloss:0.82351
[400]	validation_0-mlogloss:0.79710
[500]	validation_0-mlogloss:0.77741
[600]	validation_0-mlogloss:0.76252
[700]	validation_0-mlogloss:0.75050
[800]	validation_0-mlogloss:0.74055
[900]	validation_0-mlogloss:0.73211
[1000]	validation_0-mlogloss:0.72469
[1100]	validation_0-mlogloss:0.71852
[1200]	validation_0-mlogloss:0.71294
[1300]	validation_0-mlogloss:0.70818
[1400]	validation_0-mlogloss:0.70344
[1500]	validation_0-mlogloss:0.69925
[1600]	validation_0-mlogloss:0.69568
[1700]	validation_0-mlogloss:0.69219
[1800]	validation_0-mlogloss:0.68913
[1900]	validation_0-mlogloss:0.68618
[1999]	validation_0-mlogloss:0.68373



    E.g. tree_method = "hist", device = "cuda"



Test set balanced accuracy: 0.679183


Test set balanced accuracy: 0.677989


In [11]:
# 创建Pool对象
train_pool = Pool(data=X_train, label=y_train, cat_features=cat_idxs)
test_pool = Pool(data=X_test, label=y_test, cat_features=cat_idxs)

# 计算类别权重（使用每个类别的逆频率）
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = list(class_weights)  # 转换为列表
print(f"类别权重: {class_weights}")

cat_model = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.05,
    depth=7,
    
    l2_leaf_reg=0.1,
    model_size_reg=0.1,
    classes_count=8,
    class_weights=class_weights,
    cat_features=cat_idxs,
    random_seed=42,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    verbose=100,
    early_stopping_rounds=50,
    task_type='GPU',
    devices='0:1',  # 如果使用GPU,指定GPU设备
    save_snapshot=False,
    train_dir="./tmp",
    leaf_estimation_method='Newton',  # 默认方法
)


# 训练模型
cat_model.fit(
    train_pool,
    eval_set=test_pool,
    use_best_model=True
)

y_test_pred = cat_model.predict(test_pool)
test_score = balanced_accuracy_score(y_test, y_test_pred)
print(f"Test set balanced accuracy: {test_score:.6f}")


类别权重: [1.450715663384428, 0.6722329366386002, 3.281515499425947, 1.975532209012994, 0.9026654876200101, 2.093305990918412, 0.32411040301181593, 2.930285011277425]
0:	learn: 0.5276560	test: 0.5308194	best: 0.5308194 (0)	total: 62.8ms	remaining: 2m 5s
100:	learn: 0.6035374	test: 0.5852222	best: 0.5852222 (100)	total: 5.65s	remaining: 1m 46s
200:	learn: 0.6714016	test: 0.6297306	best: 0.6297306 (200)	total: 11.1s	remaining: 1m 39s
300:	learn: 0.7135642	test: 0.6460039	best: 0.6464452 (298)	total: 16.5s	remaining: 1m 32s
400:	learn: 0.7460634	test: 0.6555250	best: 0.6558477 (399)	total: 21.6s	remaining: 1m 25s
500:	learn: 0.7746371	test: 0.6617700	best: 0.6620216 (498)	total: 26s	remaining: 1m 17s
600:	learn: 0.7973425	test: 0.6645677	best: 0.6649570 (598)	total: 30.4s	remaining: 1m 10s
700:	learn: 0.8158744	test: 0.6676719	best: 0.6678708 (692)	total: 34.8s	remaining: 1m 4s
800:	learn: 0.8316080	test: 0.6703876	best: 0.6704599 (786)	total: 39.3s	remaining: 58.8s
900:	learn: 0.8465332	test

In [13]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=3000,
    learning_rate=0.05,
    num_leaves=21,
    max_depth=5,
    min_child_samples=50,
    colsample_bytree=0.6,
    reg_alpha=25.0,
    reg_lambda=25.0,
    random_state=42,
    class_weight='balanced',
    device='cpu',
    verbose=-1,
)


# 定义回调函数
callbacks = [lgb.log_evaluation(period=100), lgb.early_stopping(stopping_rounds=10)]

# 训练模型
lgb_model.fit(X_train, y_train, 
          callbacks=callbacks, 
          eval_set=[(X_test, y_test)],
          categorical_feature=cat_idxs)

y_test_pred = lgb_model.predict(X_test)
test_score = balanced_accuracy_score(y_test, y_test_pred)
print(f"Test set balanced accuracy: {test_score:.6f}")


[1]	valid_0's multi_logloss: 1.96233
Training until validation scores don't improve for 10 rounds
[2]	valid_0's multi_logloss: 1.86499
[3]	valid_0's multi_logloss: 1.78514
[4]	valid_0's multi_logloss: 1.71353
[5]	valid_0's multi_logloss: 1.6524
[6]	valid_0's multi_logloss: 1.59702
[7]	valid_0's multi_logloss: 1.5475
[8]	valid_0's multi_logloss: 1.50319
[9]	valid_0's multi_logloss: 1.465
[10]	valid_0's multi_logloss: 1.43019
[11]	valid_0's multi_logloss: 1.39837
[12]	valid_0's multi_logloss: 1.36864
[13]	valid_0's multi_logloss: 1.34192
[14]	valid_0's multi_logloss: 1.31653
[15]	valid_0's multi_logloss: 1.29363
[16]	valid_0's multi_logloss: 1.27198
[17]	valid_0's multi_logloss: 1.25242
[18]	valid_0's multi_logloss: 1.23477
[19]	valid_0's multi_logloss: 1.21799
[20]	valid_0's multi_logloss: 1.20293
[21]	valid_0's multi_logloss: 1.18903
[22]	valid_0's multi_logloss: 1.17606
[23]	valid_0's multi_logloss: 1.16395
[24]	valid_0's multi_logloss: 1.15217
[25]	valid_0's multi_logloss: 1.14185
[2

In [28]:
from sklearn.base import BaseEstimator, ClassifierMixin

class PretrainedVotingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, estimators, voting='soft', weights=None):
        self.estimators = estimators
        self.voting = voting
        self.weights = weights
        
    def fit(self, X, y=None):
        # 已经预训练，无需再训练
        return self
    
    def predict(self, X):
        if self.voting == 'soft':
            # 对于概率投票
            probas = np.asarray([clf.predict_proba(X) for clf in self.estimators])
            avg_proba = np.average(probas, axis=0, weights=self.weights)
            return np.argmax(avg_proba, axis=1)
        else:
            # 对于硬投票
            predictions = np.asarray([clf.predict(X) for clf in self.estimators]).T
            maj_vote = np.apply_along_axis(
                lambda x: np.argmax(np.bincount(x, weights=self.weights)), axis=1, arr=predictions
            )
            return maj_vote

# 初始化自定义投票分类器
voting_clf = PretrainedVotingClassifier(
    estimators=[xgb_model, cat_model, lgb_model],
    voting='soft'
)

# 直接进行预测
y_pred_voting = voting_clf.predict(X_test)

# 评估结果
print("Voting Classifier Results:")
print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred_voting):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_voting))



Voting Classifier Results:
Balanced Accuracy: 0.6815

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.71      0.66      2463
           1       0.83      0.80      0.82      5315
           2       0.51      0.59      0.55      1089
           3       0.59      0.62      0.61      1809
           4       0.70      0.68      0.69      3958
           5       0.62      0.73      0.67      1706
           6       0.90      0.83      0.86     11023
           7       0.43      0.48      0.46      1219

    accuracy                           0.75     28582
   macro avg       0.65      0.68      0.66     28582
weighted avg       0.76      0.75      0.76     28582



In [21]:
# 获取基模型在训练集上的预测概率
xgb_train_pred = xgb_model.predict_proba(X_train)
cat_train_pred = cat_model.predict_proba(X_train)
lgb_train_pred = lgb_model.predict_proba(X_train)

# 获取基模型在测试集上的预测概率
xgb_test_pred = xgb_model.predict_proba(X_test)
cat_test_pred = cat_model.predict_proba(X_test)
lgb_test_pred = lgb_model.predict_proba(X_test)


In [22]:
import numpy as np

# 将预测概率水平堆叠，形成新的特征集
X_meta_train = np.hstack((xgb_train_pred, cat_train_pred, lgb_train_pred))
X_meta_test = np.hstack((xgb_test_pred, cat_test_pred, lgb_test_pred))


In [26]:
from sklearn.linear_model import LogisticRegression

meta_params = {
    'max_depth': 3,
    'learning_rate': 0.05,
    'n_estimators': 100,
    'objective': 'multi:softprob',
    'num_class': 8,
    'eval_metric': 'mlogloss',
    'use_label_encoder': False,
    'verbosity': 1,
    'random_state': 42,
}

# 初始化元模型
meta_model = XGBClassifier(**meta_params)

meta_model.fit(
    X_meta_train,
    y_train,
    eval_set=[(X_meta_test, y_test)],
    verbose=True
)

[0]	validation_0-mlogloss:1.94361
[1]	validation_0-mlogloss:1.83151
[2]	validation_0-mlogloss:1.73598


Parameters: { "use_label_encoder" } are not used.



[3]	validation_0-mlogloss:1.65308
[4]	validation_0-mlogloss:1.57995
[5]	validation_0-mlogloss:1.51477
[6]	validation_0-mlogloss:1.45619
[7]	validation_0-mlogloss:1.40301
[8]	validation_0-mlogloss:1.35472
[9]	validation_0-mlogloss:1.31052
[10]	validation_0-mlogloss:1.26989
[11]	validation_0-mlogloss:1.23253
[12]	validation_0-mlogloss:1.19805
[13]	validation_0-mlogloss:1.16608
[14]	validation_0-mlogloss:1.13646
[15]	validation_0-mlogloss:1.10891
[16]	validation_0-mlogloss:1.08326
[17]	validation_0-mlogloss:1.05950
[18]	validation_0-mlogloss:1.03734
[19]	validation_0-mlogloss:1.01657
[20]	validation_0-mlogloss:0.99729
[21]	validation_0-mlogloss:0.97927
[22]	validation_0-mlogloss:0.96242
[23]	validation_0-mlogloss:0.94673
[24]	validation_0-mlogloss:0.93208
[25]	validation_0-mlogloss:0.91844
[26]	validation_0-mlogloss:0.90567
[27]	validation_0-mlogloss:0.89371
[28]	validation_0-mlogloss:0.88257
[29]	validation_0-mlogloss:0.87214
[30]	validation_0-mlogloss:0.86233
[31]	validation_0-mlogloss:

In [27]:
# 在测试集上进行预测
y_pred_stacking = meta_model.predict(X_meta_test)

# 评估结果
from sklearn.metrics import balanced_accuracy_score, classification_report

print("\nStacking Classifier Results:")
print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred_stacking):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_stacking))



Stacking Classifier Results:
Balanced Accuracy: 0.6545

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.64      0.66      2463
           1       0.82      0.81      0.81      5315
           2       0.59      0.51      0.55      1089
           3       0.61      0.61      0.61      1809
           4       0.69      0.71      0.70      3958
           5       0.73      0.66      0.70      1706
           6       0.85      0.89      0.87     11023
           7       0.49      0.39      0.44      1219

    accuracy                           0.76     28582
   macro avg       0.68      0.65      0.67     28582
weighted avg       0.76      0.76      0.76     28582

