In [1]:
import pandas as pd
import numpy as np
import torch
import re
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, classification_report
from pytorch_tabnet.tab_model import TabNetClassifier

In [2]:
import torch

def get_device():
    if torch.cuda.is_available():
        device = 'cuda'
        print("CUDA 可用，使用 GPU 进行训练。")
    else:
        device = 'cpu'
        print("CUDA 不可用，使用 CPU 进行训练。")
    return device

device = get_device()

def clean_feature_names(X):
    # 函数用于清理特征名称
    def clean_name(name):
        # 移除或替换特殊字符
        name = re.sub(r'[^\w\s-]', '_', name)
        # 确保名称不以数字开头
        if name[0].isdigit():
            name = 'f_' + name
        return name

    X.columns = [clean_name(col) for col in X.columns]
    return X

# 定义函数以自动检测和处理类别特征
def process_categorical_features(df, max_unique=10):
    """
    自动检测和处理数据框中的类别变量。

    参数：
    - df (pd.DataFrame): 输入的数据框。
    - max_unique (int): 判定为类别变量的最大唯一值数量。

    返回：
    - cat_idxs (list of int): 类别特征的索引。
    - cat_dims (list of int): 每个类别特征的模态数。
    - encoder_dict (dict): 存储每个类别特征的 LabelEncoder 实例。
    """
    cat_cols = [col for col in df.columns if df[col].nunique() <= max_unique]
    cat_dims = []
    cat_idxs = []
    encoder_dict = {}

    for col in cat_cols:
        print(f"处理类别特征: {col}，唯一值数量: {df[col].nunique()}")
        # 使用 LabelEncoder
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].values)
        cat_dims.append(len(le.classes_))
        cat_idxs.append(df.columns.get_loc(col))

    return cat_idxs, cat_dims, df

CUDA 不可用，使用 CPU 进行训练。
CUDA 不可用，使用 CPU 进行训练。


In [3]:
X_y_group_train = pd.read_csv('./mid_data/X_y_group_train_updated_v12.2_piecewise.csv')

print("Adding numeric labels y")
le = LabelEncoder()
X_y_group_train["y"] = le.fit_transform(X_y_group_train["label"])
# reordering columns:
X_y_group_train = X_y_group_train[["dataset", "variable"] + X_y_group_train.columns.drop(["dataset", "variable", "label", "y"]).tolist() + ["label", "y"]]

blacklist = ["ttest(v,X)", "pvalue(ttest(v,X))<=0.05", "ttest(v,Y)", "pvalue(ttest(v,Y))<=0.05", "ttest(X,Y)", "pvalue(ttest(X,Y))<=0.05"\
    "square_dimension", "max(PPS(v,others))"]
columns_to_drop = [col for col in blacklist if col in X_y_group_train.columns]
X_y_group_train = X_y_group_train.drop(columns=columns_to_drop)

numeric_columns = X_y_group_train.select_dtypes(include=[np.number]).columns
X_y_group_train[numeric_columns] = X_y_group_train[numeric_columns].fillna(X_y_group_train[numeric_columns].mean())

display(X_y_group_train)

# 清理特征名称
X_y_group_train = clean_feature_names(X_y_group_train)

print("Extracting X_train, y_train, and group")
# 分离数据集ID、特征和标签
group_train = X_y_group_train["dataset"]
X = X_y_group_train.drop(["variable", "dataset", "label", "y"], axis="columns")
y = X_y_group_train["y"]

# 处理类别特征
cat_idxs, cat_dims, X = process_categorical_features(X)
print(f"类别特征索引 (cat_idxs): {cat_idxs}")
print(f"类别特征模态数 (cat_dims): {cat_dims}")

Adding numeric labels y


Unnamed: 0,dataset,variable,dimension,"corr(v,X)","corr(v,Y)","max(corr(v, others))","min(corr(v, others))","mean(corr(v, others))","std(corr(v, others))","corr(X,Y)",...,v~X_piecewise_coef1,v~X_piecewise_coef2,v~Y_piecewise_coef1,v~Y_piecewise_coef2,X~v_piecewise_coef1,X~v_piecewise_coef2,Y~v_piecewise_coef1,Y~v_piecewise_coef2,label,y
0,0,0,8,0.169735,-0.113595,0.791467,0.027355,0.282376,0.298969,-0.771058,...,0.087250,0.164971,-0.112374,-0.002442,0.012587,0.314297,-0.312407,0.397623,Consequence of Y,5
1,0,2,8,-0.072334,0.139419,0.230139,0.003233,0.090511,0.082201,-0.771058,...,-0.051409,-0.041849,0.163493,-0.048149,-0.056995,-0.030678,0.293044,-0.307249,Independent,6
2,0,3,8,0.123115,-0.238769,0.781051,0.012225,0.257037,0.288165,-0.771058,...,0.208082,-0.169934,-0.288890,0.100243,0.213582,-0.180935,0.080068,-0.637675,Cause of Y,1
3,0,4,8,-0.001935,0.013921,0.147408,0.001935,0.052451,0.053457,-0.771058,...,0.016450,-0.036772,0.098802,-0.169762,-0.006993,0.010116,-0.369786,0.767414,Cause of Y,1
4,0,5,8,0.284323,-0.466570,0.791467,0.064815,0.426306,0.250048,-0.771058,...,0.463463,-0.358279,-0.669777,0.406414,0.454525,-0.340405,-0.229665,-0.473811,Mediator,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142905,46997,6,7,0.014740,0.697089,0.748635,0.013858,0.376630,0.352655,-0.089641,...,0.087745,-0.146010,0.575781,0.242615,0.056678,-0.083877,0.726976,-0.059773,Cause of Y,1
142906,46997,7,7,-0.007397,-0.116481,0.998843,0.007397,0.180741,0.333308,-0.089641,...,0.123670,-0.262133,-0.110020,-0.012921,0.002044,-0.018881,-0.397732,0.562503,Cause of Y,1
142907,46997,8,7,-0.011658,0.908029,0.970364,0.011658,0.469777,0.441105,-0.089641,...,-0.071578,0.119841,1.012838,-0.209618,0.122501,-0.268318,0.997623,-0.179187,Cause of Y,1
142908,46998,0,2,0.083546,-0.019665,0.083546,0.016856,0.040022,0.037719,0.036862,...,0.163719,-0.160346,0.014060,-0.067450,0.510875,-0.854658,-0.053777,0.068223,Consequence of X,4


Extracting X_train, y_train, and group
处理类别特征: dimension，唯一值数量: 8
处理类别特征: pvalue_ttest_X_Y____0_05，唯一值数量: 2
处理类别特征: square_dimension，唯一值数量: 8
处理类别特征: ExactSearch_v_X_，唯一值数量: 2
处理类别特征: ExactSearch_X_v_，唯一值数量: 2
处理类别特征: ExactSearch_v_Y_，唯一值数量: 2
处理类别特征: ExactSearch_Y_v_，唯一值数量: 2
处理类别特征: ExactSearch_X_Y_，唯一值数量: 2
处理类别特征: PC_v_X_，唯一值数量: 2
处理类别特征: PC_X_v_，唯一值数量: 2
处理类别特征: PC_v_Y_，唯一值数量: 2
处理类别特征: PC_Y_v_，唯一值数量: 2
处理类别特征: PC_X_Y_，唯一值数量: 2
处理类别特征: FCI_v_X_，唯一值数量: 4
处理类别特征: FCI_X_v_，唯一值数量: 4
处理类别特征: FCI_v_Y_，唯一值数量: 4
处理类别特征: FCI_Y_v_，唯一值数量: 4
处理类别特征: FCI_X_Y_，唯一值数量: 4
类别特征索引 (cat_idxs): [0, 8, 33, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62]
类别特征模态数 (cat_dims): [8, 2, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4]


In [4]:
# 分割数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("y_train 唯一值:", np.unique(y_train))
print("y_test 唯一值:", np.unique(y_test))

y_train 唯一值: [0 1 2 3 4 5 6 7]
y_test 唯一值: [0 1 2 3 4 5 6 7]


In [5]:
# 定义 TabNetClassifier
clf = TabNetClassifier(
    n_d=64,                  # 决策层的宽度（小心过拟合）
    n_a=64,                  # 注意力嵌入的宽度（一般与n_d一致）
    n_steps=5,               # 决策步骤数（3-10）
    gamma=1.5,               # 特征重用系数（值接近 1 会减少层间的特征选择相关性，1.0-2.0）
    cat_idxs=cat_idxs,       # 类别特征的索引列表
    cat_dims=cat_dims,       # 每个类别特征的模态数（即类别数量）
    cat_emb_dim=1,           # 类别特征的嵌入维度
    n_independent=2,         # 每个步骤中独立的 Gated Linear Units (GLU) 层的数量（1-5）
    n_shared=2,              # 每个步骤中共享的 GLU 层的数量（1-5）
    epsilon=1e-5,            # 防止除以零的常数
    seed=42,                 # 随机种子
    momentum=0.02,           # 批量归一化的动量参数（0.01-0.4）
    clip_value=None,         # 如果设置为浮点数，将梯度剪裁到该值
    lambda_sparse=1e-3,      # 额外的稀疏性损失系数，值越大，模型在特征选择上越稀疏（1e-3-1e-1）
    optimizer_fn=torch.optim.Adam,                      # 优化器
    optimizer_params=dict(lr=2e-2),                     # 优化器的参数
    scheduler_fn=torch.optim.lr_scheduler.StepLR,       # 学习率调度器
    scheduler_params=dict(step_size=50, gamma=0.9),     # 学习率调度器的参数
    mask_type='sparsemax',   # 特征选择的掩码类型（'sparsemax' 或 'entmax'）
    # grouped_features=None,   # 将特征分组，使模型在同一组内共享注意力。这在特征预处理生成相关或依赖特征时尤其有用，例如使用 TF-IDF 或 PCA。特征重要性在同组内将相同。
    verbose=1,               # 是否打印训练过程中的信息（0 或 1）
    device_name=device,       # 使用 GPU
)

# 训练模型
clf.fit(
    X_train=X_train.values,  # 训练集的特征矩阵（np.array）
    y_train=y_train.values,  # 训练集的目标标签（np.array，对于多分类任务，标签应为整数编码）
    eval_set=[(X_train.values, y_train.values), (X_test.values, y_test.values)],   # 验证集列表
    eval_name=['train', 'valid'],                      # 验证集的名称
    eval_metric=['accuracy', 'balanced_accuracy'],     # 评估指标列表
    max_epochs=2000,         # 最大训练轮数
    patience=10,             # 早停的耐心轮数
    batch_size=512,         # 批量大小
    virtual_batch_size=128,  # 用于 Ghost Batch Normalization 的虚拟批次大小（应能被 batch_size 整除）
    num_workers=0,           # 用于 torch.utils.data.DataLoader 的工作线程数
    drop_last=False,         # 是否在训练过程中丢弃最后一个不完整的批次
    callbacks=None,          # 回调函数列表
    compute_importance=True,   # 是否计算特征重要性
)

# 预测
y_train_pred = clf.predict(X_train.values)
y_test_pred = clf.predict(X_test.values)

# 计算平衡准确率
train_score = balanced_accuracy_score(y_train, y_train_pred)
test_score = balanced_accuracy_score(y_test, y_test_pred)

print(f"训练集平衡准确率: {train_score:.6f}")
print(f"测试集平衡准确率: {test_score:.6f}")

# 输出详细的分类报告
print("\n测试集分类报告:")
print(classification_report(y_test, y_test_pred))



epoch 0  | loss: 1.51206 | train_accuracy: 0.56376 | train_balanced_accuracy: 0.29578 | valid_accuracy: 0.56091 | valid_balanced_accuracy: 0.29552 |  0:01:09s


KeyboardInterrupt: 