In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
X_y_group_train = pd.read_csv('./mid_data/X_y_group_train_updated_v12.2_优化线性.csv')

print("Adding numeric labels y")
le = LabelEncoder()
X_y_group_train["y"] = le.fit_transform(X_y_group_train["label"])
# reordering columns:
X_y_group_train = X_y_group_train[["dataset", "variable"] + X_y_group_train.columns.drop(["dataset", "variable", "label", "y"]).tolist() + ["label", "y"]]


blacklist = ["ttest(v,X)", "pvalue(ttest(v,X))<=0.05", "ttest(v,Y)", "pvalue(ttest(v,Y))<=0.05", "ttest(X,Y)", "pvalue(ttest(X,Y))<=0.05"]
columns_to_drop = [col for col in blacklist if col in X_y_group_train.columns]
X_y_group_train = X_y_group_train.drop(columns=columns_to_drop)

numeric_columns = X_y_group_train.select_dtypes(include=[np.number]).columns
X_y_group_train[numeric_columns] = X_y_group_train[numeric_columns].fillna(X_y_group_train[numeric_columns].mean())

display(X_y_group_train)

print("Extracting X_train, y_train, and group")
X_train = X_y_group_train.drop(["variable", "dataset", "label", "y"], axis="columns")

y_train = X_y_group_train["y"]
group_train = X_y_group_train["dataset"]

In [None]:
import re

def clean_feature_names(X):
    # 函数用于清理特征名称
    def clean_name(name):
        # 移除或替换特殊字符
        name = re.sub(r'[^\w\s-]', '_', name)
        # 确保名称不以数字开头
        if name[0].isdigit():
            name = 'f_' + name
        return name

    X.columns = [clean_name(col) for col in X.columns]
    return X

# 清理特征名称
X_train = clean_feature_names(X_train)

In [None]:
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
import torch
from sklearn.metrics import balanced_accuracy_score, classification_report
from pytorch_tabnet.tab_model import TabNetClassifier

# 定义 TabNetClassifier
clf = TabNetClassifier(
    n_d=64,                  # 决策层的宽度
    n_a=64,                  # 注意力嵌入的宽度
    n_steps=5,               # 步骤数
    gamma=1.5,               # 特征重用系数
    cat_idxs=[],             # 类别特征的索引（如果有）
    cat_dims=[],             # 每个类别特征的模态数（如果有）
    cat_emb_dim=1,           # 类别特征的嵌入维度
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    scheduler_params=dict(step_size=50, gamma=0.9),
    mask_type='sparsemax',   # 或 'entmax'
    verbose=1,
    device_name='gpu'        # 使用 GPU
)

# 训练模型
clf.fit(
    X_train=X_train,
    y_train=y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy', 'balanced_accuracy'],
    max_epochs=2000,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

# 预测
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

# 计算平衡准确率
train_score = balanced_accuracy_score(y_train, y_train_pred)
test_score = balanced_accuracy_score(y_test, y_test_pred)

print(f"训练集平衡准确率: {train_score:.6f}")
print(f"测试集平衡准确率: {test_score:.6f}")

# 输出详细的分类报告
print("\n测试集分类报告:")
print(classification_report(y_test, y_test_pred))