In [1]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

In [2]:
def remove_columns_with_keywords(df, keywords):
    columns_to_drop = [col for col in df.columns if any(keyword in col for keyword in keywords)]
    return df.drop(columns=columns_to_drop)

def get_columns_with_keywords(df, keywords):
    columns_to_get = [col for col in df.columns if any(keyword in col for keyword in keywords)]
    return columns_to_get

def recovery_dataset_id(X_y_group_train):
    # 假设'dataset'列是需要转换的列
    X_y_group_train['dataset'] = X_y_group_train['dataset'].apply(lambda x: f'{int(x):05}')
    return X_y_group_train

def clean_feature_names(X):
    # 函数用于清理特征名称
    def clean_name(name):
        # 移除或替换特殊字符
        name = re.sub(r'[^\w\s-]', '_', name)
        # 确保名称不以数字开头
        if name and name[0].isdigit():
            name = 'f_' + name
        return name

    X.columns = [clean_name(col) for col in X.columns]
    return X

def process_categorical_features(df, max_unique=10):
    """
    检测和处理数据框中的类别变量。

    参数：
    - df (pd.DataFrame): 输入的数据框。
    - max_unique (int): 判定为类别变量的最大唯一值数量。

    返回：
    - cat_idxs (list of int): 类别特征的索引。
    - cat_dims (list of int): 每个类别特征的模态数。
    - df (pd.DataFrame): 经过编码后的数据框。
    """
    cat_cols = [col for col in df.columns if df[col].nunique() <= max_unique]
    cat_dims = []
    cat_idxs = []

    for col in cat_cols:
        print(f"     处理类别特征: {col}，唯一值数量: {df[col].nunique()}")
        # 使用 LabelEncoder
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str).fillna('NaN'))
        cat_dims.append(len(le.classes_))
        cat_idxs.append(df.columns.get_loc(col))

    return cat_idxs, cat_dims, df

In [3]:
X_y_group_train = pd.read_csv('mid_data/X_y_group_train_updated_v13.4_rolling.csv')

print("##### Adding numeric labels y")
# 添加数值标签 y
le = LabelEncoder()
X_y_group_train["y"] = le.fit_transform(X_y_group_train["label"])
# 重新排列列
X_y_group_train = X_y_group_train[["dataset", "variable"] + X_y_group_train.columns.drop(["dataset", "variable", "label", "y"]).tolist() + ["label", "y"]]

print("##### Data Preprocessing...")
# 定义要删除的列
blacklist = [
    "ttest(v,X)", 
    "pvalue(ttest(v,X))<=0.05", 
    "ttest(v,Y)", 
    "pvalue(ttest(v,Y))<=0.05", 
    "ttest(X,Y)", 
    "pvalue(ttest(X,Y))<=0.05",
    "square_dimension", 
    "max(PPS(v,others))",
    "TLI_Collider",
    "TLI_Confounder",
    "RMSEA_Collider",
    "RMSEA_Confounder",
    # "RMSEA_Cause of Y",
]
columns_to_drop = [col for col in blacklist if col in X_y_group_train.columns]
X_y_group_train = X_y_group_train.drop(columns=columns_to_drop)
print('     删除多余列后样本量', X_y_group_train.shape)

# 处理无穷值：转换为NaN
X_y_group_train = X_y_group_train.replace([np.inf, -np.inf], np.nan)

# 处理数值列的缺失值
numeric_columns = X_y_group_train.select_dtypes(include=[np.number]).columns
X_y_group_train[numeric_columns] = X_y_group_train[numeric_columns].fillna(X_y_group_train[numeric_columns].mean())

# 清理特征名称
X_y_group_train = clean_feature_names(X_y_group_train)

print("##### Extracting X_train, y_train, and group")
# 分离数据集ID、特征和标签
group_train = X_y_group_train["dataset"]
X = X_y_group_train.drop(["variable", "dataset", "label", "y"], axis="columns")
y = X_y_group_train["y"]

# 处理类别特征
cat_idxs, cat_dims, X = process_categorical_features(X)
print(f"     类别特征索引 (cat_idxs): {cat_idxs}")
print(f"     类别特征模态数 (cat_dims): {cat_dims}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("     分割数据集后X_train & X_test shape:", X_train.shape, X_test.shape)
print("     y_train 唯一值:", np.unique(y_train))
print("     y_test 唯一值:", np.unique(y_test))

classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = list(class_weights)  # 转换为列表
print(f"类别权重: {class_weights}")

##### Adding numeric labels y
##### Data Preprocessing...
     删除多余列后样本量 (142910, 207)
##### Extracting X_train, y_train, and group
     处理类别特征: dimension，唯一值数量: 8
     处理类别特征: ExactSearch_v_X_，唯一值数量: 2
     处理类别特征: ExactSearch_X_v_，唯一值数量: 2
     处理类别特征: ExactSearch_v_Y_，唯一值数量: 2
     处理类别特征: ExactSearch_Y_v_，唯一值数量: 2
     处理类别特征: ExactSearch_X_Y_，唯一值数量: 2
     处理类别特征: PC_v_X_，唯一值数量: 2
     处理类别特征: PC_X_v_，唯一值数量: 2
     处理类别特征: PC_v_Y_，唯一值数量: 2
     处理类别特征: PC_Y_v_，唯一值数量: 2
     处理类别特征: PC_X_Y_，唯一值数量: 2
     处理类别特征: FCI_v_X_，唯一值数量: 4
     处理类别特征: FCI_X_v_，唯一值数量: 4
     处理类别特征: FCI_v_Y_，唯一值数量: 4
     处理类别特征: FCI_Y_v_，唯一值数量: 4
     处理类别特征: FCI_X_Y_，唯一值数量: 4
     处理类别特征: GRaSP_v_X_，唯一值数量: 3
     处理类别特征: GRaSP_X_v_，唯一值数量: 3
     处理类别特征: GRaSP_v_Y_，唯一值数量: 3
     处理类别特征: GRaSP_Y_v_，唯一值数量: 3
     处理类别特征: GRaSP_X_Y_，唯一值数量: 3
     类别特征索引 (cat_idxs): [0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 137, 138, 139, 140, 141]
     类别特征模态数 (cat_dims): [8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [26]:
import xgboost as xgb
from xgboost import XGBClassifier


print("##### Training XGBoost model")
# Create DMatrix for XGBoost

class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
weight_dict = dict(zip(classes, class_weights))
sample_weights = y_train.map(weight_dict)

dtrain = xgb.DMatrix(X_train, label=y_train, weight=sample_weights)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set up XGBoost parameters
params = {
    'max_depth': 5,
    'eta': 0.05,
    'objective': 'multi:softmax',
    'num_class': 8,
    'eval_metric': 'mlogloss',
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'min_child_weight': 1,
    'gamma': 0.3,
    'lambda': 10,
    'alpha': 10,
    'device': 'cuda',  
    'max_delta_step': 0.5
}

# Train XGBoost model
model = xgb.train(
    params,
    dtrain,
    num_boost_round=2000,
    evals=[(dtrain, 'train'), (dtest, 'test')],
    early_stopping_rounds=50,
    verbose_eval=100
)

# Predict
y_train_pred = model.predict(dtrain)
y_test_pred = model.predict(dtest)

# Calculate balanced accuracy
train_score = balanced_accuracy_score(y_train, y_train_pred)
test_score = balanced_accuracy_score(y_test, y_test_pred)
print(f"Training set balanced accuracy: {train_score:.6f}")
print(f"Test set balanced accuracy: {test_score:.6f}")

# Print classification report
print("Test set classification report:")
print(classification_report(y_test, y_test_pred))

# Save the model
model.save_model('xgboost_model.json')
print("Model saved as 'xgboost_model.json'")

##### Training XGBoost model
[0]	train-mlogloss:2.01973	test-mlogloss:2.00699
[100]	train-mlogloss:1.05900	test-mlogloss:0.95913
[200]	train-mlogloss:0.91333	test-mlogloss:0.86587
[300]	train-mlogloss:0.83154	test-mlogloss:0.82299
[400]	train-mlogloss:0.77205	test-mlogloss:0.79654
[500]	train-mlogloss:0.72327	test-mlogloss:0.77691
[600]	train-mlogloss:0.68188	test-mlogloss:0.76221
[700]	train-mlogloss:0.64579	test-mlogloss:0.75060
[800]	train-mlogloss:0.61307	test-mlogloss:0.74007
[900]	train-mlogloss:0.58374	test-mlogloss:0.73147
[1000]	train-mlogloss:0.55650	test-mlogloss:0.72422
[1100]	train-mlogloss:0.53170	test-mlogloss:0.71781
[1200]	train-mlogloss:0.50895	test-mlogloss:0.71211
[1300]	train-mlogloss:0.48801	test-mlogloss:0.70729
[1400]	train-mlogloss:0.46804	test-mlogloss:0.70254
[1500]	train-mlogloss:0.44972	test-mlogloss:0.69850
[1600]	train-mlogloss:0.43220	test-mlogloss:0.69501
[1700]	train-mlogloss:0.41594	test-mlogloss:0.69138
[1800]	train-mlogloss:0.40057	test-mlogloss:0.6

In [3]:
X_y_group_train = pd.read_csv('mid_data/X_y_group_train_updated_v13.4_rolling.csv')

print("##### Adding numeric labels y")
# 添加数值标签 y
le = LabelEncoder()
le.classes_ = np.array([
    'Confounder', 'Collider',
    'Mediator', 'Independent',
    'Cause of X', 'Consequence of X', 
    'Cause of Y', 'Consequence of Y',
])
X_y_group_train["y"] = le.transform(X_y_group_train["label"])
# 重新排列列
X_y_group_train = X_y_group_train[["dataset", "variable"] + X_y_group_train.columns.drop(["dataset", "variable", "label", "y"]).tolist() + ["label", "y"]]

print("##### Data Preprocessing...")
# 定义要删除的列
blacklist = [
    "ttest(v,X)", 
    "pvalue(ttest(v,X))<=0.05", 
    "ttest(v,Y)", 
    "pvalue(ttest(v,Y))<=0.05", 
    "ttest(X,Y)", 
    "pvalue(ttest(X,Y))<=0.05",
    "square_dimension", 
    "max(PPS(v,others))",
    "TLI_Collider",
    "TLI_Confounder",
    "RMSEA_Collider",
    "RMSEA_Confounder",
]
columns_to_drop = [col for col in blacklist if col in X_y_group_train.columns]
X_y_group_train = X_y_group_train.drop(columns=columns_to_drop)
print('     1.删除多余列后样本量', X_y_group_train.shape)
# 填充缺失值
X_y_group_train = X_y_group_train.replace([np.inf, -np.inf], np.nan)
numeric_columns = X_y_group_train.select_dtypes(include=[np.number]).columns
X_y_group_train[numeric_columns] = X_y_group_train[numeric_columns].fillna(X_y_group_train[numeric_columns].mean())
print('     2.填充缺失值后样本量', X_y_group_train.shape)
# 清理特征名称
X_y_group_train = clean_feature_names(X_y_group_train)
print('     3.清理特征名称后样本量', X_y_group_train.shape)

print("##### Extracting X_train, y_train")
X = X_y_group_train.drop(["variable", "dataset", "label", "y"], axis="columns")
y = X_y_group_train["y"]

# 处理类别特征
cat_idxs, cat_dims, X = process_categorical_features(X)
print("     ->类别特征索引 (cat_idxs):", cat_idxs)
print("     ->类别特征模态数 (cat_dims):", cat_dims)
print('     4.处理类别特征后样本量', X.shape, y.shape)

print("##### Extracting X_train, y_train, and group")
# 分割数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)
print("     5.分割数据集后样本量:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print("     ->y_train 唯一值:", np.unique(y_train))
print("     ->y_test 唯一值:", np.unique(y_test))

print("##### Computing class weights")
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
# 类别权重列表
class_weights = list(class_weights) 
print("     ->分类目标权重列表:", class_weights)
# 类别权重字典
weight_dict = dict(zip(classes, class_weights))
print("     ->分类目标权重字典:", weight_dict)
# 样本权重
sample_weights = y_train.map(weight_dict)

##### Adding numeric labels y
##### Data Preprocessing...
     1.删除多余列后样本量 (142910, 207)
     2.填充缺失值后样本量 (142910, 207)
     3.清理特征名称后样本量 (142910, 207)
##### Extracting X_train, y_train
     处理类别特征: dimension，唯一值数量: 8
     处理类别特征: ExactSearch_v_X_，唯一值数量: 2
     处理类别特征: ExactSearch_X_v_，唯一值数量: 2
     处理类别特征: ExactSearch_v_Y_，唯一值数量: 2
     处理类别特征: ExactSearch_Y_v_，唯一值数量: 2
     处理类别特征: ExactSearch_X_Y_，唯一值数量: 2
     处理类别特征: PC_v_X_，唯一值数量: 2
     处理类别特征: PC_X_v_，唯一值数量: 2
     处理类别特征: PC_v_Y_，唯一值数量: 2
     处理类别特征: PC_Y_v_，唯一值数量: 2
     处理类别特征: PC_X_Y_，唯一值数量: 2
     处理类别特征: FCI_v_X_，唯一值数量: 4
     处理类别特征: FCI_X_v_，唯一值数量: 4
     处理类别特征: FCI_v_Y_，唯一值数量: 4
     处理类别特征: FCI_Y_v_，唯一值数量: 4
     处理类别特征: FCI_X_Y_，唯一值数量: 4
     处理类别特征: GRaSP_v_X_，唯一值数量: 3
     处理类别特征: GRaSP_X_v_，唯一值数量: 3
     处理类别特征: GRaSP_v_Y_，唯一值数量: 3
     处理类别特征: GRaSP_Y_v_，唯一值数量: 3
     处理类别特征: GRaSP_X_Y_，唯一值数量: 3
     ->类别特征索引 (cat_idxs): [0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 137, 138, 139, 140, 141]
  

In [5]:
from xgboost import XGBClassifier, callback

class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
weight_dict = dict(zip(classes, class_weights))
sample_weights = y_train.map(weight_dict)

# 定义早停回调
early_stop = callback.EarlyStopping(
    rounds=20,
    save_best=True,
    maximize=False,
    metric_name='mlogloss'
)

# 定义XGBClassifier
xgb_model = XGBClassifier(
    max_depth=5,
    learning_rate=0.05,
    objective='multi:softmax',
    num_class=8,
    eval_metric='mlogloss',
    subsample=0.7,
    colsample_bytree=0.7,
    min_child_weight=1,
    gamma=0.3,
    reg_lambda=10,
    reg_alpha=10,
    tree_method='hist',#'gpu_hist',      # 如果没有GPU，可以使用 'hist'
    max_delta_step=0.5,
    n_estimators=2000,
    verbosity=3,
    use_label_encoder=False,
    random_state=42,
    callbacks=[early_stop]
)

xgb_model.fit(
    X_train, y_train,
    sample_weight=sample_weights,
    eval_set=[(X_test, y_test)],
    verbose=True
)

# Predict
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

# Calculate balanced accuracy
train_score = balanced_accuracy_score(y_train, y_train_pred)
test_score = balanced_accuracy_score(y_test, y_test_pred)
print(f"Training set balanced accuracy: {train_score:.6f}")
print(f"Test set balanced accuracy: {test_score:.6f}")

# Print classification report
print("Test set classification report:")
print(classification_report(y_test, y_test_pred))

# Save the model
xgb_model.save_model('xgboost_model.json')
print("Model saved as 'xgboost_model.json'")

[21:32:04] AllReduce: 0.038344s, 1 calls @ 38344us

[21:32:04] MakeCuts: 0.038778s, 1 calls @ 38778us

[21:32:04] DEBUG: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0b3782d1791676daf-1\xgboost\xgboost-ci-windows\src\gbm\gbtree.cc:130: Using tree method: 3
[0]	validation_0-mlogloss:2.05365
[1]	validation_0-mlogloss:2.02799
[2]	validation_0-mlogloss:2.00282
[3]	validation_0-mlogloss:1.97827
[4]	validation_0-mlogloss:1.95315
[5]	validation_0-mlogloss:1.92883
[6]	validation_0-mlogloss:1.90458
[7]	validation_0-mlogloss:1.88071
[8]	validation_0-mlogloss:1.85750
[9]	validation_0-mlogloss:1.83395


KeyboardInterrupt: 