In [1]:
'''
Author: gu guguoqin66@gmail.com
Date: 2024-10-11 08:18:26
LastEditors: gu guguoqin66@gmail.com
LastEditTime: 2024-10-11 08:30:22
FilePath: \ADIA-Lab-Causal-Discovery\try_catboost.ipynb
Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
'''
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

from catboost import CatBoostClassifier, Pool
import matplotlib.pyplot as plt

In [2]:
def clean_feature_names(X):
    # 函数用于清理特征名称
    def clean_name(name):
        # 移除或替换特殊字符
        name = re.sub(r'[^\w\s-]', '_', name)
        # 确保名称不以数字开头
        if name and name[0].isdigit():
            name = 'f_' + name
        return name

    X.columns = [clean_name(col) for col in X.columns]
    return X

# 定义函数以自动检测和处理类别特征
def process_categorical_features(df, max_unique=10):
    """
    自动检测和处理数据框中的类别变量。

    参数：
    - df (pd.DataFrame): 输入的数据框。
    - max_unique (int): 判定为类别变量的最大唯一值数量。

    返回：
    - cat_idxs (list of int): 类别特征的索引。
    - cat_dims (list of int): 每个类别特征的模态数。
    - df (pd.DataFrame): 经过编码后的数据框。
    """
    cat_cols = [col for col in df.columns if df[col].nunique() <= max_unique]
    cat_dims = []
    cat_idxs = []
    encoder_dict = {}

    for col in cat_cols:
        print(f"处理类别特征: {col}，唯一值数量: {df[col].nunique()}")
        # 使用 LabelEncoder
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str).fillna('NaN'))
        cat_dims.append(len(le.classes_))
        cat_idxs.append(df.columns.get_loc(col))

    return cat_idxs, cat_dims, df

In [3]:
# 读取数据
X_y_group_train = pd.read_csv('./mid_data/X_y_group_train_updated_v13.4_rolling.csv')

print("Adding numeric labels y")
le = LabelEncoder()
X_y_group_train["y"] = le.fit_transform(X_y_group_train["label"])
# 重新排列列
X_y_group_train = X_y_group_train[["dataset", "variable"] + X_y_group_train.columns.drop(["dataset", "variable", "label", "y"]).tolist() + ["label", "y"]]

# 定义要删除的列
blacklist = [
    "ttest(v,X)", 
    "pvalue(ttest(v,X))<=0.05", 
    "ttest(v,Y)", 
    "pvalue(ttest(v,Y))<=0.05", 
    "ttest(X,Y)", 
    "pvalue(ttest(X,Y))<=0.05",
    "square_dimension", 
    "max(PPS(v,others))",
    "TLI_Collider",
    "TLI_Confounder",
    "RMSEA_Collider",
    "RMSEA_Confounder"
]
columns_to_drop = [col for col in blacklist if col in X_y_group_train.columns]
X_y_group_train = X_y_group_train.drop(columns=columns_to_drop)

# 处理数值列的缺失值
numeric_columns = X_y_group_train.select_dtypes(include=[np.number]).columns
X_y_group_train[numeric_columns] = X_y_group_train[numeric_columns].fillna(X_y_group_train[numeric_columns].mean())

# 清理特征名称
X_y_group_train = clean_feature_names(X_y_group_train)

print("Extracting X_train, y_train, and group")
# 分离数据集ID、特征和标签
group_train = X_y_group_train["dataset"]
X = X_y_group_train.drop(["variable", "dataset", "label", "y"], axis="columns")
y = X_y_group_train["y"]

# 处理类别特征
cat_idxs, cat_dims, X = process_categorical_features(X)
print(f"类别特征索引 (cat_idxs): {cat_idxs}")
print(f"类别特征模态数 (cat_dims): {cat_dims}")

# 分割数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("y_train 唯一值:", np.unique(y_train))
print("y_test 唯一值:", np.unique(y_test))

# 创建Pool对象
train_pool = Pool(data=X_train, label=y_train, cat_features=cat_idxs)
test_pool = Pool(data=X_test, label=y_test, cat_features=cat_idxs)

# 计算类别权重（使用每个类别的逆频率）
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = list(class_weights)  # 转换为列表
print(f"类别权重: {class_weights}")

Adding numeric labels y
Extracting X_train, y_train, and group
处理类别特征: dimension，唯一值数量: 8
处理类别特征: ExactSearch_v_X_，唯一值数量: 2
处理类别特征: ExactSearch_X_v_，唯一值数量: 2
处理类别特征: ExactSearch_v_Y_，唯一值数量: 2
处理类别特征: ExactSearch_Y_v_，唯一值数量: 2
处理类别特征: ExactSearch_X_Y_，唯一值数量: 2
处理类别特征: PC_v_X_，唯一值数量: 2
处理类别特征: PC_X_v_，唯一值数量: 2
处理类别特征: PC_v_Y_，唯一值数量: 2
处理类别特征: PC_Y_v_，唯一值数量: 2
处理类别特征: PC_X_Y_，唯一值数量: 2
处理类别特征: FCI_v_X_，唯一值数量: 4
处理类别特征: FCI_X_v_，唯一值数量: 4
处理类别特征: FCI_v_Y_，唯一值数量: 4
处理类别特征: FCI_Y_v_，唯一值数量: 4
处理类别特征: FCI_X_Y_，唯一值数量: 4
处理类别特征: GRaSP_v_X_，唯一值数量: 3
处理类别特征: GRaSP_X_v_，唯一值数量: 3
处理类别特征: GRaSP_v_Y_，唯一值数量: 3
处理类别特征: GRaSP_Y_v_，唯一值数量: 3
处理类别特征: GRaSP_X_Y_，唯一值数量: 3
类别特征索引 (cat_idxs): [0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 137, 138, 139, 140, 141]
类别特征模态数 (cat_dims): [8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3]
y_train 唯一值: [0 1 2 3 4 5 6 7]
y_test 唯一值: [0 1 2 3 4 5 6 7]
类别权重: [1.450715663384428, 0.6722329366386002, 3.281515499425947, 1.975532209012994, 0.902665

In [7]:

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,

    depth=5,

    rsm=0.95,
    
    l2_leaf_reg=0.5,
    model_size_reg=0.5,

    classes_count=8,
    class_weights=class_weights,
    cat_features=cat_idxs,
    random_seed=42,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    verbose=100,
    early_stopping_rounds=20,
    task_type='CPU',
)
display(model)

# 训练模型
model.fit(
    train_pool,
    eval_set=test_pool,
    use_best_model=True
)

# 预测
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# 计算平衡准确率
train_score = balanced_accuracy_score(y_train, y_train_pred)
test_score = balanced_accuracy_score(y_test, y_test_pred)
print(f"训练集平衡准确率: {train_score:.6f}")
print(f"测试集平衡准确率: {test_score:.6f}")

# 打印分类报告
print("测试集分类报告:")
print(classification_report(y_test, y_test_pred))

<catboost.core.CatBoostClassifier at 0x20c3a2c2710>

0:	learn: 0.4854429	test: 0.4876836	best: 0.4876836 (0)	total: 677ms	remaining: 11m 16s
100:	learn: 0.5640418	test: 0.5597444	best: 0.5597444 (100)	total: 1m 22s	remaining: 12m 16s
200:	learn: 0.6027238	test: 0.5934830	best: 0.5934830 (200)	total: 3m 10s	remaining: 12m 38s
300:	learn: 0.6333405	test: 0.6196817	best: 0.6201642 (296)	total: 5m 36s	remaining: 13m 1s
400:	learn: 0.6542064	test: 0.6324389	best: 0.6326699 (398)	total: 7m 17s	remaining: 10m 53s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.6382148426
bestIteration = 462

Shrink model to first 463 iterations.
训练集平衡准确率: 0.663321
测试集平衡准确率: 0.638219
测试集分类报告:
              precision    recall  f1-score   support

           0       0.55      0.66      0.60      2463
           1       0.81      0.75      0.78      5315
           2       0.39      0.57      0.46      1089
           3       0.55      0.55      0.55      1809
           4       0.69      0.58      0.63      3958
           5       0.54      0.

In [None]:
# 获取特征重要性
feature_importances = clf.get_feature_importance()
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

# 显示前20个重要特征
print(feature_importance_df.head(20))

# 绘制特征重要性图
plt.figure(figsize=(10, 12))
plt.barh(feature_importance_df['feature'][:20][::-1], feature_importance_df['importance'][:20][::-1])
plt.xlabel('Importance')
plt.title('Top 20 Feature Importances in CatBoost Model')
plt.tight_layout()
plt.show()