In [8]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

import xgboost as xgb
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [9]:
def clean_feature_names(X):
    def clean_name(name):
        # 将空格替换为下划线
        name = name.replace(' ', '_')
        # 将逗号替换为下划线
        name = name.replace(',', '_')
        # 移除或替换其他特殊字符
        name = re.sub(r'[^\w\-]', '_', name)
        # 确保名称不以数字开头
        if name and name[0].isdigit():
            name = 'f_' + name
        # 移除连续的下划线
        name = re.sub(r'_+', '_', name)
        # 移除开头和结尾的下划线
        name = name.strip('_')
        return name

    X.columns = [clean_name(col) for col in X.columns]
    return X


def remove_columns_with_keywords(df, keywords):
    columns_to_drop = [col for col in df.columns if any(keyword in col for keyword in keywords)]
    return df.drop(columns=columns_to_drop)

def get_columns_with_keywords(df, keywords):
    columns_to_get = [col for col in df.columns if any(keyword in col for keyword in keywords)]
    return columns_to_get

def recovery_dataset_id(X_y_group_train):
    # 假设'dataset'列是需要转换的列
    X_y_group_train['dataset'] = X_y_group_train['dataset'].apply(lambda x: f'{int(x):05}')
    return X_y_group_train


def process_categorical_features(df, max_unique=10):
    """
    检测和处理数据框中的类别变量。

    参数：
    - df (pd.DataFrame): 输入的数据框。
    - max_unique (int): 判定为类别变量的最大唯一值数量。

    返回：
    - cat_idxs (list of int): 类别特征的索引。
    - cat_dims (list of int): 每个类别特征的模态数。
    - df (pd.DataFrame): 经过编码后的数据框。
    """
    cat_cols = [col for col in df.columns if df[col].nunique() <= max_unique]
    cat_dims = []
    cat_idxs = []

    for col in cat_cols:
        print(f"     处理类别特征: {col}，唯一值数量: {df[col].nunique()}")
        # 使用 LabelEncoder
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str).fillna('NaN'))
        cat_dims.append(len(le.classes_))
        cat_idxs.append(df.columns.get_loc(col))

    return cat_idxs, cat_dims, df

In [10]:
X_y_group_train = pd.read_csv('mid_data/X_y_group_train_updated_v13.4_rolling.csv')

print("##### Adding numeric labels y")
# 添加数值标签 y
le = LabelEncoder()
X_y_group_train["y"] = le.fit_transform(X_y_group_train["label"])
# 重新排列列
X_y_group_train = X_y_group_train[["dataset", "variable"] + X_y_group_train.columns.drop(["dataset", "variable", "label", "y"]).tolist() + ["label", "y"]]

print("##### Data Preprocessing...")
# 定义要删除的列
blacklist = [
    "ttest(v,X)", 
    "pvalue(ttest(v,X))<=0.05", 
    "ttest(v,Y)", 
    "pvalue(ttest(v,Y))<=0.05", 
    "ttest(X,Y)", 
    "pvalue(ttest(X,Y))<=0.05",
    "square_dimension", 
    "max(PPS(v,others))",
    "TLI_Collider",
    "TLI_Confounder",
    "RMSEA_Collider",
    "RMSEA_Confounder",
    # "RMSEA_Cause of Y",
]
columns_to_drop = [col for col in blacklist if col in X_y_group_train.columns]
X_y_group_train = X_y_group_train.drop(columns=columns_to_drop)
print('     删除多余列后样本量', X_y_group_train.shape)

# 处理无穷值：转换为NaN
X_y_group_train = X_y_group_train.replace([np.inf, -np.inf], np.nan)

# 处理数值列的缺失值
numeric_columns = X_y_group_train.select_dtypes(include=[np.number]).columns
X_y_group_train[numeric_columns] = X_y_group_train[numeric_columns].fillna(X_y_group_train[numeric_columns].mean())

# 清理特征名称
X_y_group_train = clean_feature_names(X_y_group_train)

print("##### Extracting X_train, y_train, and group")
# 分离数据集ID、特征和标签
group_train = X_y_group_train["dataset"]
X = X_y_group_train.drop(["variable", "dataset", "label", "y"], axis="columns")
y = X_y_group_train["y"]

# 处理类别特征
cat_idxs, cat_dims, X = process_categorical_features(X)
print(f"     类别特征索引 (cat_idxs): {cat_idxs}")
print(f"     类别特征模态数 (cat_dims): {cat_dims}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train = clean_feature_names(X_train)
X_test = clean_feature_names(X_test)

print("     分割数据集后X_train & X_test shape:", X_train.shape, X_test.shape)
print("     y_train 唯一值:", np.unique(y_train))
print("     y_test 唯一值:", np.unique(y_test))

classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = list(class_weights)  # 转换为列表
print(f"类别权重: {class_weights}")

##### Adding numeric labels y
##### Data Preprocessing...
     删除多余列后样本量 (142910, 207)
##### Extracting X_train, y_train, and group
     处理类别特征: dimension，唯一值数量: 8
     处理类别特征: ExactSearch_v_X，唯一值数量: 2
     处理类别特征: ExactSearch_X_v，唯一值数量: 2
     处理类别特征: ExactSearch_v_Y，唯一值数量: 2
     处理类别特征: ExactSearch_Y_v，唯一值数量: 2
     处理类别特征: ExactSearch_X_Y，唯一值数量: 2
     处理类别特征: PC_v_X，唯一值数量: 2
     处理类别特征: PC_X_v，唯一值数量: 2
     处理类别特征: PC_v_Y，唯一值数量: 2
     处理类别特征: PC_Y_v，唯一值数量: 2
     处理类别特征: PC_X_Y，唯一值数量: 2
     处理类别特征: FCI_v_X，唯一值数量: 4
     处理类别特征: FCI_X_v，唯一值数量: 4
     处理类别特征: FCI_v_Y，唯一值数量: 4
     处理类别特征: FCI_Y_v，唯一值数量: 4
     处理类别特征: FCI_X_Y，唯一值数量: 4
     处理类别特征: GRaSP_v_X，唯一值数量: 3
     处理类别特征: GRaSP_X_v，唯一值数量: 3
     处理类别特征: GRaSP_v_Y，唯一值数量: 3
     处理类别特征: GRaSP_Y_v，唯一值数量: 3
     处理类别特征: GRaSP_X_Y，唯一值数量: 3
     类别特征索引 (cat_idxs): [0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 137, 138, 139, 140, 141]
     类别特征模态数 (cat_dims): [8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 3, 3

In [11]:


# 计算类别权重
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
weight_dict = dict(zip(classes, class_weights))
sample_weights = y_train.map(weight_dict)

# 设置XGBoost参数
params = {
    'max_depth': 5,
    'learning_rate': 0.05,
    'objective': 'multi:softmax',
    'num_class': 8,
    'eval_metric': 'mlogloss',
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'min_child_weight': 1,
    'gamma': 0.3,
    'reg_lambda': 10,
    'reg_alpha': 10,
    'tree_method': 'gpu_hist',  # 使用GPU
    'max_delta_step': 0.5,
    'n_estimators': 2000,
    'early_stopping_rounds': 50,
    'verbose': 100
}

# 创建XGBClassifier
xgb_model = XGBClassifier(**params)

# 训练模型
xgb_model.fit(
    X_train, 
    y_train,
    sample_weight=sample_weights,
    eval_set=[(X_test, y_test)],
    verbose=100
)

# 预测和评估
y_test_pred = xgb_model.predict(X_test)
test_score = balanced_accuracy_score(y_test, y_test_pred)
print(f"Test set balanced accuracy: {test_score:.6f}")


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "verbose" } are not used.



[0]	validation_0-mlogloss:2.00697
[100]	validation_0-mlogloss:0.95882
[200]	validation_0-mlogloss:0.86604
[300]	validation_0-mlogloss:0.82351
[400]	validation_0-mlogloss:0.79710
[500]	validation_0-mlogloss:0.77741
[600]	validation_0-mlogloss:0.76252
[700]	validation_0-mlogloss:0.75050
[800]	validation_0-mlogloss:0.74055
[900]	validation_0-mlogloss:0.73211
[1000]	validation_0-mlogloss:0.72469
[1100]	validation_0-mlogloss:0.71852
[1200]	validation_0-mlogloss:0.71294
[1300]	validation_0-mlogloss:0.70818
[1400]	validation_0-mlogloss:0.70344
[1500]	validation_0-mlogloss:0.69925
[1600]	validation_0-mlogloss:0.69568
[1700]	validation_0-mlogloss:0.69219
[1800]	validation_0-mlogloss:0.68913
[1900]	validation_0-mlogloss:0.68618
[1999]	validation_0-mlogloss:0.68373



    E.g. tree_method = "hist", device = "cuda"



Test set balanced accuracy: 0.679183


In [12]:
# 创建Pool对象
train_pool = Pool(data=X_train, label=y_train, cat_features=cat_idxs)
test_pool = Pool(data=X_test, label=y_test, cat_features=cat_idxs)

# 计算类别权重（使用每个类别的逆频率）
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = list(class_weights)  # 转换为列表
print(f"类别权重: {class_weights}")

cat_model = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.05,
    depth=7,
    
    l2_leaf_reg=0.1,
    model_size_reg=0.1,
    classes_count=8,
    class_weights=class_weights,
    cat_features=cat_idxs,
    random_seed=42,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    verbose=100,
    early_stopping_rounds=50,
    task_type='GPU',
    devices='0:1',  # 如果使用GPU,指定GPU设备
    save_snapshot=False,
    train_dir="./tmp",
    leaf_estimation_method='Newton',  # 默认方法
)


# 训练模型
cat_model.fit(
    train_pool,
    eval_set=test_pool,
    use_best_model=True
)

y_test_pred = cat_model.predict(test_pool)
test_score = balanced_accuracy_score(y_test, y_test_pred)
print(f"Test set balanced accuracy: {test_score:.6f}")


类别权重: [1.450715663384428, 0.6722329366386002, 3.281515499425947, 1.975532209012994, 0.9026654876200101, 2.093305990918412, 0.32411040301181593, 2.930285011277425]
0:	learn: 0.5276560	test: 0.5308194	best: 0.5308194 (0)	total: 53ms	remaining: 1m 46s
100:	learn: 0.6035374	test: 0.5852222	best: 0.5852222 (100)	total: 4.13s	remaining: 1m 17s
200:	learn: 0.6714016	test: 0.6297306	best: 0.6297306 (200)	total: 8.34s	remaining: 1m 14s
300:	learn: 0.7135642	test: 0.6460039	best: 0.6464452 (298)	total: 12.4s	remaining: 1m 9s
400:	learn: 0.7460634	test: 0.6555250	best: 0.6558477 (399)	total: 16.4s	remaining: 1m 5s
500:	learn: 0.7746371	test: 0.6617700	best: 0.6620216 (498)	total: 20.4s	remaining: 1m 1s
600:	learn: 0.7973425	test: 0.6645677	best: 0.6649570 (598)	total: 25s	remaining: 58.3s
700:	learn: 0.8158744	test: 0.6676719	best: 0.6678708 (692)	total: 30.2s	remaining: 55.9s
800:	learn: 0.8316080	test: 0.6703876	best: 0.6704599 (786)	total: 35.7s	remaining: 53.4s
900:	learn: 0.8465332	test: 0.6

In [13]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=3000,
    learning_rate=0.05,
    num_leaves=21,
    max_depth=5,
    min_child_samples=50,
    colsample_bytree=0.6,
    reg_alpha=25.0,
    reg_lambda=25.0,
    random_state=42,
    class_weight='balanced',
    device='cpu',
    verbose=-1,
)


# 定义回调函数
callbacks = [lgb.log_evaluation(period=100), lgb.early_stopping(stopping_rounds=10)]

# 训练模型
lgb_model.fit(X_train, y_train, 
          callbacks=callbacks, 
          eval_set=[(X_test, y_test)],
          categorical_feature=cat_idxs)

y_test_pred = lgb_model.predict(X_test)
test_score = balanced_accuracy_score(y_test, y_test_pred)
print(f"Test set balanced accuracy: {test_score:.6f}")


Training until validation scores don't improve for 10 rounds
[100]	valid_0's multi_logloss: 0.901446
[200]	valid_0's multi_logloss: 0.826832
[300]	valid_0's multi_logloss: 0.792176
[400]	valid_0's multi_logloss: 0.769355
[500]	valid_0's multi_logloss: 0.752858
[600]	valid_0's multi_logloss: 0.7406
[700]	valid_0's multi_logloss: 0.731287
[800]	valid_0's multi_logloss: 0.723438
[900]	valid_0's multi_logloss: 0.717331
[1000]	valid_0's multi_logloss: 0.712296
[1100]	valid_0's multi_logloss: 0.708135
[1200]	valid_0's multi_logloss: 0.7053
[1300]	valid_0's multi_logloss: 0.702788
[1400]	valid_0's multi_logloss: 0.701154
[1500]	valid_0's multi_logloss: 0.699705
[1600]	valid_0's multi_logloss: 0.69855
[1700]	valid_0's multi_logloss: 0.697572
[1800]	valid_0's multi_logloss: 0.697151
Early stopping, best iteration is:
[1845]	valid_0's multi_logloss: 0.697067
Test set balanced accuracy: 0.677288


In [22]:

def filter_features(X_test, model):
    """
    过滤测试数据集，只保留模型训练时使用的特征。

    参数:
    X_test : pandas.DataFrame 或 numpy.array
        需要进行预测的测试数据
    model : 已训练的模型
        包含 feature_names_in_ 属性的模型（如sklearn的大多数模型）

    返回:
    pandas.DataFrame 或 numpy.array
        只包含模型训练时使用的特征的测试数据
    """
    if hasattr(model, 'feature_names_in_'):
        # 获取模型训练时使用的特征名称
        model_features = model.feature_names_in_
        
        if isinstance(X_test, pd.DataFrame):
            # 对于DataFrame，我们可以直接使用列名
            common_features = list(set(X_test.columns) & set(model_features))
            missing_features = set(model_features) - set(X_test.columns)
            if missing_features:
                print(f"警告: 测试数据缺少 {len(missing_features)} 个训练时使用的特征: {missing_features}")
            extra_features = set(X_test.columns) - set(model_features)
            if extra_features:
                print(f"警告: 移除了 {len(extra_features)} 个在训练时未使用的特征: {extra_features}")
            return X_test[common_features]
        elif isinstance(X_test, np.ndarray):
            # 对于numpy数组，我们假设特征的顺序与训练时相同
            if X_test.shape[1] > len(model_features):
                print(f"警告: 测试数据包含额外的特征。只使用前 {len(model_features)} 个特征。")
                return X_test[:, :len(model_features)]
            elif X_test.shape[1] < len(model_features):
                missing_count = len(model_features) - X_test.shape[1]
                print(f"错误: 测试数据的特征数 ({X_test.shape[1]}) 少于模型训练时的特征数 ({len(model_features)})")
                print(f"缺少的特征数量: {missing_count}")
                raise ValueError("特征数量不匹配")
            return X_test
    else:
        print("警告: 模型没有 feature_names_in_ 属性。无法验证特征。")
        return X_test
def align_features(X, model):
    """
    调整输入特征的顺序，使其与模型训练时的特征顺序一致。

    参数:
    X : pandas.DataFrame 或 numpy.ndarray
        需要调整顺序的输入特征
    model : 已训练的模型
        包含 feature_names_in_ 属性的模型（如sklearn的大多数模型）

    返回:
    pandas.DataFrame 或 numpy.ndarray
        特征顺序调整后的数据
    """
    if not hasattr(model, 'feature_names_in_'):
        print("警告: 模型没有 feature_names_in_ 属性。无法调整特征顺序。")
        return X

    model_features = model.feature_names_in_

    if isinstance(X, pd.DataFrame):
        # 对于DataFrame，我们可以直接使用列名重新排序
        if set(X.columns) != set(model_features):
            raise ValueError("输入特征与模型特征不完全匹配。")
        return X.reindex(columns=model_features)

    elif isinstance(X, np.ndarray):
        if X.shape[1] != len(model_features):
            raise ValueError("输入特征数量与模型特征数量不匹配。")
        
        # 对于numpy数组，我们需要创建一个映射来重新排序
        current_features = [f"feature_{i}" for i in range(X.shape[1])]
        df = pd.DataFrame(X, columns=current_features)
        feature_mapping = dict(zip(current_features, X.columns if isinstance(X, pd.DataFrame) else model_features))
        df = df.rename(columns=feature_mapping)
        return df.reindex(columns=model_features).values

    else:
        raise ValueError("输入X必须是pandas.DataFrame或numpy.ndarray。")
    

X_test = filter_features(X_test, lgb_model)
X_test = align_features(X_test, lgb_model)


In [15]:


X_test = clean_feature_names(X_test)


In [23]:
X_test

Unnamed: 0,dimension,corr_v_X,corr_v_Y,max_corr_v_others,min_corr_v_others,mean_corr_v_others,std_corr_v_others,corr_X_Y,MI_v_X,MI_v_Y,...,RMSEA_Cause_of_Y,RMSEA_Consequence_of_X,RMSEA_Consequence_of_Y,RMSEA_Mediator,max_abs_rolling_corr_v_X,min_abs_rolling_corr_v_X,mean_abs_rolling_corr_v_X,max_abs_rolling_corr_v_Y,min_abs_rolling_corr_v_Y,mean_abs_rolling_corr_v_Y
136099,7,0.341253,-0.157367,0.496141,0.023983,0.246368,0.186375,-0.465101,0.075056,0.001524,...,0.0,0.000000,0.312787,0.466795,0.253211,5.541766e-04,0.111567,0.220188,6.257377e-05,0.073648
36308,7,0.696308,-0.375458,0.696308,0.002284,0.203092,0.258201,-0.548561,0.405580,0.068996,...,0.0,0.000000,0.714819,0.453169,0.533402,8.013066e-02,0.266000,0.244580,1.881518e-02,0.116924
67155,7,-0.405111,0.381615,0.784996,0.000095,0.362493,0.235215,-0.954001,0.098940,0.113460,...,0.0,0.000000,0.145635,1.501144,0.465064,2.309441e-03,0.156133,0.451935,4.302290e-05,0.149570
139667,7,-0.436874,-0.417682,0.998679,0.060183,0.441799,0.292780,0.945741,0.209797,0.165993,...,0.0,0.000000,0.139013,1.434547,0.348513,2.637234e-07,0.136821,0.320583,5.351454e-04,0.129992
90225,7,-0.018641,-0.474062,0.931332,0.009865,0.335768,0.389524,0.655671,0.000000,0.194857,...,0.0,0.684328,0.462437,0.880916,0.158563,1.275578e-04,0.056825,0.437010,4.307383e-04,0.149810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126017,6,0.034864,0.005932,0.323883,0.003022,0.053098,0.110149,-0.437274,0.000000,0.001670,...,0.0,0.000000,0.027150,0.460376,0.182198,9.961062e-06,0.051960,0.148898,8.806279e-05,0.057662
102276,6,-0.999634,-0.371966,1.000000,0.019411,0.699054,0.397170,0.372256,5.428899,0.128630,...,0.0,0.000000,2.660290,0.000000,1.000000,9.864447e-01,0.999605,0.231589,3.871368e-02,0.121605
28719,4,0.374925,-0.818327,0.881467,0.374925,0.638338,0.218356,-0.770087,0.164214,0.812113,...,0.0,1.272677,0.814003,1.187810,0.497023,2.529979e-03,0.138950,0.736255,4.056999e-02,0.371701
34401,7,0.085004,0.037934,0.403726,0.006593,0.117929,0.117966,-0.449794,0.197079,0.268503,...,0.0,0.079749,0.110350,0.480818,0.163934,1.361479e-04,0.058189,0.234133,2.437745e-04,0.061884


In [24]:
from sklearn.base import BaseEstimator, ClassifierMixin

class PretrainedVotingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, estimators, voting='soft', weights=None):
        self.estimators = estimators
        self.voting = voting
        self.weights = weights
        
    def fit(self, X, y=None):
        # 已经预训练，无需再训练
        return self
    
    def predict(self, X):
        if self.voting == 'soft':
            # 对于概率投票
            probas = np.asarray([clf.predict_proba(X) for clf in self.estimators])
            avg_proba = np.average(probas, axis=0, weights=self.weights)
            return np.argmax(avg_proba, axis=1)
        else:
            # 对于硬投票
            predictions = np.asarray([clf.predict(X) for clf in self.estimators]).T
            maj_vote = np.apply_along_axis(
                lambda x: np.argmax(np.bincount(x, weights=self.weights)), axis=1, arr=predictions
            )
            return maj_vote

# 初始化自定义投票分类器
voting_clf = PretrainedVotingClassifier(
    estimators=[xgb_model, cat_model, lgb_model],
    voting='soft'
)

# 直接进行预测
y_pred_voting = voting_clf.predict(X_test)

# 评估结果
print("Voting Classifier Results:")
print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred_voting):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_voting))



Voting Classifier Results:
Balanced Accuracy: 0.6815

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.71      0.66      2463
           1       0.83      0.80      0.82      5315
           2       0.51      0.59      0.55      1089
           3       0.59      0.62      0.61      1809
           4       0.70      0.68      0.69      3958
           5       0.62      0.73      0.67      1706
           6       0.90      0.83      0.86     11023
           7       0.43      0.48      0.46      1219

    accuracy                           0.75     28582
   macro avg       0.65      0.68      0.66     28582
weighted avg       0.76      0.75      0.76     28582



In [18]:
# 获取基模型在训练集上的预测概率
xgb_train_pred = xgb_model.predict_proba(X_train)
cat_train_pred = cat_model.predict_proba(X_train)
lgb_train_pred = lgb_model.predict_proba(X_train)

# 获取基模型在测试集上的预测概率
xgb_test_pred = xgb_model.predict_proba(X_test)
cat_test_pred = cat_model.predict_proba(X_test)
lgb_test_pred = lgb_model.predict_proba(X_test)


In [19]:
import numpy as np

# 将预测概率水平堆叠，形成新的特征集
X_meta_train = np.hstack((xgb_train_pred, cat_train_pred, lgb_train_pred))
X_meta_test = np.hstack((xgb_test_pred, cat_test_pred, lgb_test_pred))


In [20]:
from sklearn.linear_model import LogisticRegression

meta_params = {
    'max_depth': 3,
    'learning_rate': 0.05,
    'n_estimators': 100,
    'objective': 'multi:softprob',
    'num_class': 8,
    'eval_metric': 'mlogloss',
    'use_label_encoder': False,
    'verbosity': 1,
    'random_state': 42,
}

# 初始化元模型
meta_model = XGBClassifier(**meta_params)

meta_model.fit(
    X_meta_train,
    y_train,
    eval_set=[(X_meta_test, y_test)],
    verbose=True
)

[0]	validation_0-mlogloss:1.94361
[1]	validation_0-mlogloss:1.83151
[2]	validation_0-mlogloss:1.73598
[3]	validation_0-mlogloss:1.65308
[4]	validation_0-mlogloss:1.57995


Parameters: { "use_label_encoder" } are not used.



[5]	validation_0-mlogloss:1.51477
[6]	validation_0-mlogloss:1.45619
[7]	validation_0-mlogloss:1.40301
[8]	validation_0-mlogloss:1.35472
[9]	validation_0-mlogloss:1.31052
[10]	validation_0-mlogloss:1.26989
[11]	validation_0-mlogloss:1.23253
[12]	validation_0-mlogloss:1.19805
[13]	validation_0-mlogloss:1.16608
[14]	validation_0-mlogloss:1.13646
[15]	validation_0-mlogloss:1.10891
[16]	validation_0-mlogloss:1.08326
[17]	validation_0-mlogloss:1.05950
[18]	validation_0-mlogloss:1.03734
[19]	validation_0-mlogloss:1.01657
[20]	validation_0-mlogloss:0.99729
[21]	validation_0-mlogloss:0.97927
[22]	validation_0-mlogloss:0.96242
[23]	validation_0-mlogloss:0.94673
[24]	validation_0-mlogloss:0.93208
[25]	validation_0-mlogloss:0.91844
[26]	validation_0-mlogloss:0.90567
[27]	validation_0-mlogloss:0.89371
[28]	validation_0-mlogloss:0.88257
[29]	validation_0-mlogloss:0.87214
[30]	validation_0-mlogloss:0.86233
[31]	validation_0-mlogloss:0.85326
[32]	validation_0-mlogloss:0.84477
[33]	validation_0-mloglos

In [21]:
# 在测试集上进行预测
y_pred_stacking = meta_model.predict(X_meta_test)

# 评估结果
from sklearn.metrics import balanced_accuracy_score, classification_report

print("\nStacking Classifier Results:")
print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred_stacking):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_stacking))



Stacking Classifier Results:
Balanced Accuracy: 0.6545

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.64      0.66      2463
           1       0.82      0.81      0.81      5315
           2       0.59      0.51      0.55      1089
           3       0.61      0.61      0.61      1809
           4       0.69      0.71      0.70      3958
           5       0.73      0.66      0.70      1706
           6       0.85      0.89      0.87     11023
           7       0.49      0.39      0.44      1219

    accuracy                           0.76     28582
   macro avg       0.68      0.65      0.67     28582
weighted avg       0.76      0.76      0.76     28582

