In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('../kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


../kaggle/input\sample_submission.csv
../kaggle/input\test.csv
../kaggle/input\train.csv


In [2]:
# define files path
sample_path = "../kaggle/input/sample_submission.csv"
train_path = "../kaggle/input/train.csv"
test_path = "../kaggle/input/test.csv"

In [3]:
# read csv file
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [4]:
# review train file
print(f"head of train_df:")
display(train_df.head())

print(f"head of test_df:")
display(test_df.head())

print(f"shape of train_df:{train_df.shape}")
print(f"info of train_df:")
train_df.info()

head of train_df:


Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


head of test_df:


Unnamed: 0,id,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,3116945,8.64,x,,n,t,,,w,11.13,...,b,,w,u,w,t,g,,d,a
1,3116946,6.9,o,t,o,f,,c,y,1.27,...,,,n,,,f,f,,d,a
2,3116947,2.0,b,g,n,f,,c,n,6.18,...,,,n,,,f,f,,d,s
3,3116948,3.47,x,t,n,f,s,c,n,4.98,...,,,w,,n,t,z,,d,u
4,3116949,6.17,x,h,y,f,p,,y,6.73,...,,,y,,y,t,,,d,u


shape of train_df:(3116945, 22)
info of train_df:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   class                 object 
 2   cap-diameter          float64
 3   cap-shape             object 
 4   cap-surface           object 
 5   cap-color             object 
 6   does-bruise-or-bleed  object 
 7   gill-attachment       object 
 8   gill-spacing          object 
 9   gill-color            object 
 10  stem-height           float64
 11  stem-width            float64
 12  stem-root             object 
 13  stem-surface          object 
 14  stem-color            object 
 15  veil-type             object 
 16  veil-color            object 
 17  has-ring              object 
 18  ring-type             object 
 19  spore-print-color     object 
 20  habitat               object 
 21  season                objec

In [5]:
# remove id
train_df.drop(columns=['id'], inplace=True)
test_ids = test_df['id']
test_df.drop(columns=['id'], inplace=True)

print(f"shape of train: {train_df.shape}")
print(f"shape of test: {test_df.shape}")
print(f"columns names: {train_df.columns[:].tolist()}")

shape of train: (3116945, 21)
shape of test: (2077964, 20)
columns names: ['class', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season']


In [6]:
# target variable y
if 'class' in train_df.columns:
    
    y = train_df['class'].map({'e': 0, 'p': 1})
    train_df.drop(columns=['class'], inplace=True)

In [7]:
from sklearn.preprocessing import LabelEncoder
import joblib

cat_cols = train_df.select_dtypes(include=['object']).columns
# 创建一个字典来存储每一列的编码器
encoders_dict = {}

for col in cat_cols:
    train_df[col] = train_df[col].astype(str).fillna('missing')
    test_df[col] = test_df[col].astype(str).fillna('missing')
    
    le = LabelEncoder()
    full_data = pd.concat([train_df[col], test_df[col]], axis=0)
    le.fit(full_data)
    
    train_df[col] = le.transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    
    # 将训练好的编码器存入字典
    encoders_dict[col] = le

# 保存字典到文件
joblib.dump(encoders_dict, '../models/label_encoders.pkl')
print("Label Encoding finish and encoders saved")

Label Encoding finish and encoders saved


In [8]:
# split train and val
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train_df, y, test_size=0.2, random_state=42, stratify=y
)

print(f"shape of X_train: {X_train.shape}")
print(f"shape of X_val: {X_val.shape}")

shape of X_train: (2493556, 20)
shape of X_val: (623389, 20)


In [9]:
# set LightGBM model
import optuna
import lightgbm as lgb
from sklearn.metrics import matthews_corrcoef

def objective_lgb(trial):
    # 1. 定义参数搜索空间
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'device': 'gpu',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 255),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'n_estimators': 500 # 固定树的数量，或使用 early stopping
    }

    # 2. 训练模型
    # 假设你已经定义好了 X_train, X_val, y_train, y_val
    model = lgb.LGBMClassifier(**param)
    model.fit(X_train, y_train)

    # 3. 预测并计算指标 (S4E8 推荐 MCC)
    preds = model.predict(X_val)
    mcc = matthews_corrcoef(y_val, preds)
    
    # Optuna 默认寻找最小值，所以如果是优化 MCC（越大越好），直接返回 mcc 即可（下面设为 maximize）
    return mcc

# 4. 创建研究对象并开始优化
study_lgb = optuna.create_study(direction='maximize') # 我们的目标是最大化 MCC
study_lgb.optimize(objective_lgb, n_trials=50) # 尝试 50 组不同的参数

print("最优参数: ", study_lgb.best_params)
print("最高 MCC: ", study_lgb.best_value)

  from .autonotebook import tqdm as notebook_tqdm
[I 2026-01-10 00:03:43,538] A new study created in memory with name: no-name-df32cca6-f793-41ec-a643-ca42a81ae90a
[I 2026-01-10 00:03:55,760] Trial 0 finished with value: 0.8531445698416484 and parameters: {'learning_rate': 0.025201796322315617, 'num_leaves': 77, 'max_depth': 3, 'min_child_samples': 29, 'subsample': 0.9151362729174755, 'colsample_bytree': 0.6770921149373297}. Best is trial 0 with value: 0.8531445698416484.
[I 2026-01-10 00:04:28,630] Trial 1 finished with value: 0.984259627845792 and parameters: {'learning_rate': 0.04663326238325611, 'num_leaves': 246, 'max_depth': 9, 'min_child_samples': 85, 'subsample': 0.6204789489429494, 'colsample_bytree': 0.51627825423054}. Best is trial 1 with value: 0.984259627845792.
[I 2026-01-10 00:04:37,347] Trial 2 finished with value: 0.7092792234447324 and parameters: {'learning_rate': 0.01186550081798698, 'num_leaves': 61, 'max_depth': 3, 'min_child_samples': 40, 'subsample': 0.738519800

最优参数:  {'learning_rate': 0.04571020127727049, 'num_leaves': 254, 'max_depth': 12, 'min_child_samples': 98, 'subsample': 0.8867030158242971, 'colsample_bytree': 0.5005452506291571}
最高 MCC:  0.984685714445832


In [10]:
# 保存最优参数到文件
import json

# 1. 获取最优参数字典
best_lgb_params = study_lgb.best_params

# 2. 建议手动加入一些非搜索的固定参数，确保文件完整
best_lgb_params.update({
    'objective': 'binary',
    'metric': 'binary_logloss',
    'device': 'gpu',
    'verbosity': -1
})

# 3. 写入文件
with open('../models_params/best_lgb_params.json', 'w') as f:
    json.dump(best_lgb_params, f, indent=4)

print("LightGBM 参数已保存至 ../models_params/best_lgb_params.json")


LightGBM 参数已保存至 ../models_params/best_lgb_params.json


In [11]:
# 保存全量训练模型
import joblib

lgb_mode = lgb.LGBMClassifier(**best_lgb_params)
lgb_mode.fit(train_df, y)

joblib.dump(lgb_mode, '../models/lgb_model.pkl')

print("LightGBM 模型已保存至 ../models/lgb_model.pkl")

LightGBM 模型已保存至 ../models/lgb_model.pkl


In [12]:
# set XGboost model
import xgboost as xgb
import optuna
from sklearn.metrics import matthews_corrcoef

def objective_xgb(trial):
    # 1. 定义 XGBoost 参数搜索空间
    param = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        # GPU 加速关键参数
        'tree_method': 'hist',
        'device': 'cuda', 
        
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'n_estimators': 500
    }

    # 2. 训练模型
    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train)

    # 3. 预测并计算 MCC
    preds = model.predict(X_val)
    mcc = matthews_corrcoef(y_val, preds)
    
    return mcc

# 4. 优化
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=50)

print("XGBoost 最优参数: ", study_xgb.best_params)
print("最高 MCC: ", study_xgb.best_value)

[I 2026-01-10 00:27:30,538] A new study created in memory with name: no-name-cb7fec5b-1bf3-43f6-a391-63313ebd93a9
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
[I 2026-01-10 00:27:34,853] Trial 0 finished with value: 0.9559461807258882 and parameters: {'learning_rate': 0.024581060555249303, 'max_depth': 5, 'min_child_weight': 9, 'subsample': 0.9125100889786149, 'colsample_bytree': 0.5598868855402692, 'gamma': 0.003020576153090284}. Best is trial 0 with value: 0.9559461807258882.
[I 2026-01-10 00:27:38,229] Trial 1 finished with value: 0.8810281660699565 and parameters: {'learning_rate': 0.03187772596158397, 'max_depth': 3, 'min_child_weight': 9, 'subsample': 0.5713468647369206, 'colsample_bytree': 0.5496233376588354, 'gamma': 1.5613380073314121e-06}. Best is trial 0 with value: 0.9559461807258882.
[I 2026-01-10 00:27:41,901] Trial 2 finished with value: 0

XGBoost 最优参数:  {'learning_rate': 0.05466374315217404, 'max_depth': 12, 'min_child_weight': 2, 'subsample': 0.8696167658517203, 'colsample_bytree': 0.5018326548621551, 'gamma': 5.130606854224363e-08}
最高 MCC:  0.9848340421692562


In [13]:
import json

# 1. 获取 XGBoost 最优参数
best_xgb_params = study_xgb.best_params

# 2. 补充运行必需的固定参数（确保加载后可直接运行）
best_xgb_params.update({
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'tree_method': 'hist',
    'device': 'cuda',
    'n_estimators': 500  # 也可以根据需要调高，如 1000
})

# 3. 写入文件
with open('../models_params/best_xgb_params.json', 'w') as f:
    json.dump(best_xgb_params, f, indent=4)

print("XGBoost 参数已成功保存至 ../models_params/best_xgb_params.json")


XGBoost 参数已成功保存至 ../models_params/best_xgb_params.json


In [14]:
# 保存全量训练模型
import joblib
xgb_model = xgb.XGBClassifier(**best_xgb_params)
xgb_model.fit(train_df, y)

joblib.dump(xgb_model, '../models/xgb_model.pkl')

print("XGBoost 模型已保存至 ../models/xgb_model.pkl")

XGBoost 模型已保存至 ../models/xgb_model.pkl


In [15]:
import json
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import matthews_corrcoef

# --- 1. 从文件中加载参数 ---
def load_params(filename):
    with open(filename, 'r') as f:
        return json.load(f)

print("正在加载参数文件...")
lgb_params = load_params('../models_params/best_lgb_params.json')
xgb_params = load_params('../models_params/best_xgb_params.json')

# 确保必要的运行参数（如 GPU）已经存在
lgb_params.update({'device': 'gpu', 'verbosity': -1})
xgb_params.update({'tree_method': 'hist', 'device': 'cuda'})

# --- 2. 定义 OOF 函数 ---
def get_oof_preds(model_class, model_params, X, y, X_test, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    oof_preds = np.zeros(len(X))
    test_preds_accumulated = np.zeros(len(X_test))
    
    # 转换为 numpy 提升速度 (可选)
    X_np = X.values
    y_np = y.values
    X_test_np = X_test.values
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
        
        model = model_class(**model_params)
        model.fit(X_train_fold, y_train_fold)
        
        # 记录验证集预测
        oof_preds[val_idx] = model.predict_proba(X_val_fold)[:, 1]
        
        # 记录测试集预测
        test_preds_accumulated += model.predict_proba(X_test)[:, 1] / n_splits
        
        # 打印当前 Fold 的 MCC 以便监控
        fold_mcc = matthews_corrcoef(y_val_fold, (oof_preds[val_idx] > 0.5).astype(int))
        print(f"Fold {fold+1} 完成 | MCC: {fold_mcc:.4f}")
        
    return oof_preds, test_preds_accumulated

# --- 3. 执行 OOF 流程 ---
print("\n[开始 LightGBM OOF]")
oof_lgb, test_lgb = get_oof_preds(lgb.LGBMClassifier, lgb_params, train_df, y, test_df)

print("\n[开始 XGBoost OOF]")
oof_xgb, test_xgb = get_oof_preds(xgb.XGBClassifier, xgb_params, train_df, y, test_df)

# --- 4. 计算整体 OOF 分数 ---
mcc_lgb = matthews_corrcoef(y, (oof_lgb > 0.5).astype(int))
mcc_xgb = matthews_corrcoef(y, (oof_xgb > 0.5).astype(int))

print(f"\n整体 OOF MCC (LightGBM): {mcc_lgb:.4f}")
print(f"整体 OOF MCC (XGBoost): {mcc_xgb:.4f}")

正在加载参数文件...

[开始 LightGBM OOF]
Fold 1 完成 | MCC: 0.9822
Fold 2 完成 | MCC: 0.9820
Fold 3 完成 | MCC: 0.9822
Fold 4 完成 | MCC: 0.9817
Fold 5 完成 | MCC: 0.9821

[开始 XGBoost OOF]
Fold 1 完成 | MCC: 0.9847
Fold 2 完成 | MCC: 0.9846
Fold 3 完成 | MCC: 0.9846
Fold 4 完成 | MCC: 0.9845
Fold 5 完成 | MCC: 0.9847

整体 OOF MCC (LightGBM): 0.9820
整体 OOF MCC (XGBoost): 0.9846


In [16]:
from sklearn.linear_model import LogisticRegression

# 创建元特征矩阵
X_meta = np.column_stack([oof_lgb, oof_xgb])
X_test_meta = np.column_stack([test_lgb, test_xgb])

# 使用逻辑回归作为 Meta-Model
meta_model = LogisticRegression()
meta_model.fit(X_meta, y)

# 保存元模型
joblib.dump(meta_model, '../models/meta_model.pkl')

# 最终预测
final_preds_prob = meta_model.predict_proba(X_test_meta)[:, 1]
final_stacking_mcc = matthews_corrcoef(y, (meta_model.predict(X_meta)))

print(f"Stacking OOF MCC: {final_stacking_mcc:.4f}")

Stacking OOF MCC: 0.9843


In [17]:
# --- 修正后的 Stacking 预测与提交 ---

# 1. 获取 Meta-Model 在测试集上的类别预测 (0 或 1)
# 也可以通过概率手动设置阈值：(final_preds_prob > 0.5).astype(int)
final_stacking_preds = meta_model.predict(X_test_meta)

# 2. 映射回原始标签 'e' 和 'p'
submission = pd.DataFrame({
    'id': test_ids,
    'class': pd.Series(final_stacking_preds).map({0: 'e', 1: 'p'})
})

# 3. 保存
submission.to_csv('submission_stacking.csv', index=False)
print("✅ 成功！Stacking 提交文件已保存。")

✅ 成功！Stacking 提交文件已保存。


In [18]:
# 1. 查看权重 (每个特征对应一个权重)
print(f"特征权重 (Weights): {meta_model.coef_}")

# 2. 查看截距
print(f"截距 (Intercept): {meta_model.intercept_}")

特征权重 (Weights): [[4.98548622 5.77009522]]
截距 (Intercept): [-5.12134849]
