In [1]:
# 在这个文档中，我们将使用Optuna库来优化我们的模型。
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier

In [2]:
SEED = 42

数据预处理部分和 baseline 采用一样的方案

In [3]:
def DataPreprocessing(df):
    # 异常值处理
    df['Fare'] = np.where(df['Fare']<7.5, np.nan, df['Fare'])
        
    # 特征丢弃
    df.drop(columns=['Cabin','PassengerId','Name','Ticket'],inplace=True)
    
    # 特征编码
    genders = {'male': 0, 'female': 1}
    start_pos = {'S': 0, 'C': 1, 'Q': 2}
    df['Sex'] = df['Sex'].map(genders)
    df['Embarked'] = df['Embarked'].map(start_pos)
    
    # 缺失值处理
    df['Fare'] = df.groupby('Pclass')['Fare'].transform(lambda x: x.fillna(x.mean()))

    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    
    imputer = KNNImputer(n_neighbors=5)
    df['Age'] = imputer.fit_transform(df[['Age']])
    
    # 特征构造
    df['family'] = df['SibSp'] + df['Parch']
    return df

In [4]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train = DataPreprocessing(train)
test = DataPreprocessing(test)

这里下面我们将使用 Optuna 来优化我们的 XGBoost 模型。

并在最后，我们会与没有使用优化参数的 XGBoost 模型进行比较。

In [5]:
def n_cross_validata(model, data, skf):
    X = data.drop(columns=['Survived'])
    y = data['Survived']
    
    score = []
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[:, 1]
        score.append(roc_auc_score(y_test, y_pred))
    
    return np.mean(score)

def CV_Objective(trial, data, skf):
    params = {
        'objective': trial.suggest_categorical('objective', ['reg:tweedie', 'reg:pseudohubererror']),
        'random_state': SEED,
        'num_parallel_tree': trial.suggest_int('num_parallel_tree', 2, 30),
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 4),
        'learning_rate': trial.suggest_float('learning_rate', 0.02, 0.05, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 0.8),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.8),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 1e-1, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 1e-1, log=True),
    }

    model = XGBClassifier(**params)
    score = n_cross_validata(model,data, skf)
    
    return score
    
    
def RunOptimization(data):
    study = optuna.create_study(direction='maximize')
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
    study.optimize(lambda trial: CV_Objective(trial, data, skf), n_trials=100)
    
    print(f"Best parameters: {study.best_params}")
    print(f"Best score: {study.best_value}")
    return study.best_params

In [6]:
params = RunOptimization(train)

[I 2025-01-16 20:06:51,512] A new study created in memory with name: no-name-65ca657a-9810-43ee-ae5c-caae7ac442b1
[I 2025-01-16 20:06:56,682] Trial 0 finished with value: 0.8705141724986419 and parameters: {'objective': 'reg:pseudohubererror', 'num_parallel_tree': 25, 'n_estimators': 237, 'max_depth': 4, 'learning_rate': 0.04855523615460717, 'subsample': 0.7933290260290711, 'colsample_bytree': 0.6635975047485981, 'reg_alpha': 0.014674387959879173, 'reg_lambda': 0.02455411962734914}. Best is trial 0 with value: 0.8705141724986419.
[I 2025-01-16 20:07:00,312] Trial 1 finished with value: 0.8722717540664048 and parameters: {'objective': 'reg:pseudohubererror', 'num_parallel_tree': 23, 'n_estimators': 221, 'max_depth': 4, 'learning_rate': 0.036072019675830465, 'subsample': 0.6687323157197322, 'colsample_bytree': 0.7254904421827106, 'reg_alpha': 0.00012822218372933531, 'reg_lambda': 0.00241332353281404}. Best is trial 1 with value: 0.8722717540664048.
[I 2025-01-16 20:07:04,964] Trial 2 fin

Best parameters: {'objective': 'reg:pseudohubererror', 'num_parallel_tree': 21, 'n_estimators': 157, 'max_depth': 4, 'learning_rate': 0.030667230642422286, 'subsample': 0.5715179242734815, 'colsample_bytree': 0.7144645784573894, 'reg_alpha': 0.0064916301134209375, 'reg_lambda': 2.14465845959358e-05}
Best score: 0.8759307193302016


In [7]:
print(params)

{'objective': 'reg:pseudohubererror', 'num_parallel_tree': 21, 'n_estimators': 157, 'max_depth': 4, 'learning_rate': 0.030667230642422286, 'subsample': 0.5715179242734815, 'colsample_bytree': 0.7144645784573894, 'reg_alpha': 0.0064916301134209375, 'reg_lambda': 2.14465845959358e-05}


In [11]:
# 使用最优参数训练模型
model = XGBClassifier(**params)
model.fit(train.drop(columns=['Survived']), train['Survived'])
res_pro = model.predict(test)

# 查看结果(假设 gender_submission.csv 是 Kaggle 提供的答案)
ans = pd.read_csv('data/gender_submission.csv')
ac_pro = (ans['Survived'] == res_pro).sum()/len(res_pro)
print(ac_pro)

0.9210526315789473


In [12]:
# 使用原始模型查看结果
org_model = XGBClassifier()
org_model.fit(train.drop(columns=['Survived']), train['Survived'])
res_org = org_model.predict(test)

# 查看结果(假设 gender_submission.csv 是 Kaggle 提供的答案)
ans = pd.read_csv('data/gender_submission.csv')
ac_org = (ans['Survived'] == res_org).sum()/len(res_org)
print(ac_org)


0.8277511961722488


In [13]:
increase = (ac_pro - ac_org) / ac_org * 100
print(f"The increase is {increase:.2f}%")


The increase is 11.27%


可以看到，有接近10%左右的提升，这是非常夸张的

In [14]:
# 保存下来提交看看
submission = pd.DataFrame({
    'PassengerId': ans['PassengerId'],
    'Survived': res_pro
})

submission.to_csv('data/ProSubmission.csv', index=False)
print("OK!")

OK!


In [15]:
submission = pd.DataFrame({
    'PassengerId': ans['PassengerId'],
    'Survived': res_org
})

submission.to_csv('data/OrgSubmission.csv', index=False)
print("OK!")

OK!


在 Kaggle 上提交结果如下：

ProSubmission.csv ---> 0.77751

OrgSubmission.csv ---> 0.76076

在 Kaggle 分数上，提升有 1% 左右。