In [3]:
# 导入所需的库
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV


# 定义数值型和非数值型特征的列表
num_features = ['Age', 'SibSp', 'Parch', 'Fare']
cat_features = ['Pclass', 'Sex', 'Embarked']

# 创建数值型特征的处理步骤，包括填充缺失值和标准化
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# 创建非数值型特征的处理步骤，包括填充缺失值和独热编码
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 创建一个列转换器，将不同的处理步骤应用到不同的特征上
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

# 创建一个随机森林分类器作为模型，并将其与预处理器组合成一个pipline
model = RandomForestClassifier()
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

In [15]:
# 创建一个k折交叉验证对象，指定k为3，并设置shuffle为True以打乱数据顺序
kf = KFold(n_splits=3, shuffle=True)

# 定义要搜索的参数网格，包括随机森林分类器的n_estimators、max_depth、min_samples_split三个参数
param_grid = {
    'model__n_estimators': range(100, 1000, 50),
    'model__max_depth': range(10, 100, 2),
    'model__min_samples_split': range(2, 10)
}

# 创建一个随机搜索对象，使用pipline作为估计器，param_grid作为参数网格，迭代100次随机选择超参组合，并设置scoring为accuracy以评估模型性能，
# n_jobs=-1代表使用所有进程，kf作为交叉验证方法
random_cv = RandomizedSearchCV(pipeline, param_grid, n_iter=100, scoring='accuracy', n_jobs=-1, cv=kf)

In [None]:
# 读取训练集并分离特征和标签
df = pd.read_csv('/kaggle/input/titanic/train.csv')
X = df[num_features + cat_features]
y = df['Survived']

# 使用训练集数据拟合随机搜索对象
random_cv.fit(X, y)

# 打印最优参数组合和最高准确率
print('Best parameters:', random_cv.best_params_)
print('Best score:', random_cv.best_score_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits

[CV 1/3] END model__max_depth=66, model__min_samples_split=5, model__n_estimators=250;, score=0.822 total time=   0.6s
[CV 3/3] END model__max_depth=66, model__min_samples_split=5, model__n_estimators=250;, score=0.811 total time=   0.6s
[CV 1/3] END model__max_depth=12, model__min_samples_split=7, model__n_estimators=800;, score=0.825 total time=   1.7s
[CV 1/3] END model__max_depth=84, model__min_samples_split=9, model__n_estimators=500;, score=0.818 total time=   1.1s
[CV 3/3] END model__max_depth=84, model__min_samples_split=9, model__n_estimators=500;, score=0.825 total time=   1.4s
[CV 1/3] END model__max_depth=44, model__min_samples_split=7, model__n_estimators=700;, score=0.818 total time=   1.5s
[CV 1/3] END model__max_depth=26, model__min_samples_split=5, model__n_estimators=150;, score=0.822 total time=   0.3s
[CV 1/3] END model__max_depth=90, model__min_samples_split=2, model__n_estimators=900;, score=0.811 tot

In [22]:
# 根据随机搜索到的超参组合，再定义要搜索的参数网格，进行更精细的调参
param_grid = {
    'model__n_estimators': range(400, 600, 50),
    'model__max_depth': range(3, 10, 2),
    'model__min_samples_split': range(90, 100)
}

# 创建一个网格搜索对象
grid_cv = GridSearchCV(pipeline, param_grid, scoring='accuracy', n_jobs=-1, cv=kf)

# 使用训练集数据拟合随机搜索对象
grid_cv.fit(X, y)

# 打印最优参数组合和最高准确率
print('Best parameters:', grid_cv.best_params_)
print('Best score:', grid_cv.best_score_)

Best parameters: {'model__max_depth': 7, 'model__min_samples_split': 90, 'model__n_estimators': 400}
Best score: 0.8024691358024691


In [25]:
# 将grid_cv做为model，与预处理器组合成一个pipline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', grid_cv)
])

# 读取测试集,并对测试集进行预测
df = pd.read_csv('/kaggle/input/titanic/train.csv')
X_test = df[num_features + cat_features]
y_pred = pipeline.predict(X_test)

# 将预测结果和乘客ID合并成一个新的DataFrame，保存为csv
output = pd.DataFrame({'PassengerId': df['PassengerId'], 'Survived': y_pred})
output.to_csv('submission.csv', index=False)