In [4]:
import pandas as pd
import numpy as np # 常用库，虽然此预览中可能不直接大量使用，但数据分析常备

# --- 1. 加载数据 ---
file_path = 'train.csv' # 确保 train.csv 文件在当前工作目录，或者提供完整路径
try:
    df = pd.read_csv(file_path)
    print(f"成功加载数据: {file_path}\n")
except FileNotFoundError:
    print(f"错误: 文件 '{file_path}' 未找到。请确保文件路径正确。")
    exit()
except Exception as e:
    print(f"加载数据时发生错误: {e}")
    print("提示: 如果是编码问题，可以尝试指定编码，例如：")
    print("df = pd.read_csv(file_path, encoding='latin1') # 或者 'gbk', 'utf-8-sig' 等")
    exit()

# --- 2. 数据基本概览 ---
print("--- 1. 数据基本信息 ---")
print("DataFrame的形状 (行数, 列数):")
print(df.shape)

print("\n前5行数据:")
print(df.head())

print("\n后5行数据 (可选，检查末尾是否有异常):")
print(df.tail())

print("\n列名:")
print(df.columns.tolist())

print("\n数据类型及非空值计数:")
df.info() # 这个函数会直接打印信息，非常方便

# --- 3. 描述性统计 ---
print("\n\n--- 2. 描述性统计 ---")
print("数值型特征的描述性统计:")
# include=np.number 只选择数值型列
# transpose() 可以让输出更易读，如果列很多的话
print(df.describe(include=np.number).transpose())

print("\n对象/类别型特征的描述性统计:")
# include='object' 只选择对象类型（通常是字符串）
# include=['object', 'category'] 如果你已经转换了某些列为category类型
print(df.describe(include=['object', 'category']).transpose())

# --- 4. 缺失值检查 ---
print("\n\n--- 3. 缺失值检查 ---")
missing_values_count = df.isnull().sum()
missing_values_percentage = (missing_values_count / len(df)) * 100

missing_data_summary = pd.DataFrame({
    '缺失值数量': missing_values_count,
    '缺失值百分比': missing_values_percentage
})

# 只显示包含缺失值的列，并按百分比降序排列
print("各列缺失值统计 (仅显示含缺失值的列):")
print(missing_data_summary[missing_data_summary['缺失值数量'] > 0].sort_values(by='缺失值百分比', ascending=False))

if missing_data_summary['缺失值数量'].sum() == 0:
    print("数据集中没有缺失值。")
else:
    print(f"\n数据集中总共有 {missing_data_summary['缺失值数量'].sum()} 个缺失值。")

# --- 5. 重复行检查 ---
print("\n\n--- 4. 重复行检查 ---")
num_duplicate_rows = df.duplicated().sum()
print(f"数据集中重复行的数量: {num_duplicate_rows}")

if num_duplicate_rows > 0:
    print("示例重复行 (只显示部分，如果存在):")
    # df[df.duplicated(keep=False)] # 显示所有重复的行
    print(df[df.duplicated()].head()) # 只显示重复项中的后项

# --- 6. 唯一值分析 (主要针对对象/类别型特征) ---
print("\n\n--- 5. 唯一值分析 (对象/类别型特征) ---")
object_columns = df.select_dtypes(include=['object', 'category']).columns
if not object_columns.empty:
    for col in object_columns:
        print(f"\n列名: {col}")
        num_unique = df[col].nunique()
        print(f"  唯一值数量: {num_unique}")

        # 如果唯一值数量不多，可以打印出来看看具体值和它们的频次
        if num_unique < 20: # 这个阈值可以根据你的数据调整
            print(f"  具体唯一值及频次 (最多显示前10个, dropna=False会统计NaN):")
            print(df[col].value_counts(dropna=False).head(10))
        elif num_unique == len(df) and 'id' not in col.lower() and 'name' not in col.lower():
             print(f"  注意: 列 '{col}' 的唯一值数量等于行数 ({len(df)})。这可能是一个ID列，或者是一个高基数文本列，请仔细检查。")
        else:
            print(f"  唯一值数量较多 ({num_unique}), 不逐一显示具体值。可使用 df['{col}'].value_counts() 单独查看。")
else:
    print("数据集中没有检测到对象/类别型特征。")

# --- 7. 总结与清洗建议提示 ---
print("\n\n--- 6. 数据预览总结与清洗建议 ---")
print("请仔细检查以上输出，这将为你的数据清洗提供方向：")
print("1. **缺失值**: 关注 `缺失值统计` 部分。思考如何处理这些缺失值（删除行/列，填充均值、中位数、众数，或使用更复杂的插补方法）。")
print("2. **数据类型**: 检查 `df.info()` 的输出。是否有列的类型不正确？（例如，数字被识别为字符串，日期被识别为对象）。")
print("3. **重复数据**: 检查 `重复行检查` 部分。如果存在重复行，需要判断它们是否是真实的重复并决定是否删除。")
print("4. **异常值/不合理值**: 查看数值型特征的 `描述性统计`（min, max, mean, std）。是否存在不合逻辑的最小值或最大值？标准差是否过大？")
print("5. **类别特征基数与一致性**: 查看对象/类别型特征的 `唯一值分析` 和 `描述性统计`。")
print("   - `nunique` (唯一值数量): 唯一值过多（高基数）的类别特征可能需要特殊处理（如合并稀有类别、目标编码等）。")
print("   - `value_counts()`: 检查是否有不一致的标签（例如 'Male', 'male', 'M'），或者是否有错别字、不必要的空格等。")
print("   - `top` 和 `freq`: 对于类别特征，最常见的类别是什么，它的频率如何？")
print("6. **列名**: 检查列名是否清晰、规范，是否含有不必要的空格或特殊字符。")

print("\n代码执行完毕。现在你可以根据这些预览信息开始制定数据清洗策略了。")

# 可选：如果你想将预览信息保存到文件
# with open('data_preview_report.txt', 'w', encoding='utf-8') as f:
#     # 你可以将上面的 print 语句重定向到这个文件对象
#     # 例如: print("DataFrame的形状:", df.shape, file=f)
#     # 但更简单的方法是捕获 print 输出或重新组织代码以写入字符串
#     # 这里仅作提示，实际实现会更复杂一些
#     pass

成功加载数据: train.csv

--- 1. 数据基本信息 ---
DataFrame的形状 (行数, 列数):
(891, 12)

前5行数据:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53

In [5]:
import pandas as pd
import numpy as np

# 假设 df 已经加载了 'train.csv'
# 例如: df = pd.read_csv('train.csv')
# 为了代码可独立运行，我们先加载它
try:
    df = pd.read_csv('train.csv')
    print("成功加载数据: train.csv")
except FileNotFoundError:
    print("错误: train.csv 未找到。请确保文件在当前目录下。")
    exit()

# 1. 创建副本
df_processed = df.copy()
print("\n--- 1. 创建数据副本 ---")
print("副本创建完毕。")

# --- 2. 处理缺失值 ---
print("\n--- 2. 处理缺失值 ---")

# Embarked: 用众数填充
embarked_mode = df_processed['Embarked'].mode()[0]
df_processed['Embarked'].fillna(embarked_mode, inplace=True)
print(f"Embarked 缺失值已用众数 '{embarked_mode}' 填充。")

# Age: 按 Pclass 和 Sex 分组，用各组的中位数填充
# 使用 transform 可以将聚合结果广播回原始 DataFrame 的索引
df_processed['Age'] = df_processed.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
# 如果分组填充后仍有缺失 (例如某个 Pclass+Sex 组合完全没有 Age 数据), 用全局中位数填充
if df_processed['Age'].isnull().any():
    age_global_median = df_processed['Age'].median()
    df_processed['Age'].fillna(age_global_median, inplace=True)
    print(f"Age 缺失值已按 Pclass 和 Sex 分组中位数填充，剩余部分用全局中位数 {age_global_median:.2f} 填充。")
else:
    print("Age 缺失值已按 Pclass 和 Sex 分组中位数填充。")


# Cabin: 直接删除该列
df_processed.drop('Cabin', axis=1, inplace=True)
print("Cabin 列已删除。")

# --- 3. 数据类型转换与编码 ---
print("\n--- 3. 数据类型转换与编码 ---")

# Sex: 转换为 0 和 1
df_processed['Sex'] = df_processed['Sex'].map({'male': 0, 'female': 1}).astype(int)
print("Sex 列已转换为数值型 (male:0, female:1)。")

# Embarked: 进行独热编码
df_processed = pd.get_dummies(df_processed, columns=['Embarked'], prefix='Embarked', drop_first=True)
print("Embarked 列已进行独热编码。") # drop_first=True 避免多重共线性

# --- 4. 特征工程 ---
print("\n--- 4. 特征工程 ---")

# Name: 提取 Title
df_processed['Title'] = df_processed['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# 合并稀有称谓
common_titles = ['Mr', 'Miss', 'Mrs', 'Master']
df_processed['Title'] = df_processed['Title'].apply(lambda x: x if x in common_titles else 'Rare')
# 对 Title进行独热编码
df_processed = pd.get_dummies(df_processed, columns=['Title'], prefix='Title', drop_first=True)
print("从 Name 中提取并编码了 Title 特征。")

# SibSp 和 Parch: 合并为 FamilySize，并创建 IsAlone
df_processed['FamilySize'] = df_processed['SibSp'] + df_processed['Parch'] + 1
df_processed['IsAlone'] = 0
df_processed.loc[df_processed['FamilySize'] == 1, 'IsAlone'] = 1
# 删除原始 SibSp 和 Parch
df_processed.drop(['SibSp', 'Parch'], axis=1, inplace=True)
print("创建了 FamilySize 和 IsAlone 特征，并删除了 SibSp 和 Parch。")

# Fare:
# 处理票价为0的情况：用对应 Pclass 的票价中位数填充
median_fares_by_pclass = df_processed.groupby('Pclass')['Fare'].median()
for p_class in median_fares_by_pclass.index:
    # 仅对 Fare 为 0 且 Pclass 匹配的行进行填充
    condition = (df_processed['Fare'] == 0) & (df_processed['Pclass'] == p_class)
    if median_fares_by_pclass[p_class] > 0: #确保中位数不是0
        df_processed.loc[condition, 'Fare'] = median_fares_by_pclass[p_class]
    else: # 如果某个舱位的中位数也是0，用一个极小值代替，避免log(0)
        df_processed.loc[condition, 'Fare'] = np.finfo(float).eps

# 如果在上述处理后，仍然有 Fare 为 0 的情况（例如，某个Pclass的所有票价都是0，非常罕见）
# 则用一个极小的正数替换，以避免log(0)错误
if (df_processed['Fare'] == 0).any():
    df_processed.loc[df_processed['Fare'] == 0, 'Fare'] = np.finfo(float).eps
    print("Fare 列中剩余的0值已用极小正数替换。")

# 进行对数转换 (np.log1p 处理 x=0 的情况, log1p(x) = log(1+x))
df_processed['Fare'] = np.log1p(df_processed['Fare'])
print("Fare 列已处理0值并进行了对数转换。")


# --- 5. 删除不再需要的列 ---
print("\n--- 5. 删除不再需要的列 ---")
columns_to_drop = ['PassengerId', 'Name', 'Ticket']
df_processed.drop(columns_to_drop, axis=1, inplace=True)
print(f"已删除列: {', '.join(columns_to_drop)}。")

# --- 6. 验证 ---
print("\n--- 6. 验证处理后的数据 ---")
print("\n处理后 DataFrame 的信息:")
df_processed.info()

print("\n处理后 DataFrame 的缺失值统计:")
print(df_processed.isnull().sum())

print("\n处理后 DataFrame 的前5行:")
print(df_processed.head())

print("\n处理后 DataFrame 的描述性统计 (包含所有类型):")
print(df_processed.describe(include='all').T) # .T 转置方便查看

成功加载数据: train.csv

--- 1. 创建数据副本 ---
副本创建完毕。

--- 2. 处理缺失值 ---
Embarked 缺失值已用众数 'S' 填充。
Age 缺失值已按 Pclass 和 Sex 分组中位数填充。
Cabin 列已删除。

--- 3. 数据类型转换与编码 ---
Sex 列已转换为数值型 (male:0, female:1)。
Embarked 列已进行独热编码。

--- 4. 特征工程 ---
从 Name 中提取并编码了 Title 特征。
创建了 FamilySize 和 IsAlone 特征，并删除了 SibSp 和 Parch。
Fare 列已处理0值并进行了对数转换。

--- 5. 删除不再需要的列 ---
已删除列: PassengerId, Name, Ticket。

--- 6. 验证处理后的数据 ---

处理后 DataFrame 的信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Sex         891 non-null    int64  
 3   Age         891 non-null    float64
 4   Fare        891 non-null    float64
 5   Embarked_Q  891 non-null    bool   
 6   Embarked_S  891 non-null    bool   
 7   Title_Miss  891 non-null    bool   
 8   Title_Mr    891 non-null    bool   
 9   Title_Mrs   891 non-null   

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['Embarked'].fillna(embarked_mode, inplace=True)


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
# 注意：不再需要 matplotlib 和 seaborn，以及 FontProperties

# --- 假设 df_processed 是上一步处理好的 DataFrame ---
# (保持之前的模拟/加载 df_processed 的代码不变)
try:
    df = pd.read_csv('train.csv')
    df_processed = df.copy()
    embarked_mode = df_processed['Embarked'].mode()[0]
    df_processed['Embarked'] = df_processed['Embarked'].fillna(embarked_mode)
    df_processed['Age'] = df_processed.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    if df_processed['Age'].isnull().any():
        age_global_median = df_processed['Age'].median()
        df_processed['Age'] = df_processed['Age'].fillna(age_global_median)
    df_processed.drop('Cabin', axis=1, inplace=True)
    df_processed['Sex'] = df_processed['Sex'].map({'male': 0, 'female': 1}).astype(int)
    df_processed = pd.get_dummies(df_processed, columns=['Embarked'], prefix='Embarked', drop_first=True)
    df_processed['Title'] = df_processed['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    common_titles = ['Mr', 'Miss', 'Mrs', 'Master']
    df_processed['Title'] = df_processed['Title'].apply(lambda x: x if x in common_titles else 'Rare')
    df_processed['Title'] = df_processed['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df_processed['Title'] = df_processed['Title'].replace('Mme', 'Mrs')
    rare_titles = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    df_processed['Title'] = df_processed['Title'].replace(rare_titles, 'Rare')
    df_processed = pd.get_dummies(df_processed, columns=['Title'], prefix='Title', drop_first=True)
    df_processed['FamilySize'] = df_processed['SibSp'] + df_processed['Parch'] + 1
    df_processed['IsAlone'] = 0
    df_processed.loc[df_processed['FamilySize'] == 1, 'IsAlone'] = 1
    df_processed.drop(['SibSp', 'Parch'], axis=1, inplace=True)
    median_fares_by_pclass = df_processed.groupby('Pclass')['Fare'].median()
    for p_class in median_fares_by_pclass.index:
        condition = (df_processed['Fare'] == 0) & (df_processed['Pclass'] == p_class)
        if median_fares_by_pclass[p_class] > 0:
            df_processed.loc[condition, 'Fare'] = median_fares_by_pclass[p_class]
        else:
            df_processed.loc[condition, 'Fare'] = np.finfo(float).eps
    if (df_processed['Fare'] == 0).any():
        df_processed.loc[df_processed['Fare'] == 0, 'Fare'] = np.finfo(float).eps
    df_processed['Fare'] = np.log1p(df_processed['Fare'])
    columns_to_drop = ['PassengerId', 'Name', 'Ticket']
    df_processed.drop(columns_to_drop, axis=1, inplace=True)
    print("成功模拟/加载并预处理数据，df_processed 已准备好。")
except FileNotFoundError:
    print("错误: train.csv 未找到。请确保文件在当前目录下，或者 df_processed 已在环境中定义。")
    exit()
except Exception as e:
    print(f"预处理过程中发生错误: {e}")
    exit()

# 1. 准备数据
print("\n--- 1. 准备数据 ---")
if 'Survived' not in df_processed.columns:
    print("错误: 目标变量 'Survived' 不在 df_processed 中。请检查预处理步骤。")
    exit()

X = df_processed.drop('Survived', axis=1)
y = df_processed['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"训练集样本数: {X_train.shape[0]}, 测试集样本数: {X_test.shape[0]}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 将scaled array转换回DataFrame，以保持列名，方便理解 (可选，但对打印有帮助)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)
print("特征已标准化。")

# 2. 选择并训练模型
print("\n--- 2. 选择并训练模型 ---")
models = {
    "逻辑回归 (Logistic Regression)": LogisticRegression(random_state=42, solver='liblinear', max_iter=1000),
    "随机森林 (Random Forest)": RandomForestClassifier(random_state=42, n_estimators=100),
    "梯度提升机 (Gradient Boosting)": GradientBoostingClassifier(random_state=42, n_estimators=100)
}

evaluation_results = {} # 用于存储最终的总结数据

for name, model in models.items():
    print(f"\n--- 训练并评估模型: {name} ---")
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] # 获取正类的预测概率

    accuracy = accuracy_score(y_test, y_pred)
    # 使用 target_names 使分类报告更易读
    report = classification_report(y_test, y_pred, target_names=['未生还 (0)', '生还 (1)'], zero_division=0)
    cm = confusion_matrix(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)

    evaluation_results[name] = {
        'accuracy': accuracy,
        'roc_auc': auc,
        'classification_report_str': report, # 存储字符串形式的报告
        'confusion_matrix_arr': cm.tolist() # 存储为列表，方便打印
    }

    print(f"准确率 (Accuracy): {accuracy:.4f}")
    print(f"ROC AUC 分数: {auc:.4f}")
    print("分类报告 (Classification Report):\n", report)
    print("混淆矩阵 (Confusion Matrix):\n", cm)
    # 打印混淆矩阵的另一种方式，更明确标签
    print("混淆矩阵 (标签化):")
    print(f"          预测未生还  预测生还")
    print(f"实际未生还    {cm[0,0]:<10} {cm[0,1]:<10}")
    print(f"实际生还      {cm[1,0]:<10} {cm[1,1]:<10}")


# 3. 总结所有模型的准确率和ROC AUC (之前是步骤4的一部分)
print("\n--- 模型性能总结 ---")
# 创建一个列表来构建DataFrame
summary_data = []
for name, res in evaluation_results.items():
    summary_data.append({
        '模型 (Model)': name,
        '准确率 (Accuracy)': f"{res['accuracy']:.4f}", # 格式化为字符串
        'ROC AUC': f"{res['roc_auc']:.4f}"  # 格式化为字符串
    })

summary_df = pd.DataFrame(summary_data)
# 打印DataFrame，可以不排序或按需排序
print(summary_df.to_string(index=False)) # to_string() 避免截断

# 如果需要，可以打印每个模型更详细的存储结果
# print("\n--- 各模型详细评估结果 (存储数据) ---")
# for model_name, results in evaluation_results.items():
#     print(f"\n模型: {model_name}")
#     print(f"  准确率: {results['accuracy']:.4f}")
#     print(f"  ROC AUC: {results['roc_auc']:.4f}")
#     print(f"  分类报告:\n{results['classification_report_str']}")
#     print(f"  混淆矩阵 (列表形式): {results['confusion_matrix_arr']}")

print("\n模型训练和评估完成。")

成功模拟/加载并预处理数据，df_processed 已准备好。

--- 1. 准备数据 ---
训练集样本数: 712, 测试集样本数: 179
特征已标准化。

--- 2. 选择并训练模型 ---

--- 训练并评估模型: 逻辑回归 (Logistic Regression) ---
准确率 (Accuracy): 0.8380
ROC AUC 分数: 0.8740
分类报告 (Classification Report):
               precision    recall  f1-score   support

     未生还 (0)       0.85      0.89      0.87       110
      生还 (1)       0.81      0.75      0.78        69

    accuracy                           0.84       179
   macro avg       0.83      0.82      0.83       179
weighted avg       0.84      0.84      0.84       179

混淆矩阵 (Confusion Matrix):
 [[98 12]
 [17 52]]
混淆矩阵 (标签化):
          预测未生还  预测生还
实际未生还    98         12        
实际生还      17         52        

--- 训练并评估模型: 随机森林 (Random Forest) ---
准确率 (Accuracy): 0.7765
ROC AUC 分数: 0.8289
分类报告 (Classification Report):
               precision    recall  f1-score   support

     未生还 (0)       0.80      0.85      0.82       110
      生还 (1)       0.73      0.67      0.70        69

    accuracy                      

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier # 只导入需要的模型
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# --- 假设 df_processed 是上一步处理好的 DataFrame ---
# (保持之前的模拟/加载 df_processed 的代码不变)
try:
    df = pd.read_csv('train.csv')
    df_processed = df.copy()
    # ... (此处省略与之前完全相同的预处理代码) ...
    embarked_mode = df_processed['Embarked'].mode()[0]
    df_processed['Embarked'] = df_processed['Embarked'].fillna(embarked_mode)
    df_processed['Age'] = df_processed.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    if df_processed['Age'].isnull().any():
        age_global_median = df_processed['Age'].median()
        df_processed['Age'] = df_processed['Age'].fillna(age_global_median)
    df_processed.drop('Cabin', axis=1, inplace=True)
    df_processed['Sex'] = df_processed['Sex'].map({'male': 0, 'female': 1}).astype(int)
    df_processed = pd.get_dummies(df_processed, columns=['Embarked'], prefix='Embarked', drop_first=True)
    df_processed['Title'] = df_processed['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    common_titles = ['Mr', 'Miss', 'Mrs', 'Master']
    df_processed['Title'] = df_processed['Title'].apply(lambda x: x if x in common_titles else 'Rare')
    df_processed['Title'] = df_processed['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df_processed['Title'] = df_processed['Title'].replace('Mme', 'Mrs')
    rare_titles = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    df_processed['Title'] = df_processed['Title'].replace(rare_titles, 'Rare')
    df_processed = pd.get_dummies(df_processed, columns=['Title'], prefix='Title', drop_first=True)
    df_processed['FamilySize'] = df_processed['SibSp'] + df_processed['Parch'] + 1
    df_processed['IsAlone'] = 0
    df_processed.loc[df_processed['FamilySize'] == 1, 'IsAlone'] = 1
    df_processed.drop(['SibSp', 'Parch'], axis=1, inplace=True)
    median_fares_by_pclass = df_processed.groupby('Pclass')['Fare'].median()
    for p_class in median_fares_by_pclass.index:
        condition = (df_processed['Fare'] == 0) & (df_processed['Pclass'] == p_class)
        if median_fares_by_pclass[p_class] > 0:
            df_processed.loc[condition, 'Fare'] = median_fares_by_pclass[p_class]
        else:
            df_processed.loc[condition, 'Fare'] = np.finfo(float).eps
    if (df_processed['Fare'] == 0).any():
        df_processed.loc[df_processed['Fare'] == 0, 'Fare'] = np.finfo(float).eps
    df_processed['Fare'] = np.log1p(df_processed['Fare'])
    columns_to_drop = ['PassengerId', 'Name', 'Ticket']
    df_processed.drop(columns_to_drop, axis=1, inplace=True)
    print("成功模拟/加载并预处理数据，df_processed 已准备好。")
except FileNotFoundError:
    print("错误: train.csv 未找到。请确保文件在当前目录下，或者 df_processed 已在环境中定义。")
    exit()
except Exception as e:
    print(f"预处理过程中发生错误: {e}")
    exit()


# 1. 准备数据
print("\n--- 1. 准备数据 ---")
if 'Survived' not in df_processed.columns:
    print("错误: 目标变量 'Survived' 不在 df_processed 中。请检查预处理步骤。")
    exit()

X = df_processed.drop('Survived', axis=1)
y = df_processed['Survived']

# 划分训练集和测试集 (在进行GridSearchCV时，它内部会处理交叉验证，
# 但我们仍然需要一个最终的测试集来评估最佳模型)
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"完整训练集样本数 (用于GridSearchCV): {X_train_full.shape[0]}, 测试集样本数: {X_test.shape[0]}")

# 特征标准化
# Scaler应该在完整训练集上fit，然后在完整训练集和测试集上transform
scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test)

# 将scaled array转换回DataFrame (可选, 但有助于理解)
X_train_full_scaled = pd.DataFrame(X_train_full_scaled, columns=X.columns, index=X_train_full.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)
print("特征已标准化。")


# 2. 超参数调优 - 梯度提升机 (Gradient Boosting)
print("\n--- 2. 超参数调优: 梯度提升机 ---")

# 定义参数网格
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.7, 0.8, 0.9],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
# 注意：这个参数网格比较大，GridSearchCV会运行较长时间。
# 可以先用一个较小的网格或者使用 RandomizedSearchCV。
# 为了演示，我们先用一个精简的网格：
param_grid_gb_simple = {
    'n_estimators': [100, 150],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 4],
    # 'subsample': [0.8], # 可以固定一些参数减少搜索空间
    # 'min_samples_split': [2],
    # 'min_samples_leaf': [1]
}


# 使用 StratifiedKFold 进行交叉验证，保持类别比例
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 初始化 GridSearchCV
# scoring='accuracy' 或 'roc_auc'
grid_search_gb = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_grid=param_grid_gb_simple, # 使用简化的网格进行快速演示
    scoring='roc_auc', # 更关注AUC
    cv=cv_strategy,
    n_jobs=-1, # 使用所有可用的CPU核心
    verbose=1 # 显示进度
)

print("开始 GridSearchCV 搜索最佳参数...")
grid_search_gb.fit(X_train_full_scaled, y_train_full)

print("\nGridSearchCV 完成。")
print(f"最佳参数组合: {grid_search_gb.best_params_}")
print(f"使用这些参数在交叉验证中的最佳 ROC AUC 分数: {grid_search_gb.best_score_:.4f}")

# 3. 使用最佳参数评估模型
print("\n--- 3. 使用最佳参数在测试集上评估梯度提升机 ---")
best_gb_model = grid_search_gb.best_estimator_ # 这已经是训练好的最佳模型

y_pred_best_gb = best_gb_model.predict(X_test_scaled)
y_pred_proba_best_gb = best_gb_model.predict_proba(X_test_scaled)[:, 1]

accuracy_best_gb = accuracy_score(y_test, y_pred_best_gb)
report_best_gb = classification_report(y_test, y_pred_best_gb, target_names=['未生还 (0)', '生还 (1)'], zero_division=0)
cm_best_gb = confusion_matrix(y_test, y_pred_best_gb)
auc_best_gb = roc_auc_score(y_test, y_pred_proba_best_gb)

print(f"调优后梯度提升机 - 准确率 (Accuracy): {accuracy_best_gb:.4f}")
print(f"调优后梯度提升机 - ROC AUC 分数: {auc_best_gb:.4f}")
print("调优后梯度提升机 - 分类报告 (Classification Report):\n", report_best_gb)
print("调优后梯度提升机 - 混淆矩阵 (Confusion Matrix):\n", cm_best_gb)

# (可选) 对逻辑回归也进行调优
print("\n--- (可选) 超参数调优: 逻辑回归 ---")
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'] # liblinear 支持 l1 和 l2
}
grid_search_lr = GridSearchCV(
    estimator=LogisticRegression(random_state=42, max_iter=2000),
    param_grid=param_grid_lr,
    scoring='roc_auc',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)
print("开始逻辑回归 GridSearchCV 搜索最佳参数...")
grid_search_lr.fit(X_train_full_scaled, y_train_full)

print("\n逻辑回归 GridSearchCV 完成。")
print(f"最佳参数组合 (LR): {grid_search_lr.best_params_}")
print(f"使用这些参数在交叉验证中的最佳 ROC AUC 分数 (LR): {grid_search_lr.best_score_:.4f}")

best_lr_model = grid_search_lr.best_estimator_
y_pred_best_lr = best_lr_model.predict(X_test_scaled)
y_pred_proba_best_lr = best_lr_model.predict_proba(X_test_scaled)[:, 1]
auc_best_lr = roc_auc_score(y_test, y_pred_proba_best_lr)
accuracy_best_lr = accuracy_score(y_test, y_pred_best_lr)
print(f"调优后逻辑回归 - 准确率: {accuracy_best_lr:.4f}, ROC AUC: {auc_best_lr:.4f}")


print("\n模型提升尝试完成。")

成功模拟/加载并预处理数据，df_processed 已准备好。

--- 1. 准备数据 ---
完整训练集样本数 (用于GridSearchCV): 712, 测试集样本数: 179
特征已标准化。

--- 2. 超参数调优: 梯度提升机 ---
开始 GridSearchCV 搜索最佳参数...
Fitting 5 folds for each of 8 candidates, totalling 40 fits

GridSearchCV 完成。
最佳参数组合: {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 150}
使用这些参数在交叉验证中的最佳 ROC AUC 分数: 0.8930

--- 3. 使用最佳参数在测试集上评估梯度提升机 ---
调优后梯度提升机 - 准确率 (Accuracy): 0.8156
调优后梯度提升机 - ROC AUC 分数: 0.8569
调优后梯度提升机 - 分类报告 (Classification Report):
               precision    recall  f1-score   support

     未生还 (0)       0.82      0.89      0.86       110
      生还 (1)       0.80      0.70      0.74        69

    accuracy                           0.82       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.81      0.82      0.81       179

调优后梯度提升机 - 混淆矩阵 (Confusion Matrix):
 [[98 12]
 [21 48]]

--- (可选) 超参数调优: 逻辑回归 ---
开始逻辑回归 GridSearchCV 搜索最佳参数...
Fitting 5 folds for each of 10 candidates, totalling 50 fits

逻辑回归 GridSearchCV 完成。
最佳参数组

In [8]:
# --- 代码接续之前的输出 ---

# --- 1. 回退特征集到第一次成功调优GBM时的状态 ---
# 我们需要重新加载和预处理数据，但不包括 Deck 和 Age_Group
print("\n--- 1. 重新加载和预处理基础特征集 ---")
try:
    df_base = pd.read_csv('train.csv')
    df_processed_base = df_base.copy() # 使用新名字以区分
    # ... (此处为第一次成功的预处理代码，不包括Deck和Age_Group的添加) ...
    embarked_mode = df_processed_base['Embarked'].mode()[0]
    df_processed_base['Embarked'] = df_processed_base['Embarked'].fillna(embarked_mode)
    df_processed_base['Age'] = df_processed_base.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    if df_processed_base['Age'].isnull().any():
        age_global_median = df_processed_base['Age'].median()
        df_processed_base['Age'] = df_processed_base['Age'].fillna(age_global_median)
    df_processed_base.drop('Cabin', axis=1, inplace=True)
    df_processed_base['Sex'] = df_processed_base['Sex'].map({'male': 0, 'female': 1}).astype(int) # male:0, female:1
    df_processed_base = pd.get_dummies(df_processed_base, columns=['Embarked'], prefix='Embarked', drop_first=True)
    df_processed_base['Title'] = df_processed_base['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    common_titles = ['Mr', 'Miss', 'Mrs', 'Master']
    df_processed_base['Title'] = df_processed_base['Title'].apply(lambda x: x if x in common_titles else 'Rare')
    df_processed_base['Title'] = df_processed_base['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df_processed_base['Title'] = df_processed_base['Title'].replace('Mme', 'Mrs')
    rare_titles = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    df_processed_base['Title'] = df_processed_base['Title'].replace(rare_titles, 'Rare')
    df_processed_base = pd.get_dummies(df_processed_base, columns=['Title'], prefix='Title', drop_first=True)
    df_processed_base['FamilySize'] = df_processed_base['SibSp'] + df_processed_base['Parch'] + 1
    df_processed_base['IsAlone'] = 0
    df_processed_base.loc[df_processed_base['FamilySize'] == 1, 'IsAlone'] = 1
    df_processed_base.drop(['SibSp', 'Parch'], axis=1, inplace=True)
    median_fares_by_pclass = df_processed_base.groupby('Pclass')['Fare'].median()
    for p_class in median_fares_by_pclass.index:
        condition = (df_processed_base['Fare'] == 0) & (df_processed_base['Pclass'] == p_class)
        if median_fares_by_pclass[p_class] > 0:
            df_processed_base.loc[condition, 'Fare'] = median_fares_by_pclass[p_class]
        else:
            df_processed_base.loc[condition, 'Fare'] = np.finfo(float).eps
    if (df_processed_base['Fare'] == 0).any():
        df_processed_base.loc[df_processed_base['Fare'] == 0, 'Fare'] = np.finfo(float).eps
    df_processed_base['Fare'] = np.log1p(df_processed_base['Fare'])
    columns_to_drop_base = ['PassengerId', 'Name', 'Ticket'] # Ticket暂时删除，后面会基于原始Ticket创建新特征
    df_processed_base.drop(columns_to_drop_base, axis=1, inplace=True)
    print("基础特征集预处理完成。")
except Exception as e:
    print(f"基础特征集预处理过程中发生错误: {e}")
    exit()

# --- 2. 添加新的特征工程 (Ticket, Sex*Pclass) ---
print("\n--- 2. 添加新的特征工程 (Ticket, Sex*Pclass) ---")
df_original_for_new_ft = pd.read_csv('train.csv') # 原始数据用于提取Ticket等

# Ticket Prefix
def extract_ticket_prefix(ticket_str):
    parts = ticket_str.upper().replace('.', '').replace('/', '').split(' ')
    prefix = []
    for part in parts:
        if not part.isdigit():
            # 移除所有数字，只保留字母部分
            cleaned_part = ''.join(filter(str.isalpha, part))
            if cleaned_part:
                prefix.append(cleaned_part)
    if not prefix:
        return 'NUM'
    return '_'.join(prefix)

df_processed_base['Ticket_Prefix'] = df_original_for_new_ft['Ticket'].apply(extract_ticket_prefix)
# 处理稀有前缀 (例如，只保留出现次数 > N 的前缀)
prefix_counts = df_processed_base['Ticket_Prefix'].value_counts()
common_ticket_prefixes = prefix_counts[prefix_counts > 5].index # 保留出现超过5次的前缀
df_processed_base['Ticket_Prefix'] = df_processed_base['Ticket_Prefix'].apply(lambda x: x if x in common_ticket_prefixes else 'OTHER_TICKET')
df_processed_base = pd.get_dummies(df_processed_base, columns=['Ticket_Prefix'], prefix='TicketPrefix', drop_first=True)
print("Ticket_Prefix 特征已创建并编码。")

# Shared Ticket Count
ticket_counts_map = df_original_for_new_ft['Ticket'].value_counts().to_dict()
df_processed_base['Shared_Ticket_Count'] = df_original_for_new_ft['Ticket'].map(ticket_counts_map).fillna(1)
print("Shared_Ticket_Count 特征已创建。")

# Sex * Pclass Interaction (Pclass是1,2,3; Sex是0(male),1(female))
# 为了避免数值过大，可以先对Pclass进行某种映射或直接使用，或将Pclass也独热编码后再交互
# 这里简单用乘积，如果Pclass是类别，更好的方式是独热后交互
df_processed_base['Sex_Pclass_Interact'] = df_processed_base['Sex'] * df_processed_base['Pclass']
# 另一种交互：为每个 Pclass 创建一个 Sex_in_PclassX 特征
# df_processed_base['Female_In_Pclass1'] = (df_processed_base['Sex'] == 1) & (df_processed_base['Pclass'] == 1).astype(int)
# df_processed_base['Female_In_Pclass2'] = (df_processed_base['Sex'] == 1) & (df_processed_base['Pclass'] == 2).astype(int)
# df_processed_base['Female_In_Pclass3'] = (df_processed_base['Sex'] == 1) & (df_processed_base['Pclass'] == 3).astype(int)
print("Sex_Pclass_Interact 特征已创建。")


# --- 3. 重新准备数据 (加入新特征后) ---
print("\n--- 3. 重新准备数据 (再次加入新特征后) ---")
X_v3 = df_processed_base.drop('Survived', axis=1)
y_v3 = df_processed_base['Survived']

X_train_full_v3, X_test_v3, y_train_full_v3, y_test_v3 = train_test_split(
    X_v3, y_v3, test_size=0.2, random_state=42, stratify=y_v3
)
print(f"V3 特征 - 完整训练集样本数: {X_train_full_v3.shape[0]}, 测试集样本数: {X_test_v3.shape[0]}")
print(f"V3 特征数量: {X_train_full_v3.shape[1]}")

scaler_v3 = StandardScaler()
X_train_full_scaled_v3 = scaler_v3.fit_transform(X_train_full_v3)
X_test_scaled_v3 = scaler_v3.transform(X_test_v3)
X_train_full_scaled_v3 = pd.DataFrame(X_train_full_scaled_v3, columns=X_v3.columns, index=X_train_full_v3.index)
X_test_scaled_v3 = pd.DataFrame(X_test_scaled_v3, columns=X_v3.columns, index=X_test_v3.index)
print("V3 特征集已标准化。")

# --- 4. 重新调优梯度提升机 (V3特征集) ---
print("\n--- 4. 重新调优梯度提升机 (V3 特征集) ---")
param_grid_gb_v3 = { # 可以从上次GBM调优的最佳参数附近开始搜索
    'n_estimators': [100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4], # 鉴于上次max_depth=3表现不错
    'subsample': [0.7, 0.8, 0.9] # 重新加入subsample
}
cv_strategy_v3 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search_gb_v3 = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_grid=param_grid_gb_v3,
    scoring='roc_auc', cv=cv_strategy_v3, n_jobs=-1, verbose=1
)
print("开始V3特征集上的 GridSearchCV 搜索...")
grid_search_gb_v3.fit(X_train_full_scaled_v3, y_train_full_v3)

print("\nV3特征集 GridSearchCV 完成。")
print(f"最佳参数组合 (V3特征): {grid_search_gb_v3.best_params_}")
print(f"使用这些参数在交叉验证中的最佳 ROC AUC 分数 (V3特征): {grid_search_gb_v3.best_score_:.4f}")

best_gb_model_v3 = grid_search_gb_v3.best_estimator_
y_pred_best_gb_v3 = best_gb_model_v3.predict(X_test_scaled_v3)
y_pred_proba_best_gb_v3 = best_gb_model_v3.predict_proba(X_test_scaled_v3)[:, 1]
accuracy_best_gb_v3 = accuracy_score(y_test_v3, y_pred_best_gb_v3)
report_best_gb_v3 = classification_report(y_test_v3, y_pred_best_gb_v3, target_names=['未生还 (0)', '生还 (1)'], zero_division=0)
cm_best_gb_v3 = confusion_matrix(y_test_v3, y_pred_best_gb_v3)
auc_best_gb_v3 = roc_auc_score(y_test_v3, y_pred_proba_best_gb_v3)

print(f"\nV3特征+调优后梯度提升机 - 准确率 (Accuracy): {accuracy_best_gb_v3:.4f}")
print(f"V3特征+调优后梯度提升机 - ROC AUC 分数: {auc_best_gb_v3:.4f}")
print("V3特征+调优后梯度提升机 - 分类报告:\n", report_best_gb_v3)
print("V3特征+调优后梯度提升机 - 混淆矩阵:\n", cm_best_gb_v3)

print("\n新一轮特征工程与模型再调优尝试完成。")


--- 1. 重新加载和预处理基础特征集 ---
基础特征集预处理完成。

--- 2. 添加新的特征工程 (Ticket, Sex*Pclass) ---
Ticket_Prefix 特征已创建并编码。
Shared_Ticket_Count 特征已创建。
Sex_Pclass_Interact 特征已创建。

--- 3. 重新准备数据 (再次加入新特征后) ---
V3 特征 - 完整训练集样本数: 712, 测试集样本数: 179
V3 特征数量: 23
V3 特征集已标准化。

--- 4. 重新调优梯度提升机 (V3 特征集) ---
开始V3特征集上的 GridSearchCV 搜索...
Fitting 5 folds for each of 54 candidates, totalling 270 fits

V3特征集 GridSearchCV 完成。
最佳参数组合 (V3特征): {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
使用这些参数在交叉验证中的最佳 ROC AUC 分数 (V3特征): 0.8986

V3特征+调优后梯度提升机 - 准确率 (Accuracy): 0.8156
V3特征+调优后梯度提升机 - ROC AUC 分数: 0.8521
V3特征+调优后梯度提升机 - 分类报告:
               precision    recall  f1-score   support

     未生还 (0)       0.82      0.90      0.86       110
      生还 (1)       0.81      0.68      0.74        69

    accuracy                           0.82       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.82      0.82      0.81       179

V3特征+调优后梯度提升机 - 混淆矩阵:
 [[99 11]
 [22 47]]

新一轮

In [9]:
# --- 代码接续之前的V3特征集输出 ---
import re # 确保导入
from sklearn.feature_selection import RFECV # 确保导入
from sklearn.model_selection import StratifiedKFold # 确保导入

# --- 1. 回退到 V3 之前的 df_processed_base，然后添加 V3 特征和改进 Sex*Pclass ---
print("\n--- 1. 构建 V3.1 特征集 (改进 Sex*Pclass) ---")
try:
    df_base = pd.read_csv('train.csv')
    df_processed_v31 = df_base.copy() # 使用新名字
    # ... (此处为第一次成功的预处理代码，不包括Deck和Age_Group的添加) ...
    embarked_mode = df_processed_v31['Embarked'].mode()[0]
    df_processed_v31['Embarked'] = df_processed_v31['Embarked'].fillna(embarked_mode)
    df_processed_v31['Age'] = df_processed_v31.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    if df_processed_v31['Age'].isnull().any():
        age_global_median = df_processed_v31['Age'].median()
        df_processed_v31['Age'] = df_processed_v31['Age'].fillna(age_global_median)
    df_processed_v31.drop('Cabin', axis=1, inplace=True)
    df_processed_v31['Sex'] = df_processed_v31['Sex'].map({'male': 0, 'female': 1}).astype(int)
    df_processed_v31 = pd.get_dummies(df_processed_v31, columns=['Embarked'], prefix='Embarked', drop_first=True)
    df_processed_v31['Title'] = df_processed_v31['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    common_titles = ['Mr', 'Miss', 'Mrs', 'Master']
    df_processed_v31['Title'] = df_processed_v31['Title'].apply(lambda x: x if x in common_titles else 'Rare')
    df_processed_v31['Title'] = df_processed_v31['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df_processed_v31['Title'] = df_processed_v31['Title'].replace('Mme', 'Mrs')
    rare_titles = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    df_processed_v31['Title'] = df_processed_v31['Title'].replace(rare_titles, 'Rare')
    df_processed_v31 = pd.get_dummies(df_processed_v31, columns=['Title'], prefix='Title', drop_first=True)
    df_processed_v31['FamilySize'] = df_processed_v31['SibSp'] + df_processed_v31['Parch'] + 1
    df_processed_v31['IsAlone'] = 0
    df_processed_v31.loc[df_processed_v31['FamilySize'] == 1, 'IsAlone'] = 1
    df_processed_v31.drop(['SibSp', 'Parch'], axis=1, inplace=True)
    # Fare处理保持不变 (log1p)
    median_fares_by_pclass_v31 = df_processed_v31.groupby('Pclass')['Fare'].median()
    for p_class_v31 in median_fares_by_pclass_v31.index:
        condition_v31 = (df_processed_v31['Fare'] == 0) & (df_processed_v31['Pclass'] == p_class_v31)
        if median_fares_by_pclass_v31[p_class_v31] > 0:
            df_processed_v31.loc[condition_v31, 'Fare'] = median_fares_by_pclass_v31[p_class_v31]
        else:
            df_processed_v31.loc[condition_v31, 'Fare'] = np.finfo(float).eps
    if (df_processed_v31['Fare'] == 0).any():
        df_processed_v31.loc[df_processed_v31['Fare'] == 0, 'Fare'] = np.finfo(float).eps
    df_processed_v31['Fare'] = np.log1p(df_processed_v31['Fare'])

    # Ticket Prefix (与V3相同)
    def extract_ticket_prefix_v31(ticket_str):
        parts = ticket_str.upper().replace('.', '').replace('/', '').split(' ')
        prefix = []
        for part in parts:
            if not part.isdigit():
                cleaned_part = ''.join(filter(str.isalpha, part))
                if cleaned_part:
                    prefix.append(cleaned_part)
        if not prefix: return 'NUM'
        return '_'.join(prefix)
    df_processed_v31['Ticket_Prefix'] = df_base['Ticket'].apply(extract_ticket_prefix_v31) # 使用原始df_base获取Ticket
    prefix_counts_v31 = df_processed_v31['Ticket_Prefix'].value_counts()
    common_ticket_prefixes_v31 = prefix_counts_v31[prefix_counts_v31 > 5].index
    df_processed_v31['Ticket_Prefix'] = df_processed_v31['Ticket_Prefix'].apply(lambda x: x if x in common_ticket_prefixes_v31 else 'OTHER_TICKET')
    df_processed_v31 = pd.get_dummies(df_processed_v31, columns=['Ticket_Prefix'], prefix='TicketPrefix', drop_first=True)

    # Shared Ticket Count (与V3相同)
    ticket_counts_map_v31 = df_base['Ticket'].value_counts().to_dict()
    df_processed_v31['Shared_Ticket_Count'] = df_base['Ticket'].map(ticket_counts_map_v31).fillna(1)

    # 改进 Sex * Pclass Interaction
    pclass_dummies_v31 = pd.get_dummies(df_processed_v31['Pclass'], prefix='PclassD', drop_first=False) # PclassD to avoid clash if Pclass exists
    df_processed_v31 = pd.concat([df_processed_v31, pclass_dummies_v31], axis=1)
    df_processed_v31['Female_In_Pclass1'] = df_processed_v31['Sex'] * df_processed_v31['PclassD_1']
    df_processed_v31['Female_In_Pclass2'] = df_processed_v31['Sex'] * df_processed_v31['PclassD_2']
    df_processed_v31['Female_In_Pclass3'] = df_processed_v31['Sex'] * df_processed_v31['PclassD_3']
    # 现在可以删除原始的 Pclass 列，因为它已被独热编码和交互特征所代表
    # 也删除独热编码的PclassD列，因为它们的作用已经通过交互特征体现
    df_processed_v31.drop(['Pclass', 'PclassD_1', 'PclassD_2', 'PclassD_3'], axis=1, inplace=True, errors='ignore')
    print("V3.1 特征集 (含改进的Sex*Pclass交互) 构建完成。")

    # (可选) Fare_Per_Person
    df_processed_v31['Fare_Raw'] = df_base['Fare'] # 原始Fare
    df_processed_v31['Fare_Per_Person'] = df_processed_v31['Fare_Raw'] / df_processed_v31['Shared_Ticket_Count']
    df_processed_v31['Fare_Per_Person'].replace([np.inf, -np.inf], np.nan, inplace=True)
    fare_per_person_median_v31 = df_processed_v31.groupby(df_base['Pclass'])['Fare_Per_Person'].transform('median') # Group by original Pclass
    df_processed_v31['Fare_Per_Person'].fillna(fare_per_person_median_v31, inplace=True)
    df_processed_v31['Fare_Per_Person'].fillna(df_processed_v31['Fare_Per_Person'].median(), inplace=True)
    df_processed_v31['Fare_Per_Person'] = np.log1p(df_processed_v31['Fare_Per_Person'])
    df_processed_v31.drop('Fare_Raw', axis=1, inplace=True)
    print("Fare_Per_Person 特征已添加。")


    # 删除原始标识符和不再需要的列
    columns_to_drop_v31 = ['PassengerId', 'Name', 'Ticket']
    df_processed_v31.drop(columns_to_drop_v31, axis=1, inplace=True, errors='ignore')

except Exception as e:
    print(f"V3.1 特征集构建过程中发生错误: {e}")
    raise e # 重新抛出异常以便调试


# --- 2. 准备数据 (V3.1 特征集) ---
print("\n--- 2. 准备数据 (V3.1 特征集) ---")
X_v31 = df_processed_v31.drop('Survived', axis=1)
y_v31 = df_processed_v31['Survived']

X_train_full_v31, X_test_v31, y_train_full_v31, y_test_v31 = train_test_split(
    X_v31, y_v31, test_size=0.2, random_state=42, stratify=y_v31
)
print(f"V3.1 特征 - 完整训练集样本数: {X_train_full_v31.shape[0]}, 测试集样本数: {X_test_v31.shape[0]}")
current_num_features_v31 = X_train_full_v31.shape[1]
print(f"V3.1 当前特征数量: {current_num_features_v31}")


scaler_v31 = StandardScaler()
X_train_full_scaled_v31 = scaler_v31.fit_transform(X_train_full_v31)
X_test_scaled_v31 = scaler_v31.transform(X_test_v31)
# 保留列名，方便RFECV后查看
X_train_full_scaled_df_v31 = pd.DataFrame(X_train_full_scaled_v31, columns=X_v31.columns, index=X_train_full_v31.index)
X_test_scaled_df_v31 = pd.DataFrame(X_test_scaled_v31, columns=X_v31.columns, index=X_test_v31.index)
print("V3.1 特征集已标准化。")


# --- 3. 应用 RFECV 进行特征选择 ---
print("\n--- 3. 应用 RFECV (V3.1 特征集) ---")
# 使用上一轮V3调优的GBM最佳参数作为RFECV的评估器
# 或者使用一组鲁棒的默认参数，如果担心过拟合到特定参数集
estimator_for_rfe_v31 = GradientBoostingClassifier(
    learning_rate=0.05, max_depth=3, n_estimators=200, subsample=0.8, # 来自V3最佳参数
    random_state=42
)
# 如果特征数量过多，RFECV会很慢。可以考虑先用SelectKBest等方法初筛。
# min_features_to_select 可以设置一个下限
selector_v31 = RFECV(
    estimator_for_rfe_v31,
    step=1,
    cv=StratifiedKFold(3, shuffle=True, random_state=42), # 用3折加速，实际可调整
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1,
    min_features_to_select=max(1, int(current_num_features_v31 * 0.5)) # 至少选择一半特征，或至少1个
)

print("开始 RFECV 特征选择...")
selector_v31.fit(X_train_full_scaled_df_v31, y_train_full_v31) # 使用DataFrame以保留列名

print("\nRFECV 完成。")
print(f"RFECV 选择了 {selector_v31.n_features_} 个特征。")
selected_features_mask_v31 = selector_v31.support_
selected_feature_names_v31 = X_v31.columns[selected_features_mask_v31]
print("选择的特征名:", selected_feature_names_v31.tolist())

# 获取筛选后的特征数据
X_train_full_selected_v31 = X_train_full_scaled_df_v31[selected_feature_names_v31]
X_test_selected_v31 = X_test_scaled_df_v31[selected_feature_names_v31]


# --- 4. 在筛选后的特征集 (V3.1-selected) 上重新调优GBM ---
print("\n--- 4. 重新调优GBM (V3.1-selected 特征集) ---")
# 参数网格可以与之前相似，或者根据特征数量调整
param_grid_gb_v31_selected = {
    'n_estimators': [100, 150, 200, 250],
    'learning_rate': [0.01, 0.025, 0.05, 0.1],
    'max_depth': [2, 3, 4], # 减少特征后，深度可能不需要太深
    'subsample': [0.7, 0.8, 0.9, 1.0]
}
cv_strategy_v31_selected = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search_gb_v31_selected = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_grid=param_grid_gb_v31_selected,
    scoring='roc_auc', cv=cv_strategy_v31_selected, n_jobs=-1, verbose=1
)
print("开始 V3.1-selected 特征集上的 GridSearchCV 搜索...")
grid_search_gb_v31_selected.fit(X_train_full_selected_v31, y_train_full_v31)

print("\nV3.1-selected 特征集 GridSearchCV 完成。")
print(f"最佳参数组合 (V3.1-selected): {grid_search_gb_v31_selected.best_params_}")
print(f"CV中的最佳 ROC AUC (V3.1-selected): {grid_search_gb_v31_selected.best_score_:.4f}")

best_gb_model_v31_selected = grid_search_gb_v31_selected.best_estimator_
y_pred_gb_v31_selected = best_gb_model_v31_selected.predict(X_test_selected_v31)
y_pred_proba_gb_v31_selected = best_gb_model_v31_selected.predict_proba(X_test_selected_v31)[:, 1]
accuracy_gb_v31_selected = accuracy_score(y_test_v31, y_pred_gb_v31_selected)
report_gb_v31_selected = classification_report(y_test_v31, y_pred_gb_v31_selected, target_names=['未生还 (0)', '生还 (1)'], zero_division=0)
cm_gb_v31_selected = confusion_matrix(y_test_v31, y_pred_gb_v31_selected)
auc_gb_v31_selected = roc_auc_score(y_test_v31, y_pred_proba_gb_v31_selected)

print(f"\nV3.1-selected + 调优后GBM - 准确率: {accuracy_gb_v31_selected:.4f}")
print(f"V3.1-selected + 调优后GBM - ROC AUC: {auc_gb_v31_selected:.4f}")
print("V3.1-selected + 调优后GBM - 分类报告:\n", report_gb_v31_selected)
print("V3.1-selected + 调优后GBM - 混淆矩阵:\n", cm_gb_v31_selected)

print("\n又一轮特征工程、选择与模型再调优尝试完成。")


--- 1. 构建 V3.1 特征集 (改进 Sex*Pclass) ---
V3.1 特征集 (含改进的Sex*Pclass交互) 构建完成。
Fare_Per_Person 特征已添加。

--- 2. 准备数据 (V3.1 特征集) ---
V3.1 特征 - 完整训练集样本数: 712, 测试集样本数: 179
V3.1 当前特征数量: 25
V3.1 特征集已标准化。

--- 3. 应用 RFECV (V3.1 特征集) ---
开始 RFECV 特征选择...
Fitting estimator with 25 features.
Fitting estimator with 25 features.
Fitting estimator with 25 features.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed_v31['Fare_Per_Person'].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed_v31['Fare_Per_Person'].fillna(fare_per_person_median_v31, inplace=True)
The behavior will change in pandas 3.0. This inplace method wi

Fitting estimator with 24 features.
Fitting estimator with 24 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 23 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 22 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 21 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 20 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 19 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 18 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 17 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 16 features.
Fitting estimator with 16 features.
Fitting estimator with 15 fe

In [10]:
# --- 代码接续之前的V3.1特征集输出 ---
import xgboost as xgb # 确保导入
# import lightgbm as lgb # 如果想尝试LightGBM

# --- Plan A: 在 V3 特征集上尝试 XGBoost ---
print("\n--- Plan A: 在 V3 特征集上尝试 XGBoost ---")

# V3 特征集数据 (X_train_full_scaled_v3, y_train_full_v3, X_test_scaled_v3, y_test_v3)
# 这些是在上一轮的第3步准备好的数据

# 定义XGBoost的参数网格
# 这个网格可以根据需要调整，XGBoost参数很多
param_grid_xgb_v3 = {
    'n_estimators': [100, 150, 200, 250],
    'learning_rate': [0.01, 0.025, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0], # 每棵树的列采样比例
    'gamma': [0, 0.1, 0.2], # 最小分割增益
    # 'reg_alpha': [0, 0.01, 0.1], # L1 正则化
    # 'reg_lambda': [0.1, 1, 10], # L2 正则化
}
# 为了快速演示，使用一个更小的网格
param_grid_xgb_v3_simple = {
    'n_estimators': [150, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 4],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    # 'gamma': [0.1]
}


xgb_model_v3 = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',      # 使用AUC作为评估指标 (XGBoost内部的)
    use_label_encoder=False,# 推荐设置为False避免警告
    random_state=42
)

cv_strategy_xgb_v3 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # 5折

grid_search_xgb_v3 = GridSearchCV(
    estimator=xgb_model_v3,
    param_grid=param_grid_xgb_v3_simple, # 使用简化网格
    scoring='roc_auc', # Scikit-learn GridSearchCV的评估指标
    cv=cv_strategy_xgb_v3,
    n_jobs=-1,
    verbose=1
)

print("开始 XGBoost (V3特征集) 的 GridSearchCV 搜索...")
# X_train_full_scaled_v3 是Numpy数组，XGBoost可以直接处理
grid_search_xgb_v3.fit(X_train_full_scaled_v3, y_train_full_v3)

print("\nXGBoost (V3特征集) GridSearchCV 完成。")
print(f"最佳参数组合 (XGBoost V3): {grid_search_xgb_v3.best_params_}")
print(f"CV中的最佳 ROC AUC (XGBoost V3): {grid_search_xgb_v3.best_score_:.4f}")

best_xgb_model_v3 = grid_search_xgb_v3.best_estimator_
y_pred_xgb_v3 = best_xgb_model_v3.predict(X_test_scaled_v3)
y_pred_proba_xgb_v3 = best_xgb_model_v3.predict_proba(X_test_scaled_v3)[:, 1]
accuracy_xgb_v3 = accuracy_score(y_test_v3, y_pred_xgb_v3)
report_xgb_v3 = classification_report(y_test_v3, y_pred_xgb_v3, target_names=['未生还 (0)', '生还 (1)'], zero_division=0)
cm_xgb_v3 = confusion_matrix(y_test_v3, y_pred_xgb_v3)
auc_xgb_v3 = roc_auc_score(y_test_v3, y_pred_proba_xgb_v3)

print(f"\nXGBoost (V3特征集) + 调优 - 准确率: {accuracy_xgb_v3:.4f}")
print(f"XGBoost (V3特征集) + 调优 - ROC AUC: {auc_xgb_v3:.4f}")
print("XGBoost (V3特征集) + 调优 - 分类报告:\n", report_xgb_v3)
print("XGBoost (V3特征集) + 调优 - 混淆矩阵:\n", cm_xgb_v3)

print("\n尝试XGBoost模型完成。")


--- Plan A: 在 V3 特征集上尝试 XGBoost ---
开始 XGBoost (V3特征集) 的 GridSearchCV 搜索...


AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [8]:
# --- 代码接续之前的XGBoost输出 ---
from sklearn.ensemble import VotingClassifier # 确保导入
# 假设我们有以下之前训练好的最佳模型和它们对应的scaler及特征列名：

# 模型1: 第一次调优的逻辑回归 (假设保存在 best_lr_model_initial)
# 特征集: X_train_full_scaled, X_test_scaled (最开始的特征集)
# scaler: scaler (最开始的scaler)
# (你需要从之前的代码段中获取这些对象，或者重新运行那部分以得到它们)
# 为了演示，我们先假设这些对象存在

# 模型2: V3特征集上调优的XGBoost (best_xgb_model_v3)
# 特征集: X_train_full_scaled_v3, X_test_scaled_v3 (V3特征集)
# scaler: scaler_v3 (V3特征集的scaler)

# --- 准备投票所需的模型和数据 ---
# 你需要确保这些模型和数据是之前运行得到的，并且与它们训练时一致

# 示例：重新获取/定义第一次调优的逻辑回归 (你需要替换成你实际的代码/对象)
try:
    # --- 模拟获取第一次调优的逻辑回归 ---
    # (这部分代码需要你根据你之前的脚本调整或确保对象已存在)
    # 假设 df_processed_initial 是第一次成功预处理的基础特征集 (不含V3等后续添加的)
    # scaler_initial 是对应的scaler
    # X_train_full_initial, X_test_initial, y_train_full_initial, y_test_initial 是对应的数据划分
    # best_lr_model_initial 是调优后的逻辑回归模型

    # 为了代码能跑通，我们简单地重新跑一次逻辑回归的训练，用它之前的最佳参数
    # 实际中，你应该加载已保存的模型或确保对象在内存中
    print("\n--- (模拟)准备第一次调优的逻辑回归模型 ---")
    df_initial_temp = pd.read_csv('train.csv')
    df_processed_initial = df_initial_temp.copy()
    embarked_mode_init = df_processed_initial['Embarked'].mode()[0]
    df_processed_initial['Embarked'] = df_processed_initial['Embarked'].fillna(embarked_mode_init)
    df_processed_initial['Age'] = df_processed_initial.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    if df_processed_initial['Age'].isnull().any():
        df_processed_initial['Age'] = df_processed_initial['Age'].fillna(df_processed_initial['Age'].median())
    df_processed_initial.drop('Cabin', axis=1, inplace=True)
    df_processed_initial['Sex'] = df_processed_initial['Sex'].map({'male': 0, 'female': 1}).astype(int)
    df_processed_initial = pd.get_dummies(df_processed_initial, columns=['Embarked'], prefix='Embarked', drop_first=True)
    df_processed_initial['Title'] = df_processed_initial['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    common_titles_init = ['Mr', 'Miss', 'Mrs', 'Master']
    df_processed_initial['Title'] = df_processed_initial['Title'].apply(lambda x: x if x in common_titles_init else 'Rare')
    df_processed_initial['Title'] = df_processed_initial['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df_processed_initial['Title'] = df_processed_initial['Title'].replace('Mme', 'Mrs')
    rare_titles_init = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    df_processed_initial['Title'] = df_processed_initial['Title'].replace(rare_titles_init, 'Rare')
    df_processed_initial = pd.get_dummies(df_processed_initial, columns=['Title'], prefix='Title', drop_first=True)
    df_processed_initial['FamilySize'] = df_processed_initial['SibSp'] + df_processed_initial['Parch'] + 1
    df_processed_initial['IsAlone'] = 0
    df_processed_initial.loc[df_processed_initial['FamilySize'] == 1, 'IsAlone'] = 1
    df_processed_initial.drop(['SibSp', 'Parch'], axis=1, inplace=True)
    median_fares_by_pclass_init = df_processed_initial.groupby('Pclass')['Fare'].median()
    for p_class_init in median_fares_by_pclass_init.index:
        condition_init = (df_processed_initial['Fare'] == 0) & (df_processed_initial['Pclass'] == p_class_init)
        if median_fares_by_pclass_init[p_class_init] > 0: df_processed_initial.loc[condition_init, 'Fare'] = median_fares_by_pclass_init[p_class_init]
        else: df_processed_initial.loc[condition_init, 'Fare'] = np.finfo(float).eps
    if (df_processed_initial['Fare'] == 0).any(): df_processed_initial.loc[df_processed_initial['Fare'] == 0, 'Fare'] = np.finfo(float).eps
    df_processed_initial['Fare'] = np.log1p(df_processed_initial['Fare'])
    df_processed_initial.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

    X_initial = df_processed_initial.drop('Survived', axis=1)
    y_initial = df_processed_initial['Survived']
    X_train_full_initial, X_test_initial, y_train_full_initial, y_test_initial = train_test_split(X_initial, y_initial, test_size=0.2, random_state=42, stratify=y_initial)
    scaler_initial = StandardScaler().fit(X_train_full_initial)
    X_train_full_scaled_initial = scaler_initial.transform(X_train_full_initial)
    X_test_scaled_initial = scaler_initial.transform(X_test_initial)

    # 使用之前找到的最佳LR参数 {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
    from sklearn.linear_model import LogisticRegression # 确保导入
    best_lr_model_initial = LogisticRegression(C=1, penalty='l2', solver='liblinear', random_state=42, max_iter=2000)
    best_lr_model_initial.fit(X_train_full_scaled_initial, y_train_full_initial)
    print("模拟的初始最佳逻辑回归模型已准备。")

except Exception as e:
    print(f"准备初始逻辑回归模型时出错: {e}. 请确保相关对象和数据可用。")
    best_lr_model_initial = None # 设置为None，后续会跳过投票

# 模型2: V3特征集上调优的XGBoost (best_xgb_model_v3)
# 这个模型和对应的数据 (X_train_full_scaled_v3, y_train_full_v3, X_test_scaled_v3, y_test_v3)
# 应该直接从你上一段代码的运行结果中获得。

if best_lr_model_initial and 'best_xgb_model_v3' in locals():
    print("\n--- 实施 Voting Classifier ---")
    # 注意：VotingClassifier的fit方法需要所有基础模型使用相同的训练数据（或至少是相同长度和索引的y）
    # 这里我们用 y_train_full_initial (或 y_train_full_v3，它们应该是一样的，因为random_state和stratify相同)
    # 但是，每个基础模型在内部fit时，会使用它自己对应的X_train_scaled
    # 更安全的做法是，让VotingClassifier在未缩放的数据上fit，并在Pipeline中包含缩放
    # 或者，确保所有模型在predict_proba时接收到它们各自对应的X_test_scaled

    # 为了简单起见，我们先创建一个包装器，确保每个模型用自己的数据预测
    # 但标准的VotingClassifier.fit() 是在一个统一的X,y上训练的
    # 一个更标准的做法是，如果特征集不同，则不能直接用Sklearn的VotingClassifier
    # 除非我们只在测试集上做投票，而不是重新fit VotingClassifier

    # 策略：在测试集上手动进行软投票 (因为特征集不同，不能直接用VotingClassifier.fit)
    print("由于特征集不同，将进行手动软投票...")

    # 预测概率
    # 逻辑回归在它的测试集 X_test_scaled_initial 上预测
    proba_lr = best_lr_model_initial.predict_proba(X_test_scaled_initial)
    # XGBoost 在它的测试集 X_test_scaled_v3 上预测
    proba_xgb = best_xgb_model_v3.predict_proba(X_test_scaled_v3)

    # 确保y_test是一致的 (y_test_initial 和 y_test_v3 应该是一样的)
    y_test_for_voting = y_test_initial # 或 y_test_v3

    # 软投票: 平均概率 (只取正类的概率)
    avg_proba = (proba_lr[:, 1] + proba_xgb[:, 1]) / 2.0
    # 将平均概率转换为类别预测 (阈值0.5)
    y_pred_vote = (avg_proba >= 0.5).astype(int)

    accuracy_vote = accuracy_score(y_test_for_voting, y_pred_vote)
    report_vote = classification_report(y_test_for_voting, y_pred_vote, target_names=['未生还 (0)', '生还 (1)'], zero_division=0)
    cm_vote = confusion_matrix(y_test_for_voting, y_pred_vote)
    auc_vote = roc_auc_score(y_test_for_voting, avg_proba) # 用平均概率计算AUC

    print(f"\n手动软投票 - 准确率: {accuracy_vote:.4f}")
    print(f"手动软投票 - ROC AUC: {auc_vote:.4f}")
    print("手动软投票 - 分类报告:\n", report_vote)
    print("手动软投票 - 混淆矩阵:\n", cm_vote)
else:
    print("\n跳过Voting Classifier，因为一个或多个基础模型未成功准备。")

print("\nVoting Classifier 尝试完成。")


--- (模拟)准备第一次调优的逻辑回归模型 ---
模拟的初始最佳逻辑回归模型已准备。

--- 实施 Voting Classifier ---
由于特征集不同，将进行手动软投票...

手动软投票 - 准确率: 0.8324
手动软投票 - ROC AUC: 0.8685
手动软投票 - 分类报告:
               precision    recall  f1-score   support

     未生还 (0)       0.84      0.90      0.87       110
      生还 (1)       0.82      0.72      0.77        69

    accuracy                           0.83       179
   macro avg       0.83      0.81      0.82       179
weighted avg       0.83      0.83      0.83       179

手动软投票 - 混淆矩阵:
 [[99 11]
 [19 50]]

Voting Classifier 尝试完成。


In [9]:
# --- 代码接续之前的Voting Classifier输出 ---
from sklearn.linear_model import LogisticRegression # 确保导入
from sklearn.model_selection import GridSearchCV, StratifiedKFold # 确保导入

# --- 尝试将V3特征集应用于逻辑回归并调优 ---
print("\n--- 尝试将V3特征集应用于逻辑回归并调优 ---")

# 使用V3特征集的数据 (X_train_full_scaled_v3, y_train_full_v3, X_test_scaled_v3, y_test_v3)
# 这些是在尝试XGBoost之前准备好的

param_grid_lr_v3 = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # 与之前LR调优的网格相似
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'] # liblinear 支持 l1 和 l2
}

lr_model_v3 = LogisticRegression(random_state=42, max_iter=3000) # 增加max_iter以防不收敛

cv_strategy_lr_v3 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search_lr_v3 = GridSearchCV(
    estimator=lr_model_v3,
    param_grid=param_grid_lr_v3,
    scoring='roc_auc',
    cv=cv_strategy_lr_v3,
    n_jobs=-1,
    verbose=1
)

print("开始逻辑回归 (V3特征集) 的 GridSearchCV 搜索...")
grid_search_lr_v3.fit(X_train_full_scaled_v3, y_train_full_v3) # 使用V3特征集训练

print("\n逻辑回归 (V3特征集) GridSearchCV 完成。")
print(f"最佳参数组合 (LR V3): {grid_search_lr_v3.best_params_}")
print(f"CV中的最佳 ROC AUC (LR V3): {grid_search_lr_v3.best_score_:.4f}")

best_lr_model_v3 = grid_search_lr_v3.best_estimator_
y_pred_lr_v3 = best_lr_model_v3.predict(X_test_scaled_v3) # 在V3特征集上测试
y_pred_proba_lr_v3 = best_lr_model_v3.predict_proba(X_test_scaled_v3)[:, 1]
accuracy_lr_v3 = accuracy_score(y_test_v3, y_pred_lr_v3) # y_test_v3 与 y_test_initial 应该相同
report_lr_v3 = classification_report(y_test_v3, y_pred_lr_v3, target_names=['未生还 (0)', '生还 (1)'], zero_division=0)
cm_lr_v3 = confusion_matrix(y_test_v3, y_pred_lr_v3)
auc_lr_v3 = roc_auc_score(y_test_v3, y_pred_proba_lr_v3)

print(f"\n逻辑回归 (V3特征集) + 调优 - 准确率: {accuracy_lr_v3:.4f}")
print(f"逻辑回归 (V3特征集) + 调优 - ROC AUC: {auc_lr_v3:.4f}")
print("逻辑回归 (V3特征集) + 调优 - 分类报告:\n", report_lr_v3)
print("逻辑回归 (V3特征集) + 调优 - 混淆矩阵:\n", cm_lr_v3)

print("\n尝试将V3特征集用于逻辑回归完成。")


--- 尝试将V3特征集应用于逻辑回归并调优 ---
开始逻辑回归 (V3特征集) 的 GridSearchCV 搜索...
Fitting 5 folds for each of 12 candidates, totalling 60 fits

逻辑回归 (V3特征集) GridSearchCV 完成。
最佳参数组合 (LR V3): {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
CV中的最佳 ROC AUC (LR V3): 0.8694

逻辑回归 (V3特征集) + 调优 - 准确率: 0.8324
逻辑回归 (V3特征集) + 调优 - ROC AUC: 0.8680
逻辑回归 (V3特征集) + 调优 - 分类报告:
               precision    recall  f1-score   support

     未生还 (0)       0.84      0.90      0.87       110
      生还 (1)       0.82      0.72      0.77        69

    accuracy                           0.83       179
   macro avg       0.83      0.81      0.82       179
weighted avg       0.83      0.83      0.83       179

逻辑回归 (V3特征集) + 调优 - 混淆矩阵:
 [[99 11]
 [19 50]]

尝试将V3特征集用于逻辑回归完成。


In [10]:
# --- 代码接续之前的LR(V3)输出 ---
import xgboost as xgb # 确保导入
from sklearn.model_selection import GridSearchCV, StratifiedKFold # 确保导入

# --- 在基础特征集上调优 XGBoost ---
print("\n--- 在基础特征集上调优 XGBoost ---")

# 使用初始的基础特征集数据 (X_train_full_scaled_initial, y_train_full_initial, X_test_scaled_initial, y_test_initial)
# 这些是在 (模拟)准备第一次调优的逻辑回归模型 时准备好的

# 定义XGBoost的参数网格 (与之前V3上的XGBoost网格类似或根据需要调整)
param_grid_xgb_initial_simple = {
    'n_estimators': [100, 150, 200], # 减少一些，因为特征可能更简单
    'learning_rate': [0.025, 0.05, 0.1],
    'max_depth': [3, 4, 5], # 基础特征可能不需要太深的树
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    # 'gamma': [0, 0.1]
}

xgb_model_initial = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    random_state=42
)

cv_strategy_xgb_initial = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search_xgb_initial = GridSearchCV(
    estimator=xgb_model_initial,
    param_grid=param_grid_xgb_initial_simple,
    scoring='roc_auc',
    cv=cv_strategy_xgb_initial,
    n_jobs=-1,
    verbose=1
)

print("开始 XGBoost (基础特征集) 的 GridSearchCV 搜索...")
# X_train_full_scaled_initial 是Numpy数组
grid_search_xgb_initial.fit(X_train_full_scaled_initial, y_train_full_initial)

print("\nXGBoost (基础特征集) GridSearchCV 完成。")
print(f"最佳参数组合 (XGBoost Initial): {grid_search_xgb_initial.best_params_}")
print(f"CV中的最佳 ROC AUC (XGBoost Initial): {grid_search_xgb_initial.best_score_:.4f}")

best_xgb_model_initial = grid_search_xgb_initial.best_estimator_
y_pred_xgb_initial = best_xgb_model_initial.predict(X_test_scaled_initial)
y_pred_proba_xgb_initial = best_xgb_model_initial.predict_proba(X_test_scaled_initial)[:, 1]
accuracy_xgb_initial = accuracy_score(y_test_initial, y_pred_xgb_initial)
report_xgb_initial = classification_report(y_test_initial, y_pred_xgb_initial, target_names=['未生还 (0)', '生还 (1)'], zero_division=0)
cm_xgb_initial = confusion_matrix(y_test_initial, y_pred_xgb_initial)
auc_xgb_initial = roc_auc_score(y_test_initial, y_pred_proba_xgb_initial)

print(f"\nXGBoost (基础特征集) + 调优 - 准确率: {accuracy_xgb_initial:.4f}")
print(f"XGBoost (基础特征集) + 调优 - ROC AUC: {auc_xgb_initial:.4f}")
print("XGBoost (基础特征集) + 调优 - 分类报告:\n", report_xgb_initial)
print("XGBoost (基础特征集) + 调优 - 混淆矩阵:\n", cm_xgb_initial)

print("\n尝试在基础特征集上调优XGBoost完成。")


--- 在基础特征集上调优 XGBoost ---
开始 XGBoost (基础特征集) 的 GridSearchCV 搜索...
Fitting 5 folds for each of 243 candidates, totalling 1215 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.



XGBoost (基础特征集) GridSearchCV 完成。
最佳参数组合 (XGBoost Initial): {'colsample_bytree': 0.9, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.9}
CV中的最佳 ROC AUC (XGBoost Initial): 0.8924

XGBoost (基础特征集) + 调优 - 准确率: 0.8212
XGBoost (基础特征集) + 调优 - ROC AUC: 0.8642
XGBoost (基础特征集) + 调优 - 分类报告:
               precision    recall  f1-score   support

     未生还 (0)       0.82      0.90      0.86       110
      生还 (1)       0.81      0.70      0.75        69

    accuracy                           0.82       179
   macro avg       0.82      0.80      0.81       179
weighted avg       0.82      0.82      0.82       179

XGBoost (基础特征集) + 调优 - 混淆矩阵:
 [[99 11]
 [21 48]]

尝试在基础特征集上调优XGBoost完成。


In [11]:
# --- 代码接续之前的XGBoost(基础特征集)输出 ---
from sklearn.ensemble import VotingClassifier # 确保导入

# 模型1: 初始调优的逻辑回归 (best_lr_model_initial)
#        是在 X_train_full_scaled_initial, y_train_full_initial 上训练的
# 模型2: 基础特征集上调优的XGBoost (best_xgb_model_initial)
#        也是在 X_train_full_scaled_initial, y_train_full_initial 上训练的

# 确保这两个模型对象存在并且是训练好的
if 'best_lr_model_initial' in locals() and 'best_xgb_model_initial' in locals():
    print("\n--- 实施 Voting Classifier (LR + XGBoost, 基础特征集) ---")

    # 创建 VotingClassifier
    # 'lr' 和 'xgb' 是任意给分类器的名字
    # best_lr_model_initial 和 best_xgb_model_initial 应该是已经用 best_params_ 训练好的模型
    voting_clf_final = VotingClassifier(
        estimators=[
            ('lr', best_lr_model_initial),
            ('xgb', best_xgb_model_initial)
        ],
        voting='soft' # 使用软投票，基于预测概率的平均值
        # weights=[0.6, 0.4] # (可选) 可以尝试给表现更好的LR更高权重, 需要在验证集上调优权重
    )

    # 在完整的训练数据上训练VotingClassifier
    # (它会分别调用每个基础模型的fit，如果它们还没被fit的话，
    #  但由于我们传入的是已训练模型，这里主要是设置集成结构)
    # 或者更准确地说，如果传入的是已训练模型，VotingClassifier可以直接用于predict/predict_proba
    # 但为了确保一致性，通常会fit，或者确保传入的是未训练但已配置好参数的分类器
    # 为了简单起见，假设 best_lr_model_initial 和 best_xgb_model_initial 是已经fit好的。
    # 如果要严格地fit VotingClassifier，基础模型应该在clone后传入，或者在fit时传入未训练的模型
    # 这里我们直接用它们来进行预测概率的组合，因为它们已经在相同的训练集上训练过了。

    # 我们将直接在测试集上评估这个组合的预测概率，而不是重新fit VotingClassifier本身。
    # 这是因为 best_lr_model_initial 和 best_xgb_model_initial 已经是调优后在完整训练集上训练的模型。

    # 获取概率预测
    proba_lr_final = best_lr_model_initial.predict_proba(X_test_scaled_initial)[:, 1]
    proba_xgb_final = best_xgb_model_initial.predict_proba(X_test_scaled_initial)[:, 1]

    # 软投票: 平均概率
    # (如果设置了weights，这里需要加权平均)
    # weights = [0.6, 0.4] # 示例权重
    # avg_proba_final = np.average([proba_lr_final, proba_xgb_final], axis=0, weights=weights)
    avg_proba_final = (proba_lr_final + proba_xgb_final) / 2.0


    y_pred_vote_final = (avg_proba_final >= 0.5).astype(int)

    accuracy_vote_final = accuracy_score(y_test_initial, y_pred_vote_final)
    report_vote_final = classification_report(y_test_initial, y_pred_vote_final, target_names=['未生还 (0)', '生还 (1)'], zero_division=0)
    cm_vote_final = confusion_matrix(y_test_initial, y_pred_vote_final)
    auc_vote_final = roc_auc_score(y_test_initial, avg_proba_final)

    print(f"\nVoting Classifier (LR+XGBoost, 基础特征集, 手动软投票) - 准确率: {accuracy_vote_final:.4f}")
    print(f"Voting Classifier (LR+XGBoost, 基础特征集, 手动软投票) - ROC AUC: {auc_vote_final:.4f}")
    print("Voting Classifier (LR+XGBoost, 基础特征集, 手动软投票) - 分类报告:\n", report_vote_final)
    print("Voting Classifier (LR+XGBoost, 基础特征集, 手动软投票) - 混淆矩阵:\n", cm_vote_final)

else:
    print("\n跳过Voting Classifier，因为一个或多个基础模型 (best_lr_model_initial, best_xgb_model_initial) 未成功准备。")

print("\nVoting Classifier (基础特征集) 尝试完成。")

# --- (可选后续) 对XGBoost (基础特征集) 进行带早停的深度调优 ---
# 这个部分可以作为独立的步骤来进一步优化XGBoost单模型
# print("\n--- (可选后续) XGBoost (基础特征集) 早停深度调优 ---")
# from sklearn.model_selection import train_test_split as train_test_split_for_early_stopping
#
# # 从完整训练集中分出一部分作为早停的验证集
# X_train_es, X_val_es, y_train_es, y_val_es = train_test_split_for_early_stopping(
#     X_train_full_scaled_initial, y_train_full_initial, test_size=0.15, random_state=42, stratify=y_train_full_initial
# )
#
# xgb_es_model = xgb.XGBClassifier(
#     objective='binary:logistic', eval_metric='auc', use_label_encoder=False, random_state=42,
#     # 使用之前GridSearch找到的最佳参数 (除了n_estimators, learning_rate可以调整)
#     colsample_bytree=best_xgb_model_initial.get_params()['colsample_bytree'],
#     max_depth=best_xgb_model_initial.get_params()['max_depth'],
#     subsample=best_xgb_model_initial.get_params()['subsample'],
#     learning_rate=0.01, # 尝试更小的学习率
#     n_estimators=1000 # 设置一个较大的初始值
#     # 也可以加入 gamma, reg_alpha, reg_lambda 等进行正则化
# )
#
# print("开始XGBoost早停训练...")
# xgb_es_model.fit(X_train_es, y_train_es,
#                  early_stopping_rounds=50,
#                  eval_set=[(X_val_es, y_val_es)],
#                  verbose=False)
#
# print(f"XGBoost with early stopping - Best N Estimators: {xgb_es_model.best_iteration}")
#
# y_pred_xgb_es = xgb_es_model.predict(X_test_scaled_initial)
# y_pred_proba_xgb_es = xgb_es_model.predict_proba(X_test_scaled_initial)[:, 1]
# accuracy_xgb_es = accuracy_score(y_test_initial, y_pred_xgb_es)
# auc_xgb_es = roc_auc_score(y_test_initial, y_pred_proba_xgb_es)
# print(f"XGBoost (基础特征集, 早停) - 准确率: {accuracy_xgb_es:.4f}, ROC AUC: {auc_xgb_es:.4f}")


--- 实施 Voting Classifier (LR + XGBoost, 基础特征集) ---

Voting Classifier (LR+XGBoost, 基础特征集, 手动软投票) - 准确率: 0.8547
Voting Classifier (LR+XGBoost, 基础特征集, 手动软投票) - ROC AUC: 0.8714
Voting Classifier (LR+XGBoost, 基础特征集, 手动软投票) - 分类报告:
               precision    recall  f1-score   support

     未生还 (0)       0.86      0.92      0.89       110
      生还 (1)       0.85      0.75      0.80        69

    accuracy                           0.85       179
   macro avg       0.85      0.84      0.84       179
weighted avg       0.85      0.85      0.85       179

Voting Classifier (LR+XGBoost, 基础特征集, 手动软投票) - 混淆矩阵:
 [[101   9]
 [ 17  52]]

Voting Classifier (基础特征集) 尝试完成。


In [12]:
# --- 代码接续之前的Voting Classifier输出 ---
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split as new_train_test_split # 避免与之前的变量名冲突
# 假设 X_train_full_scaled_initial, y_train_full_initial, X_test_scaled_initial, y_test_initial
# 是我们之前定义的“基础特征集”及其对应的数据

print("\n--- 深度调优 XGBoost (基础特征集) 使用早停和更细致的参数 ---")

# 从完整的基础训练集中分出一部分作为早停的验证集
X_train_es_base, X_val_es_base, y_train_es_base, y_val_es_base = new_train_test_split(
    X_train_full_scaled_initial, y_train_full_initial,
    test_size=0.2, # 例如用20%作为验证集
    random_state=123, # 用不同的random_state确保与主train/test划分不同
    stratify=y_train_full_initial
)

# 定义一个XGBoost模型实例，这次我们将手动进行类似GridSearch的过程，但结合早停
# 或者，我们可以先用GridSearchCV找到一个不错的参数起点，然后细化
# 这里我们直接尝试一些参数组合，并依赖早停

# 最佳参数组合 (XGBoost Initial from previous run):
# {'colsample_bytree': 0.9, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.9}

xgb_deep_tune_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    random_state=42,
    # 从之前的最佳参数开始，但调整learning_rate和n_estimators以配合早停
    colsample_bytree=0.9, # from previous best
    max_depth=3,          # from previous best
    subsample=0.9,        # from previous best
    learning_rate=0.01,   # 尝试更小的学习率
    n_estimators=2000,    # 设置一个很大的初始值，让早停来决定
    gamma=0.1,            # 尝试加入gamma正则化
    reg_alpha=0.01,       # 尝试加入L1正则化
    reg_lambda=0.1        # 尝试加入L2正则化
)

print("开始 XGBoost 早停训练 (基础特征集)...")
xgb_deep_tune_model.fit(
    X_train_es_base, y_train_es_base,
    early_stopping_rounds=100, # 如果100轮内验证AUC没有提升则停止
    eval_set=[(X_val_es_base, y_val_es_base)],
    verbose=False # 设置为True可以看到每轮的输出
)

print(f"XGBoost with early stopping - Best N Estimators: {xgb_deep_tune_model.best_iteration}")

# 在测试集上评估
y_pred_xgb_deep = xgb_deep_tune_model.predict(X_test_scaled_initial) # 使用主测试集
y_pred_proba_xgb_deep = xgb_deep_tune_model.predict_proba(X_test_scaled_initial)[:, 1]

accuracy_xgb_deep = accuracy_score(y_test_initial, y_pred_xgb_deep)
report_xgb_deep = classification_report(y_test_initial, y_pred_xgb_deep, target_names=['未生还 (0)', '生还 (1)'], zero_division=0)
cm_xgb_deep = confusion_matrix(y_test_initial, y_pred_xgb_deep)
auc_xgb_deep = roc_auc_score(y_test_initial, y_pred_proba_xgb_deep)

print(f"\nXGBoost (基础特征集, 早停深度调优) - 准确率: {accuracy_xgb_deep:.4f}")
print(f"XGBoost (基础特征集, 早停深度调优) - ROC AUC: {auc_xgb_deep:.4f}")
print("XGBoost (基础特征集, 早停深度调优) - 分类报告:\n", report_xgb_deep)
print("XGBoost (基础特征集, 早停深度调优) - 混淆矩阵:\n", cm_xgb_deep)

print("\nXGBoost 早停深度调优尝试完成。")

# ----------------------------------------------------------------------------------
# 如果上面的手动调优效果不错，你可以考虑将这个过程包装到 Optuna 或 Hyperopt 中进行更系统的搜索
# ----------------------------------------------------------------------------------


--- 深度调优 XGBoost (基础特征集) 使用早停和更细致的参数 ---
开始 XGBoost 早停训练 (基础特征集)...




XGBoost with early stopping - Best N Estimators: 256

XGBoost (基础特征集, 早停深度调优) - 准确率: 0.8268
XGBoost (基础特征集, 早停深度调优) - ROC AUC: 0.8641
XGBoost (基础特征集, 早停深度调优) - 分类报告:
               precision    recall  f1-score   support

     未生还 (0)       0.82      0.92      0.87       110
      生还 (1)       0.84      0.68      0.75        69

    accuracy                           0.83       179
   macro avg       0.83      0.80      0.81       179
weighted avg       0.83      0.83      0.82       179

XGBoost (基础特征集, 早停深度调优) - 混淆矩阵:
 [[101   9]
 [ 22  47]]

XGBoost 早停深度调优尝试完成。


In [16]:
import pandas as pd
import numpy as np
import re # 用于Ticket Prefix (如果决定最终版本包含它，目前最佳是不含)

# Sklearn 模型和工具
from sklearn.model_selection import train_test_split # StratifiedKFold, GridSearchCV 不再直接用于最终训练
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import xgboost as xgb # XGBoost
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# 用于保存和加载模型
import joblib
import os # 用于检查文件是否存在


# --- 1. 数据加载和基础特征预处理函数 ---
def preprocess_titanic_data(df_input, is_train=True, scaler_obj=None, training_age_medians=None, training_fare_pclass_medians=None, training_fare_global_median=None, training_embarked_mode=None):
    """
    对Titanic数据进行预处理。
    对于训练数据，会fit scaler并计算统计量。
    对于测试数据，会使用传入的scaler和统计量进行transform。
    """
    df_processed = df_input.copy()

    # Embarked: 填充缺失值，然后独热编码
    if is_train:
        embarked_mode_to_use = df_processed['Embarked'].mode()[0]
        training_embarked_mode = embarked_mode_to_use # 保存供测试集使用
    else:
        embarked_mode_to_use = training_embarked_mode if training_embarked_mode else 'S' # 'S'是最常见的
    df_processed['Embarked'].fillna(embarked_mode_to_use, inplace=True)
    df_processed = pd.get_dummies(df_processed, columns=['Embarked'], prefix='Embarked', drop_first=True)

    # Age: 中位数填充 (按Pclass和Sex分组)
    if is_train:
        # 计算并保存用于测试集的分组中位数
        age_group_medians_map = df_processed.groupby(['Pclass', 'Sex'])['Age'].median().to_dict()
        training_age_medians = age_group_medians_map # 保存供测试集使用
        df_processed['Age'] = df_processed.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    else:
        if training_age_medians:
            # 应用训练集的分组中位数
            for key, median_val in training_age_medians.items():
                pclass, sex_str = key # Sex在原始数据中是字符串
                sex_val = 0 if sex_str == 'male' else 1 # 假设预处理前Sex是字符串
                # 注意：如果传入的df_input中的Sex列已经是0/1，则需要调整这里的sex_str逻辑
                # 为了简化，假设传入的df_input的Sex列还是字符串 'male'/'female'
                # 或者，确保training_age_medians的key与df_processed中的Pclass和Sex格式一致
                # 为了此函数通用性，这里假设df_processed['Sex']在填充Age前还是字符串，后面会转为0/1
                # 但实际上，Sex转为0/1应该在Age填充前或后保持一致
                # --> 修正：假设Sex已提前转换或training_age_medians的key与转换后的Sex一致
                current_sex_val_in_df = df_processed['Sex'].map({'male':0, 'female':1}) if df_processed['Sex'].dtype == 'object' else df_processed['Sex']

                condition = (df_processed['Pclass'] == pclass) & \
                            (current_sex_val_in_df == sex_val) & \
                            (df_processed['Age'].isnull())
                df_processed.loc[condition, 'Age'] = median_val
        else:
            print("警告: 测试数据处理Age时未提供训练集的分组中位数。")

    # 对于测试集或训练集中分组后仍有缺失的情况，用全局中位数填充
    if is_train:
        age_global_median_to_use = df_processed['Age'].median()
        training_fare_global_median = age_global_median_to_use # 保存供测试集使用
    else:
        age_global_median_to_use = training_fare_global_median if training_fare_global_median else (df_processed['Age'].median() if not df_processed['Age'].isnull().all() else 28) # 28是常见中位数
    df_processed['Age'].fillna(age_global_median_to_use, inplace=True)


    # Sex: 转换为数值
    df_processed['Sex'] = df_processed['Sex'].map({'male': 0, 'female': 1}).astype(int)

    # Title: 从Name中提取并编码
    df_processed['Title'] = df_processed['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    common_titles = ['Mr', 'Miss', 'Mrs', 'Master']
    df_processed['Title'] = df_processed['Title'].apply(lambda x: x if x in common_titles else 'Rare')
    df_processed['Title'] = df_processed['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df_processed['Title'] = df_processed['Title'].replace('Mme', 'Mrs')
    rare_titles_to_map = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    df_processed['Title'] = df_processed['Title'].replace(rare_titles_to_map, 'Rare')
    df_processed = pd.get_dummies(df_processed, columns=['Title'], prefix='Title', drop_first=True)

    # FamilySize 和 IsAlone
    df_processed['FamilySize'] = df_processed['SibSp'] + df_processed['Parch'] + 1
    df_processed['IsAlone'] = 0
    df_processed.loc[df_processed['FamilySize'] == 1, 'IsAlone'] = 1

    # Fare: 填充缺失和0值，然后log转换
    if 'Fare' in df_processed.columns:
        if df_processed['Fare'].isnull().any(): # 处理测试集可能的Fare缺失
            if is_train: # 训练时计算并应用
                fare_pclass_medians_map = df_processed.groupby('Pclass')['Fare'].median().to_dict()
                training_fare_pclass_medians = fare_pclass_medians_map # 保存
                for p_class, median_val in fare_pclass_medians_map.items():
                    df_processed.loc[(df_processed['Pclass'] == p_class) & (df_processed['Fare'].isnull()), 'Fare'] = median_val
            elif training_fare_pclass_medians: # 测试时应用保存的值
                 for p_class, median_val in training_fare_pclass_medians.items():
                    df_processed.loc[(df_processed['Pclass'] == p_class) & (df_processed['Fare'].isnull()), 'Fare'] = median_val
            # 全局填充仍然存在的Fare NaN (如果分组中位数也是NaN或分组不存在)
            df_processed['Fare'].fillna(df_processed['Fare'].median() if not df_processed['Fare'].isnull().all() else 0, inplace=True)


        # 填充Fare为0的情况 (用对应Pclass的中位数)
        # 这部分与上面Fare缺失值填充逻辑可以合并，但为了清晰分开
        if is_train:
            # 如果 training_fare_pclass_medians 还没计算（例如Fare之前没有NaN），这里计算
            if training_fare_pclass_medians is None:
                 training_fare_pclass_medians = df_processed.groupby('Pclass')['Fare'].median().to_dict()
            fare_pclass_medians_to_use = training_fare_pclass_medians
        elif training_fare_pclass_medians:
            fare_pclass_medians_to_use = training_fare_pclass_medians
        else: # 测试集且未提供训练中位数，则在测试集上计算（不理想但作为后备）
            fare_pclass_medians_to_use = df_processed.groupby('Pclass')['Fare'].median().to_dict()
            print("警告: 测试数据处理Fare为0时未提供训练集Pclass中位数票价。")

        for p_class_val, median_val in fare_pclass_medians_to_use.items():
            condition = (df_processed['Fare'] == 0) & (df_processed['Pclass'] == p_class_val)
            df_processed.loc[condition, 'Fare'] = median_val if median_val > 0 else np.finfo(float).eps

        if (df_processed['Fare'] == 0).any():
             df_processed.loc[df_processed['Fare'] == 0, 'Fare'] = np.finfo(float).eps
        df_processed['Fare'] = np.log1p(df_processed['Fare'])


    # 选择基础特征列
    base_feature_cols = ['Pclass', 'Sex', 'Age', 'Fare', 'FamilySize', 'IsAlone',
                         'Embarked_Q', 'Embarked_S',
                         'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare']

    # 确保所有预期的特征列都存在
    for col in base_feature_cols:
        if col not in df_processed.columns:
            df_processed[col] = 0 # 用0填充缺失的独热编码列
    df_processed = df_processed[base_feature_cols] # 保证列的顺序和存在

    # 特征标准化
    numerical_cols_to_scale = ['Age', 'Fare', 'FamilySize', 'Pclass'] # Pclass也进行标准化
    # IsAlone是0/1，Sex是0/1，独热编码列也是0/1，通常不需要标准化，但标准化了也无大碍
    if is_train:
        current_scaler = StandardScaler()
        df_processed[numerical_cols_to_scale] = current_scaler.fit_transform(df_processed[numerical_cols_to_scale])
        scaler_to_return = current_scaler
    else:
        if scaler_obj:
            df_processed[numerical_cols_to_scale] = scaler_obj.transform(df_processed[numerical_cols_to_scale])
            scaler_to_return = scaler_obj
        else:
            print("警告: 测试数据预处理时未提供scaler，将基于测试数据自身进行标准化。")
            temp_scaler = StandardScaler()
            df_processed[numerical_cols_to_scale] = temp_scaler.fit_transform(df_processed[numerical_cols_to_scale])
            scaler_to_return = temp_scaler # 这种情况下返回的是基于测试集fit的scaler

    if is_train:
        return df_processed, scaler_to_return, training_age_medians, training_fare_pclass_medians, training_fare_global_median, training_embarked_mode
    else:
        return df_processed, scaler_to_return


# --- 主训练流程 ---
def train_final_ensemble_model(train_file_path='train.csv'):
    print("--- 1. 加载和预处理训练数据 ---")
    df_train_raw = pd.read_csv(train_file_path)
    y_train_full = df_train_raw['Survived']
    X_train_full_raw = df_train_raw.drop('Survived', axis=1)

    X_train_full_processed, scaler, age_medians, fare_pclass_medians, fare_global_median, embarked_mode = \
        preprocess_titanic_data(X_train_full_raw, is_train=True)

    print("训练数据预处理完成。")
    print(f"预处理后训练特征数量: {X_train_full_processed.shape[1]}")

    print("\n--- 2. 训练逻辑回归模型 (使用已知最佳参数) ---")
    lr_best_params = {'C': 1, 'penalty': 'l2', 'solver': 'liblinear', 'random_state': 42, 'max_iter': 2000}
    final_lr_model = LogisticRegression(**lr_best_params)
    final_lr_model.fit(X_train_full_processed, y_train_full)
    print("逻辑回归模型训练完成。")

    print("\n--- 3. 训练XGBoost模型 (使用已知最佳参数) ---")
    xgb_best_params = {
        'objective': 'binary:logistic', 'eval_metric': 'auc', 'use_label_encoder': False, 'random_state': 42,
        'colsample_bytree': 0.9, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.9
    }
    final_xgb_model = xgb.XGBClassifier(**xgb_best_params)
    final_xgb_model.fit(X_train_full_processed, y_train_full) # XGBoost可以直接处理Numpy或DataFrame
    print("XGBoost模型训练完成。")

    print("\n--- 4. 保存模型和预处理所需的统计量 ---")
    joblib.dump(scaler, 'final_titanic_scaler.pkl')
    joblib.dump(final_lr_model, 'final_lr_model.pkl')
    joblib.dump(final_xgb_model, 'final_xgb_model.pkl')
    # 保存用于测试集预处理的统计量
    preprocessing_stats = {
        'age_medians': age_medians,
        'fare_pclass_medians': fare_pclass_medians,
        'fare_global_median': fare_global_median,
        'embarked_mode': embarked_mode,
        'feature_columns': X_train_full_processed.columns.tolist() # 保存特征列顺序
    }
    joblib.dump(preprocessing_stats, 'final_preprocessing_stats.pkl')
    print("Scaler, LR模型, XGBoost模型及预处理统计量已保存。")

    return final_lr_model, final_xgb_model, scaler, preprocessing_stats


def evaluate_ensemble_on_test_split(final_lr_model, final_xgb_model, scaler, preprocessing_stats, train_file_path='train.csv'):
    print("\n--- 评估集成模型在测试分割上的性能 ---")
    df_train_raw_eval = pd.read_csv(train_file_path)
    _, X_test_raw_eval, _, y_test_eval = train_test_split(
        df_train_raw_eval.drop('Survived', axis=1), df_train_raw_eval['Survived'],
        test_size=0.2, random_state=42, stratify=df_train_raw_eval['Survived']
    )

    X_test_processed_eval, _ = preprocess_titanic_data(
        X_test_raw_eval, is_train=False, scaler_obj=scaler,
        training_age_medians=preprocessing_stats['age_medians'],
        training_fare_pclass_medians=preprocessing_stats['fare_pclass_medians'],
        training_fare_global_median=preprocessing_stats['fare_global_median'],
        training_embarked_mode=preprocessing_stats['embarked_mode']
    )
    # 确保测试集的列与训练时一致
    X_test_processed_eval = X_test_processed_eval[preprocessing_stats['feature_columns']]


    print("测试分割数据预处理完成.")

    proba_lr_test = final_lr_model.predict_proba(X_test_processed_eval)[:, 1]
    proba_xgb_test = final_xgb_model.predict_proba(X_test_processed_eval)[:, 1]

    avg_proba_test = (proba_lr_test + proba_xgb_test) / 2.0
    y_pred_vote_test = (avg_proba_test >= 0.5).astype(int)

    accuracy_vote_test = accuracy_score(y_test_eval, y_pred_vote_test)
    report_vote_test = classification_report(y_test_eval, y_pred_vote_test, target_names=['未生还 (0)', '生还 (1)'], zero_division=0)
    cm_vote_test = confusion_matrix(y_test_eval, y_pred_vote_test)
    auc_vote_test = roc_auc_score(y_test_eval, avg_proba_test)

    print(f"\n集成模型 (LR+XGBoost, 手动软投票) 在测试分割上:")
    print(f"  准确率: {accuracy_vote_test:.4f}")
    print(f"  ROC AUC: {auc_vote_test:.4f}")
    print("  分类报告:\n", report_vote_test)
    print("  混淆矩阵:\n", cm_vote_test)


def predict_on_new_data(new_data_df,
                        lr_model_path='final_lr_model.pkl',
                        xgb_model_path='final_xgb_model.pkl',
                        scaler_path='final_titanic_scaler.pkl',
                        stats_path='final_preprocessing_stats.pkl'):
    print("\n--- 预测新数据 ---")
    if not all(os.path.exists(p) for p in [lr_model_path, xgb_model_path, scaler_path, stats_path]):
        print("错误: 一个或多个模型/scaler/统计量文件未找到。请先运行训练流程。")
        return None, None

    scaler = joblib.load(scaler_path)
    lr_model = joblib.load(lr_model_path)
    xgb_model = joblib.load(xgb_model_path)
    preprocessing_stats = joblib.load(stats_path)
    print("模型, Scaler及预处理统计量已加载。")

    X_new_processed, _ = preprocess_titanic_data(
        new_data_df, is_train=False, scaler_obj=scaler,
        training_age_medians=preprocessing_stats['age_medians'],
        training_fare_pclass_medians=preprocessing_stats['fare_pclass_medians'],
        training_fare_global_median=preprocessing_stats['fare_global_median'],
        training_embarked_mode=preprocessing_stats['embarked_mode']
    )
    # 确保新数据的列与训练时一致
    X_new_processed = X_new_processed[preprocessing_stats['feature_columns']]

    print("新数据预处理完成。")

    proba_lr_new = lr_model.predict_proba(X_new_processed)[:, 1]
    proba_xgb_new = xgb_model.predict_proba(X_new_processed)[:, 1]

    avg_proba_new = (proba_lr_new + proba_xgb_new) / 2.0
    y_pred_vote_new = (avg_proba_new >= 0.5).astype(int)

    print("新数据预测完成。")
    return y_pred_vote_new, avg_proba_new


# --- 执行流程 ---
if __name__ == '__main__':
    # 1. 训练最终模型并保存
    # 确保train.csv在同目录下
    if os.path.exists('train.csv'):
        final_lr, final_xgb, final_scaler, stats = train_final_ensemble_model()

        # 2. 在训练数据的测试分割上重新评估 (验证流程)
        if final_lr and final_xgb and final_scaler and stats:
            evaluate_ensemble_on_test_split(final_lr, final_xgb, final_scaler, stats)

        # 3. 对Kaggle的test.csv进行预测 (确保test.csv在同目录下)
        if os.path.exists('test.csv'):
            df_test_kaggle_raw = pd.read_csv('test.csv')
            passenger_ids_kaggle = df_test_kaggle_raw['PassengerId']

            predictions_kaggle, probabilities_kaggle = predict_on_new_data(df_test_kaggle_raw)

            if predictions_kaggle is not None:
                submission_df = pd.DataFrame({'PassengerId': passenger_ids_kaggle, 'Survived': predictions_kaggle})
                submission_df.to_csv('final_titanic_submission.csv', index=False)
                print("\nKaggle提交文件 'final_titanic_submission.csv' 已生成。")
        else:
            print("\n未找到 'test.csv'，跳过对Kaggle测试数据的预测。")
    else:
        print("错误: 'train.csv' 未找到。无法开始训练流程。")

--- 1. 加载和预处理训练数据 ---
训练数据预处理完成。
预处理后训练特征数量: 13

--- 2. 训练逻辑回归模型 (使用已知最佳参数) ---
逻辑回归模型训练完成。

--- 3. 训练XGBoost模型 (使用已知最佳参数) ---
XGBoost模型训练完成。

--- 4. 保存模型和预处理所需的统计量 ---
Scaler, LR模型, XGBoost模型及预处理统计量已保存。

--- 评估集成模型在测试分割上的性能 ---
测试分割数据预处理完成.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['Embarked'].fillna(embarked_mode_to_use, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['Age'].fillna(age_global_median_to_use, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedi


集成模型 (LR+XGBoost, 手动软投票) 在测试分割上:
  准确率: 0.8827
  ROC AUC: 0.9007
  分类报告:
               precision    recall  f1-score   support

     未生还 (0)       0.89      0.93      0.91       110
      生还 (1)       0.88      0.81      0.84        69

    accuracy                           0.88       179
   macro avg       0.88      0.87      0.87       179
weighted avg       0.88      0.88      0.88       179

  混淆矩阵:
 [[102   8]
 [ 13  56]]

--- 预测新数据 ---
模型, Scaler及预处理统计量已加载。
新数据预处理完成。
新数据预测完成。

Kaggle提交文件 'final_titanic_submission.csv' 已生成。


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['Embarked'].fillna(embarked_mode_to_use, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['Age'].fillna(age_global_median_to_use, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedi