# Titanic

In [93]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

## 1.数据详情

In [84]:
def data_overview(df, head=True):
    '''数据详情'''
    # 基本信息
    if head:
        display(df.head())
    # 缺失值
    missing_values = df.isnull().sum()
    missing_percentage = (df.isnull().sum() / len(df)) * 100
    overview = pd.DataFrame({
        'Missing Count': missing_values,
        'Missing(%)': missing_percentage.round(1)
    })
    # 各变量取值数量
    unique_counts = df.nunique()
    overview['Unique_counts'] = df.nunique()
    # 各变量取值范围
    unique_values = {col: df[col].unique() for col in df.columns}
    unique_value = []
    for col in unique_counts.index:
        if unique_counts[col] <= 20:
            unique_value.append(unique_values[col])
        else:
            unique_value.append('Much')
    overview['Unique_values'] = unique_value
    # 数据类型
    overview['Dtype'] = df.dtypes
    display(overview)
    # 重复值
    duplicate_count = df.duplicated().sum()
    print(f"Duplicate Rows Count: {duplicate_count} duplicate rows found" 
          if duplicate_count > 0 else "No duplicate rows found")

In [85]:
# 合并训练集与测试集
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
train["DatasetType"] = "train"
test["DatasetType"] = "test"
combined = pd.concat([train, test], axis=0).reset_index(drop=True)

In [86]:
data_overview(combined)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,DatasetType
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train


Unnamed: 0,Missing Count,Missing(%),Unique_counts,Unique_values,Dtype
PassengerId,0,0.0,1309,Much,int64
Survived,418,31.9,2,"[0.0, 1.0, nan]",float64
Pclass,0,0.0,3,"[3, 1, 2]",int64
Name,0,0.0,1307,Much,object
Sex,0,0.0,2,"[male, female]",object
Age,263,20.1,98,Much,float64
SibSp,0,0.0,7,"[1, 0, 3, 4, 2, 5, 8]",int64
Parch,0,0.0,8,"[0, 1, 2, 5, 3, 4, 6, 9]",int64
Ticket,0,0.0,929,Much,object
Fare,1,0.1,281,Much,float64


No duplicate rows found


## 2.特征工程1

In [87]:
# 从名字中提取称呼
def extract_title(Name):
    title_search = re.search(r',\s*([^\.]*)\.', Name)
    if title_search:
        return title_search.group(1).strip()
    return ""
combined["Title"] = combined["Name"].apply(extract_title).astype('category')
print(combined["Title"].unique())

# 处理称呼
def encode_title(title):
    if title in ['Mr', 'Mrs', 'Miss', 'Master']:
        return title
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title in ['Mme']:
        return 'Mrs'
    elif title in ['Dr', 'Major', 'Col', 'Capt', 'Rev']:
        return 'Officer'
    elif title in ['Don', 'Sir', 'the Countess', 'Lady', 'Jonkheer']:
        return 'Noble'
    else:
        return 'Other'

combined["TitleEncoded"] = combined["Title"].apply(encode_title).astype('category')
print(combined["TitleEncoded"].unique())

['Mr', 'Mrs', 'Miss', 'Master', 'Don', ..., 'Col', 'Capt', 'the Countess', 'Jonkheer', 'Dona']
Length: 18
Categories (18, object): ['Capt', 'Col', 'Don', 'Dona', ..., 'Ms', 'Rev', 'Sir', 'the Countess']
['Mr', 'Mrs', 'Miss', 'Master', 'Noble', 'Officer', 'Other']
Categories (7, object): ['Master', 'Miss', 'Mr', 'Mrs', 'Noble', 'Officer', 'Other']


In [88]:
# 创建特征：FamilySize
combined["FamilySize"] = combined["SibSp"] + combined["Parch"] + 1

# 创建特征："FamilyGroup"
def categorize_family_size(size):
    if size == 1:
        return "Solo"
    elif size <= 4:
        return "Small"
    else:
        return "Large"

combined["FamilyGroup"] = combined["FamilySize"].apply(categorize_family_size).astype('category')
combined["FamilyGroup"].unique()

['Small', 'Solo', 'Large']
Categories (3, object): ['Large', 'Small', 'Solo']

## 3.数据清洗

In [89]:
# 转换分类变量
combined['Survived'] = combined['Survived'].astype('category')
combined['Sex'] = combined['Sex'].astype('category')
combined['Embarked'] = combined['Embarked'].astype('category')
combined['DatasetType'] = combined['DatasetType'].astype('category')

# 缺失数据的填补
# Cabin
combined["Cabin"] = combined["Cabin"].fillna("Unknown")

# Fare
fare_median = combined.groupby(["Pclass", "Embarked"])["Fare"].median()
combined["Fare"] = combined.apply(lambda row: fare_median.loc[row["Pclass"], row["Embarked"]] if pd.isna(row["Fare"]) else row["Fare"], axis=1)

# Embark
mode_embarked = combined.groupby(["Pclass", "Fare"])["Embarked"].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else "S")
combined["Embarked"] = combined.apply(lambda row: mode_embarked.loc[row["Pclass"], row["Fare"]] if pd.isna(row["Embarked"]) else row["Embarked"], axis=1)


In [91]:
def impute_num(df, columns_X, columns_y, model):
    '''
    使用模型填充的方式填补数值型缺失值\n
    return: 训练好的预测模型
    df: 含有缺失值的数据框
    columns: 填补缺失值时使用的特征X的列名，数据类型为列表
    model: 填补时使用的模型
    '''
    X = df[columns_X]
    y = df[columns_y]
    # 划分训练集与测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2025)
    # 训练模型
    model.fit(X_train, y_train)
    # 模型效果
    y_pred = model.predict(X_test)
    print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')

In [90]:
data_overview(combined)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,DatasetType,Title,TitleEncoded,FamilySize,FamilyGroup
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S,train,Mr,Mr,2,Small
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,Mrs,Mrs,2,Small
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S,train,Miss,Miss,1,Solo
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train,Mrs,Mrs,2,Small
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Unknown,S,train,Mr,Mr,1,Solo


Unnamed: 0,Missing Count,Missing(%),Unique_counts,Unique_values,Dtype
PassengerId,0,0.0,1309,Much,int64
Survived,418,31.9,2,"[0.0, 1.0, NaN] Categories (2, float64): [0.0,...",category
Pclass,0,0.0,3,"[3, 1, 2]",int64
Name,0,0.0,1307,Much,object
Sex,0,0.0,2,"['male', 'female'] Categories (2, object): ['f...",category
Age,263,20.1,98,Much,float64
SibSp,0,0.0,7,"[1, 0, 3, 4, 2, 5, 8]",int64
Parch,0,0.0,8,"[0, 1, 2, 5, 3, 4, 6, 9]",int64
Ticket,0,0.0,929,Much,object
Fare,0,0.0,281,Much,float64


No duplicate rows found


## 4.特征工程2