# 使用Keras对泰坦尼克号旅客生存进行预测

In [1]:
import os
import urllib.request

import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing

In [2]:
## 旅客数据集准备
data_url = "https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls"

data_file_path = "Data/titanic3.xls"

if not os.path.isfile(data_file_path):
    result = urllib.request.urlretrieve(data_url, data_file_path)
    print("Downloaded: ", result)
else:
    print(data_file_path, "Data file already exists")

Data/titanic3.xls Data file already exists


## 使用 Pandas 对数据进行处理

In [3]:
# 读取数据，结果为 DataFrame 格式
df_data = pd.read_excel(data_file_path)

In [4]:
# 查看数据摘要
df_data.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


In [5]:
df_data

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


## 筛选特征字段

| **字段**   | **字段说明**        | **数据说明**                               |
|----------|-----------------|----------------------------------------|
| pclass   | 舱等              | 1：头等舱，2：二等舱，3：三等舱                      |
| survival | 是否生存            | 0：否，1：是                                |
| name     | 姓名              |                                        |
| sex      | 性别              | female：女性，male：男                       |
| age      | 年龄              |                                        |
| slibsp   | 兄弟姐妹或者配偶也在船上的数量 |                                        |
| parch    | 双亲或者子女也在船上的数量   |                                        |
| ticket   | 船票号码            |                                        |
| fare     | 船票费用            |                                        |
| cabin    | 舱位号码            |                                        |
| embarked | 登船港口            | C=Cherbourg，Q=Queenstown，S=Southampton |


In [6]:
# survival（是否生存）是标签字段，其他是候选特征字段
# 筛选提取需要的特征字段，去掉ticket，cabin等
selected_cols = ["survived", "name", "pclass", "sex", "age", "sibsp", "parch", "fare", "embarked"]

selected_df_data = df_data[selected_cols]

In [7]:
selected_df_data

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
0,1,"Allen, Miss. Elisabeth Walton",1,female,29.0000,0,0,211.3375,S
1,1,"Allison, Master. Hudson Trevor",1,male,0.9167,1,2,151.5500,S
2,0,"Allison, Miss. Helen Loraine",1,female,2.0000,1,2,151.5500,S
3,0,"Allison, Mr. Hudson Joshua Creighton",1,male,30.0000,1,2,151.5500,S
4,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",1,female,25.0000,1,2,151.5500,S
...,...,...,...,...,...,...,...,...,...
1304,0,"Zabour, Miss. Hileni",3,female,14.5000,1,0,14.4542,C
1305,0,"Zabour, Miss. Thamine",3,female,,1,0,14.4542,C
1306,0,"Zakarian, Mr. Mapriededer",3,male,26.5000,0,0,7.2250,C
1307,0,"Zakarian, Mr. Ortin",3,male,27.0000,0,0,7.2250,C


In [8]:
# Pandas 判断缺失值一般采用isnull()，生成所有数据的True/False矩阵
# 这是元素级别的判断，把对应的所有元素的位置都列出来，元素为空或者为NA就显示为True，否则为False
selected_df_data.isnull()

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
1304,False,False,False,False,False,False,False,False,False
1305,False,False,False,False,True,False,False,False,False
1306,False,False,False,False,False,False,False,False,False
1307,False,False,False,False,False,False,False,False,False


In [9]:
# 判断哪一列有空值
selected_df_data.isnull().any()

survived    False
name        False
pclass      False
sex         False
age          True
sibsp       False
parch       False
fare         True
embarked     True
dtype: bool

In [10]:
# 判断每一列中有空值的个数
selected_df_data.isnull().sum()

survived      0
name          0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [11]:
# 显示存在空值的列，确定缺失位置
selected_df_data[selected_df_data.isnull().values == True]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
15,0,"Baumann, Mr. John D",1,male,,0,0,25.9250,S
37,1,"Bradley, Mr. George (""George Arthur Brayton"")",1,male,,0,0,26.5500,S
40,0,"Brewe, Dr. Arthur Jackson",1,male,,0,0,39.6000,C
46,0,"Cairns, Mr. Alexander",1,male,,0,0,31.0000,S
59,1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genev...",1,female,,0,0,27.7208,C
...,...,...,...,...,...,...,...,...,...
1293,0,"Williams, Mr. Howard Hugh ""Harry""",3,male,,0,0,8.0500,S
1297,0,"Wiseman, Mr. Phillippe",3,male,,0,0,7.2500,S
1302,0,"Yousif, Mr. Wazli",3,male,,0,0,7.2250,C
1303,0,"Yousseff, Mr. Gerious",3,male,,0,0,14.4583,C


## 数据预处理函数

In [12]:
def prepare_data(df_data):
    # 删除姓名列
    # drop不改变原有df中的数据，而是返回另一个DataFrame来存放删除后的数据
    # axis = 1：代表删除列
    df = df_data.drop(["name"], axis=1)

    # 填充null值（缺失值）
    # 此处均填充为平均值
    # -------------------------------------------------------------------------
    # 为缺失值age记录填充值
    age_mean = df["age"].mean()
    df["age"] = df["age"].fillna(age_mean)
    # -------------------------------------------------------------------------
    # 为缺失值fare记录填充值
    fare_mean = df["fare"].mean()
    df["fare"] = df["fare"].fillna(fare_mean)
    # -------------------------------------------------------------------------
    # 性别sex由字符转换成数字编码
    df["sex"] = df["sex"].map({"female": 0, "male": 1}).astype(int)
    # -------------------------------------------------------------------------
    # 为缺失值embarked记录填充值
    df["embarked"] = df["embarked"].fillna("S")
    # 港口embarked由字母表示转换为数字编码
    df["embarked"] = df["embarked"].map({"C": 0, "Q": 1, "S": 2}).astype(int)

    # 转换为ndarray数组（numpy格式数组）
    ndarray_data = df.values

    # 分离特征值和标签值
    # ------------------------------------
    # 第0列是标签值（survival列）
    # [行，列]
    # 保留所有行，列值从第1列（第0列）开始填充
    label = ndarray_data[:, 0]
    # ------------------------------------
    # 后 1 ~ 8列为特征值（性别等）
    # [行，列]
    # 保留所有行，列值从第2列（第1列）开始填充
    features = ndarray_data[:, 1:]

    # 如有以下例子：
    # 制作一份鸡汤，需要用到鸡、水、姜、葱、蒜、盐等材料，但材料间的取值范围均不相同
    # 水的取值范围可能在2000克到3000克，但相比较之下盐可能仅需要1克2克，这样的取值范围在多元线性回归中是不合理的
    # ----------------------------------------------------------------------------------------
    # 为防止不同特征值取值范围之间的差异性，需要对特征数据进行归一化
    # 归一化过程：[特征值 / max(特征值) - min(特征值)]
    # ----------------------------------------------------------------------------------------
    # 对特征数据列进行归一化（缩小差异区间至0~1）处理
    # 利用sklearn封装API特征值标准化
    minmax_scale = preprocessing.MinMaxScaler(feature_range={0, 1})
    norm_features = minmax_scale.fit_transform(features)

    return norm_features, label

## 数据准备

In [13]:
# shuffle，打乱数据顺序，为后面训练做准备
# 通过Pandas的抽样函数sample实现，frac为百分比
# selected_df_data数据保持不变
# frac=1代表全部数据都需要进行打乱
shuffle_df_data = selected_df_data.sample(frac=1)

In [14]:
# 处理后的数据集
# x_data = norm_features
# y_data = label

# FIX THIS
x_data, y_data = prepare_data(shuffle_df_data)

TypeError: 'set' object does not support indexing

In [None]:
# 划分训练集和测试集
train_size = int(len(x_data) * 0.8)

x_train = x_data[:train_size]
y_train = y_data[:train_size]

x_test = x_data[train_size:]
y_test = y_data[train_size:]

## 建立模型结构

In [None]:
# 建立Keras序列模型
# 7（本例中有7个特征，故输入层为7个神经元） -> 64 -> 32 -> 1（本例中生存值只有0与1两个值，故输出层只有一个神经元）
model = tf.keras.models.Sequential()

In [None]:
# 第一层隐层
model.add(tf.keras.layers.Dense(
    units=64,  # 神经元个数
    input_dim=7,  # 输入层传递个数，也可不指定，等效于input_shape
    use_bias=True,  # 是否使用偏置项
    kernel_initializer="uniform",  # 权重初始化方式，此处为unifrom分布
    bias_initializer="zeros",  # 偏置项初始化值
    activation="relu",  # 激活函数
))

In [None]:
# 防止层级间过拟合
# model.add(tf.keras.layers.Dropout(rate=0.3))

In [None]:
# 第二层隐层
model.add(tf.keras.layers.Dense(
    units=32,
    activation="sigmoid"
))

In [None]:
# 防止层级间过拟合
# model.add(tf.keras.layers.Dropout(rate=0.3))

In [None]:
# 输出层
model.add(tf.keras.layers.Dense(
    units=1,
    activation="sigmoid"
))

In [None]:
model.summary()

## 模型设置

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.003),
              # optimizer可以是优化器的名字，也可以是优化器的实例，传入学习率
              loss="binary_crossentropy",  # 损失函数名
              # 用sigmoid作为激活函数，一般损失函数使用binary_crossentropy（二分交叉熵）
              # 用softmax作为激活函数，一般损失函数使用categorical_crossentropy（分类交叉熵）
              metrics=["accuracy"]
              # 模型要训练和评估的度量值
              )

## 模型训练

In [None]:
train_history = model.fit(x=x_train,  # 训练数据
                          y=y_train,  # 目标数据（标签数据）
                          epochs=100,  # 训练迭代次数
                          batch_size=40,  # 批次大小
                          validation_split=0.2,  # 验证集占比
                          verbose=2,  # 训练过程的日志显示，
                          # verbose=2
                          # 0：不在标准输出流输出日志信息
                          # 1：输出进度条显示
                          # 2：每个epoch输出一行记录
                          )

In [None]:
train_history.history

In [None]:
train_history.history.keys()

## 训练过程可视化

In [None]:
def show_train_history(train_history, train_metric, val_metric):
    plt.plot(train_history.history[train_metric])
    plt.plot(train_history.history[val_metric])

    plt.title("Train History")
    plt.xlabel("Epochs")
    plt.ylabel(train_metric)

    plt.legend(["Train", "Validation"], loc="upper left")

    plt.show()

In [None]:
show_train_history(train_history, "loss", "val_loss")

In [None]:
show_train_history(train_history, "acc", "val_acc")

## 评估模型

In [None]:
result = model.evaluate(x=x_test, y=y_test)

In [None]:
result

In [None]:
model.metrics_names

## 模型应用：预测Jack和Rose的生存概率

In [None]:
use_selected_cols = ["survived", "name", "pclass", "sex", "age", "sibsp", "parch", "fare", "embarked"]
use_selected_df_data = df_data[selected_cols]

In [None]:
use_selected_df_data

In [None]:
# 旅客信息字段
use_selected_cols

In [None]:
# 添加Jack和Rose的旅客信息
Jack_Info = [0, "Jack", 3, "male", 23, 1, 0, 5.0000, "S"]
Rose_Info = [1, "Rose", 1, "female", 20, 1, 0, 100.0000, "S"]

In [None]:
# 创建新的旅客DataFrame
new_passenger_pd = pd.DataFrame([Jack_Info, Rose_Info], columns=use_selected_cols)

In [None]:
# 在老数据后追加新旅客信息#
all_passenger_pd = use_selected_df_data.append(new_passenger_pd)

In [None]:
# 新加入数据将加到最后，此处显示倒数3条数据
all_passenger_pd[-3:]

In [None]:
all_passenger_pd

In [None]:
# 数据准备
x_features, y_label = prepare_data(all_passenger_pd)

In [None]:
# 利用模型计算旅客生存概率
sur_probability = model.predict(x_features)

In [None]:
sur_probability[:5]

In [None]:
all_passenger_pd.insert(len(all_passenger_pd.colums), "sur_probability", sur_probability)

In [None]:
all_passenger_pd[-5:]