<a href="https://colab.research.google.com/github/XingxinHE/Kaggle_Practice/blob/master/Titanic_kerastuner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Set up and Install

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir("/content/gdrive/My Drive/Kaggle/Titanic")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
!pip install git+https://github.com/keras-team/keras-tuner.git -q

  Building wheel for keras-tuner (setup.py) ... [?25l[?25hdone
  Building wheel for terminaltables (setup.py) ... [?25l[?25hdone


In [17]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers

#Preprocessing函数给后面处理features
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow.keras.layers.experimental.preprocessing import IntegerLookup

#加载KerasTuner
import kerastuner as kt

### Get the data

In [4]:
full_train_dataframe = pd.read_csv('train.csv')
test_dataframe = pd.read_csv('test.csv')

In [5]:
full_train_dataframe.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 分训练集和验证集

假设只有这些feature起作用： Pclass, Sex, Age, SibSp, Parch, Fare.

所以会丢掉Cabin, Name, PassengerId, Ticket, Embarked.

数据预处理Age，因为有NaN

In [7]:
#划分train和val数据集
val_dataframe = full_train_dataframe.sample(frac=0.2, random_state=1337)
train_dataframe = full_train_dataframe.drop(val_dataframe.index)


print("Total number of training samples: %d" % (len(full_train_dataframe)))
print("Total number of test samples: %d" % (len(test_dataframe)))
print(
    "Using %d samples for training and %d for validation"
    % (len(train_dataframe), len(val_dataframe))
)

Total number of training samples: 891
Total number of test samples: 418
Using 713 samples for training and 178 for validation


In [8]:
#定义一个function来填充NaN
def fill_nan(df, mean_age):
    df['Age'].fillna(value=mean_age, inplace=True)

#定义填充NaN的平均年龄
mean_age = np.mean(train_dataframe['Age'])

#填充NaN
fill_nan(train_dataframe, mean_age)
fill_nan(val_dataframe, mean_age)
fill_nan(full_train_dataframe, mean_age)
fill_nan(test_dataframe, mean_age)

### 将DF变成TF数据集的function

In [9]:
#定义DF->TF的function
#这个函数处理训练集和测试集，因为测试集是没有标签的，所以后面的train=True是只给训练集的
#这一点很重要，之前我用structure data的思路去做，老是做不到，就是这个步骤缺了
def dataframe_to_dataset(dataframe, train=True):
    dataframe = dataframe.copy()

    #把上面假设无关的标签去除
    dataframe.pop("Cabin")
    dataframe.pop("Name")
    dataframe.pop("Ticket")
    dataframe.pop("Embarked")
    dataframe.pop("PassengerId")
    
    if train:
        #定义预测用的标签
        labels = dataframe.pop("Survived")
        #创建数据集
        ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
        #打乱
        ds = ds.shuffle(buffer_size=len(dataframe))
    else:
        ds = tf.data.Dataset.from_tensor_slices(dict(dataframe))
    return ds

### 正式将DF->TF.ds

In [10]:
train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)
test_ds = dataframe_to_dataset(test_dataframe, train=False)
full_train_ds = dataframe_to_dataset(full_train_dataframe)

In [11]:
#取样看看有没有差错
for sample in train_ds.take(1):
    for key in sample[0].keys():
    #为何是sample[0]？因为sample[0]是训练集，sample[1]是label
        print('Feature:', key, '- dtype:', sample[0][key].dtype.name)

Feature: Pclass - dtype: int64
Feature: Sex - dtype: string
Feature: Age - dtype: float64
Feature: SibSp - dtype: int64
Feature: Parch - dtype: int64
Feature: Fare - dtype: float64


In [12]:
#分批次和预取数据
train_ds = train_ds.batch(32).prefetch(32)
val_ds = val_ds.batch(32).prefetch(32)
test_ds = test_ds.batch(32).prefetch(32)
full_train_ds = full_train_ds.batch(32).prefetch(32)

### 设置网络的输入set up input of the network

In [13]:
# Numerical features
age = keras.Input(shape=(1,), name='Age')
fare = keras.Input(shape=(1,), name='Fare')

# Integer categorical features
pclass = keras.Input(shape=(1,), name='Pclass', dtype='int64')
sibsp = keras.Input(shape=(1,), name='SibSp', dtype='int64')
parch = keras.Input(shape=(1,), name='Parch', dtype='int64')

# String categorical features
sex = keras.Input(shape=(1,), name='Sex', dtype='string')

We set up utilities to encode these features, using Keras Preprocessing Layers.

In [15]:
#numerical encoder
def encode_numerical_feature(feature, name, dataset):
    # Create a Normalization layer for our feature
    normalizer = Normalization()

    #指定只是该feature
    feature_ds = dataset.map(lambda x, y: x[name]) #y是label，x是原来的df
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the statistics of the data
    normalizer.adapt(feature_ds)

    #normalize输入
    encoded_feature = normalizer(feature)
    return encoded_feature

#categorical encoder
def encode_categorical_feature(feature, name, dataset):

    #因为训练集有两种categories，一种是数字的，一种是文字的
    #为了省时间，创建的这个函数可以容纳两种数据，只需specify即可
    if feature.dtype.name == 'string':
        index = StringLookup()
    else:
        index = IntegerLookup()

    #指定只是该feature
    feature_ds = dataset.map(lambda x, y: x[name]) #y是label，x是原来的df
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    #找出有多少个categories
    index.adapt(feature_ds)

    #将categories转成数字
    encoded_feature = index(feature)

    # Create a CategoryEncoding for our integer indices
    encoder = CategoryEncoding(output_mode="binary")

    # Prepare a dataset of indices
    feature_ds = feature_ds.map(index)

    # Learn the space of possible indices
    encoder.adapt(feature_ds)

    # Apply one-hot encoding to our indices
    encoded_feature = encoder(encoded_feature)
    return encoded_feature


### Encoding features

In [16]:
# Numerical features
encoded_age = encode_numerical_feature(age, name='Age', dataset=train_ds)
encoded_fare = encode_numerical_feature(fare, name='Fare', dataset=train_ds)

# Integer categorical features
encoded_pclass = encode_categorical_feature(pclass, name='Pclass', dataset=train_ds)
encoded_sibsp = encode_categorical_feature(sibsp, name='SibSp', dataset=train_ds)
encoded_parch = encode_categorical_feature(parch, name='Parch', dataset=train_ds)

# String categorical features
encoded_sex = encode_categorical_feature(sex, name='Sex', dataset=train_ds)

### 准备 KerasTuner

简单来说，Tuner就是帮我们找到最合适的模型

In [18]:
#说好input和feature都是啥
inputs = [age, fare, pclass, sibsp, parch, sex]
features = layers.concatenate([encoded_age, encoded_fare, encoded_pclass, encoded_sibsp, encoded_parch, encoded_sex])

In [19]:
#做模型的function
def make_model(hp):
    num_dense = hp.Int('num_dense', min_value=1, max_value=3, step=1) #在这些范围内找出最佳的模型
    x = features
    for i in range(num_dense):
        units = hp.Int('units_{i}'.format(i=i), min_value=8, max_value=256, step=8) #在这些范围内找出倒数第二层的Dense Layer要多少neuron
        x = layers.Dense(units, activation='relu')(x)
    outputs = layers.Dense(1, activation='sigmoid')(x) #因为是0和1的输出，所以是sigmoid
    model = keras.Model(inputs, outputs) #输入和输出

    learning_rate = hp.Float('learning_rate', min_value=3e-4, max_value=3e-3)
    optimizer = keras.optimizers.Adam(learning_rate=1e-3)
    model.compile(loss=keras.losses.BinaryCrossentropy(from_logits=True), #损失函数
                  optimizer=optimizer, #优化器
                  metrics=[keras.metrics.BinaryAccuracy(name='acc')]) #量化的metrics
    model.summary()
    return model

### 运行kt，来找最优解的模型

We run random search over this hyperparmater search space.

In [20]:
tuner = kt.tuners.RandomSearch(
    make_model,
    objective='val_acc',
    max_trials=100,
    overwrite=True)

callbacks=[keras.callbacks.EarlyStopping(monitor='val_acc', mode='max', patience=3)]
tuner.search(train_ds, validation_data=val_ds, callbacks=callbacks, epochs=100)

Trial 100 Complete [00h 00m 02s]
val_acc: 0.8202247023582458

Best val_acc So Far: 0.8426966071128845
Total elapsed time: 00h 04m 23s
INFO:tensorflow:Oracle triggered exit


### 找到最优解的epoch

Now, we can retrieve the best hyperparameters, use them to build the best model, and train the model for 100 epochs to find at which epoch training should stop.

In [21]:
best_hp = tuner.get_best_hyperparameters()[0]
model = make_model(best_hp)
history = model.fit(train_ds, validation_data=val_ds, epochs=100)

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Pclass (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
SibSp (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
Parch (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
Sex (InputLayer)                [(None, 1)]          0                                            
_______________________________________________________________________________________

### 把上述最优解的参数来训练最优模型

Finally, we can train the best model configuration from scratch for the optimal number of epochs.

This time, we train on the entirety of the training data -- no validation split. Our model parameters are already validated.

In [22]:
val_acc_per_epoch = history.history['val_acc']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))
model = make_model(best_hp)
model.fit(full_train_ds, epochs=best_epoch)

Best epoch: 30
Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Pclass (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
SibSp (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
Parch (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
Sex (InputLayer)                [(None, 1)]          0                                            
________________________________________________________________________

<tensorflow.python.keras.callbacks.History at 0x7f21f2cf6668>

### 用模型预测结果

In [23]:
#用模型预测测试集的结果
predictions = model.predict(test_ds)

### 将结果弄成DF

In [24]:

passenger_ids = test_dataframe.pop("PassengerId")
submission = pd.DataFrame({"PassengerId": passenger_ids,
                           "Survived": np.ravel(np.round(predictions))})

In [25]:
#发现target不是int
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0.0
1,893,0.0
2,894,0.0
3,895,0.0
4,896,0.0


In [26]:
#化为整数
submission['Survived']=submission.Survived.astype(int)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


### 把DF输出为csv提交文件

In [27]:
submission.to_csv("submission.csv", index=False)