In [1]:
%matplotlib inline
import random
import tensorflow as tf
from d2l import tensorflow as d2l

In [11]:
import math
import numpy as np
import pandas as pd

train_data = pd.read_csv("/home/xiangzi/repository/kaggle/titanic/data/train.csv")
test_data = pd.read_csv("/home/xiangzi/repository/kaggle/titanic/data/test.csv")

features_1 = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
features_2 = ["Sex", "Embarked"]


def bucket(num_list, low, high, size):
    dim = math.ceil((high - low) / size) + 1
    vector_list = []
    res = []
    for num in num_list:
        num = float(num)
        if math.isnan(num):
            num = high
        elif num < low or num > high:
            num = high
        vector = [0] * dim
        for k in range(math.floor((num - low) / size)+1):
            vector[k] = 1
        vector_list.append(vector)
    for i in range(dim):
        features = []
        for j in range(len(vector_list)):
            features.append(vector_list[j][i])
        res.append(features)
    return res
    
def category(f_name, feature_list):
    dim = 0
    vector_list = []
    res = []
    for feature in feature_list:
        if f_name == "Sex":
            dim = 2
            if feature == "male":
                vector = [1, 0]
            if feature == "female":
                vector = [0, 1]
        if f_name == "Embarked":
            dim = 3
            if feature == "S":
                vector = [1, 0, 0]
            if feature == "C":
                vector = [0, 1, 0]
            if feature == "Q":
                vector = [0, 0, 1]
        vector_list.append(vector)
    for i in range(dim):
        features = []
        for j in range(len(vector_list)):
            features.append(vector_list[j][i])
        res.append(features)
    return res


def transform(data):
    dic = {}
    for f_name in features_1:
        num_list = np.array(data[f_name])
        if f_name == "Pclass":
            low, high, size = 1, 3, 1
        elif f_name == "Age":
            low, high, size = 0, 120, 5
        elif f_name == "SibSp":
            low, high, size = 0, 8, 1
        elif f_name == "Parch":
            low, high, size = 0, 6, 1
        elif f_name == "Fare":
            low, high, size = 0, 160, 5
        else:
            continue
        tmp = bucket(num_list, low, high, size)
        dim = len(tmp)
        for i in range(dim):
            dic[f_name + "_" + str(i)] = tmp[i]
    for f_name in features_2:
        feature_list = np.array(data[f_name])
        tmp = category(f_name, feature_list)
        dim = len(tmp)
        for i in range(dim):
            dic[f_name + "_" + str(i)] = tmp[i]        
    return pd.DataFrame(dic)

def transform_v2(data):
    res = []
    for f_name in features_1:
        num_list = np.array(data[f_name])
        if f_name == "Pclass":
            low, high, size = 1, 3, 1
        elif f_name == "Age":
            low, high, size = 0, 120, 5
        elif f_name == "SibSp":
            low, high, size = 0, 8, 1
        elif f_name == "Parch":
            low, high, size = 0, 6, 1
        elif f_name == "Fare":
            low, high, size = 0, 160, 5
        else:
            continue
        tmp = bucket(num_list, low, high, size)
        res.extend(tmp)
    for f_name in features_2:
        feature_list = np.array(data[f_name])
        tmp = category(f_name, feature_list)
        res.extend(tmp)
    # add feature 常值1
    tmp = [1] * len(num_list)
    res.extend([tmp])
    return np.transpose(np.array(res, dtype=np.float32))


X_train = transform_v2(train_data)
X_test = transform_v2(test_data)

y_train = np.array(train_data["Survived"], dtype=np.float32)


In [12]:
def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))
    # 这些样本是随机读取的，没有特定的顺序
    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
        j = tf.constant(indices[i: min(i + batch_size, num_examples)])
        yield tf.gather(features, j), tf.gather(labels, j)

In [13]:
n_samples = X_train.shape[0]
n_features = X_train.shape[1]

In [14]:
w = tf.Variable(tf.random.normal(shape=(n_features, 1), mean=0, stddev=0.01), trainable=True)
b = tf.Variable(tf.zeros(1), trainable=True)

In [23]:
def linreg(X, w, b):
    return tf.matmul(X, w) + b

In [24]:
def squared_loss(y_hat, y):
    return (y_hat - tf.reshape(y, y_hat.shape)) ** 2 / 2

In [26]:
def sgd(params, grads, lr, batch_size):
    for param, grad in zip(params, grads):
        param.assign_sub(lr*grad/batch_size)

In [33]:
lr = 0.01
batch_size = 100
num_epochs = 200
net = linreg
loss = squared_loss

In [34]:
for epoch in range(num_epochs):
    for X, y in data_iter(batch_size, X_train, y_train):
        with tf.GradientTape() as g:
            l = loss(net(X, w, b), y)
        dw, db = g.gradient(l, [w, b])
        sgd([w, b], [dw, db], lr, batch_size)
    train_l = loss(net(X_train, w, b), y_train)
    print(f'epoch {epoch + 1}, loss {float(tf.reduce_mean(train_l)):f}')        

epoch 1, loss 0.113627
epoch 2, loss 0.106740
epoch 3, loss 0.102839
epoch 4, loss 0.099831
epoch 5, loss 0.097364
epoch 6, loss 0.095301
epoch 7, loss 0.093553
epoch 8, loss 0.092001
epoch 9, loss 0.090705
epoch 10, loss 0.089410
epoch 11, loss 0.088323
epoch 12, loss 0.087247
epoch 13, loss 0.086276
epoch 14, loss 0.085416
epoch 15, loss 0.084565
epoch 16, loss 0.083803
epoch 17, loss 0.083078
epoch 18, loss 0.082438
epoch 19, loss 0.081837
epoch 20, loss 0.081189
epoch 21, loss 0.080639
epoch 22, loss 0.080123
epoch 23, loss 0.079620
epoch 24, loss 0.079182
epoch 25, loss 0.078723
epoch 26, loss 0.078313
epoch 27, loss 0.077923
epoch 28, loss 0.077558
epoch 29, loss 0.077216
epoch 30, loss 0.076887
epoch 31, loss 0.076582
epoch 32, loss 0.076286
epoch 33, loss 0.076019
epoch 34, loss 0.075745
epoch 35, loss 0.075497
epoch 36, loss 0.075261
epoch 37, loss 0.075035
epoch 38, loss 0.074828
epoch 39, loss 0.074626
epoch 40, loss 0.074424
epoch 41, loss 0.074238
epoch 42, loss 0.074073
e

In [36]:
r = net(X_test, w, b)

In [41]:
predictions = []
for value in r:
    if value > 0.5:
        predictions.append(1)
    else:
        predictions.append(0)

In [42]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('/home/xiangzi/repository/kaggle/titanic/data/submission_tf_lr.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
