# Titanic FFM

## 概要：
- 运行时间比较长的训练，还是应该弄个TensorBoard，方便监控结果，免得每次都需要用鼠标手动托页面查看最新的运行结果。
- 模型为全链接神经网络和统计学习方法。

## Result:


Reference: 
1. https://www.kaggle.com/c/titanic#tutorials
2. https://www.kaggle.com/sinakhorami/titanic-best-working-classifier
3. https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python/notebook
4. https://github.com/aksnzhy/xlearn

## 1. Preprocess

### Import pkgs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.display import display

import os
import gc
import time
import random
import zipfile
import h5py
import pickle
import math
from PIL import Image
import shutil

from tqdm import tqdm
import multiprocessing

  from ._conv import register_converters as _register_converters


In [2]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

random_num = np.random.randint(10000)
print('random_num: %s' % random_num)

random_num: 8020


In [3]:
project_name = 'Titanic'
step_name = 'FFM'
date_str = time.strftime("%Y%m%d", time.localtime())
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)

cwd = os.getcwd()
log_path = os.path.join(cwd, 'log')
model_path = os.path.join(cwd, 'model')
output_path = os.path.join(cwd, 'output')
print('model_path: ' + log_path)
print('model_path: ' + model_path)
print('model_path: ' + output_path)

run_name: Titanic_FFM_20180620_101229
model_path: /data1/github/Kaggle/titanic/log
model_path: /data1/github/Kaggle/titanic/model
model_path: /data1/github/Kaggle/titanic/output


### Import original data as DataFrame

In [4]:
data_train = pd.read_csv('./input/train.csv')
data_test = pd.read_csv('./input/test.csv')

display(data_train.head(2))
display(data_test.head(2))
data_train.loc[2, 'Ticket']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


'STON/O2. 3101282'

### Show columns of dataframe

In [5]:
data_train_original_col = data_train.columns
data_test_original_col = data_test.columns
print(data_train_original_col)
print(data_test_original_col)
# data_train0 = data_train.drop(data_train_original_col, axis = 1)
# data_test0  = data_test.drop(data_test_original_col, axis = 1)
# display(data_train0.head(2))
# display(data_test0.head(2))

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


### Preprocess features

In [6]:
full_data = [data_train, data_test]

In [7]:
# Pclass
for dataset in full_data:
    temp = dataset[dataset['Pclass'].isnull()]
    if len(temp) == 0:
        print('Do not have null value!')
    else:
        temp.head(2)
        
for dataset in full_data:
    dataset['a_Pclass'] = dataset['Pclass']
#     display(dataset.head())

Do not have null value!
Do not have null value!


In [8]:
# Name
for dataset in full_data:
    dataset['a_Name_Length'] = dataset['Name'].apply(len)
#     display(dataset.head(2))

In [9]:
# Sex
for dataset in full_data:
    dataset['a_Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)
#     display(dataset.head(2))

In [10]:
# Age
def is_child(age):
    if age >= 0 and age <=15:
        return 1
    return 0

for dataset in full_data:
    dataset['a_Age'] = dataset['Age'].fillna(-1)
    dataset['a_Have_Age'] = dataset['Age'].isnull().map({True: 0, False: 1}).astype(int)
    dataset['a_Is_Child'] = dataset['a_Age'].apply(is_child)
#     display(dataset[dataset['Age'].isnull()].head(2))
    display(dataset[dataset['Age']<=15].head(2))
    display(dataset.head(2))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,a_Pclass,a_Name_Length,a_Sex,a_Age,a_Have_Age,a_Is_Child
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,3,30,1,2.0,1,1
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,2,35,0,14.0,1,1


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,a_Pclass,a_Name_Length,a_Sex,a_Age,a_Have_Age,a_Is_Child
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,3,23,1,22.0,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,51,0,38.0,1,0


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,a_Pclass,a_Name_Length,a_Sex,a_Age,a_Have_Age,a_Is_Child
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S,3,26,1,14.0,1,1
21,913,3,"Olsen, Master. Artur Karl",male,9.0,0,1,C 17368,3.1708,,S,3,25,1,9.0,1,1


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,a_Pclass,a_Name_Length,a_Sex,a_Age,a_Have_Age,a_Is_Child
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,3,16,1,34.5,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,3,32,0,47.0,1,0


In [11]:
# SibSp and Parch
for dataset in full_data:
    dataset['a_FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['a_IsAlone'] = dataset['a_FamilySize'].apply(lambda x: 1 if x<=1 else 0)
#     display(dataset.head(2))

In [12]:
# Ticket(Very one have a ticket)
for dataset in full_data:
    dataset['a_Have_Ticket'] = dataset['Ticket'].isnull().map({True: 0, False: 1}).astype(int)
#     display(dataset[dataset['Ticket'].isnull()].head(2))
#     display(dataset.head(2))

In [13]:
# Fare
for dataset in full_data:
    dataset['a_Fare'] = dataset['Fare'].fillna(-1)
    dataset['a_Have_Fare'] = dataset['Fare'].isnull().map({True: 0, False: 1}).astype(int)
#     display(dataset[dataset['Fare'].isnull()].head(2))
#     display(dataset.head(2))

In [14]:
# Cabin
for dataset in full_data:
    dataset['a_Have_Cabin'] = dataset['Cabin'].isnull().map({True: 0, False: 1}).astype(int)
#     display(dataset[dataset['Cabin'].isnull()].head(2))
#     display(dataset.head(2))

In [15]:
# Embarked
for dataset in full_data:
#     dataset['Embarked'] = dataset['Embarked'].fillna('N')
    dataset['a_Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2, None: 3} ).astype(int)
    dataset['a_Have_Embarked'] = dataset['Embarked'].isnull().map({True: 0, False: 1}).astype(int)
#     display(dataset[dataset['Embarked'].isnull()].head(2))
#     display(dataset.head(2))

Name words segmentation and one-hote

In [16]:
# Name words segmentation
import re
name_words = []

# Inorder to allign columns of data_train and data_test, only data_train to fetch word
for name in data_train['Name']:
#     print(name)
    words = re.findall(r"[\w']+", name)
#     print(len(words))
#     print(words)
    for w in words:
        if w not in name_words:
            name_words.append(w)
# print(len(name_words))
name_words.sort()
# print(name_words)

In [17]:
# Add columns
for dataset in full_data:
    for w in name_words:
        col_name = 'a_Name_' + w
        dataset[col_name] = 0
    dataset.head(1)

In [18]:
# Name words one-hote
for dataset in full_data:
    for i, row in dataset.iterrows():
    #     print(row['Name'])
        words = re.findall(r"[\w']+", row['Name'])
        for w in words:
            if w in name_words:
                col_name = 'a_Name_' + w
                dataset.loc[i, col_name] = 1
#     display(dataset[dataset['a_Name_Braund'] == 1])

Cabin segmentation and one-hote

In [19]:
# Get cabin segmentation words
import re
cabin_words = []

# Inorder to allign columns of data_train and data_test, only data_train to fetch number
for c in data_train['Cabin']:
#     print(c)
    if c is not np.nan:
        word = re.findall(r"[a-zA-Z]", c)
#         print(words[0])
        cabin_words.append(word[0])
print(len(cabin_words))
cabin_words.sort()
print(np.unique(cabin_words))
cabin_words_unique = list(np.unique(cabin_words))

204
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'T']


In [20]:
def get_cabin_word(cabin):
    if cabin is not np.nan:
        word = re.findall(r"[a-zA-Z]", cabin)
        if word:
            return cabin_words_unique.index(word[0])
    return -1

for dataset in full_data:
    dataset['a_Cabin_Word'] = dataset['Cabin'].apply(get_cabin_word)
    # dataset['a_Cabin_Word'].head(100)

In [21]:
def get_cabin_number(cabin):
    if cabin is not np.nan:
        word = re.findall(r"[0-9]+", cabin)
        if word:
            return int(word[0])
    return -1

for dataset in full_data:
    dataset['a_Cabin_Number'] = dataset['Cabin'].apply(get_cabin_number)
    # dataset['a_Cabin_Number'].head(100)

In [22]:
# Clean data
# Reference: 
#    1. https://www.kaggle.com/sinakhorami/titanic-best-working-classifier
#    2. https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python/notebook
# full_data = [data_train, data_test]
# for dataset in full_data:
#     dataset['a_Name_length'] = dataset['Name'].apply(len)
#     #dataset['Sex'] = (dataset['Sex']=='male').astype(int)
#     dataset['a_Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
#     dataset['a_Age'] = dataset['Age'].fillna(0)
#     dataset['a_Age_IsNull'] = dataset['Age'].isnull()
#     dataset['a_FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
#     dataset['a_IsAlone'] = dataset['a_FamilySize'].apply(lambda x: 1 if x<=1 else 0)
#     dataset['a_Fare'] = dataset['Fare'].fillna(dataset['Fare'].median())
#     #dataset['Has_Cabin'] = dataset['Cabin'].apply(lambda x: 1 if type(x) == str else 0) # same as below
#     dataset['a_Has_Cabin'] = dataset['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
#     dataset['a_Has_Embarked'] = dataset['Embarked'].isnull()
#     dataset['Embarked'] = dataset['Embarked'].fillna('N')
#     dataset['a_Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2, 'N': 3} ).astype(int)
#     dataset['Embarked'] = dataset['Embarked'].fillna('S')
    
# display(data_train.head(2))
# display(data_test.head(2))

In [23]:
survived = data_train['Survived']
data_train0 = data_train.drop(data_train_original_col, axis = 1)
data_test0  = data_test.drop(data_test_original_col, axis = 1)
display(data_train0.head(2))
display(data_test0.head(2))

features = data_train0
display(features.head(2))

Unnamed: 0,a_Pclass,a_Name_Length,a_Sex,a_Age,a_Have_Age,a_Is_Child,a_FamilySize,a_IsAlone,a_Have_Ticket,a_Fare,...,a_Name_de,a_Name_del,a_Name_der,a_Name_hoef,a_Name_of,a_Name_the,a_Name_van,a_Name_y,a_Cabin_Word,a_Cabin_Number
0,3,23,1,22.0,1,0,2,0,1,7.25,...,0,0,0,0,0,0,0,0,-1,-1
1,1,51,0,38.0,1,0,2,0,1,71.2833,...,0,0,0,0,0,0,0,0,2,85


Unnamed: 0,a_Pclass,a_Name_Length,a_Sex,a_Age,a_Have_Age,a_Is_Child,a_FamilySize,a_IsAlone,a_Have_Ticket,a_Fare,...,a_Name_de,a_Name_del,a_Name_der,a_Name_hoef,a_Name_of,a_Name_the,a_Name_van,a_Name_y,a_Cabin_Word,a_Cabin_Number
0,3,16,1,34.5,1,0,1,1,1,7.8292,...,0,0,0,0,0,0,0,0,-1,-1
1,3,32,0,47.0,1,0,2,0,1,7.0,...,0,0,0,0,0,0,0,0,-1,-1


Unnamed: 0,a_Pclass,a_Name_Length,a_Sex,a_Age,a_Have_Age,a_Is_Child,a_FamilySize,a_IsAlone,a_Have_Ticket,a_Fare,...,a_Name_de,a_Name_del,a_Name_der,a_Name_hoef,a_Name_of,a_Name_the,a_Name_van,a_Name_y,a_Cabin_Word,a_Cabin_Number
0,3,23,1,22.0,1,0,2,0,1,7.25,...,0,0,0,0,0,0,0,0,-1,-1
1,1,51,0,38.0,1,0,2,0,1,71.2833,...,0,0,0,0,0,0,0,0,2,85


Check and confirm all columns is proccessed

In [24]:
for col in features.columns:
    if not col.startswith('a_'):
        print(col)

In [25]:
# Shuffle and split the train_data into train, crossvalidation and testing subsets
x_data = features
y_data = survived
x_test = data_test0

scaler = StandardScaler()
x_data = scaler.fit_transform(x_data)
x_test = scaler.transform(x_test)

x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.1, random_state=random_num)

In [26]:
# Show distribute of abave data sets
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

(801, 1545)
(90, 1545)
(801,)
(90,)


### Neuron network

In [27]:
import xlearn as xl

fm_model = xl.FMModel(
    task='binary', 
    init=0.01, 
    epoch=1000, 
    k=50, 
    lr=0.01, 
    reg_lambda=0.1, 
    opt='sgd', 
    metric='acc'
)
# Start to train
fm_model.fit(x_train, y_train,  eval_set=[x_val, y_val])

In [28]:
y_val_prob = fm_model.predict(x_val)
print(y_val_prob.shape)
print(y_val_prob[:10])
val_pred_test = (y_val_prob>=0.5).astype(int)
print(val_pred_test[:10])
val_acc = accuracy_score(val_pred_test, y_val)
print('val_acc: %.3f' % val_acc)
print('*' * 60)

y_test_proba = fm_model.predict(x_test)
# y_pred = np.argmax(y_pred, axis=1)
print(y_test_proba.shape)
print(y_test_proba[:10])
y_test_pred = (y_test_proba>=0.5).astype(int)
print(y_test_pred[:10])

y_data_proba = fm_model.predict(x_data)
# y_pred = np.argmax(y_pred, axis=1)
print(y_data_proba.shape)
print(y_data_proba[:10])
y_data_pred = (y_data_proba>=0.5).astype(int)
print(y_data_pred[:10])
data_acc = accuracy_score(y_data_pred, y_data)
print('data_acc: %.3f' % data_acc)
print('*' * 60)

(90,)
[0.387895 0.385067 0.385314 0.385697 0.382345 0.381286 0.383919 0.382161
 0.384618 0.380715]
[0 0 0 0 0 0 0 0 0 0]
val_acc: 0.644
************************************************************
(418,)
[0.380998 0.38655  0.379544 0.377183 0.387224 0.377377 0.387439 0.383099
 0.387186 0.379209]
[0 0 0 0 0 0 0 0 0 0]
(891,)
[0.379979 0.389503 0.385931 0.388138 0.379621 0.379881 0.382149 0.382087
 0.387389 0.386533]
[0 0 0 0 0 0 0 0 0 0]
data_acc: 0.616
************************************************************


### Predict and Export pred.csv file

In [29]:
train_cols = data_train.columns
for col in data_test0.columns:
    if col not in train_cols:
        print(col)

In [30]:
random_num = str(int(random_num)).zfill(4)
print(random_num)

run_name_acc = run_name + '_' + str(int(val_acc*10000)).zfill(4)
print(run_name_acc)

8020
Titanic_FFM_20180620_101229_6444


In [31]:
def save_proba(y_data_proba, y_data, y_test_proba, file_name):
    if os.path.exists(file_name):
        os.remove(file_name)
        print('Remove file: %s' % file_name)
    with h5py.File(file_name) as h:
        h.create_dataset('y_data_proba', data=y_data_proba)
        h.create_dataset('y_data', data=y_data)
        h.create_dataset('y_test_proba', data=y_test_proba)
    print('Save file: %s' % file_name)

def load_proba(file_name):
    with h5py.File(file_name, 'r') as h:
        y_data_proba = np.array(h['y_data_proba'])
        y_data = np.array(h['y_data'])
        y_test_proba = np.array(h['y_test_proba'])
    print('Load file: %s' % file_name)
    return y_data_proba, y_data, y_test_proba

In [32]:
y_proba_file = os.path.join(output_path, 'titanic_proba_%s_%s.p' % (run_name_acc, random_num))
save_proba(y_data_proba, y_data, y_test_proba, y_proba_file)
y_data_proba, y_data, y_test_proba = load_proba(y_proba_file)

print(y_data_proba.shape)
print(y_data.shape)
print(y_test_proba.shape)

Save file: /data1/github/Kaggle/titanic/output/titanic_proba_Titanic_FFM_20180620_101229_6444_8020.p
Load file: /data1/github/Kaggle/titanic/output/titanic_proba_Titanic_FFM_20180620_101229_6444_8020.p
(891,)
(891,)
(418,)


In [33]:
passenger_id = data_test['PassengerId']
output = pd.DataFrame( { 'PassengerId': passenger_id , 'Survived': y_test_pred })

output_csv_file = os.path.join(output_path, '%s_%s.csv' % (run_name_acc, random_num))
# output.to_csv(output_csv_file, index = False)
print(output_csv_file)
print('\n%s_%s' % (run_name_acc, random_num))

/data1/github/Kaggle/titanic/output/Titanic_FFM_20180620_101229_6444_8020.csv

Titanic_FFM_20180620_101229_6444_8020


In [34]:
print(run_name_acc)
print('Done!')

Titanic_FFM_20180620_101229_6444
Done!
