# Titanic LightGBM

Kaggle score: 

重要：
- 因为model.fit(features.as_matrix(), survived.as_matrix(), batch_size = 2, epochs = 20)需要numpy.array输入，而不是pandas.DataFrame，这里需要DataFrame.as_matrix()转换
- 因为使用了kernel_initializer = 'uniform'，导致报错：InternalError: Blas GEMM launch failed

Reference: 
1. https://www.kaggle.com/c/titanic#tutorials
2. https://www.kaggle.com/sinakhorami/titanic-best-working-classifier
3. https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python/notebook


### Import pkgs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.display import display

import os
import gc
import time
import random
import zipfile
import h5py
import pickle
import math
from PIL import Image
import shutil

from tqdm import tqdm
import multiprocessing

## Run name

In [2]:
project_name = 'Titanic'
step_name = 'LightGBM'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)

run_name: Titanic_LightGBM_20180409_004325


## Project folders

In [3]:
cwd = os.getcwd()
date_str = '20180409_0040'
input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
output_temp_folder = os.path.join(cwd, 'output', date_str)
model_folder = os.path.join(cwd, 'model')
model_temp_folder = os.path.join(cwd, 'model', date_str)
feature_folder = os.path.join(cwd, 'feature')
log_folder = os.path.join(cwd, 'log')
print('input_folder: \t\t\t%s' % input_folder)
print('output_folder: \t\t\t%s' % output_folder)
print('output_temp_folder: \t\t%s' % output_temp_folder)
print('model_folder: \t\t\t%s' % model_folder)
print('model_temp_folder: \t\t%s' % model_temp_folder)
print('feature_folder: \t\t%s' % feature_folder)
print('log_folder: \t\t\t%s' % log_folder)

if not os.path.exists(output_temp_folder):
    os.mkdir(output_temp_folder)
    print('Create folder: %s' % output_temp_folder)
if not os.path.exists(model_temp_folder):
    os.mkdir(model_temp_folder)
    print('Create folder: %s' % model_temp_folder)

train_csv_file = os.path.join(input_folder, 'train.csv')
test_csv_file = os.path.join(input_folder, 'test.csv')

print(train_csv_file)
print(test_csv_file)

input_folder: 			D:\Kaggle\titanic\input
output_folder: 			D:\Kaggle\titanic\output
output_temp_folder: 		D:\Kaggle\titanic\output\20180409_0040
model_folder: 			D:\Kaggle\titanic\model
model_temp_folder: 		D:\Kaggle\titanic\model\20180409_0040
feature_folder: 		D:\Kaggle\titanic\feature
log_folder: 			D:\Kaggle\titanic\log
D:\Kaggle\titanic\input\train.csv
D:\Kaggle\titanic\input\test.csv


### Import original data as DataFrame

In [4]:
data_train = pd.read_csv(train_csv_file)
data_test = pd.read_csv(test_csv_file)

display(data_train.head(20))
display(data_test.head(20))
data_train.loc[2, 'Ticket']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


'STON/O2. 3101282'

### Show columns of dataframe

In [5]:
data_train_original_col = data_train.columns
data_test_original_col = data_test.columns
print(data_train_original_col)
print(data_test_original_col)
# data_train0 = data_train.drop(data_train_original_col, axis = 1)
# data_test0  = data_test.drop(data_test_original_col, axis = 1)
# display(data_train0.head(2))
# display(data_test0.head(2))

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


### Preprocess features

In [6]:
full_data = [data_train, data_test]

In [7]:
# Pclass
for dataset in full_data:
    temp = dataset[dataset['Pclass'].isnull()]
    if len(temp) == 0:
        print('Do not have null value!')
    else:
        temp.head(2)
        
for dataset in full_data:
    dataset['a_Pclass'] = dataset['Pclass']
#     display(dataset.head())

Do not have null value!
Do not have null value!


In [8]:
# Name
for dataset in full_data:
    dataset['a_Name_Length'] = dataset['Name'].apply(len)
#     display(dataset.head(2))

In [9]:
# Sex
for dataset in full_data:
    dataset['a_Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)
#     display(dataset.head(2))

In [10]:
# Age
for dataset in full_data:
    dataset['a_Age'] = dataset['Age'].fillna(-1)
    dataset['a_Have_Age'] = dataset['Age'].isnull().map({True: 0, False: 1}).astype(int)
#     display(dataset[dataset['Age'].isnull()].head(2))
#     display(dataset.head(2))

In [11]:
# SibSp and Parch
for dataset in full_data:
    dataset['a_FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['a_IsAlone'] = dataset['a_FamilySize'].apply(lambda x: 1 if x<=1 else 0)
#     display(dataset.head(2))

In [12]:
# Ticket(Very one have a ticket)
for dataset in full_data:
    dataset['a_Have_Ticket'] = dataset['Ticket'].isnull().map({True: 0, False: 1}).astype(int)
#     display(dataset[dataset['Ticket'].isnull()].head(2))
#     display(dataset.head(2))

In [13]:
# Fare
for dataset in full_data:
    dataset['a_Fare'] = dataset['Fare'].fillna(-1)
    dataset['a_Have_Fare'] = dataset['Fare'].isnull().map({True: 0, False: 1}).astype(int)
#     display(dataset[dataset['Fare'].isnull()].head(2))
#     display(dataset.head(2))

In [14]:
# Cabin
for dataset in full_data:
    dataset['a_Have_Cabin'] = dataset['Cabin'].isnull().map({True: 0, False: 1}).astype(int)
#     display(dataset[dataset['Cabin'].isnull()].head(2))
#     display(dataset.head(2))

In [15]:
# Embarked
for dataset in full_data:
#     dataset['Embarked'] = dataset['Embarked'].fillna('N')
    dataset['a_Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2, None: 3} ).astype(int)
    dataset['a_Have_Embarked'] = dataset['Embarked'].isnull().map({True: 0, False: 1}).astype(int)
#     display(dataset[dataset['Embarked'].isnull()].head(2))
#     display(dataset.head(2))

Name words segmentation and one-hote

In [16]:
# Name words segmentation
import re
name_words = []

# Inorder to allign columns of data_train and data_test, only data_train to fetch word
for name in data_train['Name']:
#     print(name)
    words = re.findall(r"[\w']+", name)
#     print(len(words))
#     print(words)
    for w in words:
        if w not in name_words:
            name_words.append(w)
# print(len(name_words))
name_words.sort()
# print(name_words)

In [17]:
# Add columns
for dataset in full_data:
    for w in name_words:
        col_name = 'a_Name_' + w
        dataset[col_name] = 0
    dataset.head(1)

In [18]:
# Name words one-hote
for dataset in full_data:
    for i, row in dataset.iterrows():
    #     print(row['Name'])
        words = re.findall(r"[\w']+", row['Name'])
        for w in words:
            if w in name_words:
                col_name = 'a_Name_' + w
                dataset.loc[i, col_name] = 1
#     display(dataset[dataset['a_Name_Braund'] == 1])

Cabin segmentation and one-hote

In [19]:
# Get cabin segmentation words
import re
cabin_words = []

# Inorder to allign columns of data_train and data_test, only data_train to fetch number
for c in data_train['Cabin']:
#     print(c)
    if c is not np.nan:
        word = re.findall(r"[a-zA-Z]", c)
#         print(words[0])
        cabin_words.append(word[0])
print(len(cabin_words))
cabin_words.sort()
print(np.unique(cabin_words))
cabin_words_unique = list(np.unique(cabin_words))

204
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'T']


In [20]:
def get_cabin_word(cabin):
    if cabin is not np.nan:
        word = re.findall(r"[a-zA-Z]", cabin)
        if word:
            return cabin_words_unique.index(word[0])
    return -1

for dataset in full_data:
    dataset['a_Cabin_Word'] = dataset['Cabin'].apply(get_cabin_word)
    # dataset['a_Cabin_Word'].head(100)

In [21]:
def get_cabin_number(cabin):
    if cabin is not np.nan:
        word = re.findall(r"[0-9]+", cabin)
        if word:
            return int(word[0])
    return -1

for dataset in full_data:
    dataset['a_Cabin_Number'] = dataset['Cabin'].apply(get_cabin_number)
    # dataset['a_Cabin_Number'].head(100)

In [22]:
# Clean data
# Reference: 
#    1. https://www.kaggle.com/sinakhorami/titanic-best-working-classifier
#    2. https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python/notebook

full_data = [data_train, data_test]
for dataset in full_data:
    dataset['a_Name_length'] = dataset['Name'].apply(len)
    #dataset['Sex'] = (dataset['Sex']=='male').astype(int)
    dataset['a_Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    dataset['a_Age'] = dataset['Age'].fillna(0)
    dataset['a_Age_IsNull'] = dataset['Age'].isnull()
    dataset['a_FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['a_IsAlone'] = dataset['a_FamilySize'].apply(lambda x: 1 if x<=1 else 0)
    dataset['a_Fare'] = dataset['Fare'].fillna(dataset['Fare'].median())
    #dataset['Has_Cabin'] = dataset['Cabin'].apply(lambda x: 1 if type(x) == str else 0) # same as below
    dataset['a_Has_Cabin'] = dataset['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
    dataset['a_Has_Embarked'] = dataset['Embarked'].isnull()
    dataset['Embarked'] = dataset['Embarked'].fillna('N')
    dataset['a_Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2, 'N': 3} ).astype(int)
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    
display(data_train.head(2))
display(data_test.head(2))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,a_Name_of,a_Name_the,a_Name_van,a_Name_y,a_Cabin_Word,a_Cabin_Number,a_Name_length,a_Age_IsNull,a_Has_Cabin,a_Has_Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,0,0,0,0,-1,-1,23,False,0,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,0,0,0,0,2,85,51,False,1,False


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,a_Name_of,a_Name_the,a_Name_van,a_Name_y,a_Cabin_Word,a_Cabin_Number,a_Name_length,a_Age_IsNull,a_Has_Cabin,a_Has_Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,...,0,0,0,0,-1,-1,16,False,0,False
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,...,0,0,0,0,-1,-1,32,False,0,False


In [23]:
survived = data_train['Survived']
data_train0 = data_train.drop(data_train_original_col, axis = 1)
data_test0  = data_test.drop(data_test_original_col, axis = 1)
display(data_train0.head(2))
display(data_test0.head(2))

features = data_train0
display(features.head(2))

Unnamed: 0,a_Pclass,a_Name_Length,a_Sex,a_Age,a_Have_Age,a_FamilySize,a_IsAlone,a_Have_Ticket,a_Fare,a_Have_Fare,...,a_Name_of,a_Name_the,a_Name_van,a_Name_y,a_Cabin_Word,a_Cabin_Number,a_Name_length,a_Age_IsNull,a_Has_Cabin,a_Has_Embarked
0,3,23,1,22.0,1,2,0,1,7.25,1,...,0,0,0,0,-1,-1,23,False,0,False
1,1,51,0,38.0,1,2,0,1,71.2833,1,...,0,0,0,0,2,85,51,False,1,False


Unnamed: 0,a_Pclass,a_Name_Length,a_Sex,a_Age,a_Have_Age,a_FamilySize,a_IsAlone,a_Have_Ticket,a_Fare,a_Have_Fare,...,a_Name_of,a_Name_the,a_Name_van,a_Name_y,a_Cabin_Word,a_Cabin_Number,a_Name_length,a_Age_IsNull,a_Has_Cabin,a_Has_Embarked
0,3,16,1,34.5,1,1,1,1,7.8292,1,...,0,0,0,0,-1,-1,16,False,0,False
1,3,32,0,47.0,1,2,0,1,7.0,1,...,0,0,0,0,-1,-1,32,False,0,False


Unnamed: 0,a_Pclass,a_Name_Length,a_Sex,a_Age,a_Have_Age,a_FamilySize,a_IsAlone,a_Have_Ticket,a_Fare,a_Have_Fare,...,a_Name_of,a_Name_the,a_Name_van,a_Name_y,a_Cabin_Word,a_Cabin_Number,a_Name_length,a_Age_IsNull,a_Has_Cabin,a_Has_Embarked
0,3,23,1,22.0,1,2,0,1,7.25,1,...,0,0,0,0,-1,-1,23,False,0,False
1,1,51,0,38.0,1,2,0,1,71.2833,1,...,0,0,0,0,2,85,51,False,1,False


Check and confirm all columns is proccessed

In [24]:
for col in features.columns:
    if not col.startswith('a_'):
        print(col)

## 2. Build model

In [25]:
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

import lightgbm as lgb

x_data = features
y_data = survived
x_test = data_test0

n_components = random.choice([50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150])
print('n_components: %s' % n_components)

pca = PCA(n_components=n_components)
pca.fit(x_data)
pca.fit(x_test)

x_data = pca.transform(x_data)
x_test = pca.transform(x_test)

random_num = np.random.randint(10000)
print('random_num: %s' % random_num)
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.1, random_state=random_num)
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)
print(x_test.shape)


lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_val, y_val, reference=lgb_train)

# LightGBM parameters
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['auc', 'binary_logloss'],
#         'num_class': 3,
    'learning_rate': random.choice([0.03, 0.1]),
    'num_leaves': random.choice([5, 10, 20, 30, 40]),
    'max_depth': random.choice([4, 5, 6, 7, 8, 9, 10, 11, 12]),
    'n_estimators': random.choice([2000, 5000, 10000]),
    'min_data_in_leaf': random.choice([5, 10, 20, 30, 40]),
    'num_iteration': random.choice([30, 40, 60, 80]),
    'verbose': 0
}

print('params: %s' % params)

# train
gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=500,
    valid_sets=lgb_eval,
    early_stopping_rounds=10
)

y_val_prob = gbm.predict(x_val, num_iteration=gbm.best_iteration)
print(y_val_prob.shape)
print(y_val_prob[:10])
val_pred_test = (y_val_prob>=0.5).astype(int)
print(val_pred_test[:10])
val_acc = accuracy_score(val_pred_test, y_val)
print('val_acc: %.3f' % val_acc)
print('*' * 60)

y_test_proba = gbm.predict(x_test, num_iteration=gbm.best_iteration)
# y_pred = np.argmax(y_pred, axis=1)
print(y_test_proba.shape)
print(y_test_proba[:10])
y_test_pred = (y_test_proba>=0.5).astype(int)
print(y_test_pred[:10])

y_data_proba = gbm.predict(x_data, num_iteration=gbm.best_iteration)
# y_pred = np.argmax(y_pred, axis=1)
print(y_data_proba.shape)
print(y_data_proba[:10])
y_data_pred = (y_data_proba>=0.5).astype(int)
print(y_data_pred[:10])

n_components: 120
random_num: 16
(801, 120)
(90, 120)
(801,)
(90,)
(418, 120)
params: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': ['auc', 'binary_logloss'], 'learning_rate': 0.1, 'num_leaves': 20, 'max_depth': 10, 'n_estimators': 10000, 'min_data_in_leaf': 10, 'num_iteration': 40, 'verbose': 0}




[1]	valid_0's auc: 0.745283	valid_0's binary_logloss: 0.666031
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.704233	valid_0's binary_logloss: 0.644785
[3]	valid_0's auc: 0.712647	valid_0's binary_logloss: 0.624744
[4]	valid_0's auc: 0.726415	valid_0's binary_logloss: 0.60429
[5]	valid_0's auc: 0.779959	valid_0's binary_logloss: 0.581411
[6]	valid_0's auc: 0.806986	valid_0's binary_logloss: 0.562374
[7]	valid_0's auc: 0.818205	valid_0's binary_logloss: 0.545015
[8]	valid_0's auc: 0.808516	valid_0's binary_logloss: 0.537643
[9]	valid_0's auc: 0.796787	valid_0's binary_logloss: 0.538319
[10]	valid_0's auc: 0.810046	valid_0's binary_logloss: 0.529314
[11]	valid_0's auc: 0.798062	valid_0's binary_logloss: 0.526021
[12]	valid_0's auc: 0.824324	valid_0's binary_logloss: 0.512894
[13]	valid_0's auc: 0.822285	valid_0's binary_logloss: 0.509881
[14]	valid_0's auc: 0.814125	valid_0's binary_logloss: 0.512067
[15]	valid_0's auc: 0.810301	valid_0's binary_loglos

In [26]:
print('random_num: %s' % random_num)
print('val_acc: %.3f' % val_acc)

random_num: 16
val_acc: 0.789


## 4. Predict and Export titanic_pred.csv file

In [27]:
random_num = str(int(random_num)).zfill(4)
print(random_num)

run_name_acc = run_name + '_' + str(int(val_acc*10000)).zfill(4)
print(run_name_acc)

0016
Titanic_LightGBM_20180409_004325_7888


In [28]:
def save_proba(y_data_proba, y_data, y_test_proba, file_name):
    if os.path.exists(file_name):
        os.remove(file_name)
        print('Remove file: %s' % file_name)
    with h5py.File(file_name) as h:
        h.create_dataset('y_data_proba', data=y_data_proba)
        h.create_dataset('y_data', data=y_data)
        h.create_dataset('y_test_proba', data=y_test_proba)
    print('Save file: %s' % file_name)

def load_proba(file_name):
    with h5py.File(file_name, 'r') as h:
        y_data_proba = np.array(h['y_data_proba'])
        y_data = np.array(h['y_data'])
        y_test_proba = np.array(h['y_test_proba'])
    print('Load file: %s' % file_name)
    return y_data_proba, y_data, y_test_proba

In [29]:
y_proba_file = os.path.join(model_temp_folder, 'titanic_proba_%s_%s.p' % (run_name_acc, random_num))
save_proba(y_data_proba, y_data, y_test_proba, y_proba_file)
y_data_proba, y_data, y_test_proba = load_proba(y_proba_file)

print(y_data_proba.shape)
print(y_data.shape)
print(y_test_proba.shape)

Save file: D:\Kaggle\titanic\model\20180409_0040\titanic_proba_Titanic_LightGBM_20180409_004325_7888_0016.p
Load file: D:\Kaggle\titanic\model\20180409_0040\titanic_proba_Titanic_LightGBM_20180409_004325_7888_0016.p
(891,)
(891,)
(418,)


In [30]:
passenger_id = data_test['PassengerId']
output = pd.DataFrame( { 'PassengerId': passenger_id , 'Survived': y_test_pred })

output_csv_file = os.path.join(output_temp_folder, '%s_%s.csv' % (run_name_acc, random_num))
output.to_csv(output_csv_file, index = False)
print(output_csv_file)
print('\n%s_%s' % (run_name_acc, random_num))

D:\Kaggle\titanic\output\20180409_0040\Titanic_LightGBM_20180409_004325_7888_0016.csv

Titanic_LightGBM_20180409_004325_7888_0016


In [31]:
print('Done!')

Done!
