# Titanic Ensemble

Kaggle score: 0.79425

## Import PKGs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.display import display

import os
import gc
import time
import zipfile
import h5py
import pickle
import math
from PIL import Image
import shutil

from tqdm import tqdm
import multiprocessing

## Run names

In [2]:
project_name = 'Titanic'
step_name = 'Ensemble'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)

run_name: Titanic_Ensemble_20180409_201925


## Project folders

In [3]:
cwd = os.getcwd()
date_str = '20180409_0040'
input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
output_temp_folder = os.path.join(cwd, 'output', date_str)
model_folder = os.path.join(cwd, 'model')
model_temp_folder = os.path.join(cwd, 'model', date_str)
feature_folder = os.path.join(cwd, 'feature')
log_folder = os.path.join(cwd, 'log')
print('input_folder: \t\t\t%s' % input_folder)
print('output_folder: \t\t\t%s' % output_folder)
print('output_temp_folder: \t\t%s' % output_temp_folder)
print('model_folder: \t\t\t%s' % model_folder)
print('model_temp_folder: \t\t%s' % model_temp_folder)
print('feature_folder: \t\t%s' % feature_folder)
print('log_folder: \t\t\t%s' % log_folder)

if not os.path.exists(output_temp_folder):
    os.mkdir(output_temp_folder)
    print('Create folder: %s' % output_temp_folder)
if not os.path.exists(model_temp_folder):
    os.mkdir(model_temp_folder)
    print('Create folder: %s' % model_temp_folder)

train_csv_file = os.path.join(input_folder, 'train.csv')
test_csv_file = os.path.join(input_folder, 'test.csv')

print(train_csv_file)
print(test_csv_file)

input_folder: 			D:\Kaggle\titanic\input
output_folder: 			D:\Kaggle\titanic\output
output_temp_folder: 		D:\Kaggle\titanic\output\20180409_0040
model_folder: 			D:\Kaggle\titanic\model
model_temp_folder: 		D:\Kaggle\titanic\model\20180409_0040
feature_folder: 		D:\Kaggle\titanic\feature
log_folder: 			D:\Kaggle\titanic\log
D:\Kaggle\titanic\input\train.csv
D:\Kaggle\titanic\input\test.csv


## Import original data as DataFrame

In [4]:
data_train = pd.read_csv(train_csv_file)
data_test = pd.read_csv(test_csv_file)

display(data_train.head(2))
display(data_test.head(2))
data_train.loc[2, 'Ticket']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


'STON/O2. 3101282'

## Load predict probability files

In [5]:
def is_npy(s):
    return s.startswith('titanic_proba_Titanic_LightGBM_20180409') and s.endswith('.p')

def is_larger_than(x):
    splits = x.split('_')
    acc = int(splits[-2])
#     print(acc)
    if acc > 8300 and acc < 8500:
        return True
    return False

print(is_larger_than('titanic_proba_Titanic_LightGBM_20180409_004504_7888_8490.p'))

file_names = os.listdir(model_temp_folder)
file_names = list(filter(is_npy, file_names))
print(len(file_names))

file_names = list(filter(is_larger_than, file_names))
print(len(file_names))

print(file_names[0])

False
10000
1328
titanic_proba_Titanic_LightGBM_20180409_004545_8444_8614.p


In [6]:
def load_proba(file_name):
    with h5py.File(file_name, 'r') as h:
        y_data_proba = np.array(h['y_data_proba'])
        y_data = np.array(h['y_data'])
        y_test_proba = np.array(h['y_test_proba'])
#     print('Load file: %s' % file_name)
    return y_data_proba, y_data, y_test_proba

y_proba_file = os.path.join(model_temp_folder, file_names[0])
y_data_proba, y_data, y_test_proba = load_proba(y_proba_file)

print(y_data_proba.shape)
print(y_data.shape)
print(y_test_proba.shape)

(891,)
(891,)
(418,)


## Mean

In [7]:
from sklearn.metrics import accuracy_score

y_proba_file = os.path.join(model_temp_folder, file_names[0])
y_data_probas, y_datas, y_test_probas = load_proba(y_proba_file)

print(y_data_probas.shape)
print(y_datas.shape)
print(y_test_probas.shape)

for file_name in file_names[1:]:
    file_path = os.path.join(model_temp_folder, file_name)
    y_data_proba, y_data, y_test_proba = load_proba(file_path)
    y_data_probas = y_data_probas + y_data_proba
    y_test_probas = y_test_probas + y_test_proba
    
y_data_probas = y_data_probas / len(file_names)
y_test_probas = y_test_probas / len(file_names)

print(len(y_data_probas))
print(y_data_probas[:10])

(891,)
(891,)
(418,)
891
[ 0.21476616  0.74225822  0.52878833  0.69269695  0.23318174  0.21861981
  0.32295565  0.3565228   0.68674115  0.7629447 ]


In [8]:
y_data_pred = (y_data_probas>=0.5).astype(int)
train_acc = accuracy_score(y_data, y_data_pred)
print('train_acc: %.3f' % train_acc)

y_test_pred = (y_test_probas>=0.5).astype(int)

train_acc: 0.919


In [9]:
run_name_acc = run_name + '_' + str(int(train_acc*10000)).zfill(4)
print(run_name_acc)

Titanic_Ensemble_20180409_201925_9191


In [10]:
passenger_id = data_test['PassengerId']
output = pd.DataFrame( { 'PassengerId': passenger_id , 'Survived': y_test_pred })

output_csv_file = os.path.join(output_folder, '%s.csv' % run_name_acc)
output.to_csv(output_csv_file, index = False)
print(output_csv_file)
print('\n%s' % run_name_acc)

D:\Kaggle\titanic\output\Titanic_Ensemble_20180409_201925_9191.csv

Titanic_Ensemble_20180409_201925_9191
