# Titanic Tutorial


Kaggle score: 

重要：
- 因为model.fit(features.as_matrix(), survived.as_matrix(), batch_size = 2, epochs = 20)需要numpy.array输入，而不是pandas.DataFrame，这里需要DataFrame.as_matrix()转换
- 因为使用了kernel_initializer = 'uniform'，导致报错：InternalError: Blas GEMM launch failed

Reference: 
1. https://www.kaggle.com/c/titanic#tutorials
2. https://www.kaggle.com/sinakhorami/titanic-best-working-classifier
3. https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python/notebook


## 1. Preprocess

### Import pkgs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display

import lightgbm as lgb

%matplotlib inline

### Import original data as DataFrame

In [2]:
data_train = pd.read_csv('./input/train.csv')
data_test = pd.read_csv('./input/test.csv')

display(data_train.head(200))
display(data_test.head(20))
data_train.loc[2, 'Ticket']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


'STON/O2. 3101282'

### Show columns of dataframe

In [3]:
data_train_original_col = data_train.columns
data_test_original_col = data_test.columns
print(data_train_original_col)
print(data_test_original_col)
# data_train0 = data_train.drop(data_train_original_col, axis = 1)
# data_test0  = data_test.drop(data_test_original_col, axis = 1)
# display(data_train0.head(2))
# display(data_test0.head(2))

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


### Preprocess features

In [4]:
full_data = [data_train, data_test]

In [5]:
# Pclass
for dataset in full_data:
    temp = dataset[dataset['Pclass'].isnull()]
    if len(temp) == 0:
        print('Do not have null value!')
    else:
        temp.head(2)
        
for dataset in full_data:
    dataset['a_Pclass'] = dataset['Pclass']
#     display(dataset.head())

Do not have null value!
Do not have null value!


In [6]:
# Name
for dataset in full_data:
    dataset['a_Name_Length'] = dataset['Name'].apply(len)
#     display(dataset.head(2))

In [7]:
# Sex
for dataset in full_data:
    dataset['a_Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)
#     display(dataset.head(2))

In [8]:
# Age
for dataset in full_data:
    dataset['a_Age'] = dataset['Age'].fillna(-1)
    dataset['a_Have_Age'] = dataset['Age'].isnull().map({True: 0, False: 1}).astype(int)
#     display(dataset[dataset['Age'].isnull()].head(2))
#     display(dataset.head(2))

In [9]:
# SibSp and Parch
for dataset in full_data:
    dataset['a_FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['a_IsAlone'] = dataset['a_FamilySize'].apply(lambda x: 1 if x<=1 else 0)
#     display(dataset.head(2))

In [10]:
# Ticket(Very one have a ticket)
for dataset in full_data:
    dataset['a_Have_Ticket'] = dataset['Ticket'].isnull().map({True: 0, False: 1}).astype(int)
#     display(dataset[dataset['Ticket'].isnull()].head(2))
#     display(dataset.head(2))

In [11]:
# Fare
for dataset in full_data:
    dataset['a_Fare'] = dataset['Fare'].fillna(-1)
    dataset['a_Have_Fare'] = dataset['Fare'].isnull().map({True: 0, False: 1}).astype(int)
#     display(dataset[dataset['Fare'].isnull()].head(2))
#     display(dataset.head(2))

In [12]:
# Cabin
for dataset in full_data:
    dataset['a_Have_Cabin'] = dataset['Cabin'].isnull().map({True: 0, False: 1}).astype(int)
#     display(dataset[dataset['Cabin'].isnull()].head(2))
#     display(dataset.head(2))

In [13]:
# Embarked
for dataset in full_data:
#     dataset['Embarked'] = dataset['Embarked'].fillna('N')
    dataset['a_Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2, None: 3} ).astype(int)
    dataset['a_Have_Embarked'] = dataset['Embarked'].isnull().map({True: 0, False: 1}).astype(int)
#     display(dataset[dataset['Embarked'].isnull()].head(2))
#     display(dataset.head(2))

Name words segmentation and one-hote

In [14]:
# Name words segmentation
import re
name_words = []

# Inorder to allign columns of data_train and data_test, only data_train to fetch word
for name in data_train['Name']:
#     print(name)
    words = re.findall(r"[\w']+", name)
#     print(len(words))
#     print(words)
    for w in words:
        if w not in name_words:
            name_words.append(w)
# print(len(name_words))
name_words.sort()
# print(name_words)

In [15]:
# Add columns
for dataset in full_data:
    for w in name_words:
        col_name = 'a_Name_' + w
        dataset[col_name] = 0
    dataset.head(1)

In [16]:
# Name words one-hote
for dataset in full_data:
    for i, row in dataset.iterrows():
    #     print(row['Name'])
        words = re.findall(r"[\w']+", row['Name'])
        for w in words:
            if w in name_words:
                col_name = 'a_Name_' + w
                dataset.loc[i, col_name] = 1
#     display(dataset[dataset['a_Name_Braund'] == 1])

Cabin segmentation and one-hote

In [17]:
# Get cabin segmentation words
import re
cabin_words = []

# Inorder to allign columns of data_train and data_test, only data_train to fetch number
for c in data_train['Cabin']:
#     print(c)
    if c is not np.nan:
        word = re.findall(r"[a-zA-Z]", c)
#         print(words[0])
        cabin_words.append(word[0])
print(len(cabin_words))
cabin_words.sort()
print(np.unique(cabin_words))
cabin_words_unique = list(np.unique(cabin_words))

204
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'T']


In [18]:
def get_cabin_word(cabin):
    if cabin is not np.nan:
        word = re.findall(r"[a-zA-Z]", cabin)
        if word:
            return cabin_words_unique.index(word[0])
    return -1

for dataset in full_data:
    dataset['a_Cabin_Word'] = dataset['Cabin'].apply(get_cabin_word)
    # dataset['a_Cabin_Word'].head(100)

In [19]:
def get_cabin_number(cabin):
    if cabin is not np.nan:
        word = re.findall(r"[0-9]+", cabin)
        if word:
            return int(word[0])
    return -1

for dataset in full_data:
    dataset['a_Cabin_Number'] = dataset['Cabin'].apply(get_cabin_number)
    # dataset['a_Cabin_Number'].head(100)

In [20]:
# Clean data
# Reference: 
#    1. https://www.kaggle.com/sinakhorami/titanic-best-working-classifier
#    2. https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python/notebook

full_data = [data_train, data_test]
for dataset in full_data:
    dataset['a_Name_length'] = dataset['Name'].apply(len)
    #dataset['Sex'] = (dataset['Sex']=='male').astype(int)
    dataset['a_Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    dataset['a_Age'] = dataset['Age'].fillna(0)
    dataset['a_Age_IsNull'] = dataset['Age'].isnull()
    dataset['a_FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['a_IsAlone'] = dataset['a_FamilySize'].apply(lambda x: 1 if x<=1 else 0)
    dataset['a_Fare'] = dataset['Fare'].fillna(dataset['Fare'].median())
    #dataset['Has_Cabin'] = dataset['Cabin'].apply(lambda x: 1 if type(x) == str else 0) # same as below
    dataset['a_Has_Cabin'] = dataset['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
    dataset['a_Has_Embarked'] = dataset['Embarked'].isnull()
    dataset['Embarked'] = dataset['Embarked'].fillna('N')
    dataset['a_Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2, 'N': 3} ).astype(int)
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    
display(data_train.head(2))
display(data_test.head(2))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,a_Name_of,a_Name_the,a_Name_van,a_Name_y,a_Cabin_Word,a_Cabin_Number,a_Name_length,a_Age_IsNull,a_Has_Cabin,a_Has_Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,0,0,0,0,-1,-1,23,False,0,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,0,0,0,0,2,85,51,False,1,False


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,a_Name_of,a_Name_the,a_Name_van,a_Name_y,a_Cabin_Word,a_Cabin_Number,a_Name_length,a_Age_IsNull,a_Has_Cabin,a_Has_Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,...,0,0,0,0,-1,-1,16,False,0,False
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,...,0,0,0,0,-1,-1,32,False,0,False


In [21]:
survived = data_train['Survived']
data_train0 = data_train.drop(data_train_original_col, axis = 1)
data_test0  = data_test.drop(data_test_original_col, axis = 1)
display(data_train0.head(2))
display(data_test0.head(2))

features = data_train0
display(features.head(2))

Unnamed: 0,a_Pclass,a_Name_Length,a_Sex,a_Age,a_Have_Age,a_FamilySize,a_IsAlone,a_Have_Ticket,a_Fare,a_Have_Fare,...,a_Name_of,a_Name_the,a_Name_van,a_Name_y,a_Cabin_Word,a_Cabin_Number,a_Name_length,a_Age_IsNull,a_Has_Cabin,a_Has_Embarked
0,3,23,1,22.0,1,2,0,1,7.25,1,...,0,0,0,0,-1,-1,23,False,0,False
1,1,51,0,38.0,1,2,0,1,71.2833,1,...,0,0,0,0,2,85,51,False,1,False


Unnamed: 0,a_Pclass,a_Name_Length,a_Sex,a_Age,a_Have_Age,a_FamilySize,a_IsAlone,a_Have_Ticket,a_Fare,a_Have_Fare,...,a_Name_of,a_Name_the,a_Name_van,a_Name_y,a_Cabin_Word,a_Cabin_Number,a_Name_length,a_Age_IsNull,a_Has_Cabin,a_Has_Embarked
0,3,16,1,34.5,1,1,1,1,7.8292,1,...,0,0,0,0,-1,-1,16,False,0,False
1,3,32,0,47.0,1,2,0,1,7.0,1,...,0,0,0,0,-1,-1,32,False,0,False


Unnamed: 0,a_Pclass,a_Name_Length,a_Sex,a_Age,a_Have_Age,a_FamilySize,a_IsAlone,a_Have_Ticket,a_Fare,a_Have_Fare,...,a_Name_of,a_Name_the,a_Name_van,a_Name_y,a_Cabin_Word,a_Cabin_Number,a_Name_length,a_Age_IsNull,a_Has_Cabin,a_Has_Embarked
0,3,23,1,22.0,1,2,0,1,7.25,1,...,0,0,0,0,-1,-1,23,False,0,False
1,1,51,0,38.0,1,2,0,1,71.2833,1,...,0,0,0,0,2,85,51,False,1,False


Check and confirm all columns is proccessed

In [22]:
for col in features.columns:
    if not col.startswith('a_'):
        print(col)

## 2. Build model
Adaboost+GridSearch+CV

In [23]:
x_data = features
y_data = survived
x_test = data_test0

n_components = 60

pca = PCA(n_components=n_components)
pca.fit(x_data)
pca.fit(x_test)

x_data = pca.transform(x_data)
x_test = pca.transform(x_test)
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.1, random_state=2017)
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

print(x_data.shape)
print(y_data.shape)
print(x_test.shape)

lgb_train = lgb.Dataset(x_data, y_data)

(801, 60)
(801,)
(90, 60)
(90,)
(891, 60)
(891,)
(418, 60)


In [24]:
%%time
# param_grid = {
#     #"reg_alpha": [0.3, 0.7, 0.9, 1.1],
#     'boosting_type': ['gbdt', 'rf'],
#     "learning_rate": [0.06, 0.1, 0.13],
#     'n_estimators': [75, 80, 85],
#     'max_depth': [7, 8, 9],
#     'min_child_samples': [15, 20, 30],
#     'num_leaves': [27, 30, 33]
# }

param_grid = {
    'boosting_type': ['gbdt'],
    "learning_rate": [0.06],
    'n_estimators': [50, 60],
    'max_depth': [5, 6, 7],
    'min_child_samples': [10],
    'num_leaves': [20]
}

clf = lgb.LGBMClassifier(
    objective="binary",
    metric=['auc', 'binary_logloss'],
    max_bin=255,
    n_jobs=1,
    seed=42
)


grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=10, n_jobs=1)
grid_search.fit(x_data, y_data)
y_data_pred = grid_search.predict(x_data)
print('train acc: %.3f' % accuracy_score(y_data_pred, y_data))
print('val   acc: %.3f' % grid_search.best_score_)
print(grid_search.best_estimator_)

train acc: 0.963
val   acc: 0.811
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.06, max_bin=255, max_depth=5,
        metric=['auc', 'binary_logloss'], min_child_samples=10,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=60,
        n_jobs=1, num_leaves=20, objective='binary', random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, seed=42, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)
Wall time: 15.8 s


In [25]:
import numpy as np
import math
import pdb

# 【p25，公式2.2】
def sign(x):
    if x > 0.5:
        return 1
    else:
        return 0

In [26]:
%%time
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import math

class Adaboost(object):
    def __init__(self, classifier_count = 10):
        self._x_train = None
        self._y_train = None
        # 【p138，算法8.1，第（2）步】这里M是分类器的数量，【140】中部，有提到
        # “步骤（3）线性组合f(x)实现M个基本分类器的加权表决”
        self._classifier_count = classifier_count
        

    def fit(self, x_train: np.array, y_train: np.array):
        self._x_train = x_train
        self._y_train = y_train
        self._m = len(self._x_train)
        # self._n = len(self._x_train[0])

        self._clf = None
        self._error = 0
        # self._weight = None
        self._alpha = 0

        self._classifiers = []
        self._errors = []
        self._weights = []
        self._alphas = []
        org_clfs = [
            DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10),
            GradientBoostingClassifier(n_estimators=14, max_depth=5),
#             RandomForestClassifier(n_estimators=17, max_depth=17)
        ]
        # 【p138，第（1）步，初始化权值】
        self._weight = np.ones((self._m,)) / self._m
        for i in range(self._classifier_count):
#             self._clf = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10)
#             self._clf = GradientBoostingClassifier(n_estimators=8, max_depth=8)
            self._clf = GridSearchCV(estimator=clf, param_grid=param_grid, cv=10, n_jobs=1)
#             self._clf = org_clfs[i % len(org_clfs)]
#             pdb.set_trace()
            self._clf.fit(x_train, y_train, self._weight)
            print(grid_search.best_score_)
            print(self._clf.best_estimator_)
            err = self._clf.predict(x_train) != y_train
            err = err.astype(int)
            err = err * self._weight
            print('err: %s' % err[:5])
            self._error = sum(err) / len(y_train)
            # 【p139，公式8.2】
            self._alpha = 1. / 2 * math.log((1 - self._error) / self._error)
            self._weight = self._get_weight()

            self._classifiers.append(self._clf)
            self._errors.append(self._error)
            self._alphas.append(self._alpha)
            self._weights.append(self._weight)
            print('error: %s' % self._error)
            print('alpha: %s' % self._alpha)
            print('weight:%s' % self._weight[:5])
            print('*'*30)
            # 终止条件，没有找到书中对应的内容，属于个人添加
            if(all(self.predict(x_train) == y_train)):
                print('全部正确分类，满足终止条件：%s of %s' %(i, self._classifier_count))
                break
            pass


    def predict_prob(self, x_test):
        y_preds = []
        for i, clf in enumerate(self._classifiers):
            y_preds.append(self._alphas[i] * clf.predict(x_test))
        y_pred = sum(y_preds)
        return y_pred
    def predict(self, x_test):
        # 【p8.7，公式8.7】这里把probability和sign分开成两个函数来实现，
        # 便于需要probability的情况
        y_preds = self.predict_prob(x_test)
        result = np.ones((len(y_preds),))
        for i in range(len(y_preds)):
            result[i] = sign(y_preds[i])
        return result


    @property
    def x_fit(self):
        return self._x_train
    @property
    def y_fit(self):
        return self._y_train
    
    def _get_weight(self):
        # 【p139，公式8.3，8.4，8.5】
        weight_factors = self._weight*np.exp(-self._alpha*self._y_train*self._clf.predict(self._x_train))
        z = sum(weight_factors)
        new_wf = weight_factors / z
        return new_wf

Wall time: 1.01 ms


In [27]:
%%time
ada = Adaboost(8)
ada.fit(x_train, y_train)
y_train_pred = ada.predict(x_train)
print('用x_train数据测试整体模型：%.3f' % accuracy_score(y_train_pred, y_train))
# y_val_pred = ada.predict(x_val)
val_acc = accuracy_score(y_train_pred, y_train)
# print('用x_val数据测试整体模型：%.3f' % val_acc)

0.811447811448
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.06, max_bin=255, max_depth=6,
        metric=['auc', 'binary_logloss'], min_child_samples=10,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=60,
        n_jobs=1, num_leaves=20, objective='binary', random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, seed=42, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)
err: 742    0.0
206    0.0
332    0.0
824    0.0
257    0.0
Name: Survived, dtype: float64
error: 3.27306223026e-05
alpha: 5.1635833626276675
weight:742    0.000011
206    0.001974
332    0.001974
824    0.001974
257    0.000011
Name: Survived, dtype: float64
******************************
0.811447811448
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.06, max_bin=255, max_depth=6,
        metric=['auc', 'binary_logloss'], min_child_samples=10,
        min_child_

## 4. Predict and Export titanic_pred.csv file

In [28]:
y_test_pred = ada.predict(x_test)
y_test_pred = (y_test_pred>=0.5).astype(int)
print(y_test_pred[:10])

[0 0 0 0 1 0 1 0 1 0]


In [29]:
import time
import os

project_name = 'Titanic'
step_name = 'LightGBM_LightGBM_GridSearchCV_Adaboost'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
final_acc_str = '{0:0>4}'.format(int(val_acc*10000))
run_name_acc = project_name + '_' + step_name + str(n_components) + '_' + time_str + '_' + final_acc_str
print(run_name_acc)

cwd = os.getcwd()
pred_file = os.path.join(cwd, 'output', run_name_acc + '.csv')
print(pred_file)

Titanic_LightGBM_LightGBM_GridSearchCV_Adaboost60_20180320_001159_9737
D:\Kaggle\titanic\output\Titanic_LightGBM_LightGBM_GridSearchCV_Adaboost60_20180320_001159_9737.csv


In [30]:
passenger_id = data_test['PassengerId']
output = pd.DataFrame({ 'PassengerId': passenger_id, 'Survived': y_test_pred})
output.to_csv(pred_file, index = False )

In [31]:
print('Done!')

Done!


#### 