# Titanic Tutorial
https://www.kaggle.com/c/titanic#tutorials

## 1. Preprocess

In [1]:
# Import libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display

%matplotlib inline

In [2]:
# Import original data as DataFrame
data_train = pd.read_csv('./input/train.csv')
data_test = pd.read_csv('./input/test.csv')

display(data_train.head(3))
display(data_test.head(3))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [3]:
# Clean data
# Reference: 
#    1. https://www.kaggle.com/sinakhorami/titanic-best-working-classifier
#    2. https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python/notebook
full_data = [data_train, data_test]
for dataset in full_data:
    dataset['Name_length'] = dataset['Name'].apply(len)
    #dataset['Sex'] = (dataset['Sex']=='male').astype(int)
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    dataset['Age'] = dataset['Age'].fillna(dataset['Age'].median())
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['IsAlone'] = dataset['FamilySize'].apply(lambda x: 1 if x<=1 else 0)
    dataset['Fare'] = dataset['Fare'].fillna(dataset['Fare'].median())
    #dataset['Has_Cabin'] = dataset["Cabin"].apply(lambda x: 1 if type(x) == str else 0) # same as below
    dataset['Has_Cabin'] = dataset["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
display(data_train.head(2))
display(data_test.head(2))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_length,FamilySize,IsAlone,Has_Cabin
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,0,23,2,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,1,51,2,0,1


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_length,FamilySize,IsAlone,Has_Cabin
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,2,16,1,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,0,32,2,0,0


In [4]:
# Feature selection
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
data_train0 = data_train.drop(drop_elements, axis = 1)
data_test0  = data_test.drop(drop_elements, axis = 1)
display(data_train0.head(2))
display(data_test0.head(2))

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,Embarked,Name_length,FamilySize,IsAlone,Has_Cabin
0,0,3,1,22.0,0,7.25,0,23,2,0,0
1,1,1,0,38.0,0,71.2833,1,51,2,0,1


Unnamed: 0,Pclass,Sex,Age,Parch,Fare,Embarked,Name_length,FamilySize,IsAlone,Has_Cabin
0,3,1,34.5,0,7.8292,2,16,1,1,0
1,3,0,47.0,0,7.0,0,32,2,0,0


In [5]:
# Scale
#scaler = MinMaxScaler()
#numerical = ['Age', 'Parch', 'Fare', 'capital-loss', 'hours-per-week']
#features_raw[numerical] = scaler.fit_transform(data[numerical])

In [6]:
survived = data_train0['Survived']
features = data_train0.drop('Survived', axis = 1)

In [7]:
# Shuffle and split the train_data into train, crossvalidation and testing subsets
# X_traincv    : X_test = 0.8       : 0.2
# X_train: X_cv: X_test = 0.64: 0.16: 0.2
X_traincv, X_test, y_traincv, y_test = train_test_split(features, survived, test_size=0.2, random_state=30)
X_train, X_cv, y_train, y_cv = train_test_split(X_traincv, y_traincv, test_size=0.2, random_state=30)

In [8]:
# Show distribute of abave data sets
display(X_train.head(2))
display(y_train.head(2))

Unnamed: 0,Pclass,Sex,Age,Parch,Fare,Embarked,Name_length,FamilySize,IsAlone,Has_Cabin
515,1,1,47.0,0,34.0208,0,28,1,1,1
410,3,1,28.0,0,7.8958,0,18,1,1,0


515    0
410    0
Name: Survived, dtype: int64

## 2. Build model

In [9]:
from sklearn import svm
model_svc = svm.SVC()
model_svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
from sklearn.ensemble import AdaBoostClassifier
model_ada = AdaBoostClassifier(n_estimators=100)
model_ada.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=None)

In [11]:
from sklearn.ensemble import RandomForestClassifier
model_rfc = RandomForestClassifier(n_estimators=10, max_depth=10)
model_rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [12]:
from sklearn.ensemble import GradientBoostingClassifier
model_gbc = GradientBoostingClassifier(n_estimators=100, max_depth=10, learning_rate=1.0)
model_gbc.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [13]:
class adaptXgb:
    def __init__(self, est):
        self.est = est
    def predict(self, X):
        return self.est.predict_proba(X)[:,1][:,np.newaxis]
    def fit(self, X, y, sample_weight):
        self.est.fit(X, y, sample_weight)

In [14]:
model_adaptXgb = adaptXgb(model_rfc)

In [15]:
%pdb off
model_gbc_rfc = GradientBoostingClassifier(init=model_adaptXgb, max_depth=10)
model_gbc_rfc.fit(X_train, y_train)

Automatic pdb calling has been turned OFF


GradientBoostingClassifier(criterion='friedman_mse',
              init=<__main__.adaptXgb object at 0x000001BF2170C7B8>,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [16]:
from sklearn.ensemble import VotingClassifier
model_vote = VotingClassifier(estimators=[('rfc', model_rfc), ('gbc', model_gbc)], voting='soft')
model_vote.fit(X_train, y_train)

VotingClassifier(estimators=[('rfc', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_wei...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [17]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
model_ada = AdaBoostClassifier(
    model_gbc,
    n_estimators=100,
    learning_rate=0.1)
model_ada.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
          learning_rate=0.1, n_estimators=100, random_state=None)

## 3. Train and CrossValidation and Test

In [18]:
def accuracy_score(truth, pred):
    """ Returns accuracy score for input truth and predictions. """
    # Ensure that the number of predictions matches number of outcomes
    if len(truth) == len(pred): 
        # Calculate and return the accuracy as a percent
        return "Predictions have an accuracy of {:.2f}%.".format((truth == pred).mean()*100)
    else:
        return "Number of predictions does not match number of outcomes!"

In [19]:
y_test_pred1 = model_svc.predict(X_test)
print(accuracy_score(y_test, y_test_pred1))

y_test_pred2 = model_ada.predict(X_test)
print(accuracy_score(y_test, y_test_pred2))
################################################
y_test_pred3 = model_rfc.predict(X_test)
print(accuracy_score(y_test, y_test_pred3))

y_test_pred4 = model_gbc.predict(X_test)
print(accuracy_score(y_test, y_test_pred4))

y_test_pred5 = model_gbc_rfc.predict(X_test)
print(accuracy_score(y_test, y_test_pred5))

y_test_pred6 = model_vote.predict(X_test)
print(accuracy_score(y_test, y_test_pred6))

y_test_pred7 = model_ada.predict(X_test)
print(accuracy_score(y_test, y_test_pred7))

Predictions have an accuracy of 67.04%.
Predictions have an accuracy of 78.77%.
Predictions have an accuracy of 81.56%.
Predictions have an accuracy of 79.89%.
Predictions have an accuracy of 79.33%.
Predictions have an accuracy of 79.33%.
Predictions have an accuracy of 78.77%.


## 4. Predict and Export titanic_pred.csv file

In [20]:
display(data_test0.head(2))
y_data_pred = model_ada.predict(data_test0)
passenger_id = data_test['PassengerId']
output = pd.DataFrame( { 'PassengerId': passenger_id , 'Survived': y_data_pred })
output.to_csv( '.\\output\\titanic_pred.csv' , index = False )

Unnamed: 0,Pclass,Sex,Age,Parch,Fare,Embarked,Name_length,FamilySize,IsAlone,Has_Cabin
0,3,1,34.5,0,7.8292,2,16,1,1,0
1,3,0,47.0,0,7.0,0,32,2,0,0
