In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [7]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## X label-encoded

In [256]:
X = train.drop(['PassengerId', 'Name', 'Cabin', 'Ticket', 'Survived'], axis=1)

In [257]:
from sklearn.preprocessing import LabelEncoder

In [258]:
lbl1 = LabelEncoder()
X['Sex'] = lbl1.fit_transform(X['Sex'])
lbl2 = LabelEncoder()
X['Embarked'] = X['Embarked'].fillna('N')
X['Embarked'] = lbl2.fit_transform(X['Embarked'])
y = train['Survived']

In [160]:
X_nan = X.copy()
X_nan['Age'] = X_nan['Age'].fillna(X_nan['Age'].median()).astype(float)

## Model selection

In [102]:
from sklearn.model_selection import cross_val_score
import time

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [71]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold

In [105]:
kfolds = StratifiedKFold(shuffle=True, random_state=1)

In [136]:
def run_model(model, X, y):
    model_name = str(model.__class__).split('.')[-1].replace('>','').replace("'",'')
    print(model_name)
    accuracy = []
    f1 = []
    start = int(time.time() * 1000)
    for train_idx, test_idx in kfolds.split(X, y):
        model.fit(X.iloc[train_idx,:], y[train_idx])
        y_ = model.predict(X.iloc[test_idx,:])
        accuracy += [accuracy_score(y.iloc[test_idx], y_)]
        f1 += [f1_score(y.iloc[test_idx], y_)]
    end = int(time.time() * 1000)
    results.loc[len(results)] = [model_name, np.mean(accuracy), np.std(accuracy), 
                                 np.mean(f1), np.std(f1), str(end - start)]

In [202]:
def run_linear_models(X, y):
    run_model(LogisticRegression(), X, y)
    run_model(RidgeClassifier(), X, y)
    run_model(SGDClassifier(max_iter=1000), X, y)
    run_model(PassiveAggressiveClassifier(max_iter=1000), X, y)
    run_model(SVC(), X, y)
    run_model(LinearSVC(), X, y)
    run_model(MLPClassifier(max_iter=1000), X, y)
    
def run_misc_models(X, y):
#     run_model(GaussianNB(), X, y)
#     run_model(MultinomialNB(), X, y)
    run_model(BernoulliNB(), X, y)
    run_model(GaussianProcessClassifier(), X, y)
    run_model(KNeighborsClassifier(), X, y)
    run_model(DecisionTreeClassifier(), X, y)
    
def run_ensemble_models(X, y, X_nan):
    run_model(RandomForestClassifier(), X_nan, y)
    run_model(ExtraTreesClassifier(), X_nan, y)
    run_model(AdaBoostClassifier(), X_nan, y)
    run_model(GradientBoostingClassifier(), X_nan, y)    
    run_model(XGBClassifier(), X, y)
    run_model(LGBMClassifier(), X, y)
    run_model(CatBoostClassifier(), X, y)

In [127]:
results = pd.DataFrame(columns=['model', 'accuracy', 'accuracy-std', 'f1-score', 'f1-score-std', 'time'])

In [128]:
run_ensemble_models(X, y, X_nan)

RandomForestClassifier
ExtraTreesClassifier
AdaBoostClassifier
GradientBoostingClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


In [149]:
run_misc_models(X_nan, y)

BernoulliNB
GaussianProcessClassifier
KNeighborsClassifier
DecisionTreeClassifier


In [152]:
run_model(SGDClassifier(max_iter=100), X_nan, y)

SGDClassifier


In [162]:
results


Unnamed: 0,model,accuracy,accuracy-std,f1-score,f1-score-std,time
0,RandomForestClassifier,0.806958,0.014108,0.731689,0.024212,111
1,ExtraTreesClassifier,0.805836,0.001587,0.735611,0.00758,64
2,AdaBoostClassifier,0.789001,0.008399,0.723547,0.021509,256
3,GradientBoostingClassifier,0.824916,0.019244,0.757814,0.025988,202
4,XGBClassifier,0.821549,0.014547,0.75628,0.022376,116
5,LGBMClassifier,0.801347,0.012598,0.727761,0.019489,49
6,CatBoostClassifier,0.826038,0.011111,0.76261,0.015169,26554
7,LogisticRegression,0.795735,0.019309,0.717724,0.022204,44
8,RidgeClassifier,0.795735,0.011446,0.719034,0.017241,74
9,SGDClassifier,0.585859,0.016722,0.458404,0.126533,27


## Normalize

In [164]:
from sklearn.preprocessing import MinMaxScaler

In [170]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,3
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,3
3,1,0,35.0,1,0,53.1,3
4,3,1,35.0,0,0,8.05,3


In [179]:
X_norm = X_nan.copy()

In [182]:
sc1 = MinMaxScaler()
X_norm['Age'] = sc1.fit_transform(X_norm['Age'].values.reshape((-1, 1)))
sc2 = MinMaxScaler()
X_norm['Fare'] = sc2.fit_transform(X_norm['Fare'].values.reshape((-1, 1)))

In [185]:
run_linear_models(X_norm, y)

LogisticRegression
RidgeClassifier
SGDClassifier
PassiveAggressiveClassifier
SVC




LinearSVC
MLPClassifier




In [186]:
results

Unnamed: 0,model,accuracy,accuracy-std,f1-score,f1-score-std,time
0,RandomForestClassifier,0.806958,0.014108,0.731689,0.024212,111
1,ExtraTreesClassifier,0.805836,0.001587,0.735611,0.00758,64
2,AdaBoostClassifier,0.789001,0.008399,0.723547,0.021509,256
3,GradientBoostingClassifier,0.824916,0.019244,0.757814,0.025988,202
4,XGBClassifier,0.821549,0.014547,0.75628,0.022376,116
5,LGBMClassifier,0.801347,0.012598,0.727761,0.019489,49
6,CatBoostClassifier,0.826038,0.011111,0.76261,0.015169,26554
7,LogisticRegression,0.795735,0.019309,0.717724,0.022204,44
8,RidgeClassifier,0.795735,0.011446,0.719034,0.017241,74
9,SGDClassifier,0.585859,0.016722,0.458404,0.126533,27


In [262]:
X_all_norm = X_nan.copy()

In [263]:
sc = MinMaxScaler()
X_all_norm = pd.DataFrame(sc.fit_transform(X_all_norm), columns=X_norm.columns)

In [203]:
run_linear_models(X_all_norm, y)

LogisticRegression
RidgeClassifier
SGDClassifier
PassiveAggressiveClassifier
SVC
LinearSVC
MLPClassifier


In [204]:
results

Unnamed: 0,model,accuracy,accuracy-std,f1-score,f1-score-std,time
0,RandomForestClassifier,0.806958,0.014108,0.731689,0.024212,111
1,ExtraTreesClassifier,0.805836,0.001587,0.735611,0.00758,64
2,AdaBoostClassifier,0.789001,0.008399,0.723547,0.021509,256
3,GradientBoostingClassifier,0.824916,0.019244,0.757814,0.025988,202
4,XGBClassifier,0.821549,0.014547,0.75628,0.022376,116
5,LGBMClassifier,0.801347,0.012598,0.727761,0.019489,49
6,CatBoostClassifier,0.826038,0.011111,0.76261,0.015169,26554
7,LogisticRegression,0.795735,0.019309,0.717724,0.022204,44
8,RidgeClassifier,0.795735,0.011446,0.719034,0.017241,74
9,SGDClassifier,0.585859,0.016722,0.458404,0.126533,27


In [248]:
results[results['model'].apply(lambda x: x in ['CatBoostClassifier', 'XGBClassifier'])]

Unnamed: 0,model,accuracy,accuracy-std,f1-score,f1-score-std,time
4,XGBClassifier,0.821549,0.014547,0.75628,0.022376,116
6,CatBoostClassifier,0.826038,0.011111,0.76261,0.015169,26554
20,XGBClassifier,0.819304,0.013561,0.750959,0.020572,153
21,XGBClassifier,0.824916,0.012598,0.757413,0.021329,134
47,XGBClassifier,0.824916,0.012598,0.757413,0.021329,125
49,CatBoostClassifier,0.820426,0.017675,0.752752,0.027106,30811


In [221]:
run_ensemble_models(X_all_norm, y, X_all_norm)

RandomForestClassifier
ExtraTreesClassifier
AdaBoostClassifier
GradientBoostingClassifier
XGBClassifier
LGBMClassifier
CatBoostClassifier


In [222]:
results

Unnamed: 0,model,accuracy,accuracy-std,f1-score,f1-score-std,time
0,RandomForestClassifier,0.806958,0.014108,0.731689,0.024212,111
1,ExtraTreesClassifier,0.805836,0.001587,0.735611,0.00758,64
2,AdaBoostClassifier,0.789001,0.008399,0.723547,0.021509,256
3,GradientBoostingClassifier,0.824916,0.019244,0.757814,0.025988,202
4,XGBClassifier,0.821549,0.014547,0.75628,0.022376,116
5,LGBMClassifier,0.801347,0.012598,0.727761,0.019489,49
6,CatBoostClassifier,0.826038,0.011111,0.76261,0.015169,26554
7,LogisticRegression,0.795735,0.019309,0.717724,0.022204,44
8,RidgeClassifier,0.795735,0.011446,0.719034,0.017241,74
9,SGDClassifier,0.585859,0.016722,0.458404,0.126533,27


In [227]:
results[results['model']=='GradientBoostingClassifier']

Unnamed: 0,model,accuracy,accuracy-std,f1-score,f1-score-std,time
3,GradientBoostingClassifier,0.824916,0.019244,0.757814,0.025988,202
46,GradientBoostingClassifier,0.824916,0.012598,0.758562,0.024533,194


In [228]:
results.iloc[results['accuracy'].sort_values(ascending=False).index].head(10)

Unnamed: 0,model,accuracy,accuracy-std,f1-score,f1-score-std,time
6,CatBoostClassifier,0.826038,0.011111,0.76261,0.015169,26554
47,XGBClassifier,0.824916,0.012598,0.757413,0.021329,125
46,GradientBoostingClassifier,0.824916,0.012598,0.758562,0.024533,194
3,GradientBoostingClassifier,0.824916,0.019244,0.757814,0.025988,202
21,XGBClassifier,0.824916,0.012598,0.757413,0.021329,134
4,XGBClassifier,0.821549,0.014547,0.75628,0.022376,116
49,CatBoostClassifier,0.820426,0.017675,0.752752,0.027106,30811
20,XGBClassifier,0.819304,0.013561,0.750959,0.020572,153
35,MLPClassifier,0.81257,0.012397,0.733779,0.014096,1723
42,MLPClassifier,0.81257,0.013837,0.732228,0.014426,1668


In [235]:
top = results.iloc[results['accuracy'].sort_values(ascending=False).index].drop_duplicates('model')

In [238]:
top

Unnamed: 0,model,accuracy,accuracy-std,f1-score,f1-score-std,time
6,CatBoostClassifier,0.826038,0.011111,0.76261,0.015169,26554
47,XGBClassifier,0.824916,0.012598,0.757413,0.021329,125
46,GradientBoostingClassifier,0.824916,0.012598,0.758562,0.024533,194
35,MLPClassifier,0.81257,0.012397,0.733779,0.014096,1723
26,SVC,0.81257,0.011111,0.735369,0.01492,61
0,RandomForestClassifier,0.806958,0.014108,0.731689,0.024212,111
1,ExtraTreesClassifier,0.805836,0.001587,0.735611,0.00758,64
48,LGBMClassifier,0.803591,0.020264,0.719468,0.02351,50
22,LogisticRegression,0.800224,0.018305,0.723695,0.024161,34
34,LinearSVC,0.79798,0.010997,0.725477,0.017092,38


In [239]:
top_f1 = results.iloc[results['f1-score'].sort_values(ascending=False).index].drop_duplicates('model')

In [240]:
top_f1

Unnamed: 0,model,accuracy,accuracy-std,f1-score,f1-score-std,time
6,CatBoostClassifier,0.826038,0.011111,0.76261,0.015169,26554
46,GradientBoostingClassifier,0.824916,0.012598,0.758562,0.024533,194
47,XGBClassifier,0.824916,0.012598,0.757413,0.021329,125
1,ExtraTreesClassifier,0.805836,0.001587,0.735611,0.00758,64
26,SVC,0.81257,0.011111,0.735369,0.01492,61
45,AdaBoostClassifier,0.79798,0.008247,0.734231,0.015643,250
35,MLPClassifier,0.81257,0.012397,0.733779,0.014096,1723
0,RandomForestClassifier,0.806958,0.014108,0.731689,0.024212,111
5,LGBMClassifier,0.801347,0.012598,0.727761,0.019489,49
18,DecisionTreeClassifier,0.792368,0.013837,0.727593,0.016951,24


In [329]:
run_model(RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=2), X_all_norm, y)

RandomForestClassifier


In [330]:
results

Unnamed: 0,model,accuracy,accuracy-std,f1-score,f1-score-std,time
0,RandomForestClassifier,0.806958,0.014108,0.731689,0.024212,111
1,ExtraTreesClassifier,0.805836,0.001587,0.735611,0.00758,64
2,AdaBoostClassifier,0.789001,0.008399,0.723547,0.021509,256
3,GradientBoostingClassifier,0.824916,0.019244,0.757814,0.025988,202
4,XGBClassifier,0.821549,0.014547,0.75628,0.022376,116
5,LGBMClassifier,0.801347,0.012598,0.727761,0.019489,49
6,CatBoostClassifier,0.826038,0.011111,0.76261,0.015169,26554
7,LogisticRegression,0.795735,0.019309,0.717724,0.022204,44
8,RidgeClassifier,0.795735,0.011446,0.719034,0.017241,74
9,SGDClassifier,0.585859,0.016722,0.458404,0.126533,27


In [241]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


## Test

In [268]:
X_test = test.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1)

In [269]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Age         332 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        417 non-null float64
Embarked    418 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 22.9+ KB


In [274]:
X_test_norm = X_test.copy()
X_test_norm['Age'] = X_test_norm['Age'].fillna(X_test_norm['Age'].median()).astype(float)
X_test_norm['Fare'] = X_test_norm['Fare'].fillna(X_test_norm['Fare'].median()).astype(float)
X_test_norm['Embarked'] = X_test_norm['Embarked'].fillna('N')

X_test_norm['Sex'] = lbl1.transform(X_test_norm['Sex'])
X_test_norm['Embarked'] = lbl2.transform(X_test_norm['Embarked'])

In [275]:
X_test_norm = pd.DataFrame(sc.transform(X_test_norm), columns=X_test_norm.columns)

In [276]:
X_test_norm.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,0.632775,0.636364,0.366666,0.055921,0.065391,0.069441,0.719298
std,0.420919,0.481622,0.159635,0.112095,0.163571,0.109012,0.421757
min,0.0,0.0,-0.003141,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.28374,0.0,0.0,0.015412,0.666667
50%,1.0,1.0,0.334004,0.0,0.0,0.028213,1.0
75%,1.0,1.0,0.443956,0.125,0.0,0.061429,1.0
max,1.0,1.0,0.949736,1.0,1.5,1.0,1.0


In [279]:
X_all_norm.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.654321,0.647587,0.363679,0.065376,0.063599,0.062858,0.781145
std,0.418036,0.47799,0.163605,0.137843,0.134343,0.096995,0.389133
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.5,0.0,0.271174,0.0,0.0,0.01544,0.666667
50%,1.0,1.0,0.346569,0.0,0.0,0.028213,1.0
75%,1.0,1.0,0.434531,0.125,0.0,0.060508,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Submission

In [323]:
model = ExtraTreesClassifier(n_estimators=100, max_depth=5, min_samples_leaf=2)
model.fit(X_all_norm, train['Survived'])

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=5, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [324]:
y_pred = model.predict(X_test_norm)

In [325]:
submit = pd.DataFrame()
submit['PassengerId'] = test['PassengerId']
submit['Survived'] = y_pred.astype(int)

In [327]:
submit.to_csv('ExtraTrees.csv', index=False)

## More testing

In [331]:
results.columns


Index(['model', 'accuracy', 'accuracy-std', 'f1-score', 'f1-score-std',
       'time'],
      dtype='object')

In [334]:
std = results.iloc[results['accuracy-std'].sort_values().index].drop_duplicates('model')

In [335]:
std

Unnamed: 0,model,accuracy,accuracy-std,f1-score,f1-score-std,time
1,ExtraTreesClassifier,0.805836,0.001587,0.735611,0.00758,64
52,RandomForestClassifier,0.820426,0.003174,0.743435,0.017484,541
37,RidgeClassifier,0.794613,0.005498,0.722274,0.008471,41
45,AdaBoostClassifier,0.79798,0.008247,0.734231,0.015643,250
27,LinearSVC,0.79798,0.008247,0.724658,0.013497,149
31,SGDClassifier,0.795735,0.008837,0.698333,0.035907,36
26,SVC,0.81257,0.011111,0.735369,0.01492,61
6,CatBoostClassifier,0.826038,0.011111,0.76261,0.015169,26554
35,MLPClassifier,0.81257,0.012397,0.733779,0.014096,1723
36,LogisticRegression,0.79798,0.012598,0.724725,0.017808,33
