In [85]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold

from sklearn.tree import export_graphviz
from io import StringIO
import pydotplus
from ipywidgets import Image
import warnings
warnings.filterwarnings('ignore')

In [2]:
RANDOM_STATE = 6

# Analysing csv`s

In [3]:
train_df = pd.read_csv('train.csv', index_col='PassengerId')
train_df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_df = pd.read_csv('test.csv', index_col='PassengerId')
test_df.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train_df.sample(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
785,0,3,"Ali, Mr. William",male,25.0,0,0,SOTON/O.Q. 3101312,7.05,,S
706,0,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0,,S
365,0,3,"O'Brien, Mr. Thomas",male,,1,0,370365,15.5,,Q
309,0,2,"Abelson, Mr. Samuel",male,30.0,1,0,P/PP 3381,24.0,,C
338,1,1,"Burns, Miss. Elizabeth Margaret",female,41.0,0,0,16966,134.5,E40,C


In [6]:
train_df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [7]:
test_df.isnull().sum()

Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64

In [8]:
train_df.describe(include='all')

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,891,2,,,,681.0,,147,3
top,,,"Mudd, Mr. Thomas Charles",male,,,,347082.0,,B96 B98,S
freq,,,1,577,,,,7.0,,4,644
mean,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [9]:
# Filling missing values
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)

train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

# Droping columns
y = train_df['Survived']
train_df.drop(['Survived', 'Cabin', 'Ticket'], axis=1, inplace = True)
test_df.drop(['Ticket', 'Cabin'], axis=1, inplace=True)

In [10]:
test_df.head(1)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q


In [11]:
test_df.head(1)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q


In [12]:
test_df.isnull().sum()
train_df.isnull().sum()

Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [13]:
test_df.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S


In [14]:
test_df.loc[892, 'Name'].split(', ')[1].split('.')[0]

'Mr'

In [15]:
test_df['Rank'] = test_df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
train_df['Rank'] = train_df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
test_df['Rank'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Ms', 'Col', 'Rev', 'Dr', 'Dona'],
      dtype=object)

In [16]:
test_df['Sex'] =test_df['Sex'].map({'male':0, 'female':1})
train_df['Sex'] = train_df['Sex'].map({'male':0, 'female':1})

In [17]:
test_df.drop('Name', axis=1, inplace=True)
test_df.head(1)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Rank
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,0,34.5,0,0,7.8292,Q,Mr


In [18]:
train_df.drop('Name', axis=1, inplace=True)
train_df.head(1)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Rank
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,0,22.0,1,0,7.25,S,Mr


In [19]:
le = LabelEncoder()
le.fit(train_df['Embarked'])
train_df['Embarked'] = le.transform(train_df['Embarked'])
test_df['Embarked'] = le.transform(test_df['Embarked'])

In [20]:
ranks = ['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer', 'Dona']

In [21]:
le2 = LabelEncoder()
le2.fit(ranks)
train_df['Rank'] = le2.transform(train_df['Rank'])
test_df['Rank'] = le2.transform(test_df['Rank'])

# Fiting

In [22]:
def write_submission(estimator, filename, X_test):
    pred = estimator.predict(X_test)
    df = pd.DataFrame({'Survived': pred}, index=test_df.index)
    df.to_csv(filename)

In [23]:
X_train, x_ho, y_train, y_ho = train_test_split(train_df, y, test_size=0.2, random_state=RANDOM_STATE)

In [24]:
X_train.shape, x_ho.shape

((712, 8), (179, 8))

# Baseline

In [30]:
def cv_accuracy(estimator, X_train, y_train, skf):
    acc = cross_val_score(estimator, X_train, y_train, scoring='accuracy', cv=skf, verbose=1, n_jobs=-1)
    print("{0} +-{1}".format(acc.mean(), acc.std()))

In [89]:
def gridsearch(estimator, params, X_train, y_train, skf):
    gsearch = GridSearchCV(estimator, params, cv=skf, n_jobs=-1, verbose=True)
    gsearch.fit(X_train, y_train)
    print("best_params: ", gsearch.best_params_)
    print("best estemator: ", gsearch.best_estimator_)
    print('{0}+-{1}'.format(gsearch.best_score_, gsearch.cv_results_['std_test_score'][gsearch.best_index_]))

## Decision tree

In [60]:
dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
cv_accuracy(dt, X_train, y_train, 6)

0.7823315766984761 +-0.0278412339843142


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.0s finished


* Hyperparms

In [40]:
tree_params = {
    'max_depth': np.arange(1, 22),
    'min_samples_leaf': np.arange(1, 20)
}

In [None]:
{'max_depth': 10, 'min_samples_leaf': 8}
0.8132022471910112+-0.04342303333528202

In [90]:
gridsearch(dt, tree_params, X_train, y_train, 6)

Fitting 6 folds for each of 399 candidates, totalling 2394 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    6.9s


best_params:  {'max_depth': 8, 'min_samples_leaf': 7}
best estemator:  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=7, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=6, splitter='best')
0.8146067415730337+-0.03149526525222514


[Parallel(n_jobs=-1)]: Done 2394 out of 2394 | elapsed:   13.4s finished


* SKF

In [92]:
skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=RANDOM_STATE)

In [93]:
gridsearch(dt, tree_params, X_train, y_train, skf)

Fitting 6 folds for each of 399 candidates, totalling 2394 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 484 tasks      | elapsed:    1.4s


best_params:  {'max_depth': 8, 'min_samples_leaf': 5}
best estemator:  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=6, splitter='best')
0.8398876404494382+-0.031357801582924526


[Parallel(n_jobs=-1)]: Done 2394 out of 2394 | elapsed:    6.5s finished


* Accuracy scores

In [None]:
dt = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=8, max_features=7, n_estimators=24)
rf.fit(train_df, y)
accuracy_score(rf.predict(x_ho), y_ho)

## Random forest

In [79]:
rf = RandomForestClassifier(random_state=RANDOM_STATE)
cv_accuracy(rf, X_train, y_train, 9)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s finished


0.8271962566266364 +-0.04699431422194508


* Hyperparams

In [131]:
rf_params = {
    "max_depth": np.arange(2, 7),
    "n_estimators": (22, 30),
    "max_features": np.arange(1, 8),
    "min_samples_split": np.arange(1,4),
    "min_samples_leaf": (1, 6),
}

In [None]:
best_params:  {'max_depth': 9, 'max_features': 2, 'n_estimators': 10}
0.8384831460674157+-0.024504011918896555

In [125]:
gridsearch(rf, rf_params, X_train, y_train, 9)

Fitting 9 folds for each of 700 candidates, totalling 6300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   49.4s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 6300 out of 6300 | elapsed:  8.4min finished


best_params:  {'max_depth': 6, 'max_features': 7, 'n_estimators': 105}
best estemator:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=6, max_features=7, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=105,
                       n_jobs=None, oob_score=False, random_state=6, verbose=0,
                       warm_start=False)
0.8300561797752809+-0.04341416991379918


* SKF

In [101]:
skf = StratifiedKFold(n_splits=9, shuffle=True, random_state=RANDOM_STATE)
gridsearch(rf, rf_params, X_train, y_train, skf)

Fitting 9 folds for each of 4375 candidates, totalling 39375 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 348 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 848 tasks      | elapsed:   26.0s
[Parallel(n_jobs=-1)]: Done 1548 tasks      | elapsed:   43.0s
[Parallel(n_jobs=-1)]: Done 2448 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 3548 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 4848 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 6348 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 8048 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 9948 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 12048 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 14348 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 16848 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 19548 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 22448 tasks  

best_params:  {'max_depth': 7, 'max_features': 7, 'n_estimators': 24}
best estemator:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features=7, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=24,
                       n_jobs=None, oob_score=False, random_state=6, verbose=0,
                       warm_start=False)
0.8469101123595506+-0.02535414138273166


* Accuracy scores

In [130]:
rf = RandomForestClassifier(random_state=RANDOM_STATE, max_depth=7, max_features=7, n_estimators=24)
rf.fit(train_df, y)
accuracy_score(rf.predict(x_ho), y_ho)

0.9106145251396648

In [129]:
write_submission(rf, 'tuned_rf_skf2.csv', test_df)

## Knn

In [35]:
scaler = StandardScaler()
scaler.fit(X_train, y_train)
X_train_scaled = scaler.transform(X_train)
knn = KNeighborsClassifier()

In [36]:
cv_accuracy(knn, X_train_scaled, y_train, 5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


0.7864827856025038 +-0.01492478056354656


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished


* Hyperparams

In [46]:
knn_params = {
    'n_neighbours': np.arange(1, 20)
}

In [52]:
gridsearch(knn, knn_params, X_train_scaled, y_train, 5)

Fitting 5 folds for each of 19 candidates, totalling 95 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


ValueError: Invalid parameter n_neighbours for estimator KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform'). Check the list of available parameters with `estimator.get_params().keys()`.