In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import copy
import random
import graphviz
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [3]:
data = pd.read_csv('train_clean.csv', index_col=0)
data.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S


In [8]:
y = data['Survived'].copy()
X = data.drop(['PassengerId', 'Survived', 'Name', 'Ticket'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=111)
X_train.shape, X_test.shape

((623, 8), (268, 8))

In [10]:
def shift_0_age(age):
    if age == 0:
        return 30
    else:
        return age

def toNumbers(data):
    X = data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].copy()
    X['Sex'] = (data['Sex'] == 'female')
    X['Age'] = X['Age'].apply(shift_0_age)
    X['isCabin'] = (data['Cabin'] != 'Unknown')
    X['Embarked'] = data['Embarked'].apply(ord)
    return X

In [11]:
X1_train = toNumbers(X_train)
X1_test = toNumbers(X_test)
X1_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex,isCabin,Embarked
680,3,30.0,0,0,8.1375,True,False,81
810,3,26.0,0,0,7.8875,False,False,83
727,3,30.0,0,0,7.7375,True,False,81
775,3,18.0,0,0,7.75,False,False,83
795,2,39.0,0,0,13.0,False,False,83


In [26]:
rf1 = RandomForestClassifier(n_estimators=1000, max_depth=7, random_state=10, min_impurity_decrease=0.02)
rf1.fit(X1_train, y_train)
rf1.score(X1_test, y_test)

0.7910447761194029

In [32]:
params = {'max_depth': range(1, 20), 'min_samples_split': range(2,21)
          , 'min_impurity_decrease': [0, 0.01, 0.02, 0.03, 0.04] }

clf = GridSearchCV(RandomForestClassifier(), params, return_train_score=True, cv=5)
clf.fit(X1_train, y_train)
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print('Best parameters:', clf.best_params_)
print(clf.best_score_, accuracy_score(y_test, clf.predict(X1_test)))

0.745 (+/-0.083) for {'max_depth': 1, 'min_impurity_decrease': 0, 'min_samples_split': 2}
0.722 (+/-0.076) for {'max_depth': 1, 'min_impurity_decrease': 0, 'min_samples_split': 3}
0.724 (+/-0.081) for {'max_depth': 1, 'min_impurity_decrease': 0, 'min_samples_split': 4}
0.740 (+/-0.065) for {'max_depth': 1, 'min_impurity_decrease': 0, 'min_samples_split': 5}
0.746 (+/-0.092) for {'max_depth': 1, 'min_impurity_decrease': 0, 'min_samples_split': 6}
0.745 (+/-0.079) for {'max_depth': 1, 'min_impurity_decrease': 0, 'min_samples_split': 7}
0.726 (+/-0.034) for {'max_depth': 1, 'min_impurity_decrease': 0, 'min_samples_split': 8}
0.714 (+/-0.046) for {'max_depth': 1, 'min_impurity_decrease': 0, 'min_samples_split': 9}
0.722 (+/-0.067) for {'max_depth': 1, 'min_impurity_decrease': 0, 'min_samples_split': 10}
0.730 (+/-0.102) for {'max_depth': 1, 'min_impurity_decrease': 0, 'min_samples_split': 11}
0.796 (+/-0.093) for {'max_depth': 1, 'min_impurity_decrease': 0, 'min_samples_split': 12}
0.735 (

In [37]:
rf2 = RandomForestClassifier(max_depth=5, random_state=50, min_samples_split=7)
rf2.fit(X1_train, y_train)
rf2.score(X1_test, y_test)

0.7835820895522388

In [39]:
rf2.feature_importances_

array([0.12592568, 0.10858254, 0.06962965, 0.05239097, 0.22075994,
       0.3608533 , 0.03756861, 0.02428931])

In [40]:
X1_train.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'isCabin',
       'Embarked'],
      dtype='object')

In [42]:
X1_train[X1_train['Age'] < 11].shape

(48, 8)

In [44]:
X2_train = X1_train.drop(['isCabin', 'Embarked'], axis=1)
X2_test = X1_test.drop(['isCabin', 'Embarked'], axis=1)
X2_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex
680,3,30.0,0,0,8.1375,True
810,3,26.0,0,0,7.8875,False
727,3,30.0,0,0,7.7375,True
775,3,18.0,0,0,7.75,False
795,2,39.0,0,0,13.0,False


In [45]:
params = {'max_depth': range(5, 15), 'min_samples_split': range(2,21)
          , 'min_impurity_decrease': [0, 0.01, 0.02, 0.03, 0.04] }

clf2 = GridSearchCV(RandomForestClassifier(), params, return_train_score=True, cv=5)
clf2.fit(X2_train, y_train)
means = clf2.cv_results_['mean_test_score']
stds = clf2.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf2.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print('Best parameters:', clf.best_params_)
print(clf2.best_score_, accuracy_score(y_test, clf2.predict(X2_test)))

0.820 (+/-0.065) for {'max_depth': 5, 'min_impurity_decrease': 0, 'min_samples_split': 2}
0.811 (+/-0.074) for {'max_depth': 5, 'min_impurity_decrease': 0, 'min_samples_split': 3}
0.809 (+/-0.074) for {'max_depth': 5, 'min_impurity_decrease': 0, 'min_samples_split': 4}
0.799 (+/-0.077) for {'max_depth': 5, 'min_impurity_decrease': 0, 'min_samples_split': 5}
0.809 (+/-0.074) for {'max_depth': 5, 'min_impurity_decrease': 0, 'min_samples_split': 6}
0.814 (+/-0.072) for {'max_depth': 5, 'min_impurity_decrease': 0, 'min_samples_split': 7}
0.822 (+/-0.048) for {'max_depth': 5, 'min_impurity_decrease': 0, 'min_samples_split': 8}
0.817 (+/-0.049) for {'max_depth': 5, 'min_impurity_decrease': 0, 'min_samples_split': 9}
0.815 (+/-0.057) for {'max_depth': 5, 'min_impurity_decrease': 0, 'min_samples_split': 10}
0.815 (+/-0.071) for {'max_depth': 5, 'min_impurity_decrease': 0, 'min_samples_split': 11}
0.828 (+/-0.069) for {'max_depth': 5, 'min_impurity_decrease': 0, 'min_samples_split': 12}
0.819 (