In [114]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer

In [115]:
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
from pylab import rcParams
rcParams['figure.figsize'] = (9, 6)

In [116]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [117]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [118]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [119]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [120]:
train.Sex.value_counts(dropna=False)

male      577
female    314
Name: Sex, dtype: int64

In [121]:
train.groupby('Sex')['Pclass'].value_counts(normalize=True)

Sex     Pclass
female  3         0.458599
        1         0.299363
        2         0.242038
male    3         0.601386
        1         0.211438
        2         0.187175
Name: Pclass, dtype: float64

In [122]:
y_train = train.Survived
train.drop('Survived', axis=1, inplace=True)

In [123]:
train.columns == test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [124]:
train['is_test'] = 0
test['is_test'] = 1

In [125]:
df = pd.concat([train, test])

In [126]:
df["isMale"] = df.Sex.replace({"male": 1, "female":0})
df["treatment"] = df.Name.apply(lambda s: (s.split(",")[1].strip()).split(" ")[0])
df.drop(["Sex", "Cabin", "Ticket", "Name", "PassengerId"], axis=1, inplace=True)

In [127]:
df.Pclass.value_counts()

3    709
1    323
2    277
Name: Pclass, dtype: int64

In [128]:
df_dummies = pd.get_dummies(df, columns=['Pclass', 'Embarked', 'treatment'])

In [129]:
df_dummies.head(10)

Unnamed: 0,Age,SibSp,Parch,Fare,is_test,isMale,Pclass_1,Pclass_2,Pclass_3,Embarked_C,...,treatment_Master.,treatment_Miss.,treatment_Mlle.,treatment_Mme.,treatment_Mr.,treatment_Mrs.,treatment_Ms.,treatment_Rev.,treatment_Sir.,treatment_the
0,22.0,1,0,7.25,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,38.0,1,0,71.2833,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,26.0,0,0,7.925,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,35.0,1,0,53.1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,35.0,0,0,8.05,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
5,,0,0,8.4583,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
6,54.0,0,0,51.8625,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,2.0,3,1,21.075,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
8,27.0,0,2,11.1333,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
9,14.0,1,0,30.0708,0,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0


In [130]:
df_dummies.isnull().sum()

Age                    263
SibSp                    0
Parch                    0
Fare                     1
is_test                  0
isMale                   0
Pclass_1                 0
Pclass_2                 0
Pclass_3                 0
Embarked_C               0
Embarked_Q               0
Embarked_S               0
treatment_Capt.          0
treatment_Col.           0
treatment_Don.           0
treatment_Dona.          0
treatment_Dr.            0
treatment_Jonkheer.      0
treatment_Lady.          0
treatment_Major.         0
treatment_Master.        0
treatment_Miss.          0
treatment_Mlle.          0
treatment_Mme.           0
treatment_Mr.            0
treatment_Mrs.           0
treatment_Ms.            0
treatment_Rev.           0
treatment_Sir.           0
treatment_the            0
dtype: int64

In [131]:
X_train = df_dummies[df_dummies.is_test==0].drop('is_test', axis=1)
X_test = df_dummies[df_dummies.is_test==1].drop('is_test', axis=1)

In [132]:
columns = X_train.columns

In [133]:
X_train.head(10)

Unnamed: 0,Age,SibSp,Parch,Fare,isMale,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,...,treatment_Master.,treatment_Miss.,treatment_Mlle.,treatment_Mme.,treatment_Mr.,treatment_Mrs.,treatment_Ms.,treatment_Rev.,treatment_Sir.,treatment_the
0,22.0,1,0,7.25,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1,38.0,1,0,71.2833,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,26.0,0,0,7.925,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,35.0,1,0,53.1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,35.0,0,0,8.05,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
5,,0,0,8.4583,1,0,0,1,0,1,...,0,0,0,0,1,0,0,0,0,0
6,54.0,0,0,51.8625,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,2.0,3,1,21.075,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
8,27.0,0,2,11.1333,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
9,14.0,1,0,30.0708,0,0,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [134]:
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)



In [135]:
imputer.fit(X_train)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [136]:
X_train_imputed = imputer.transform(X_train)
X_train_imputed = pd.DataFrame(X_train_imputed, columns=columns)

In [137]:
X_train_imputed.head(10)

Unnamed: 0,Age,SibSp,Parch,Fare,isMale,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,...,treatment_Master.,treatment_Miss.,treatment_Mlle.,treatment_Mme.,treatment_Mr.,treatment_Mrs.,treatment_Ms.,treatment_Rev.,treatment_Sir.,treatment_the
0,22.0,1.0,0.0,7.25,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,38.0,1.0,0.0,71.2833,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,26.0,0.0,0.0,7.925,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35.0,1.0,0.0,53.1,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,35.0,0.0,0.0,8.05,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,29.699118,0.0,0.0,8.4583,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,54.0,0.0,0.0,51.8625,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,2.0,3.0,1.0,21.075,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,27.0,0.0,2.0,11.1333,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,14.0,1.0,0.0,30.0708,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### Разделение на обучающую и тестирующую выборки

In [202]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [203]:
scaler = StandardScaler()
scaler.fit(X_train_imputed)
X_train_imputed_scaled = scaler.transform(X_train_imputed)
X_train_imputed_scaled = pd.DataFrame(X_train_imputed_scaled, columns=columns)
X_train_fin, X_val, y_train_fin, y_val = train_test_split(X_train_imputed_scaled, y_train, test_size=0.2)
X_test_imputed_scaled = scaler.transform(imputer.transform(X_test))


In [204]:
X_train_fin.shape

(712, 29)

In [205]:
X_val.shape

(179, 29)

In [206]:
X_test_imputed_scaled.shape

(418, 29)

In [207]:
y_train_fin.shape

(712,)

In [208]:
y_val.shape

(179,)

In [218]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier

In [219]:
cs = np.arange(1, 6, 1)
cs

array([1, 2, 3, 4, 5])

In [224]:
grid = {'max_depth': cs}
gridsearch = GridSearchCV(DecisionTreeClassifier(), grid, scoring='accuracy', cv=10)


In [225]:
%%time
gridsearch.fit(X_train_fin, y_train_fin)

Wall time: 173 ms


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [226]:
gridsearch.cv_results_



{'mean_fit_time': array([0.00149617, 0.00140042, 0.00149603, 0.0018003 , 0.00189478]),
 'mean_score_time': array([0.00079768, 0.00089521, 0.00069823, 0.00059865, 0.00069578]),
 'mean_test_score': array([0.78792135, 0.78651685, 0.8258427 , 0.82865169, 0.82865169]),
 'mean_train_score': array([0.78792105, 0.8061838 , 0.83754137, 0.85720259, 0.86843945]),
 'param_max_depth': masked_array(data=[1, 2, 3, 4, 5],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 1},
  {'max_depth': 2},
  {'max_depth': 3},
  {'max_depth': 4},
  {'max_depth': 5}],
 'rank_test_score': array([4, 5, 3, 1, 1]),
 'split0_test_score': array([0.73611111, 0.77777778, 0.77777778, 0.80555556, 0.80555556]),
 'split0_train_score': array([0.79375  , 0.825    , 0.83125  , 0.8515625, 0.8546875]),
 'split1_test_score': array([0.73611111, 0.73611111, 0.73611111, 0.79166667, 0.79166667]),
 'split1_train_score': array([0.79375  , 0.8296875, 0.8359

In [227]:
gridsearch.best_params_

{'max_depth': 4}

In [228]:
best_C = gridsearch.best_params_["max_depth"]

In [229]:
from sklearn.metrics import accuracy_score

In [230]:
clf = gridsearch.best_estimator_
clf.feature_importances_

array([0.05320525, 0.        , 0.00904952, 0.16191692, 0.01665248,
       0.        , 0.        , 0.13296899, 0.00298028, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.57885888, 0.        ,
       0.        , 0.04436769, 0.        , 0.        ])

In [231]:
clf.fit(X_train_fin, y_train_fin)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [232]:
y_val_pred = clf.predict(X_val)

In [233]:
accuracy_score(y_val_pred, y_val)

0.7877094972067039

# Финальное предсказание

In [234]:
clf.fit(X_train_imputed_scaled, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

предсказание вероятностей принадлежности классу 0 и 1:

In [235]:
clf.predict_proba(X_test_imputed_scaled)[:10]

array([[1.        , 0.        ],
       [0.85714286, 0.14285714],
       [0.90625   , 0.09375   ],
       [0.90145985, 0.09854015],
       [0.3553719 , 0.6446281 ],
       [0.90145985, 0.09854015],
       [0.3553719 , 0.6446281 ],
       [1.        , 0.        ],
       [0.3553719 , 0.6446281 ],
       [0.90145985, 0.09854015]])

In [236]:
from sklearn.tree import export_graphviz

def get_tree_dot_view(clf, feature_names=None, class_names=None):
    print(export_graphviz(clf, out_file=None, filled=True, feature_names=feature_names, class_names=class_names))

In [237]:
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X_train_fin, y_train_fin)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [238]:
get_tree_dot_view(clf, list(X_train_fin.columns), str(y_train_fin))

digraph Tree {
node [shape=box, style="filled", color="black"] ;
0 [label="treatment_Mr. <= -0.163\ngini = 0.475\nsamples = 712\nvalue = [436, 276]\nclass = 8", fillcolor="#e581395e"] ;
1 [label="Pclass_3 <= -0.103\ngini = 0.414\nsamples = 301\nvalue = [88, 213]\nclass = 8", fillcolor="#399de596"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="treatment_Rev. <= 6.031\ngini = 0.194\nsamples = 165\nvalue = [18, 147]\nclass = 8", fillcolor="#399de5e0"] ;
1 -> 2 ;
3 [label="gini = 0.149\nsamples = 160\nvalue = [13, 147]\nclass = 8", fillcolor="#399de5e8"] ;
2 -> 3 ;
4 [label="gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = 8", fillcolor="#e58139ff"] ;
2 -> 4 ;
5 [label="Fare <= -0.229\ngini = 0.5\nsamples = 136\nvalue = [70, 66]\nclass = 8", fillcolor="#e581390f"] ;
1 -> 5 ;
6 [label="gini = 0.453\nsamples = 95\nvalue = [33, 62]\nclass = 8", fillcolor="#399de577"] ;
5 -> 6 ;
7 [label="gini = 0.176\nsamples = 41\nvalue = [37, 4]\nclass = 8", fillcolor="#e58139e3"