In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer

In [7]:
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
from pylab import rcParams
rcParams['figure.figsize'] = (9, 6)

In [8]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [9]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [11]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [12]:
train.Sex.value_counts(dropna=False)

male      577
female    314
Name: Sex, dtype: int64

In [13]:
train.groupby('Sex')['Pclass'].value_counts(normalize=True)

Sex     Pclass
female  3         0.458599
        1         0.299363
        2         0.242038
male    3         0.601386
        1         0.211438
        2         0.187175
Name: Pclass, dtype: float64

In [14]:
y_train = train.Survived
train.drop('Survived', axis=1, inplace=True)

In [15]:
train.columns == test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [16]:
train['is_test'] = 0
test['is_test'] = 1

In [17]:
df = pd.concat([train, test])

In [18]:
df["isMale"] = df.Sex.replace({"male": 1, "female":0})
df["treatment"] = df.Name.apply(lambda s: (s.split(",")[1].strip()).split(" ")[0])
df.drop(["Sex", "Cabin", "Ticket", "Name", "PassengerId"], axis=1, inplace=True)

In [19]:
df.Pclass.value_counts()

3    709
1    323
2    277
Name: Pclass, dtype: int64

In [20]:
df_dummies = pd.get_dummies(df, columns=['Pclass', 'Embarked', 'treatment'])

In [21]:
df_dummies.head(10)

Unnamed: 0,Age,SibSp,Parch,Fare,is_test,isMale,Pclass_1,Pclass_2,Pclass_3,Embarked_C,...,treatment_Master.,treatment_Miss.,treatment_Mlle.,treatment_Mme.,treatment_Mr.,treatment_Mrs.,treatment_Ms.,treatment_Rev.,treatment_Sir.,treatment_the
0,22.0,1,0,7.25,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,38.0,1,0,71.2833,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,26.0,0,0,7.925,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,35.0,1,0,53.1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,35.0,0,0,8.05,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
5,,0,0,8.4583,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
6,54.0,0,0,51.8625,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,2.0,3,1,21.075,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
8,27.0,0,2,11.1333,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
9,14.0,1,0,30.0708,0,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0


In [22]:
df_dummies.isnull().sum()

Age                    263
SibSp                    0
Parch                    0
Fare                     1
is_test                  0
isMale                   0
Pclass_1                 0
Pclass_2                 0
Pclass_3                 0
Embarked_C               0
Embarked_Q               0
Embarked_S               0
treatment_Capt.          0
treatment_Col.           0
treatment_Don.           0
treatment_Dona.          0
treatment_Dr.            0
treatment_Jonkheer.      0
treatment_Lady.          0
treatment_Major.         0
treatment_Master.        0
treatment_Miss.          0
treatment_Mlle.          0
treatment_Mme.           0
treatment_Mr.            0
treatment_Mrs.           0
treatment_Ms.            0
treatment_Rev.           0
treatment_Sir.           0
treatment_the            0
dtype: int64

In [23]:
X_train = df_dummies[df_dummies.is_test==0].drop('is_test', axis=1)
X_test = df_dummies[df_dummies.is_test==1].drop('is_test', axis=1)

In [24]:
columns = X_train.columns

In [25]:
X_train.head(10)

Unnamed: 0,Age,SibSp,Parch,Fare,isMale,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,...,treatment_Master.,treatment_Miss.,treatment_Mlle.,treatment_Mme.,treatment_Mr.,treatment_Mrs.,treatment_Ms.,treatment_Rev.,treatment_Sir.,treatment_the
0,22.0,1,0,7.25,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1,38.0,1,0,71.2833,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,26.0,0,0,7.925,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,35.0,1,0,53.1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,35.0,0,0,8.05,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
5,,0,0,8.4583,1,0,0,1,0,1,...,0,0,0,0,1,0,0,0,0,0
6,54.0,0,0,51.8625,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,2.0,3,1,21.075,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
8,27.0,0,2,11.1333,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
9,14.0,1,0,30.0708,0,0,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [26]:
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)



In [27]:
imputer.fit(X_train)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [28]:
X_train_imputed = imputer.transform(X_train)
X_train_imputed = pd.DataFrame(X_train_imputed, columns=columns)

In [29]:
X_train_imputed.head(10)

Unnamed: 0,Age,SibSp,Parch,Fare,isMale,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,...,treatment_Master.,treatment_Miss.,treatment_Mlle.,treatment_Mme.,treatment_Mr.,treatment_Mrs.,treatment_Ms.,treatment_Rev.,treatment_Sir.,treatment_the
0,22.0,1.0,0.0,7.25,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,38.0,1.0,0.0,71.2833,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,26.0,0.0,0.0,7.925,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35.0,1.0,0.0,53.1,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,35.0,0.0,0.0,8.05,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,29.699118,0.0,0.0,8.4583,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,54.0,0.0,0.0,51.8625,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,2.0,3.0,1.0,21.075,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,27.0,0.0,2.0,11.1333,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,14.0,1.0,0.0,30.0708,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### Разделение на обучающую и тестирующую выборки

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [31]:
scaler = StandardScaler()
scaler.fit(X_train_imputed)
X_train_imputed_scaled = scaler.transform(X_train_imputed)
X_train_imputed_scaled = pd.DataFrame(X_train_imputed_scaled, columns=columns)
X_train_fin, X_val, y_train_fin, y_val = train_test_split(X_train_imputed_scaled, y_train, test_size=0.2)
X_test_imputed_scaled = scaler.transform(imputer.transform(X_test))


In [32]:
X_train_fin.shape

(712, 29)

In [33]:
X_val.shape

(179, 29)

In [34]:
X_test_imputed_scaled.shape

(418, 29)

In [35]:
y_train_fin.shape

(712,)

In [36]:
y_val.shape

(179,)

In [37]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier

In [38]:
cs = np.arange(1, 6, 1)
cs

array([1, 2, 3, 4, 5])

In [39]:
grid = {'max_depth': cs, 'max_features': cs}
gridsearch = GridSearchCV(DecisionTreeClassifier(), grid, scoring='accuracy', cv=10)


In [40]:
%%time
gridsearch.fit(X_train_fin, y_train_fin)

Wall time: 721 ms




GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5]), 'max_features': array([1, 2, 3, 4, 5])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [41]:
gridsearch.cv_results_



{'mean_fit_time': array([0.00160339, 0.00159221, 0.0010998 , 0.00109718, 0.00119753,
        0.00129449, 0.00119476, 0.00149279, 0.00109799, 0.00129471,
        0.00139315, 0.0012953 , 0.00119917, 0.00109615, 0.00129728,
        0.00129409, 0.00149791, 0.00139015, 0.00119452, 0.00159442,
        0.00129371, 0.00189433, 0.00130284, 0.00119555, 0.00149677]),
 'mean_score_time': array([0.00049584, 0.00069764, 0.00069821, 0.00080113, 0.00049932,
        0.00039866, 0.00069957, 0.00039926, 0.0005981 , 0.00059841,
        0.00040233, 0.0005013 , 0.00069637, 0.0006027 , 0.0004957 ,
        0.00049796, 0.00039866, 0.00100076, 0.00060613, 0.00029917,
        0.00080063, 0.00060132, 0.00059805, 0.00070314, 0.00039864]),
 'mean_test_score': array([0.64325843, 0.67696629, 0.62921348, 0.70926966, 0.68398876,
        0.67275281, 0.69803371, 0.74157303, 0.67977528, 0.74578652,
        0.67275281, 0.74438202, 0.75702247, 0.71488764, 0.76544944,
        0.73174157, 0.73735955, 0.73033708, 0.78230337, 0

In [42]:
gridsearch.best_params_

{'max_depth': 5, 'max_features': 5}

In [43]:
best_C = gridsearch.best_params_["max_depth"]

In [44]:
from sklearn.metrics import accuracy_score

In [45]:
clf = gridsearch.best_estimator_
clf.feature_importances_

array([0.05654656, 0.03574632, 0.00660861, 0.01759469, 0.01464721,
       0.01913866, 0.05148176, 0.07012102, 0.06585645, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.04471996, 0.        , 0.        , 0.38721756, 0.22418997,
       0.        , 0.00613123, 0.        , 0.        ])

In [46]:
clf.fit(X_train_fin, y_train_fin)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=5, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [47]:
y_val_pred = clf.predict(X_val)

In [48]:
accuracy_score(y_val_pred, y_val)

0.7821229050279329

# Финальное предсказание

In [49]:
clf.fit(X_train_imputed_scaled, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=5, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

предсказание вероятностей принадлежности классу 0 и 1:

In [50]:
clf.predict_proba(X_test_imputed_scaled)[:10]

array([[0.89615385, 0.10384615],
       [0.50909091, 0.49090909],
       [0.90909091, 0.09090909],
       [0.89615385, 0.10384615],
       [0.50909091, 0.49090909],
       [0.89615385, 0.10384615],
       [0.27272727, 0.72727273],
       [0.875     , 0.125     ],
       [0.36363636, 0.63636364],
       [0.84      , 0.16      ]])

In [51]:
from sklearn.tree import export_graphviz

def get_tree_dot_view(clf, feature_names=None, class_names=None):
    print(export_graphviz(clf, out_file=None, filled=True, feature_names=feature_names, class_names=class_names))

In [52]:
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X_train_fin, y_train_fin)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [53]:
get_tree_dot_view(clf, list(X_train_fin.columns), str(y_train_fin))

digraph Tree {
node [shape=box, style="filled", color="black"] ;
0 [label="treatment_Mr. <= -0.163\ngini = 0.477\nsamples = 712\nvalue = [433, 279]\nclass = 8", fillcolor="#e581395b"] ;
1 [label="Pclass_3 <= -0.103\ngini = 0.428\nsamples = 309\nvalue = [96, 213]\nclass = 1", fillcolor="#399de58c"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="isMale <= -0.309\ngini = 0.212\nsamples = 166\nvalue = [20, 146]\nclass = 1", fillcolor="#399de5dc"] ;
1 -> 2 ;
3 [label="gini = 0.083\nsamples = 138\nvalue = [6, 132]\nclass = 1", fillcolor="#399de5f3"] ;
2 -> 3 ;
4 [label="gini = 0.5\nsamples = 28\nvalue = [14, 14]\nclass = 8", fillcolor="#e5813900"] ;
2 -> 4 ;
5 [label="Fare <= -0.178\ngini = 0.498\nsamples = 143\nvalue = [76, 67]\nclass = 8", fillcolor="#e581391e"] ;
1 -> 5 ;
6 [label="gini = 0.484\nsamples = 107\nvalue = [44, 63]\nclass = 1", fillcolor="#399de54d"] ;
5 -> 6 ;
7 [label="gini = 0.198\nsamples = 36\nvalue = [32, 4]\nclass = 8", fillcolor="#e58139df"] 