**Kaggle "Titanic: Machine Learning from Disaster".**

In [1]:
# отключим предупреждения Anaconda
import warnings
warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
%pylab inline
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/agconti/kaggle-titanic/master/data/train.csv",index_col='PassengerId')  

In [3]:
df=df.drop(['Ticket','Cabin','Embarked','Name'],axis=1)

In [4]:
holdout = df.sample(100) # рандомная отложенная часть
df = df.loc[~df.index.isin(holdout.index)] #наш датасет без отложенной части

In [5]:
#целевое событие
y = df.Survived

In [6]:
#датасет без целевого события
df= df.drop(['Survived'],axis=1)

In [7]:
df.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,male,22.0,1,0,7.25
2,1,female,38.0,1,0,71.2833
3,3,female,26.0,0,0,7.925
4,1,female,35.0,1,0,53.1
5,3,male,35.0,0,0,8.05


**ЗАДАНИЕ**

1) Разбейте данные на train и test (пропорция 80/20, укажите в параметрах random_state = 17 и stratify = y)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, train_size=0.8, random_state=17, stratify=y)

In [9]:
y_test.value_counts()

0    98
1    61
Name: Survived, dtype: int64

2) Примените к обеим частям выборки пайплайн vec(задан ниже) 

In [10]:
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import DictVectorizer

def get_categ_col(df):
    return df[['Sex','Pclass']]

def get_num_cols(df):
    return df[['Age', 'Fare', 'SibSp', 'Parch']]

vec = make_union(*[
   make_pipeline(FunctionTransformer(get_categ_col, validate=False),  OneHotEncoder(sparse=False)),
   make_pipeline(FunctionTransformer(get_num_cols, validate=False), SimpleImputer(strategy='median'), StandardScaler()),
])

In [11]:
vec.fit(X_train)

X_train_transformed = pd.DataFrame(vec.transform(X_train))
X_test_transformed = pd.DataFrame(vec.transform(X_test))

In [14]:
X_train_transformed

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.0,0.0,1.0,0.0,0.374960,-0.223260,-0.486852,-0.468787
1,1.0,0.0,1.0,0.0,0.0,1.804636,0.971649,0.420790,-0.468787
2,1.0,0.0,1.0,0.0,0.0,-0.001270,1.143442,-0.486852,-0.468787
3,0.0,1.0,0.0,0.0,1.0,-0.076516,-0.499730,-0.486852,-0.468787
4,0.0,1.0,0.0,1.0,0.0,0.901683,-0.390185,-0.486852,-0.468787
...,...,...,...,...,...,...,...,...,...
627,0.0,1.0,0.0,0.0,1.0,-0.076516,-0.500164,-0.486852,-0.468787
628,0.0,1.0,0.0,0.0,1.0,-0.151763,-0.496688,-0.486852,-0.468787
629,0.0,1.0,1.0,0.0,0.0,0.374960,-0.107455,-0.486852,-0.468787
630,0.0,1.0,0.0,0.0,1.0,-0.076516,-0.499730,0.420790,-0.468787


3) С помощью GridSearchCV обучите модель LogisticRegression: сбалансируйте классы, переберите значения параметра С [0.01, 0.05, 0.1, 0.5, 1, 5, 10], задайте 3 фолда на кроссвалидации

Посчитайте средний roc_auc на тестовых фолдах (возьмите 3 значения из cv_results).

In [15]:
cv = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid={
        'C' : [0.01, 0.05, 0.1, 0.5, 1, 5, 10],
        'class_weight': ['balanced']
    },
    cv=3
).fit(X_train_transformed, y_train)

In [16]:
np.mean(cv.cv_results_['mean_test_score'])

0.7866492568591418

4) Посчитайте roc_auc для выборки holdout.

In [17]:
from sklearn.metrics import roc_auc_score

holdout_X = holdout.drop(axis=1, labels=['Survived'])
holdout_y = holdout[['Survived']]
holdout_X_transformed = pd.DataFrame(vec.transform(holdout_X))

roc_auc_score(holdout_y, cv.best_estimator_.predict(holdout_X_transformed))

0.7705663497312939

5) Посчитайте долю влияния фичи Pclass на итоговое предсказание.

In [19]:
holdout_X

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
623,3,male,20.0,1,1,15.7417
388,2,female,36.0,0,0,13.0000
730,3,female,25.0,1,0,7.9250
487,1,female,35.0,1,0,90.0000
408,2,male,3.0,1,1,18.7500
...,...,...,...,...,...,...
178,1,female,50.0,0,0,28.7125
424,3,female,28.0,1,1,14.4000
455,3,male,,0,0,8.0500
820,3,male,10.0,3,2,27.9000


In [21]:
holdout_X_transformed.columns = cols = ['is_female', 'is_male', 'pclass_1', 'pclass_2', 'pclass_3', 'age', 'sibsp', 'parch', 'fare']
holdout_X_transformed

Unnamed: 0,is_female,is_male,pclass_1,pclass_2,pclass_3,age,sibsp,parch,fare
0,0.0,1.0,0.0,0.0,1.0,-0.753731,-0.332978,0.420790,0.791951
1,1.0,0.0,0.0,1.0,0.0,0.450206,-0.390185,-0.486852,-0.468787
2,1.0,0.0,0.0,0.0,1.0,-0.377501,-0.496078,0.420790,-0.468787
3,1.0,0.0,1.0,0.0,0.0,0.374960,1.216472,0.420790,-0.468787
4,0.0,1.0,0.0,1.0,0.0,-2.032915,-0.270207,0.420790,0.791951
...,...,...,...,...,...,...,...,...,...
95,1.0,0.0,1.0,0.0,0.0,1.503651,-0.062333,-0.486852,-0.468787
96,1.0,0.0,0.0,0.0,1.0,-0.151763,-0.360973,0.420790,0.791951
97,0.0,1.0,0.0,0.0,1.0,-0.076516,-0.493470,-0.486852,-0.468787
98,0.0,1.0,0.0,0.0,1.0,-1.506192,-0.079287,2.236074,2.052690


In [22]:
contribution = pd.DataFrame((np.exp(cv.best_estimator_.coef_) / np.exp(cv.best_estimator_.coef_).sum()), columns=holdout_X_transformed.columns)
contribution

Unnamed: 0,is_female,is_male,pclass_1,pclass_2,pclass_3,age,sibsp,parch,fare
0,0.24131,0.041295,0.167425,0.105765,0.056176,0.075697,0.13745,0.077296,0.097586


In [23]:
contribution[['pclass_1', 'pclass_2', 'pclass_3']].iloc[0].sum()

0.3293653092958109

6) Теперь попробуйте применить к тем же данным дерево решений. 
Переберите значения для 5 разных параметров дерева решений (глубина, число листьев и т.д.) с помощью RandomizedSearch(n_iter = 100). Посчитайте roc_auc

In [28]:
X_train_transformed.columns = cols
X_train_transformed

Unnamed: 0,is_female,is_male,pclass_1,pclass_2,pclass_3,age,sibsp,parch,fare
0,1.0,0.0,0.0,1.0,0.0,0.374960,-0.223260,-0.486852,-0.468787
1,1.0,0.0,1.0,0.0,0.0,1.804636,0.971649,0.420790,-0.468787
2,1.0,0.0,1.0,0.0,0.0,-0.001270,1.143442,-0.486852,-0.468787
3,0.0,1.0,0.0,0.0,1.0,-0.076516,-0.499730,-0.486852,-0.468787
4,0.0,1.0,0.0,1.0,0.0,0.901683,-0.390185,-0.486852,-0.468787
...,...,...,...,...,...,...,...,...,...
627,0.0,1.0,0.0,0.0,1.0,-0.076516,-0.500164,-0.486852,-0.468787
628,0.0,1.0,0.0,0.0,1.0,-0.151763,-0.496688,-0.486852,-0.468787
629,0.0,1.0,1.0,0.0,0.0,0.374960,-0.107455,-0.486852,-0.468787
630,0.0,1.0,0.0,0.0,1.0,-0.076516,-0.499730,0.420790,-0.468787


In [30]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

tree_params = {
    'max_depth': range(2, 6),
    'min_samples_leaf': range(2, 5),
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_features': ['auto', 'sqrt', 'log2']
}

locally_best_tree = RandomizedSearchCV(
    DecisionTreeClassifier(random_state=42),
    tree_params,
    n_iter=100,
).fit(X_train_transformed, y_train)

In [33]:
np.mean(locally_best_tree.cv_results_['mean_test_score'])

0.792604674415698

7) Возьмите лучшее дерево из п.6 и визуализируйте его с помощью graphviz 

In [37]:
import sklearn
from graphviz import Source
from IPython.display import Image

best = locally_best_tree.best_estimator_

graph = Source(sklearn.tree.export_graphviz(locally_best_tree.best_estimator_, out_file=None, feature_names=X_train_transformed.columns.values))
png_bytes = graph.pipe(format='png')

Image(png_bytes)

ExecutableNotFound: failed to execute ['dot', '-Kdot', '-Tpng'], make sure the Graphviz executables are on your systems' PATH

8) Запустите в цикле обучение дерева решений для X_train с разными значениями max_depth от 1 до 10. 
Нарисуйте график как меняется roc auc в зависимости от max_depth для выборок X_train и X_test, сравните насколько графики похожи.