# Классификация. Дерево решений.

In [2]:
import pandas as pd
import numpy as np

In [31]:
df = pd.read_csv('../data/bank.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


![Types of machine learning](https://i.vas3k.ru/7r6.jpg)
[Коротко о машинном обучении](https://vas3k.ru/blog/machine_learning/)

## Дерево решений

In [5]:
df[['Age','EstimatedSalary','Gender','Exited']].head(10)

Unnamed: 0,Age,EstimatedSalary,Gender,Exited
0,42,101348.88,Female,1
1,41,112542.58,Female,0
2,42,113931.57,Female,1
3,39,93826.63,Female,0
4,43,79084.1,Female,0
5,44,149756.71,Male,1
6,50,10062.8,Male,0
7,29,119346.88,Female,1
8,44,74940.5,Male,0
9,27,71725.73,Male,0


In [9]:
df.shape

(10000, 14)

In [10]:
train = df[:7000]
test = df[:3000]

In [12]:
X_train = train.drop('Exited', axis=1)
X_test = test.drop('Exited', axis=1)

print(X_train.shape, X_test.shape)

(7000, 13) (3000, 13)


In [13]:
y_train = train['Exited']
y_test = test['Exited']

print(y_train.shape, y_test.shape)

(7000,) (3000,)


![Классическое дерево решений](https://habrastorage.org/getpro/habr/upload_files/476/3a4/023/4763a4023eecc0c33289aa195cf6e147)

### Энтропия Шеннона

$$S = -\sum_{i=1}^{N}p_ilog_2p_i,$$

![img](https://habrastorage.org/r/w1560/storage2/785/21c/7c6/78521c7c61114d0c433d76cb4f282f15.png)


![Пример с шариками](https://habrastorage.org/r/w1560/storage2/173/96f/27f/17396f27f81e9bb312f2f01aa1254dbe.png)

[Подробнее](https://habr.com/ru/post/171759/)

![](https://www.researchgate.net/publication/340567535/figure/fig2/AS:880966289588226@1587050139118/Train-test-cross-validation-split-methodology-used-in-this-paper-The-first-operation.jpg)

![](https://i.stack.imgur.com/XJZve.png)

![](https://miro.medium.com/max/1200/1*GH7h526OcmtDy3Q1jeOjHw.png)

Плюсы:
* Высокая скорость
* Интерпретация
* Устойчив к выбросам

## K-means (метрическая)

![](https://upload.wikimedia.org/wikipedia/commons/thumb/d/d2/Iris_Flowers_Clustering_kMeans_ru.svg/1920px-Iris_Flowers_Clustering_kMeans_ru.svg.png)

In [44]:
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,0,42,2.0,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,0,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,0,42,8.0,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,0,39,1.0,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,0,43,2.0,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,1,39,5.0,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,1,35,10.0,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,0,36,7.0,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,1,42,3.0,75075.31,2,1,0,92888.52,1


In [40]:
df.loc[df['Tenure'].isna(),'Tenure'] = df['Tenure'].median()

In [43]:
df['Gender']=df['Gender'].map({'Female':0, 'Male':1})

In [47]:
df_dummies = pd.get_dummies(df['Geography'])
df_dummies.head()

Unnamed: 0,France,Germany,Spain
0,1,0,0
1,0,0,1
2,1,0,0
3,1,0,0
4,0,0,1


In [48]:
df.drop(columns=['RowNumber','CustomerId','Surname','Geography'], inplace=True)
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,0,42,2.0,0.0,1,1,1,101348.88,1
1,608,0,41,1.0,83807.86,1,0,1,112542.58,0
2,502,0,42,8.0,159660.8,3,1,0,113931.57,1
3,699,0,39,1.0,0.0,2,0,0,93826.63,0
4,850,0,43,2.0,125510.82,1,1,1,79084.1,0


In [51]:
df = pd.concat([df, df_dummies],axis=1)
df

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain
0,619,0,42,2.0,0.00,1,1,1,101348.88,1,1,0,0
1,608,0,41,1.0,83807.86,1,0,1,112542.58,0,0,0,1
2,502,0,42,8.0,159660.80,3,1,0,113931.57,1,1,0,0
3,699,0,39,1.0,0.00,2,0,0,93826.63,0,1,0,0
4,850,0,43,2.0,125510.82,1,1,1,79084.10,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5.0,0.00,2,1,0,96270.64,0,1,0,0
9996,516,1,35,10.0,57369.61,1,1,1,101699.77,0,1,0,0
9997,709,0,36,7.0,0.00,1,0,1,42085.58,1,1,0,0
9998,772,1,42,3.0,75075.31,2,1,0,92888.52,1,0,1,0


In [52]:
from sklearn.model_selection import train_test_split

In [54]:
X = df.drop('Exited', axis=1)
y = df['Exited']

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=10)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(7000, 12) (7000,) (3000, 12) (3000,)


In [57]:
X_train.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,France,Germany,Spain
441,790,0,31,9.0,0.0,2,1,0,84126.75,1,0,0
8216,704,1,41,4.0,109026.8,2,1,1,43117.1,0,1,0
6197,581,0,54,2.0,152508.99,1,1,0,187597.98,0,1,0
6911,705,0,54,3.0,125889.3,3,1,0,96013.5,0,1,0
3938,483,1,41,1.0,118334.44,1,0,0,163147.99,0,1,0


In [60]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
first_tree = DecisionTreeClassifier(random_state=10)

In [65]:
np.mean(cross_val_score(first_tree, X_train, y_train, cv=5))

0.798

In [66]:
from sklearn.neighbors import KNeighborsClassifier
first_knn = KNeighborsClassifier()

In [67]:
np.mean(cross_val_score(first_knn, X_train, y_train, cv=5))

0.7638571428571429

In [80]:
from sklearn.model_selection import GridSearchCV
tree_params = {'max_depth': range(1,16), 'max_features': [0.5, 0.7, 1]}
tree_grid = GridSearchCV(first_tree,tree_params,cv=5, n_jobs=-1)

In [81]:
%%time
tree_grid.fit(X_train,y_train)

Wall time: 7.88 s


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=10), n_jobs=-1,
             param_grid={'max_depth': range(1, 16),
                         'max_features': [0.5, 0.7, 1]})

In [82]:
tree_grid.best_score_, tree_grid.best_params_

(0.8548571428571428, {'max_depth': 7, 'max_features': 0.7})

In [84]:
first_knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [85]:
%%time
knn_params = {'n_neighbors': range(5,30,5)}
knn_grid = GridSearchCV(first_knn,knn_params,cv=5, n_jobs=-1)
knn_grid.fit(X_train,y_train)

Wall time: 819 ms


GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': range(5, 30, 5)})

In [86]:
knn_grid.best_score_, knn_grid.best_params_

(0.7979999999999998, {'n_neighbors': 25})

In [87]:
y_preds = tree_grid.predict(X_test)

In [88]:
from sklearn.metrics import accuracy_score
accuracy_score(y_preds, y_test)

0.8516666666666667

In [102]:
from sklearn.metrics import confusion_matrix
confusion_matrix?

[1;31mSignature:[0m
[0mconfusion_matrix[0m[1;33m([0m[1;33m
[0m    [0my_true[0m[1;33m,[0m[1;33m
[0m    [0my_pred[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mlabels[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0msample_weight[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mnormalize[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Compute confusion matrix to evaluate the accuracy of a classification.

By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
is equal to the number of observations known to be in group :math:`i` and
predicted to be in group :math:`j`.

Thus in binary classification, the count of true negatives is
:math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is
:math:`C_{1,1}` and false positives is :math:`C_{0,1}`.

Read more in the :ref:`User Guide <confusion_matrix>`.

Parameters
----------
y_tr

In [103]:
confusion_matrix(y_test,y_preds)

array([[2276,   96],
       [ 349,  279]], dtype=int64)

In [106]:
second_tree = DecisionTreeClassifier(max_depth=7, max_features=0.7, random_state=10)
second_tree.fit(X_train, y_train)
second_tree.score(X_test, y_test)

0.8516666666666667

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(second_tree, out_file="tree.dot", feature_names=X.columns, filled=True)

🛠 Построить дерево решений, подобрать эффективные гиперпараметры для измерения accuracy_score и отобразить матрицу 
ошибок для датасета выживших на Титанике.

In [None]:
df = pd.read_csv('../data/titanic.csv')
y=df['survived']
df.head()
# Ваш код здесь