## Классификация картинок

In [20]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits
import plotly as plt

In [21]:
digits = load_digits()

In [22]:
digits.keys()

dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])

In [23]:
# признаки

X = pd.DataFrame( digits['data'] )
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [24]:
# целевая переменная - цифра от 0 до 9

Y = pd.Series( digits['target'] )
Y.head()

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.3, random_state = 0 )

In [27]:
from sklearn.tree import DecisionTreeClassifier

In [28]:
# зададим пока одно значение параметра модели max_depth
depth = 5

In [29]:
from sklearn.model_selection import cross_val_score

In [30]:
np.mean(cross_val_score( DecisionTreeClassifier( max_depth = depth ), X, Y, cv = 5, scoring = 'accuracy' ))

0.6197189306263637

In [31]:
max_value = [0, 0]
means = []
stds = []
for val in range(1,20):
    value = np.mean(cross_val_score( DecisionTreeClassifier( max_depth = val ), X, Y, cv = 5, scoring = 'accuracy' ))
    means.append(value)
    stds.append(np.std(cross_val_score( DecisionTreeClassifier( max_depth = val ), X, Y, cv = 5, scoring = 'accuracy' )))
    if value > max_value[1]:
        max_value[0] = val
        max_value[1] = value
    print(val, value)
print(max_value)   

1 0.19753067821046075
2 0.3116298339479968
3 0.4274291908290249
4 0.5422528839168287
5 0.6224567851304241
6 0.714782666374148
7 0.7463722715153781
8 0.7709720591467502
9 0.7821034573613679
10 0.7726017763124613
11 0.7837396664561405
12 0.7892182553825113
13 0.786484004118658
14 0.7826223832395275
15 0.7798017653347602
16 0.7826169938346105
17 0.7747790676651939
18 0.7776053032674657
19 0.7825224422437532
[12, 0.7892182553825113]


In [32]:
plt.err

AttributeError: module 'plotly' has no attribute 'err'

### Спойлер: можно то же самое с помощью grid search

In [33]:
from sklearn.model_selection import GridSearchCV

In [34]:
params = [ {'max_depth': list( range(1, 20) )} ]

In [35]:
gs = GridSearchCV( DecisionTreeClassifier(), param_grid = params, cv = 5, scoring = 'accuracy', return_train_score = False )

In [36]:
gs.fit( X, Y )

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='accuracy', verbose=0)

In [37]:
gs.cv_results_

{'mean_fit_time': array([0.00640044, 0.00620041, 0.00820041, 0.00940061, 0.0114007 ,
        0.01340075, 0.01540089, 0.01680102, 0.01760116, 0.01840096,
        0.01920109, 0.01920118, 0.01980109, 0.01980119, 0.01920104,
        0.02000113, 0.01980114, 0.01960115, 0.01980114]),
 'mean_score_time': array([0.00040002, 0.0006001 , 0.00040011, 0.0006    , 0.0006    ,
        0.00100007, 0.00100007, 0.00039997, 0.00079999, 0.00100017,
        0.00080009, 0.00079999, 0.        , 0.00100002, 0.00080013,
        0.0006001 , 0.00059996, 0.00079994, 0.00100002]),
 'mean_test_score': array([0.19755147, 0.3116305 , 0.42737896, 0.54368392, 0.62437396,
        0.71396772, 0.74457429, 0.77518086, 0.78742348, 0.78742348,
        0.77907624, 0.77685031, 0.78074569, 0.78519755, 0.77851976,
        0.78575403, 0.78074569, 0.77462437, 0.77573734]),
 'param_max_depth': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
                    17, 18, 19],
              mask=[False, False

In [38]:
gs.best_params_

{'max_depth': 9}