## Классификация картинок

In [40]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits
import plotly as plt

In [2]:
digits = load_digits()

In [3]:
digits.keys()

dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])

In [4]:
# признаки

X = pd.DataFrame( digits['data'] )
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [5]:
# целевая переменная - цифра от 0 до 9

Y = pd.Series( digits['target'] )
Y.head()

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.3, random_state = 0 )

In [8]:
from sklearn.tree import DecisionTreeClassifier

In [9]:
# зададим пока одно значение параметра модели max_depth
depth = 5

In [10]:
from sklearn.model_selection import cross_val_score

In [21]:
np.mean(cross_val_score( DecisionTreeClassifier( max_depth = depth ), X, Y, cv = 5, scoring = 'accuracy' ))

0.6252210032483113

In [29]:
max_value = [0, 0]
means = []
stds = []
for val in range(1,20):
    value = np.mean(cross_val_score( DecisionTreeClassifier( max_depth = val ), X, Y, cv = 5, scoring = 'accuracy' ))
    means.append(value)
    stds.append(np.std(cross_val_score( DecisionTreeClassifier( max_depth = val ), X, Y, cv = 5, scoring = 'accuracy' )))
    if value > max_value[1]:
        max_value[0] = val
        max_value[1] = value
    print(val, value)
print(max_value)   

1 0.19753067821046075
2 0.3116298339479968
3 0.4274291908290249
4 0.5422870557654165
5 0.6230310595059592
6 0.7108594172930136
7 0.7513882800856535
8 0.7792921547996239
9 0.7832504311582105
10 0.7725982019551403
11 0.7826020469654006
12 0.7798149629744382
13 0.7775667493663762
14 0.7826041631255597
15 0.7759327895526207
16 0.7792531576469756
17 0.7793401132275048
18 0.784217135915674
19 0.7870954658688939
[19, 0.7870954658688939]


In [20]:
plt.err

### Спойлер: можно то же самое с помощью grid search

In [30]:
from sklearn.model_selection import GridSearchCV

In [31]:
params = [ {'max_depth': list( range(1, 20) )} ]

In [37]:
gs = GridSearchCV( DecisionTreeClassifier(), param_grid = params, cv = 5, scoring = 'accuracy', return_train_score = False )

In [38]:
gs.fit( X, Y )

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='accuracy', verbose=0)

In [34]:
gs.cv_results_

{'mean_fit_time': array([0.00933385, 0.00966732, 0.01300073, 0.01600091, 0.01900117,
        0.02200127, 0.0246679 , 0.02700146, 0.02866824, 0.02933494,
        0.03033519, 0.03066834, 0.03166842, 0.03600216, 0.03033503,
        0.03000172, 0.03066842, 0.03166842, 0.03166842]),
 'mean_score_time': array([0.0013334 , 0.0013334 , 0.00100001, 0.00100009, 0.00100001,
        0.00100001, 0.00166694, 0.00133348, 0.00100009, 0.00100009,
        0.00099993, 0.00200017, 0.00166678, 0.00133332, 0.00133348,
        0.00100009, 0.00100001, 0.00133348, 0.00100009]),
 'mean_test_score': array([0.19810796, 0.3116305 , 0.4490818 , 0.54590985, 0.64218141,
        0.70673344, 0.75236505, 0.76015582, 0.77239844, 0.77629382,
        0.77796327, 0.77629382, 0.77462437, 0.77462437, 0.77239844,
        0.78241514, 0.77072899, 0.77128548, 0.77239844]),
 'param_max_depth': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
                    17, 18, 19],
              mask=[False, False

In [39]:
gs.best_params_

{'max_depth': 10}