## Классификация картинок

In [38]:
import pandas as pd
from sklearn.datasets import load_digits
import numpy as np

In [39]:
digits = load_digits()

In [40]:
digits.keys()

dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])

In [41]:
# признаки

X = pd.DataFrame( digits['data'] )
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [42]:
# целевая переменная - цифра от 0 до 9

Y = pd.Series( digits['target'] )
Y.head()

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.3, random_state = 0 )

In [45]:
from sklearn.tree import DecisionTreeClassifier

In [46]:
# зададим пока одно значение параметра модели max_depth
depth = 5

In [47]:
from sklearn.model_selection import cross_val_score

In [48]:
score = cross_val_score( DecisionTreeClassifier( max_depth = depth ), X, Y, cv = 5, scoring = 'accuracy' )

In [49]:
np.mean(score)

0.6247101857035251

In [50]:
results = []
for depth in range (1, 26):
    for cv in range (2, 10):
        score = cross_val_score( DecisionTreeClassifier( max_depth = depth ), X, Y, cv = cv, scoring = 'accuracy' )
        results.append( { 'Depth': depth, 'cv': cv, 'mean_acc': np.mean(score) } )
        print("Depth = {}, cv = {}, mean accuracy = {}".format(depth, cv, np.mean(score)))

Depth = 1, cv = 2, mean accuracy = 0.19644157781036942
Depth = 1, cv = 3, mean accuracy = 0.19810013140861105
Depth = 1, cv = 4, mean accuracy = 0.19810014817285126
Depth = 1, cv = 5, mean accuracy = 0.19753067821046075
Depth = 1, cv = 6, mean accuracy = 0.19753278357506868
Depth = 1, cv = 7, mean accuracy = 0.1975432540492827
Depth = 1, cv = 8, mean accuracy = 0.19753790808370494
Depth = 1, cv = 9, mean accuracy = 0.19752864519869492
Depth = 2, cv = 2, mean accuracy = 0.33718487394957986
Depth = 2, cv = 3, mean accuracy = 0.31168036582235575
Depth = 2, cv = 4, mean accuracy = 0.3093948442068247
Depth = 2, cv = 5, mean accuracy = 0.3116298339479968
Depth = 2, cv = 6, mean accuracy = 0.3133790902536171
Depth = 2, cv = 7, mean accuracy = 0.31282602921895947
Depth = 2, cv = 8, mean accuracy = 0.31620759998186565
Depth = 2, cv = 9, mean accuracy = 0.31618727171450706
Depth = 3, cv = 2, mean accuracy = 0.4100831665213255
Depth = 3, cv = 3, mean accuracy = 0.4491141318535344
Depth = 3, cv = 

Depth = 20, cv = 4, mean accuracy = 0.781989331006397
Depth = 20, cv = 5, mean accuracy = 0.7809947860305695
Depth = 20, cv = 6, mean accuracy = 0.8047517766569227
Depth = 20, cv = 7, mean accuracy = 0.7997781382525387
Depth = 20, cv = 8, mean accuracy = 0.8104430218934002
Depth = 20, cv = 9, mean accuracy = 0.8164081298124232
Depth = 21, cv = 2, mean accuracy = 0.7412473244014587
Depth = 21, cv = 3, mean accuracy = 0.7757610236379285
Depth = 21, cv = 4, mean accuracy = 0.7936685961854602
Depth = 21, cv = 5, mean accuracy = 0.7809000486111599
Depth = 21, cv = 6, mean accuracy = 0.7881167922283083
Depth = 21, cv = 7, mean accuracy = 0.8024355604108306
Depth = 21, cv = 8, mean accuracy = 0.8091602247668613
Depth = 21, cv = 9, mean accuracy = 0.8256799046099368
Depth = 22, cv = 2, mean accuracy = 0.7590734996828921
Depth = 22, cv = 3, mean accuracy = 0.7762980944665704
Depth = 22, cv = 4, mean accuracy = 0.7791169683700853
Depth = 22, cv = 5, mean accuracy = 0.7798240071017349
Depth = 22,

In [51]:
means = []
for dict in results:
    means.append(dict['mean_acc'])

In [52]:
results[means.index(max(means))]

{'Depth': 14, 'cv': 9, 'mean_acc': 0.8296797224278486}

### Спойлер: можно то же самое с помощью grid search

In [53]:
from sklearn.model_selection import GridSearchCV

In [54]:
params = [ {'max_depth': list( range(1, 20) )} ]

In [60]:
gs = GridSearchCV( DecisionTreeClassifier(), param_grid = params, cv = 5, scoring = 'accuracy', return_train_score = False )

In [61]:
gs.fit( X, Y )

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='accuracy', verbose=0)

In [62]:
gs.cv_results_

{'mean_fit_time': array([0.00327697, 0.00485888, 0.00580683, 0.00654693, 0.00733132,
        0.00951605, 0.00980873, 0.01085711, 0.01134243, 0.01237006,
        0.01245775, 0.01330919, 0.01402664, 0.01391959, 0.01367059,
        0.01292248, 0.01324306, 0.01325555, 0.01366086]),
 'mean_score_time': array([0.0004631 , 0.00043397, 0.00041261, 0.00029564, 0.00026727,
        0.0003736 , 0.00029478, 0.00033188, 0.00029182, 0.00034375,
        0.00031109, 0.00039344, 0.00049138, 0.00046134, 0.00044756,
        0.0003067 , 0.00032043, 0.00041409, 0.00041914]),
 'mean_test_score': array([0.19755147, 0.3116305 , 0.42737896, 0.54479688, 0.62493044,
        0.71396772, 0.74624374, 0.7801892 , 0.7801892 , 0.77685031,
        0.77796327, 0.78074569, 0.78853645, 0.77963272, 0.78130217,
        0.78185865, 0.786867  , 0.77685031, 0.7801892 ]),
 'param_max_depth': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
                    17, 18, 19],
              mask=[False, False

In [63]:
gs.best_params_

{'max_depth': 13}