# Classificação

In [19]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

## 1. Aquisição de dados

In [20]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.target = mnist.target.astype(np.int8) #transforma as labels de string para int

In [21]:
mnist.data.shape

(70000, 784)

In [22]:
X,y = mnist['data'], mnist['target']

In [42]:
mnist.details

{'id': '554',
 'name': 'mnist_784',
 'version': '1',
 'description_version': '1',
 'format': 'ARFF',
 'creator': ['Yann LeCun', 'Corinna Cortes', 'Christopher J.C. Burges'],
 'upload_date': '2014-09-29T03:28:38',
 'language': 'English',
 'licence': 'Public',
 'url': 'https://www.openml.org/data/v1/download/52667/mnist_784.arff',
 'file_id': '52667',
 'default_target_attribute': 'class',
 'tag': ['AzurePilot',
  'OpenML-CC18',
  'OpenML100',
  'study_1',
  'study_123',
  'study_41',
  'study_99',
  'vision'],
 'visibility': 'public',
 'minio_url': 'http://openml1.win.tue.nl/dataset554/dataset_554.pq',
 'status': 'active',
 'processing_date': '2020-11-20 20:12:09',
 'md5_checksum': '0298d579eb1b86163de7723944c7e495'}

## 3. Pré-processamento

In [43]:
X_train, y_train, X_test, y_test = X[:60000], y[:60000], X[60000:], y[60000:]

In [44]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(60000, 784)
(60000,)
(10000, 784)
(10000,)


## 4. Seleçao e treinamento do modelo

In [45]:
from sklearn.neighbors import KNeighborsClassifier

In [46]:
kn_clf = KNeighborsClassifier(n_jobs=-2)
kn_clf.fit(X_train,y_train)

KNeighborsClassifier(n_jobs=-2)

In [47]:
predictions = kn_clf.predict(X_test)

In [48]:
acc = sum(predictions == y_test)/len(predictions)

In [49]:
print('KNN stardard params',acc)

KNN stardard params 0.9688


## 5. Ajustando o Modelo

In [50]:
from sklearn.model_selection import GridSearchCV

In [51]:
kn_clf = KNeighborsClassifier(n_jobs=-2)

In [52]:
param_grid = {
    'n_neighbors': [3, 5, 11, 19],
    'weights': ['uniform', 'distance'],
}

In [53]:
grid_search = GridSearchCV(
    kn_clf, param_grid,
    cv=3,
    n_jobs = -2,
    verbose = 1)

In [54]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


GridSearchCV(cv=3, estimator=KNeighborsClassifier(n_jobs=-2), n_jobs=-2,
             param_grid={'n_neighbors': [3, 5, 11, 19],
                         'weights': ['uniform', 'distance']},
             verbose=1)

In [55]:
grid_search.best_estimator_

KNeighborsClassifier(n_jobs=-2, n_neighbors=3, weights='distance')

In [56]:
grid_search.best_params_

{'n_neighbors': 3, 'weights': 'distance'}

#### Melhor Parâmetro

In [57]:
kn_clf_best = KNeighborsClassifier(n_neighbors=3,weights='distance', n_jobs=-2)

In [58]:
kn_clf_best.fit(X_train,y_train)

KNeighborsClassifier(n_jobs=-2, n_neighbors=3, weights='distance')

In [59]:
predictions = kn_clf_best.predict(X_test)

In [60]:
acc = sum(predictions == y_test)/len(predictions)

In [61]:
print('KNN Best Parameters',acc )

KNN Best Parameters 0.9717


In [84]:
#Ref: https://stackoverflow.com/questions/61175315/is-it-possible-to-use-vector-methods-to-shift-images-stored-in-a-numpy-ndarray-f
def OneToFourImagesX(X_train):
    x_all_down = []
    x_all_up = []
    x_all_left = []
    x_all_right = []
    for x in range(len(X_train)):
        image = np.array(X_train)[x].reshape((28,28))
        x_all_down.append(np.roll(image, 1, axis=0).ravel())  # one pixel down:
        x_all_up.append(np.roll(image, -1, axis=0).ravel())  # one pixel up:
        x_all_left.append(np.roll(image, -1, axis=1).ravel())  # one pixel left:
        x_all_right.append(np.roll(image, 1, axis=1).ravel())  # one pixel right:
    X_train_new = np.concatenate([X_train, np.array(x_all_down), np.array(x_all_up), np.array(x_all_left), np.array(x_all_right)])
    return X_train_new

def OneToFourImagesY(Y_train):
    return np.concatenate((Y_train, Y_train, Y_train, Y_train, Y_train))

In [85]:
X_train_new = OneToFourImagesX(X_train)

In [101]:
X_train_new.shape

(300000, 784)

In [87]:
y_train_new = OneToFourImagesY(y_train)

In [102]:
y_train_new.shape

(300000,)

In [103]:
kn_clf_best_new = KNeighborsClassifier(n_neighbors=3,weights='distance', n_jobs=-2)

In [104]:
kn_clf_best_new.fit(X_train_new,y_train_new)

KNeighborsClassifier(n_jobs=-2, n_neighbors=3, weights='distance')

In [110]:
predictions = kn_clf_best_new.predict(X_test)



In [111]:
acc = sum(predictions == y_test)/len(predictions)

print('KNN best ',acc )

KNN best  0.9763


## 6. Avaliando o Modelo

In [None]:
y_test

In [108]:
from sklearn.metrics import accuracy_score, recall_score

In [109]:
from sklearn.metrics import confusion_matrix
print("Matriz de Confusão")
confusion_matrix(y_test, predictions)

Matriz de Confusão


array([[ 974,    1,    1,    0,    0,    1,    2,    1,    0,    0],
       [   0, 1132,    2,    0,    1,    0,    0,    0,    0,    0],
       [   5,    2, 1006,    2,    2,    0,    2,   12,    1,    0],
       [   0,    2,    2,  983,    1,   10,    0,    6,    4,    2],
       [   1,    4,    0,    0,  951,    0,    4,    3,    0,   19],
       [   2,    2,    0,    9,    0,  867,    5,    1,    3,    3],
       [   5,    4,    0,    0,    2,    2,  945,    0,    0,    0],
       [   0,   18,    3,    0,    3,    0,    0,  997,    0,    7],
       [   6,    0,    2,    9,    4,    9,    3,    3,  933,    5],
       [   1,    4,    1,    5,    6,    5,    1,   10,    1,  975]],
      dtype=int64)

In [95]:
print("Acurácia")
accuracy_score(predictions,y_test)

Acurácia


0.9763

In [97]:
print("Revocação")
recall_score(y_test, predictions, average='weighted')

Revocação


0.9763

In [98]:
from sklearn.metrics import f1_score
print("F1-Score")
f1_score(y_test,predictions, average='weighted')

F1-Score


0.976283120244064