# MNIST
Modified National Institute of Standards and Technology

28*28 픽셀의 0~9 사이의 숫자 이미지를 레이블로 구성한 데이터셋이다.

In [None]:
import time
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [None]:
df_train = pd.read_csv('/home/han/Desktop/Han_ws/00.Data/07.ML/data/mnist_train.csv')
df_test = pd.read_csv('/home/han/Desktop/Han_ws/00.Data/07.ML/data/mnist_test.csv')
df_train.shape, df_test.shape

In [None]:
x_train = np.array(df_train.iloc[:, 1:])
y_train = np.array(df_train['label'])
x_test = np.array(df_test.iloc[:, 1:])
y_test = np.array(df_test['label'])

x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
# 랜덤으로 16개를 뽑아 확인해본다.


samples = random.choices(population=range(0,60000), k=16)
plt.figure(figsize=(14,12))
for idx, n in enumerate(samples) :
    plt.subplot(4, 4, idx+1)
    plt.imshow(x_train[n].reshape(28,28), cmap='Greys', interpolation='nearest')
    plt.title(y_train[n])

plt.show()

In [None]:
start_time = time.time()
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(x_train, y_train)
print('Fit time : ', time.time() - start_time)

In [None]:
start_time = time.time()
pred = clf.predict(x_test)
print('Fit time : ', time.time() - start_time)
print(accuracy_score(y_test, pred))

In [None]:
# PCA를 사용하여 차원을 줄여준다.

pipe = Pipeline([('pca', PCA()), ('clf', KNeighborsClassifier())])
parameters = {'pca__n_components' : [2, 5, 10], 'clf__n_neighbors' : [5, 10, 15]}
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)
grid = GridSearchCV(pipe, parameters, cv=kf, n_jobs=-1, verbose=1)
grid.fit(x_train, y_train)

In [None]:
print("Best score: %0.3f" %grid.best_score_)
print("Best parameters set : ")
best_parameters = grid.best_estimator_.get_params()
for param_name in sorted(parameters.keys()) :
    print("\t%s : %r " %(param_name, best_parameters[param_name]))

In [None]:
accuracy_score(y_test, grid.best_estimator_.predict(x_test))

In [None]:
def result(y_pred, y_test) :
    from sklearn.metrics import classification_report, confusion_matrix
    print(classification_report(y_test, y_pred))

result(grid.predict(x_train), y_train)

In [None]:
n = 4893
plt.imshow(x_test[n].reshape(28,28), cmap='Greys', interpolation='nearest')
plt.show()

print('Answer is : ', grid.best_estimator_.predict(x_test[n].reshape(1, 784)))
print('Real Label is : ', y_test[n])

In [None]:
preds = grid.best_estimator_.predict(x_test)


In [None]:
# 학습된 모델로 시험해서 틀린 데이터들이다.

wrong_results = x_test[y_test != preds]
samples = random.choices(population=range(0, wrong_results.shape[0]), k=16)
plt.figure(figsize=(14,12))

for idx, n in enumerate(samples) :
    plt.subplot(4, 4, idx+1)
    plt.imshow(wrong_results[n].reshape(28,28), cmap='Greys', interpolation='nearest')
    plt.title(grid.best_estimator_.predict(wrong_results[n].reshape(1,784))[0])

plt.show()