<a href="https://colab.research.google.com/github/Redwoods/Py/blob/master/pdm2020/my-note/py-tensorflow/tf2_2_mnist_ML_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Keras example: mnist analysis by simple NN
- NN : neural network
- ANN: Artificial neural Network

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import matplotlib.pyplot as plt
# %matplotlib inline

print(tf.__version__)

In [None]:
# load dataset
from keras.datasets import mnist
(X_train0, y_train0), (X_test0, y_test0) = mnist.load_data()

In [None]:
# summarize loaded dataset
print(X_train0.shape, X_train0.dtype)
print(y_train0.shape, y_train0.dtype)
print(X_test0.shape, X_test0.dtype)
print(y_test0.shape, y_test0.dtype)

In [None]:
# display one random image from the training set:
class_names = ["0","1","2","3","4","5","6","7","8","9"]

idx = np.random.randint(0, X_train0.shape[0])
print(idx)
image = X_train0[idx]
plt.imshow(image) #, cmap=plt.get_cmap('gray'))
plt.title(class_names[y_train0[idx]])
plt.xticks([])
plt.yticks([])
plt.show()

### Show images of numbers

In [None]:
fig = plt.figure(figsize=(10,6))
for i in range(10):
    plt.subplot(2, 5, i+1)
    num0_9 = X_train0[y_train0 == i]
    print(num0_9.shape)
    plt.imshow(num0_9[0]) #, cmap='gray') #, interpolation='none')
    plt.title("Class %d" % (i))
    plt.xticks([])
    plt.yticks([])


### 데이터를 float 타입으로 바꾸고 스케일링한다.
- 2차원 영상을 1차원 벡터로 변환
- (28,28) => (784,)
- Scaling: (0 ~ 255)/255.0

In [None]:
X_train = X_train0.reshape(60000, 784).astype('float32') / 255.0
X_test = X_test0.reshape(10000, 784).astype('float32') / 255.0
print(X_train.shape, X_train.dtype)
print(X_test.shape, X_test.dtype)

In [None]:
print(X_train0.shape, X_train.shape)

### y 데이터는 One-Hot-Encoding 을 한다.

In [None]:
y_train0[:5]

In [None]:
# Use function to_categorical() to do One-Hot-Encoding
# tf.keras.utils.to_categorical
from keras.utils import to_categorical

y_train = to_categorical(y_train0, 10)
y_test = to_categorical(y_test0, 10)
y_train[:5]

> One-Hot-Encoding을 하는 이유
- 머신러닝의 목표가 특정 숫자 그림의 숫자를 정확하게 맞추는 것이 아니다.
    1. 일단, 0에서 9까지의 숫자로 판단될 10개의 확률을 계산한다.
    2. 그리고 특정 숫자로 판단될 확률이 제일 큰 수로 결정한다.
- 3개 이상의 다중클래스를 구분하는 지도학습의 경우 One-Hot-Encoding은 필요한 과정이다.

***

## ML of mnist
- ML classifier 선택
    * SGDClassifier
- fit 메서드로 트레이닝 (ML)

### SGD classifier

In [None]:
# sklearn : ML module in Python
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsOneClassifier

model = OneVsOneClassifier(SGDClassifier(max_iter=100, random_state=42))
model.fit(X_train, y_train0)

In [None]:
# Select the first number in testset
plt.figure(figsize=(2, 2))
plt.imshow(X_test0[0], cmap='gray')
plt.grid(False)
plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
X_test.shape

In [None]:
# X_test[0, :].shape,X_test[:1, :].shape

In [None]:
# Predict the first number in testset using ML model
model.predict(X_test[0, :])
# model.predict(X_test[:1, :])

In [None]:
y_test0[0]

### 테스트 데이터에 대한 예측 정확도 계산 

In [None]:
y_pred = model.predict(X_test)
y_pred.shape

In [None]:
t_count = np.sum(y_pred == y_test0) # True positive
f_count = np.sum(y_pred != y_test0) # False positive
f_count==10000-t_count

In [None]:
t_count,f_count

In [None]:
accuracy = t_count/len(y_test)*100
accuracy

### sklearn의 ML 모델 정확도 관련 함수
> from sklearn.metrics import accuracy_score, confusion_matrix
- 모델 평가 : accuracy_score(y_true, y_pred)
- 혼동 행렬(confusion matrix) : confusion_matrix(y_true, y_pred)

> https://ysyblog.tistory.com/72

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test0, y_pred)  #*100

In [None]:
confusion_matrix(y_test0, y_pred)

### Accuracy of predicting test numbers is around 93% in ML using SGD classifier.

In [None]:
np.nonzero(y_pred == y_test0)  # return tuple

In [None]:
np.nonzero(y_pred != y_test0)

In [None]:
# see which we predicted correctly and which not
correct_indices = np.nonzero(y_pred == y_test0)[0]
incorrect_indices = np.nonzero(y_pred != y_test0)[0]
print()
print(len(correct_indices)," classified correctly")
print(len(incorrect_indices)," classified incorrectly")

In [None]:
len(correct_indices)/len(y_test)*100

### [DIY: 설명]

In [None]:
correct_indices[:9], incorrect_indices[:9]

In [None]:
# adapt figure size to accomodate 18 subplots
plt.rcParams['figure.figsize'] = (7,14)

plt.figure()

# plot 9 correct predictions
for i, correct in enumerate(correct_indices[:9]):
    plt.subplot(6,3,i+1)
    plt.imshow(X_test[correct].reshape(28,28), cmap='gray', interpolation='none')
    plt.title(
      "Predicted: {}, Truth: {}".format(y_pred[correct],
                                        y_test0[correct]), color='blue')
    plt.xticks([])
    plt.yticks([])

# plot 9 incorrect predictions
for i, incorrect in enumerate(incorrect_indices[:9]):
    plt.subplot(6,3,i+10)
    plt.imshow(X_test[incorrect].reshape(28,28), cmap='gray', interpolation='none')
    plt.title(
      "Predicted {}, Truth: {}".format(y_pred[incorrect], 
                                       y_test0[incorrect]), color='red')
    plt.xticks([])
    plt.yticks([])

plt.suptitle('First 9 correct and incorrect predictions')

---