## 이 페이지는 MultiLabel 된 데이터를 처리하는 방법을 정리한다.

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import keras.utils.np_utils as kutils
import glob
from skimage.io import imread, imsave

In [3]:
trainLabels = pd.read_csv('trainLabels.csv')
trainLabels.head(5)

Unnamed: 0,ID,Class
0,1,n
1,2,8
2,3,T
3,4,I
4,5,R


In [12]:
trainX = []
trainY_array = []
trainFiles = glob.glob("myTrainResized/*")
for i, nameFile in enumerate(trainFiles):
    # print(nameFile)
    id_col = int(nameFile.split('/')[-1].split('.')[0])
    trainY_array.append(trainLabels[trainLabels['ID'] == id_col]['Class'].item())
    image = imread(nameFile)
    trainX.append(image)
trainX = np.array(trainX)
print("trainX.shape", trainX.shape)
trainY = np.array(trainY_array)
print("trainY.shape", trainY.shape)

trainX.shape (6283, 20, 20, 3)
trainY.shape (6283,)


### 여기서 trainY 에 담긴 내용(MultiLabel) 을 카테고리 분류를 수행해야한다.

In [14]:
trainY

array(['n', 'G', 'Z', ..., 'e', 'N', 'e'], 
      dtype='<U1')

### 방법1. pandas 의 get_dummies() 를 이용한다.

In [23]:
dummy_from_pd = pd.get_dummies(trainY)
print(type(dummy_from_pd))
dummy_from_pd.head(5)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,q,r,s,t,u,v,w,x,y,z
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### 가능은 한데, 나중에 cnn.predict_classes() 를 했을때, 나오는 값이 어떤 정수가 나오는데, 무엇으로 변환이 된건지 확인하기가 힘들었었다.

### 방법2. sklearn 의 LabelEncoder() 를 이용한다.
http://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/

In [15]:
encoder = LabelEncoder()
encoder

LabelEncoder()

In [17]:
encoder.fit(trainY)
encoder.classes_

array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C',
       'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
       'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c',
       'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
       'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'], 
      dtype='<U1')

In [24]:
encoded_Y = encoder.transform(trainY)
encoded_Y

array([49, 16, 35, ..., 40, 23, 40])

In [25]:
dummy_from_keras = kutils.to_categorical(encoded_Y)
dummy_from_keras

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

##### 여기있는 dummy_from_keras 를 이용하면 된다.
### dummy_from_pd, dummy_from_keras 를 비교하면 다음과 같다.

In [28]:
dummy_from_pd.values[0] == dummy_from_keras[0]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True], dtype=bool)

In [29]:
dummy_from_pd.values[1] == dummy_from_keras[1]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True], dtype=bool)

#### 즉 두 카테고리화 된 데이터는 같은 해쉬값을 가지고 만들어지는 것 같다.
### 만약 encoder 를 이용해서 만들었을 경우 다음과 같이 복호화 할 수 있다
예측의 결과가 [23, 12, 29, 14, 53, 48, 49, 49, 10, 53] 로 나왔다고 가정하겠다.

In [30]:
encoder.inverse_transform([23, 12, 29, 14, 53, 48, 49, 49, 10, 53])

array(['N', 'C', 'T', 'E', 'r', 'm', 'n', 'n', 'A', 'r'], 
      dtype='<U1')

#### 바로 위의 값으로 예측했음을 알아 낼 수 있다.