### Konlpy를 사용하여 한국어를 원핫인코딩함

In [1]:
from konlpy.tag import Okt

In [12]:
okt = Okt()
text = "쿠키는 고양이고 잭은 주인입니다"
tokens = okt.morphs(text)
print(tokens)

['쿠키', '는', '고양이', '고', '잭', '은', '주인', '입니다']


In [13]:
w_index = {word : index for index, word in enumerate(tokens)}
print(w_index)

{'쿠키': 0, '는': 1, '고양이': 2, '고': 3, '잭': 4, '은': 5, '주인': 6, '입니다': 7}


In [14]:
def one_hot_encoding(word, w_index):
    one_hot_vector = [0]*(len(w_index))
    index = w_index[word]
    one_hot_vector[index] = 1
    return one_hot_vector

In [15]:
one_hot_encoding("잭", w_index)

[0, 0, 0, 0, 1, 0, 0, 0]

### Keras를 이용한 원-핫-인코딩

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [24]:
text = 'Cookie is cat. He looks just like his brother, Jack'

In [25]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
print(tokenizer.word_index)

{'cookie': 1, 'is': 2, 'cat': 3, 'he': 4, 'looks': 5, 'just': 6, 'like': 7, 'his': 8, 'brother': 9, 'jack': 10}


In [28]:
text = 'Cookie is cat. He looks just like his brother, Jack'
encoded = tokenizer.texts_to_sequences([text])[0]
print(encoded)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [29]:
one_hot = to_categorical(encoded)
print(one_hot)

[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
