# 사전학습 임베딩 사용하기

In [6]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent work', 'supreme quality', 'bad', 'highly respectable']
y_train = [1, 0, 0, 1, 1, 0, 1]

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
tokenizer.word_index

16


{'nice': 1,
 'great': 2,
 'best': 3,
 'amazing': 4,
 'stop': 5,
 'lies': 6,
 'pitiful': 7,
 'nerd': 8,
 'excellent': 9,
 'work': 10,
 'supreme': 11,
 'quality': 12,
 'bad': 13,
 'highly': 14,
 'respectable': 15}

In [8]:
x_encoded = tokenizer.texts_to_sequences(sentences)
print(x_encoded)

[[1, 2, 3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13], [14, 15]]


In [9]:
x_train = pad_sequences(x_encoded, maxlen=4, padding='post')
y_train = np.array(y_train)
print(x_train)

[[ 1  2  3  4]
 [ 5  6  0  0]
 [ 7  8  0  0]
 [ 9 10  0  0]
 [11 12  0  0]
 [13  0  0  0]
 [14 15  0  0]]


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

embedding_dim = 4
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=4))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(x_train, y_train, epochs=10, verbose=2)

Epoch 1/10
1/1 - 0s - loss: 0.6919 - acc: 0.5714 - 328ms/epoch - 328ms/step
Epoch 2/10
1/1 - 0s - loss: 0.6904 - acc: 0.5714 - 5ms/epoch - 5ms/step
Epoch 3/10
1/1 - 0s - loss: 0.6889 - acc: 0.5714 - 5ms/epoch - 5ms/step
Epoch 4/10
1/1 - 0s - loss: 0.6875 - acc: 0.5714 - 9ms/epoch - 9ms/step
Epoch 5/10
1/1 - 0s - loss: 0.6860 - acc: 0.5714 - 5ms/epoch - 5ms/step
Epoch 6/10
1/1 - 0s - loss: 0.6846 - acc: 0.5714 - 12ms/epoch - 12ms/step
Epoch 7/10
1/1 - 0s - loss: 0.6831 - acc: 0.7143 - 12ms/epoch - 12ms/step
Epoch 8/10
1/1 - 0s - loss: 0.6816 - acc: 0.7143 - 12ms/epoch - 12ms/step
Epoch 9/10
1/1 - 0s - loss: 0.6802 - acc: 0.8571 - 12ms/epoch - 12ms/step
Epoch 10/10
1/1 - 0s - loss: 0.6787 - acc: 0.8571 - 13ms/epoch - 13ms/step


<keras.callbacks.History at 0x2636b730e50>

In [6]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 4)              64        
                                                                 
 flatten (Flatten)           (None, 16)                0         
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 81
Trainable params: 81
Non-trainable params: 0
_________________________________________________________________


## 사전 학습 임베딩
- GloVe 다운로드 : http://nlp.stanford.edu/data/glove.6B.zip
- Word2Vec 다운로드 : https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM  

1) GloVe 사용

In [9]:
from urllib.request import urlretrieve, urlopen
import gzip
import zipfile

urlretrieve('http://nlp.stanford.edu/data/glove.6B.zip', filename='glove.6B.zip')
zf = zipfile.ZipFile('glove.6B.zip')
zf.extractall()
zf.close()

In [10]:
embedding_dict = dict()
f = open('glove.6B.100d.txt', encoding='utf8')

for line in f:
    word_vector = line.split()
    word = word_vector[0]

    word_vector_arr = np.asarray(word_vector[1:], dtype='float32')
    embedding_dict[word] = word_vector_arr
f.close()

print(len(embedding_dict))

400000


In [11]:
embedding_matrix = np.zeros((vocab_size, 100))
np.shape(embedding_matrix)

(16, 100)

In [12]:
tokenizer.word_index.items()

dict_items([('nice', 1), ('great', 2), ('best', 3), ('amazing', 4), ('stop', 5), ('lies', 6), ('pitiful', 7), ('nerd', 8), ('excellent', 9), ('work', 10), ('supreme', 11), ('quality', 12), ('bad', 13), ('highly', 14), ('respectable', 15)])

In [13]:
embedding_dict['great']

array([-0.013786 ,  0.38216  ,  0.53236  ,  0.15261  , -0.29694  ,
       -0.20558  , -0.41846  , -0.58437  , -0.77355  , -0.87866  ,
       -0.37858  , -0.18516  , -0.128    , -0.20584  , -0.22925  ,
       -0.42599  ,  0.3725   ,  0.26077  , -1.0702   ,  0.62916  ,
       -0.091469 ,  0.70348  , -0.4973   , -0.77691  ,  0.66045  ,
        0.09465  , -0.44893  ,  0.018917 ,  0.33146  , -0.35022  ,
       -0.35789  ,  0.030313 ,  0.22253  , -0.23236  , -0.19719  ,
       -0.0053125, -0.25848  ,  0.58081  , -0.10705  , -0.17845  ,
       -0.16206  ,  0.087086 ,  0.63029  , -0.76649  ,  0.51619  ,
        0.14073  ,  1.019    , -0.43136  ,  0.46138  , -0.43585  ,
       -0.47568  ,  0.19226  ,  0.36065  ,  0.78987  ,  0.088945 ,
       -2.7814   , -0.15366  ,  0.01015  ,  1.1798   ,  0.15168  ,
       -0.050112 ,  1.2626   , -0.77527  ,  0.36031  ,  0.95761  ,
       -0.11385  ,  0.28035  , -0.02591  ,  0.31246  , -0.15424  ,
        0.3778   , -0.13599  ,  0.2946   , -0.31579  ,  0.4294

In [14]:
# 단어 집합의 모든 단어에 대해서 사전 훈련된 GloVe 의 임베딩 벡터를 매핑
for word, index in tokenizer.word_index.items():
    vector_value = embedding_dict.get(word)
    if vector_value is not None:
        embedding_matrix[index] = vector_value

In [15]:
embedding_matrix[2]

array([-0.013786  ,  0.38216001,  0.53236002,  0.15261   , -0.29694   ,
       -0.20558   , -0.41846001, -0.58437002, -0.77354997, -0.87866002,
       -0.37858   , -0.18516   , -0.12800001, -0.20584001, -0.22925   ,
       -0.42598999,  0.3725    ,  0.26076999, -1.07019997,  0.62915999,
       -0.091469  ,  0.70348001, -0.4973    , -0.77691001,  0.66044998,
        0.09465   , -0.44893   ,  0.018917  ,  0.33146   , -0.35021999,
       -0.35789001,  0.030313  ,  0.22253001, -0.23236001, -0.19719   ,
       -0.0053125 , -0.25848001,  0.58081001, -0.10705   , -0.17845   ,
       -0.16205999,  0.087086  ,  0.63028997, -0.76648998,  0.51618999,
        0.14072999,  1.01900005, -0.43136001,  0.46138   , -0.43584999,
       -0.47567999,  0.19226   ,  0.36065   ,  0.78987002,  0.088945  ,
       -2.78139997, -0.15366   ,  0.01015   ,  1.17980003,  0.15167999,
       -0.050112  ,  1.26259995, -0.77526999,  0.36030999,  0.95761001,
       -0.11385   ,  0.28035   , -0.02591   ,  0.31246001, -0.15

In [19]:
# 사전 학습된 임베딩 사용하여 학습하기
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

# 사전 훈련된 임베딩을 100차원의 값인 것으로 사용하기 때문에 맞춰줌
output_dim=100

model = Sequential()
# 사전 훈련 임베딩을 그대로 사용하고 추가 훈련을 하지 않을 경우
# trainable = False로 주면 됨
e = Embedding(vocab_size, output_dim, weights=[embedding_matrix], input_length=4, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(x_train, y_train, epochs=10, verbose=2)

Epoch 1/10
1/1 - 0s - loss: 0.7040 - acc: 0.5714 - 274ms/epoch - 274ms/step
Epoch 2/10
1/1 - 0s - loss: 0.6829 - acc: 0.5714 - 4ms/epoch - 4ms/step
Epoch 3/10
1/1 - 0s - loss: 0.6625 - acc: 0.5714 - 9ms/epoch - 9ms/step
Epoch 4/10
1/1 - 0s - loss: 0.6429 - acc: 0.7143 - 4ms/epoch - 4ms/step
Epoch 5/10
1/1 - 0s - loss: 0.6241 - acc: 0.7143 - 4ms/epoch - 4ms/step
Epoch 6/10
1/1 - 0s - loss: 0.6060 - acc: 0.7143 - 8ms/epoch - 8ms/step
Epoch 7/10
1/1 - 0s - loss: 0.5886 - acc: 0.7143 - 4ms/epoch - 4ms/step
Epoch 8/10
1/1 - 0s - loss: 0.5719 - acc: 0.7143 - 12ms/epoch - 12ms/step
Epoch 9/10
1/1 - 0s - loss: 0.5560 - acc: 0.7143 - 4ms/epoch - 4ms/step
Epoch 10/10
1/1 - 0s - loss: 0.5406 - acc: 0.7143 - 4ms/epoch - 4ms/step


<keras.callbacks.History at 0x2665cd2cdf0>

2. 사전 훈련 Word2Vec 사용

In [2]:
import gensim
from urllib.request import urlretrieve, urlopen
import gzip
import zipfile


data_path = 'C:/Users/Myeong/dding/data/GoogleNews-vectors-negative300.bin'
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(data_path, binary=True)

print('모델의 크기(shape) :',word2vec_model.vectors.shape) # 모델의 크기 확인

모델의 크기(shape) : (3000000, 300)


In [4]:
import numpy as np

vocab_size = 16
embedding_matrix = np.zeros((vocab_size, 300))
print(np.shape(embedding_matrix))

(16, 300)


In [10]:
def get_vector(word):
    if word in word2vec_model:
        return word2vec_model[word]
    else:
        return None

for word, index in tokenizer.word_index.items():
    # 단어와 매핑되는 사전 훈련된 임베딩 벡터값
    vector_value = get_vector(word)
    if vector_value is not None:
        embedding_matrix[index] = vector_value

In [11]:
word2vec_model['nice']

array([ 0.15820312,  0.10595703, -0.18945312,  0.38671875,  0.08349609,
       -0.26757812,  0.08349609,  0.11328125, -0.10400391,  0.17871094,
       -0.12353516, -0.22265625, -0.01806641, -0.25390625,  0.13183594,
        0.0859375 ,  0.16113281,  0.11083984, -0.11083984, -0.0859375 ,
        0.0267334 ,  0.34570312,  0.15136719, -0.00415039,  0.10498047,
        0.04907227, -0.06982422,  0.08642578,  0.03198242, -0.02844238,
       -0.15722656,  0.11865234,  0.36132812,  0.00173187,  0.05297852,
       -0.234375  ,  0.11767578,  0.08642578, -0.01123047,  0.25976562,
        0.28515625, -0.11669922,  0.38476562,  0.07275391,  0.01147461,
        0.03466797,  0.18164062, -0.03955078,  0.04199219,  0.01013184,
       -0.06054688,  0.09765625,  0.06689453,  0.14648438, -0.12011719,
        0.08447266, -0.06152344,  0.06347656,  0.3046875 , -0.35546875,
       -0.2890625 ,  0.19628906, -0.33203125, -0.07128906,  0.12792969,
        0.09619141, -0.12158203, -0.08691406, -0.12890625,  0.27

In [12]:
tokenizer.word_index['nice']

1

In [13]:
embedding_matrix[1]

array([ 0.15820312,  0.10595703, -0.18945312,  0.38671875,  0.08349609,
       -0.26757812,  0.08349609,  0.11328125, -0.10400391,  0.17871094,
       -0.12353516, -0.22265625, -0.01806641, -0.25390625,  0.13183594,
        0.0859375 ,  0.16113281,  0.11083984, -0.11083984, -0.0859375 ,
        0.0267334 ,  0.34570312,  0.15136719, -0.00415039,  0.10498047,
        0.04907227, -0.06982422,  0.08642578,  0.03198242, -0.02844238,
       -0.15722656,  0.11865234,  0.36132812,  0.00173187,  0.05297852,
       -0.234375  ,  0.11767578,  0.08642578, -0.01123047,  0.25976562,
        0.28515625, -0.11669922,  0.38476562,  0.07275391,  0.01147461,
        0.03466797,  0.18164062, -0.03955078,  0.04199219,  0.01013184,
       -0.06054688,  0.09765625,  0.06689453,  0.14648438, -0.12011719,
        0.08447266, -0.06152344,  0.06347656,  0.3046875 , -0.35546875,
       -0.2890625 ,  0.19628906, -0.33203125, -0.07128906,  0.12792969,
        0.09619141, -0.12158203, -0.08691406, -0.12890625,  0.27

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Input

max_len = 4
model = Sequential()
model.add(Input(shape=(max_len, ), dtype='int32'))
e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_len, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(x_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 2s - loss: 0.6784 - acc: 0.5714 - 2s/epoch - 2s/step
Epoch 2/100
1/1 - 0s - loss: 0.6604 - acc: 0.7143 - 4ms/epoch - 4ms/step
Epoch 3/100
1/1 - 0s - loss: 0.6429 - acc: 0.7143 - 13ms/epoch - 13ms/step
Epoch 4/100
1/1 - 0s - loss: 0.6259 - acc: 0.8571 - 4ms/epoch - 4ms/step
Epoch 5/100
1/1 - 0s - loss: 0.6095 - acc: 1.0000 - 5ms/epoch - 5ms/step
Epoch 6/100
1/1 - 0s - loss: 0.5936 - acc: 1.0000 - 4ms/epoch - 4ms/step
Epoch 7/100
1/1 - 0s - loss: 0.5782 - acc: 1.0000 - 4ms/epoch - 4ms/step
Epoch 8/100
1/1 - 0s - loss: 0.5634 - acc: 1.0000 - 4ms/epoch - 4ms/step
Epoch 9/100
1/1 - 0s - loss: 0.5490 - acc: 1.0000 - 12ms/epoch - 12ms/step
Epoch 10/100
1/1 - 0s - loss: 0.5351 - acc: 1.0000 - 9ms/epoch - 9ms/step
Epoch 11/100
1/1 - 0s - loss: 0.5217 - acc: 1.0000 - 4ms/epoch - 4ms/step
Epoch 12/100
1/1 - 0s - loss: 0.5087 - acc: 1.0000 - 13ms/epoch - 13ms/step
Epoch 13/100
1/1 - 0s - loss: 0.4962 - acc: 1.0000 - 14ms/epoch - 14ms/step
Epoch 14/100
1/1 - 0s - loss: 0.4840 - ac

<keras.callbacks.History at 0x1ae2301eac0>