<a href="https://colab.research.google.com/github/RogerHeederer/NLP_entry/blob/master/PretrainedWordEmbedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reference Source : wikidocs.net 유영준님 자료

스스로 학습하면서 필요한 부분에는 추가적 설명, 소스 코드 삽입 및 수정 등이 있습니다. 영리적 목적이 아닌, 자기 계발 목적으로 정리한 자료입니다.

#케라스 임베딩 층 사용하기

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [22]:
sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent work', 'supreme quality', 'bad', 'highly respectable']
y_train = [1, 0, 0, 1, 1, 0, 1]

In [23]:
t = Tokenizer()
t.fit_on_texts(sentences)
vocab_size = len(t.word_index) + 1

print(vocab_size)

16


In [24]:
#정수 인코딩
X_encoded = t.texts_to_sequences(sentences)
X_encoded

[[1, 2, 3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13], [14, 15]]

In [25]:
max_len = max(len(l) for l in X_encoded)
print(max_len)

4


In [26]:
#문장 중에 가장 길이가 긴 문장은 4
#모든 문장을 패딩하여 길이를 일괄적으로 4로 맞춰준다.
X_train = pad_sequences(X_encoded, maxlen=max_len, padding='post')
y_train = np.array(y_train)
print(X_train)

[[ 1  2  3  4]
 [ 5  6  0  0]
 [ 7  8  0  0]
 [ 9 10  0  0]
 [11 12  0  0]
 [13  0  0  0]
 [14 15  0  0]]


##모델 설계

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
model.add(Embedding(vocab_size, 4, input_length=max_len)) # 임베딩 벡터는 4차원
model.add(Flatten()) #4차원을 flatten해서 1차원으로 만들어 다음 Dense에 넣기 위함
model.add(Dense(1, activation='sigmoid')) #출력층에 1개 뉴런 설정. 활성화 함수는 시그모이드 사용해서 이진분류 수행

In [28]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 0s - loss: 0.6987 - acc: 0.4286
Epoch 2/100
1/1 - 0s - loss: 0.6968 - acc: 0.4286
Epoch 3/100
1/1 - 0s - loss: 0.6949 - acc: 0.4286
Epoch 4/100
1/1 - 0s - loss: 0.6931 - acc: 0.4286
Epoch 5/100
1/1 - 0s - loss: 0.6913 - acc: 0.4286
Epoch 6/100
1/1 - 0s - loss: 0.6894 - acc: 0.4286
Epoch 7/100
1/1 - 0s - loss: 0.6876 - acc: 0.4286
Epoch 8/100
1/1 - 0s - loss: 0.6858 - acc: 0.4286
Epoch 9/100
1/1 - 0s - loss: 0.6840 - acc: 0.4286
Epoch 10/100
1/1 - 0s - loss: 0.6821 - acc: 0.4286
Epoch 11/100
1/1 - 0s - loss: 0.6803 - acc: 0.4286
Epoch 12/100
1/1 - 0s - loss: 0.6785 - acc: 0.4286
Epoch 13/100
1/1 - 0s - loss: 0.6767 - acc: 0.5714
Epoch 14/100
1/1 - 0s - loss: 0.6749 - acc: 0.7143
Epoch 15/100
1/1 - 0s - loss: 0.6731 - acc: 0.7143
Epoch 16/100
1/1 - 0s - loss: 0.6712 - acc: 0.8571
Epoch 17/100
1/1 - 0s - loss: 0.6694 - acc: 0.8571
Epoch 18/100
1/1 - 0s - loss: 0.6676 - acc: 0.8571
Epoch 19/100
1/1 - 0s - loss: 0.6658 - acc: 0.8571
Epoch 20/100
1/1 - 0s - loss: 0.6640 - a

<tensorflow.python.keras.callbacks.History at 0x7ff74812bb38>

#프리트레인 워드 임베딩 사용

In [29]:
print(X_train)
print(y_train)

[[ 1  2  3  4]
 [ 5  6  0  0]
 [ 7  8  0  0]
 [ 9 10  0  0]
 [11 12  0  0]
 [13  0  0  0]
 [14 15  0  0]]
[1 0 0 1 1 0 1]


In [30]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2020-09-12 16:55:34--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-09-12 16:55:34--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-09-12 16:55:34--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2020

**해당 glove 파일은 하나의 줄당 101개의 값을 가지는 리스트임**

In [31]:
n=0
f = open('glove.6B.100d.txt', encoding="utf-8")

for line in f:
  word_vector = line.split() # 각 라인을 읽어와서 word_vector에 저장
  print(word_vector)
  word = word_vector[0]
  print(word)
  n=n+1
  if n==2:
    break
f.close()

['the', '-0.038194', '-0.24487', '0.72812', '-0.39961', '0.083172', '0.043953', '-0.39141', '0.3344', '-0.57545', '0.087459', '0.28787', '-0.06731', '0.30906', '-0.26384', '-0.13231', '-0.20757', '0.33395', '-0.33848', '-0.31743', '-0.48336', '0.1464', '-0.37304', '0.34577', '0.052041', '0.44946', '-0.46971', '0.02628', '-0.54155', '-0.15518', '-0.14107', '-0.039722', '0.28277', '0.14393', '0.23464', '-0.31021', '0.086173', '0.20397', '0.52624', '0.17164', '-0.082378', '-0.71787', '-0.41531', '0.20335', '-0.12763', '0.41367', '0.55187', '0.57908', '-0.33477', '-0.36559', '-0.54857', '-0.062892', '0.26584', '0.30205', '0.99775', '-0.80481', '-3.0243', '0.01254', '-0.36942', '2.2167', '0.72201', '-0.24978', '0.92136', '0.034514', '0.46745', '1.1079', '-0.19358', '-0.074575', '0.23353', '-0.052062', '-0.22044', '0.057162', '-0.15806', '-0.30798', '-0.41625', '0.37972', '0.15006', '-0.53212', '-0.2055', '-1.2526', '0.071624', '0.70565', '0.49744', '-0.42063', '0.26148', '-1.538', '-0.30223

In [32]:
print(type(word_vector))
print(len(word_vector))

<class 'list'>
101


In [33]:
import numpy as np
embedding_dict = dict()
f = open('glove.6B.100d.txt', encoding="utf-8")

for line in f:
  word_vector = line.split()
  word = word_vector[0] # 단어
  word_vector_arr = np.asarray(word_vector[1:], dtype='float32') #해당 단어의 100개 벡터값들
  embedding_dict[word] = word_vector_arr
f.close()
print('%s개의 임베딩 벡터가 있습니다.' % len(embedding_dict))

400000개의 임베딩 벡터가 있습니다.


In [34]:
print(embedding_dict['respectable'])
print(len(embedding_dict['respectable']))

[-0.049773   0.19903    0.10585    0.1391    -0.32395    0.44053
  0.3947    -0.22805   -0.25793    0.49768    0.15384   -0.08831
  0.0782    -0.8299    -0.037788   0.16772   -0.45197   -0.17085
  0.74756    0.98256    0.81872    0.28507    0.16178   -0.48626
 -0.006265  -0.92469   -0.30625   -0.067318  -0.046762  -0.76291
 -0.0025264 -0.018795   0.12882   -0.52457    0.3586     0.43119
 -0.89477   -0.057421  -0.53724    0.25587    0.55195    0.44698
 -0.24252    0.29946    0.25776   -0.8717     0.68426   -0.05688
 -0.1848    -0.59352   -0.11227   -0.57692   -0.013593   0.18488
 -0.32507   -0.90171    0.17672    0.075601   0.54896   -0.21488
 -0.54018   -0.45882   -0.79536    0.26331    0.18879   -0.16363
  0.3975     0.1099     0.1164    -0.083499   0.50159    0.35802
  0.25677    0.088546   0.42108    0.28674   -0.71285   -0.82915
  0.15297   -0.82712    0.022112   1.067     -0.31776    0.1211
 -0.069755  -0.61327    0.27308   -0.42638   -0.085084  -0.17694
 -0.0090944  0.1109     0.

In [35]:
embedding_matrix = np.zeros((vocab_size, 100))
np.shape(embedding_matrix)

(16, 100)

In [36]:
print(t.word_index.items())

dict_items([('nice', 1), ('great', 2), ('best', 3), ('amazing', 4), ('stop', 5), ('lies', 6), ('pitiful', 7), ('nerd', 8), ('excellent', 9), ('work', 10), ('supreme', 11), ('quality', 12), ('bad', 13), ('highly', 14), ('respectable', 15)])


In [38]:
for word, i in t.word_index.items():#훈련 데이터의 단어 집합에서 단어를 1개씩 꺼내옴
  temp = embedding_dict.get(word)#단어에 해당하는 임베딩 벡터 100개 값을 변수에 저장
  if temp is not None:
    embedding_matrix[i] = temp # 임베딩 벡터를 저장한 변수를 매트릭스에 저장

In [39]:
#훈련 데이터 단어 집합의 모든 단어들에 대해서 사전 훈련된 글로브의 임베딩 벡터를 맵핑했다.
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False)
# 프리트레인 워드 임베딩 가져다 쓰기 때문에, 별도의 훈련을 하지 않는다. 

In [40]:
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=0)

<tensorflow.python.keras.callbacks.History at 0x7ff743ce04e0>

##사전 훈련된 Word2Vec

In [41]:
import numpy as np
import gensim

In [42]:
!wget "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2020-09-12 17:26:08--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.112.85
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.112.85|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2020-09-12 17:26:31 (68.4 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [43]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [44]:
print(word2vec_model.vectors.shape)

(3000000, 300)


In [45]:
embedding_matrix = np.zeros((vocab_size, 300))
np.shape(embedding_matrix)

(16, 300)

In [46]:
def get_vector(word):
  if word in word2vec_model:
    return word2vec_model[word] #임베딩 벡터 리턴
  else:
    return None

In [47]:
for word, i in t.word_index.items():
  temp = get_vector(word)
  if temp is not None:
    embedding_matrix[i] = temp

In [48]:
embedding_matrix.shape

(16, 300)

In [51]:
print(word2vec_model['nice'])

[ 0.15820312  0.10595703 -0.18945312  0.38671875  0.08349609 -0.26757812
  0.08349609  0.11328125 -0.10400391  0.17871094 -0.12353516 -0.22265625
 -0.01806641 -0.25390625  0.13183594  0.0859375   0.16113281  0.11083984
 -0.11083984 -0.0859375   0.0267334   0.34570312  0.15136719 -0.00415039
  0.10498047  0.04907227 -0.06982422  0.08642578  0.03198242 -0.02844238
 -0.15722656  0.11865234  0.36132812  0.00173187  0.05297852 -0.234375
  0.11767578  0.08642578 -0.01123047  0.25976562  0.28515625 -0.11669922
  0.38476562  0.07275391  0.01147461  0.03466797  0.18164062 -0.03955078
  0.04199219  0.01013184 -0.06054688  0.09765625  0.06689453  0.14648438
 -0.12011719  0.08447266 -0.06152344  0.06347656  0.3046875  -0.35546875
 -0.2890625   0.19628906 -0.33203125 -0.07128906  0.12792969  0.09619141
 -0.12158203 -0.08691406 -0.12890625  0.27734375  0.265625    0.1796875
  0.12695312  0.06298828 -0.34375    -0.05908203  0.0456543   0.171875
  0.08935547  0.14648438 -0.04638672 -0.00842285 -0.0279

In [54]:
print('단어 nice의 정수 인덱스 :', t.word_index['nice'])

단어 nice의 정수 인덱스 : 1


In [55]:
embedding_matrix[1]

array([ 0.15820312,  0.10595703, -0.18945312,  0.38671875,  0.08349609,
       -0.26757812,  0.08349609,  0.11328125, -0.10400391,  0.17871094,
       -0.12353516, -0.22265625, -0.01806641, -0.25390625,  0.13183594,
        0.0859375 ,  0.16113281,  0.11083984, -0.11083984, -0.0859375 ,
        0.0267334 ,  0.34570312,  0.15136719, -0.00415039,  0.10498047,
        0.04907227, -0.06982422,  0.08642578,  0.03198242, -0.02844238,
       -0.15722656,  0.11865234,  0.36132812,  0.00173187,  0.05297852,
       -0.234375  ,  0.11767578,  0.08642578, -0.01123047,  0.25976562,
        0.28515625, -0.11669922,  0.38476562,  0.07275391,  0.01147461,
        0.03466797,  0.18164062, -0.03955078,  0.04199219,  0.01013184,
       -0.06054688,  0.09765625,  0.06689453,  0.14648438, -0.12011719,
        0.08447266, -0.06152344,  0.06347656,  0.3046875 , -0.35546875,
       -0.2890625 ,  0.19628906, -0.33203125, -0.07128906,  0.12792969,
        0.09619141, -0.12158203, -0.08691406, -0.12890625,  0.27

In [56]:
# Embedding layer에 사전 훈련된 embedding_matrix 넣고 모델 훈련 시키기

model = Sequential()
e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_len, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 0s - loss: 0.6854 - acc: 0.4286
Epoch 2/100
1/1 - 0s - loss: 0.6668 - acc: 0.5714
Epoch 3/100
1/1 - 0s - loss: 0.6488 - acc: 0.7143
Epoch 4/100
1/1 - 0s - loss: 0.6314 - acc: 0.8571
Epoch 5/100
1/1 - 0s - loss: 0.6145 - acc: 0.8571
Epoch 6/100
1/1 - 0s - loss: 0.5981 - acc: 0.8571
Epoch 7/100
1/1 - 0s - loss: 0.5823 - acc: 1.0000
Epoch 8/100
1/1 - 0s - loss: 0.5671 - acc: 1.0000
Epoch 9/100
1/1 - 0s - loss: 0.5524 - acc: 1.0000
Epoch 10/100
1/1 - 0s - loss: 0.5382 - acc: 1.0000
Epoch 11/100
1/1 - 0s - loss: 0.5245 - acc: 1.0000
Epoch 12/100
1/1 - 0s - loss: 0.5113 - acc: 1.0000
Epoch 13/100
1/1 - 0s - loss: 0.4986 - acc: 1.0000
Epoch 14/100
1/1 - 0s - loss: 0.4863 - acc: 1.0000
Epoch 15/100
1/1 - 0s - loss: 0.4744 - acc: 1.0000
Epoch 16/100
1/1 - 0s - loss: 0.4630 - acc: 1.0000
Epoch 17/100
1/1 - 0s - loss: 0.4519 - acc: 1.0000
Epoch 18/100
1/1 - 0s - loss: 0.4412 - acc: 1.0000
Epoch 19/100
1/1 - 0s - loss: 0.4309 - acc: 1.0000
Epoch 20/100
1/1 - 0s - loss: 0.4209 - a

<tensorflow.python.keras.callbacks.History at 0x7ff7056a4710>