In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [6]:
# GloVe 데이터를 다운로드합니다.
"""
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
!mv glove.6B.100d.txt /content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/glove100d
"""

'\n!wget http://nlp.stanford.edu/data/glove.6B.zip\n!unzip glove.6B.zip\n!mv glove.6B.100d.txt /content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/glove100d\n'

In [4]:
#!pip install annoy
import numpy as np
from annoy import AnnoyIndex

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/647.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/647.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp310-cp310-linux_x86_64.whl size=552450 sha256=5615f346fa6d3941bded45dcc71b8ae8b5bd74ba731c34a09a3833ac83801c57
  Stored in directory: /root/.cache/pip/wheels/64/8a/da/f714bcf46c5efdcfcac0559e63370c21abe961c48e3992465a
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.3


In [18]:
class PreTrainedEmbeddings(object):
  def __init__(self, word_to_index, word_vectors):
    self.word_to_index = word_to_index
    self.word_vectors = word_vectors
    self.index_to_word = \
      {v:k for k, v in self.word_to_index.items()}

    #KNN 최근접 이웃 알고리즘 이용을 위한 annoy 라이브러리 임포트
    self.index = AnnoyIndex(len(word_vectors[0]), metric="euclidean") #AnnoyIndex(벡터 길이, 메트릭)
    for _, i in self.word_to_index.items():
      self.index.add_item(i, self.word_vectors[i]) #index에 벡터를 추가
    self.index.build(50) #트리기반 동작, 한번 트리가 형성되면 단어를 추가할때마다 다시 처음부터 빌드 해야됨.

  @classmethod
  def from_embeddings_file(cls, embedding_file):
    """
    파일 포맷: word0 값1 값2 ... 값N
    input: embedding_file_path
    output: PretrainedEmbeddings의 인스턴스 반환.
    """
    word_to_index = {}
    word_vectors = []
    with open(embedding_file) as fp:
      for line in fp.readlines():
        line = line.split(" ")
        word = line[0]
        vec = np.array([float(x) for x in line[1:]])

        word_to_index[word] = len(word_to_index)
        word_vectors.append(vec)
    return cls(word_to_index, word_vectors)

  def get_embedding(self, word):
    """
      input: word(str)
      output: embedding(numpy.ndarray)
    """
    return self.word_vectors[self.word_to_index[word]]

  def get_closest_to_vector(self, vector, n=1):
    """
    input;
    n = 가장 가까운 몇개
    vector = word vector(numpy)
    output:
    가장가까운 단어 [word1(str), ... ,word_K(str)]
    """
    nn_indices = self.index.get_nns_by_vector(vector, n)
    return [self.index_to_word[neighbor] for neighbor in nn_indices]

  def compute_and_print_analogy(self, word1, word2, word3):
    vec1 = self.get_embedding(word1)
    vec2 = self.get_embedding(word2)
    vec3 = self.get_embedding(word3)

    spatial_relationship = vec2 - vec1
    vec4 = vec3 + spatial_relationship

    closest_words = self.get_closest_to_vector(vec4, n=4)
    existing_words = set([word1, word2, word3])
    closest_words = [word for word in closest_words
                     if word not in existing_words]

    if len(closest_words) == 0:
      print("계산된 벡터와 가장 가까운 이웃을 찾을 수 없습니다.")
    for word4 in closest_words:
      print("{} : {} = {} : {}".format(word1, word2, word3, word4))

In [20]:
embedding_file_dir = "/content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/glove100d/"
embeddings = \
  PreTrainedEmbeddings.from_embeddings_file(embedding_file_dir + "glove.6B.100d.txt")

In [21]:
#상위 5개 항목 출력
for key, value in embeddings.index_to_word.items():
  if key <= 5:
    vector = embeddings.get_embedding(value)
    print(key, value, vector[:5])
  else:
    break

0 the [-0.038194 -0.24487   0.72812  -0.39961   0.083172]
1 , [-0.10767  0.11053  0.59812 -0.54361  0.67396]
2 . [-0.33979  0.20941  0.46348 -0.64792 -0.38377]
3 of [-0.1529  -0.24279  0.89837  0.16996  0.53516]
4 to [-0.1897    0.050024  0.19084  -0.049184 -0.089737]
5 and [-0.071953  0.23127   0.023731 -0.50638   0.33923 ]


In [22]:
embeddings.compute_and_print_analogy('man', 'he', 'woman')

man : he = woman : she
man : he = woman : never
man : he = woman : her


In [23]:
embeddings.compute_and_print_analogy('fly', 'plane', 'sail')

fly : plane = sail : ship
fly : plane = sail : vessel
fly : plane = sail : boat


In [24]:
embeddings.compute_and_print_analogy('cat', 'kitten', 'dog')

cat : kitten = dog : puppy
cat : kitten = dog : puppies
cat : kitten = dog : pug


In [25]:
embeddings.compute_and_print_analogy('blue', 'color', 'dog')

blue : color = dog : animal
blue : color = dog : breed
blue : color = dog : pet


In [26]:
embeddings.compute_and_print_analogy('leg', 'legs', 'hand')

leg : legs = hand : hands
leg : legs = hand : fingers
leg : legs = hand : ears


In [28]:
embeddings.compute_and_print_analogy('toe', 'foot', 'finger')

toe : foot = finger : ground
toe : foot = finger : moving
toe : foot = finger : attached


In [29]:
embeddings.compute_and_print_analogy('talk', 'communicate', 'read')

talk : communicate = read : interpret
talk : communicate = read : memorize
talk : communicate = read : decipher


In [30]:
embeddings.compute_and_print_analogy('blue', 'democrat', 'red')

blue : democrat = red : republican
blue : democrat = red : congressman
blue : democrat = red : senator


In [31]:
embeddings.compute_and_print_analogy('man', 'king', 'woman')

man : king = woman : queen
man : king = woman : monarch
man : king = woman : throne


In [32]:
embeddings.compute_and_print_analogy('man', 'doctor', 'woman')

man : doctor = woman : nurse
man : doctor = woman : physician


In [33]:
embeddings.compute_and_print_analogy('fast', 'fastest', 'small')

fast : fastest = small : smallest
fast : fastest = small : largest
fast : fastest = small : places
fast : fastest = small : registered
