# Glove 살펴보기

In [1]:
#-*- coding: utf-8 -*-

import torch
import pprint

from torchtext.vocab import load_word_vectors

## 데이터 불러오기

In [2]:
wv_dict, wv_arr, wv_size = load_word_vectors('.', 'glove.6B', 100)

print('Loaded', len(wv_arr), 'words')

loading word vectors from ./glove.6B.100d.pt
Loaded 400000 words


In [3]:
def closest(d, n=10):
    all_dists = [(w, torch.dist(d, get_word(w))) for w in wv_dict]
    return sorted(all_dists, key=lambda t: t[1])[:n]

## 가장 가까운 벡터 찾기(similarities)

In [4]:
def get_word(word):
    return wv_arr[wv_dict[word]]

def print_tuples(tuples):
    for tuple in tuples:
        print('(%.4f) %s' % (tuple[1], tuple[0]))

In [5]:
print_tuples(closest(get_word('frog')))

(0.0000) frog
(4.1250) toad
(4.4973) snake
(4.5834) ape
(4.6184) monkey
(4.6711) frogs
(4.6993) litoria
(4.7110) spider
(4.7218) hypsiboas
(4.7768) squirrel


In [6]:
def analogy(w1, w2, w3, n=5, filter_given=True):
    print('\n[%s : %s :: %s : ?]' % (w1, w2, w3))
   
    # w2 - w1 + w3 = w4
    closest_words = closest(get_word(w2) - get_word(w1) + get_word(w3))
    
    # Optionally filter out given words
    if filter_given:
        closest_words = [t for t in closest_words if t[0] not in [w1, w2, w3]]
        
    print_tuples(closest_words[:n])

In [7]:
analogy('king', 'man', 'queen')
analogy('man', 'actor', 'woman')
analogy('cat', 'kitten', 'dog')
analogy('dog', 'puppy', 'cat')
analogy('earth', 'moon', 'sun') # Interesting failure mode


[king : man :: queen : ?]
(4.0811) woman
(4.6916) girl
(5.2703) she
(5.2788) teenager
(5.3084) boy

[man : actor :: woman : ?]
(2.8133) actress
(5.0039) comedian
(5.1399) actresses
(5.2773) starred
(5.3085) screenwriter

[cat : kitten :: dog : ?]
(3.8146) puppy
(4.2944) rottweiler
(4.5888) puppies
(4.6086) pooch
(4.6520) pug

[dog : puppy :: cat : ?]
(3.8146) kitten
(4.0255) puppies
(4.1575) kittens
(4.1882) pterodactyl
(4.1945) scaredy

[earth : moon :: sun : ?]
(6.2294) lee
(6.4125) kang
(6.4644) tan
(6.4757) yang
(6.4853) lin
