In [4]:
%matplotlib notebook

import numpy as np
import numpy
from tqdm import tqdm
import gensim
import random


## Download Word2Vec Vectors from https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download

## Download analogies data from http://download.tensorflow.org/data/questions-words.txt

![alt text](https://www.tensorflow.org/images/linear-relationships.png "title")

In [5]:
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

In [6]:
king_vector = model['king']
print(len(king_vector))
print(king_vector)

300
[  1.25976562e-01   2.97851562e-02   8.60595703e-03   1.39648438e-01
  -2.56347656e-02  -3.61328125e-02   1.11816406e-01  -1.98242188e-01
   5.12695312e-02   3.63281250e-01  -2.42187500e-01  -3.02734375e-01
  -1.77734375e-01  -2.49023438e-02  -1.67968750e-01  -1.69921875e-01
   3.46679688e-02   5.21850586e-03   4.63867188e-02   1.28906250e-01
   1.36718750e-01   1.12792969e-01   5.95703125e-02   1.36718750e-01
   1.01074219e-01  -1.76757812e-01  -2.51953125e-01   5.98144531e-02
   3.41796875e-01  -3.11279297e-02   1.04492188e-01   6.17675781e-02
   1.24511719e-01   4.00390625e-01  -3.22265625e-01   8.39843750e-02
   3.90625000e-02   5.85937500e-03   7.03125000e-02   1.72851562e-01
   1.38671875e-01  -2.31445312e-01   2.83203125e-01   1.42578125e-01
   3.41796875e-01  -2.39257812e-02  -1.09863281e-01   3.32031250e-02
  -5.46875000e-02   1.53198242e-02  -1.62109375e-01   1.58203125e-01
  -2.59765625e-01   2.01416016e-02  -1.63085938e-01   1.35803223e-03
  -1.44531250e-01  -5.68847656

In [7]:
analogy_vector = model['king'] - model['man'] + model['queen']
print(analogy_vector)

[-0.19494629 -0.24462891 -0.09539795  0.34619141  0.01635742 -0.08374023
  0.23876953 -0.42193604 -0.28955078  0.41699982 -0.34667969 -0.43359375
  0.01025391  0.1256485   0.06689453 -0.09790039  0.20507812 -0.10098267
  0.08251953  0.32373047  0.45800781  0.24821472 -0.20288086  0.28955078
  0.10253906 -0.03857422 -0.48242188 -0.1081543   0.15527344 -0.15270996
 -0.21386719 -0.20581055  0.10595703  0.63391113 -0.58105469 -0.30175781
 -0.11279297  0.39648438 -0.09619141  0.03613281  0.21411133 -0.60888672
  0.34082031  0.12298584  0.29492188 -0.00195312 -0.03781128  0.16821289
  0.15332031  0.24481201 -0.04626465  0.13183594 -0.26269531  0.17150879
 -0.40771484  0.02772522 -0.41064453 -0.1862793   0.07617188 -0.10406494
  0.14550781  0.41992188  0.00299072  0.27587891  0.18359375 -0.39794922
 -0.07757568  0.12207031 -0.43115234  0.42651367  0.19873047  0.12451172
 -0.11230469  0.17626953 -0.01159668  0.02636719  0.0045166   0.12103271
  0.25024414 -0.10375977 -0.22241211  0.20141602  0

In [8]:
# example analogy task like king - man + woman = queen
answer = model.most_similar(positive=['woman', 'king'], negative=['man'])
print("king - man + woman = {}".format(answer))

king - man + woman = [(u'queen', 0.7118192911148071), (u'monarch', 0.6189674735069275), (u'princess', 0.5902431011199951), (u'crown_prince', 0.549946129322052), (u'prince', 0.5377321243286133), (u'kings', 0.5236844420433044), (u'Queen_Consort', 0.5235945582389832), (u'queens', 0.5181134343147278), (u'sultan', 0.5098593235015869), (u'monarchy', 0.5087411999702454)]


In [9]:
analogy_words = [line.rstrip('\n').split(' ') for line in open('data/questions-words.txt')]
analogy_words = [words for words in analogy_words if len(words) == 4]
np.random.seed(0)
analogy_words = random.sample(analogy_words, 100)
X = [words[:3] for words in analogy_words]
y = [words[3] for words in analogy_words]

In [10]:
print(X[0], y[0])
print(X[10], y[10])
print(X[50], y[50])

(['large', 'larger', 'bright'], 'brighter')
(['Egypt', 'Egyptian', 'England'], 'English')
(['Ukraine', 'hryvnia', 'Hungary'], 'forint')


In [11]:
is_correct_list = []
top_5_predictions_list = []
for i in tqdm(range(len(X))):
    components = X[i]
    answer = y[i]
    predictions = model.most_similar(positive=[components[1], components[2]], negative=[components[0]])
    top_5_predictions = [p[0].lower() for p in sorted(predictions, key=lambda x : -x[1])[:10]]
    top_5_predictions_list.append(top_5_predictions)
    is_in_top_5 = 1.0 if answer.lower() in top_5_predictions else 0.0
    is_correct_list.append(is_in_top_5)

100%|██████████| 100/100 [02:08<00:00,  2.19it/s]


In [12]:
for i in range(10):
    components = X[i]
    answer = y[i]
    top5 = top_5_predictions_list[i]
    correct = is_correct_list[i]
    print("Components: {}, Answer: {} Top5: {} is_correct: {}".format(components, answer, top5, correct))

Components: ['large', 'larger', 'bright'], Answer: brighter Top5: [u'brighter', u'shine_brighter', u'darker', u'dimmer', u'rosier', u'sunnier', u'cheerier', u'cloudier', u'bluer', u'prettier'] is_correct: 1.0
Components: ['fortunate', 'fortunately', 'swift'], Answer: swiftly Top5: [u'swifter', u'quick', u'swiftly', u'speedy', u'thankfully', u'prompt', u'swiftness', u'speedier', u'speedily', u'predictably'] is_correct: 1.0
Components: ['he', 'she', 'son'], Answer: daughter Top5: [u'daughter', u'mother', u'niece', u'husband', u'granddaughter', u'eldest_daughter', u'sister', u'daughters', u'daugther', u'grandmother'] is_correct: 1.0
Components: ['slow', 'slower', 'heavy'], Answer: heavier Top5: [u'heavier', u'heavy', u'heaviest', u'heavier', u'lighter', u'heftier', u'heaver', u'lighter', u'softer', u'weaker'] is_correct: 1.0
Components: ['Korea', 'Korean', 'Iceland'], Answer: Icelandic Top5: [u'icelandic', u'reykjavik', u'fr\xe9ttabladid_reports', u'faroese', u'icelanders', u'norwegian', 

In [13]:
print("Accuracy in Analogy Task is", np.mean(is_correct_list))

('Accuracy in Analogy Task is', 0.93000000000000005)
