In [1]:
import numpy as np

def read_glove_file(file):
    print("Reading the glove file . . . this can take a bit")
    cbow_dict = {} #skip information on first line
    fin= open(file, encoding="utf8")    
    for line in fin:
        items = line.replace('\r','').replace('\n','').split(' ')
        if len(items) < 10: continue
        word = str(items[0])
        vect = np.array([float(i) for i in items[1:] if len(i) > 0])
        cbow_dict[word] = vect
    return cbow_dict

In [2]:
cbow_dict = read_glove_file('glove.6B.50d.txt')

Reading the glove file . . . this can take a bit


In [3]:
vocab = list(cbow_dict.keys())
print(vocab[0:100])

['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '"', "'s", 'for', '-', 'that', 'on', 'is', 'was', 'said', 'with', 'he', 'as', 'it', 'by', 'at', '(', ')', 'from', 'his', "''", '``', 'an', 'be', 'has', 'are', 'have', 'but', 'were', 'not', 'this', 'who', 'they', 'had', 'i', 'which', 'will', 'their', ':', 'or', 'its', 'one', 'after', 'new', 'been', 'also', 'we', 'would', 'two', 'more', "'", 'first', 'about', 'up', 'when', 'year', 'there', 'all', '--', 'out', 'she', 'other', 'people', "n't", 'her', 'percent', 'than', 'over', 'into', 'last', 'some', 'government', 'time', '$', 'you', 'years', 'if', 'no', 'world', 'can', 'three', 'do', ';', 'president', 'only', 'state', 'million', 'could', 'us', 'most', '_', 'against', 'u.s.']


In [4]:
#Usage example: let's load a vector
print(cbow_dict['king'])
print(cbow_dict['man'])
print(cbow_dict['queen'])

[ 0.50451   0.68607  -0.59517  -0.022801  0.60046  -0.13498  -0.08813
  0.47377  -0.61798  -0.31012  -0.076666  1.493    -0.034189 -0.98173
  0.68229   0.81722  -0.51874  -0.31503  -0.55809   0.66421   0.1961
 -0.13495  -0.11476  -0.30344   0.41177  -2.223    -1.0756   -1.0783
 -0.34354   0.33505   1.9927   -0.04234  -0.64319   0.71125   0.49159
  0.16754   0.34344  -0.25663  -0.8523    0.1661    0.40102   1.1685
 -1.0137   -0.21585  -0.15155   0.78321  -0.91241  -1.6106   -0.64426
 -0.51042 ]
[-0.094386  0.43007  -0.17224  -0.45529   1.6447    0.40335  -0.37263
  0.25071  -0.10588   0.10778  -0.10848   0.15181  -0.65396   0.55054
  0.59591  -0.46278   0.11847   0.64448  -0.70948   0.23947  -0.82905
  1.272     0.033021  0.2935    0.3911   -2.8094   -0.70745   0.4106
  0.3894   -0.2913    2.6124   -0.34576  -0.16832   0.25154   0.31216
  0.31639   0.12539  -0.012646  0.22297  -0.56585  -0.086264  0.62549
 -0.0576    0.29375   0.66005  -0.53115  -0.48233  -0.97925   0.53135
 -0.11725 ]


In [5]:
# The MAGIC of word_2_vec
king = cbow_dict['king']
man = cbow_dict['man']
woman = cbow_dict['woman']
new_vector = king - man + woman

D = np.zeros(len(vocab))

for d in range(0, len(vocab)):
    distance = np.sum((cbow_dict[vocab[d]] - new_vector)**2)
    D[d] = distance
    

In [6]:
D

array([29.14942099, 27.10428728, 26.1167605 , ..., 65.36870848,
       67.16410272, 46.96085386])

In [7]:
np.argsort(D)[0:10]

array([ 691, 2060, 1781, 3218, 1131, 6649, 6752, 9074, 5591, 8042],
      dtype=int32)

In [8]:
related_words = [vocab[i] for i in np.argsort(D)[0:10]]
print(related_words)

['king', 'queen', 'prince', 'elizabeth', 'daughter', 'widow', 'throne', 'monarch', 'cousin', 'eldest']


In [9]:
# Word exclusion tests

word_list = ['bacon', 'egg', 'coffee', 'horse']
vecs = np.array([cbow_dict[w] for w in word_list])

In [10]:
from sklearn.metrics.pairwise import pairwise_distances
D_mat = pairwise_distances(vecs)

In [11]:
D_mat


array([[0.        , 4.88187252, 5.41078844, 6.53006327],
       [4.88187252, 0.        , 5.49023294, 6.06867756],
       [5.41078844, 5.49023294, 0.        , 5.89841102],
       [6.53006327, 6.06867756, 5.89841102, 0.        ]])

In [12]:
word_list[np.argmax(np.mean(D_mat, axis=1))]

'horse'