In [1]:
import numpy as np
import scipy.spatial.distance
import sklearn.cluster

In [146]:
maxwords = 10000
words, vecs = [], []
with open('glove.6B.50d.txt') as fp:
    for line in fp:
        values = line.split()
        words.append(values[0])
        vecs.append(np.asarray(values[1:], dtype='float32'))
        maxwords -= 1
        if maxwords == 0: break

In [166]:
def find_nearest(vec, nr=5):
    dists = scipy.spatial.distance.cdist([vec], vecs, 'cosine')
    nearest = dists.argsort()[0][:nr]
    return [(words[n], dists[0][n]) for n in nearest]

def emb(word):
    return vecs[words.index(word)]

In [167]:
find_nearest(emb('forest'), nr=10)

[('forest', 0.0),
 ('forests', 0.10678214757979143),
 ('habitat', 0.19241926402944032),
 ('wildlife', 0.24355362228663868),
 ('pine', 0.24798890650159922),
 ('area', 0.27160496430845793),
 ('park', 0.2721631584416362),
 ('lakes', 0.27520908260136945),
 ('conservation', 0.29234763001228126),
 ('trees', 0.2929180807085626)]

In [168]:
find_nearest(emb('mountain') + emb('river'), nr=5)

[('river', 0.06845535073480957),
 ('mountain', 0.1017494062708616),
 ('valley', 0.10408058167021905),
 ('mountains', 0.1415148316085736),
 ('creek', 0.1537117878705343)]

In [169]:
find_nearest(emb('rain') + emb('winter'), nr=3)

[('rain', 0.07472944318597274),
 ('winter', 0.08934650773599495),
 ('snow', 0.16260857183337096)]

In [170]:
find_nearest(emb('boy') - emb('man') + emb('woman'), nr=1)

[('girl', 0.052300702916973085)]

In [171]:
feminine = emb('woman') - emb('man')
print(find_nearest(emb('boy') + feminine, nr=2))
print(find_nearest(emb('king') + feminine, nr = 2))

[('girl', 0.052300702897155826), ('boy', 0.09373664668432202)]
[('king', 0.11401654124942395), ('queen', 0.1390418809506978)]


In [102]:
find_nearest(emb('berlin') - emb('germany') + emb('france'), nr=1)

[('paris', 0.08858787899312692)]

In [103]:
find_nearest(emb('germany') - emb('europe') + emb('asia'), nr=3)

[('germany', 0.14648232203948275),
 ('asia', 0.2135826984131749),
 ('japan', 0.22114189932144135)]

In [145]:
plural = emb('forests') - emb('forest')
find_nearest(emb('car') + plural, nr=1)


[('girl', 0.052300702897155826)]

In [80]:
kmeans = sklearn.cluster.KMeans(n_clusters=25).fit(vecs)
kmeans.predict([emb('king'), emb('queen'), emb('man'), emb('woman'), emb('car'), emb('motorcycle')])

array([24, 24,  0,  0, 18, 18], dtype=int32)