In [1]:
import numpy as np

%matplotlib notebook
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# gensim: word similarity package
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [None]:
# convert the glove file format to the word2vec file format
glove_file = datapath()
word2vec_glove_file = get_tmpfile()
glove2word2vec(glove_file, word2vec_glove_file)

In [None]:
# load a pre-trained model of word vectors
model = KeyedVectors.load_word2vec_format(word2vec_glove_file)

In [None]:
# example1: the most similar word to some other words
model.most_similar('happy')

In [None]:
# example2: the most dissimilar word to some other words
model.most_similar(negative='happy')

In [None]:
# example3: king - man + woman = queen
result = model.most_similar(positive=['woman', 'king'], negative='man')
print("{}: {:.4f}".format(*result[0]))

In [None]:
def analogy(x1, x2, y1):
    result = model.most_similar(positive=[y1, x2], negative=[x1])
    return result[0][0]

In [None]:
# japanese - japn = austrian - austria -> austria + japanese - japn = austrian
analogy('japan', 'japanese', 'austria')

In [None]:
# example4: find the odd word in a set of words
print(model.doesnt_match("breakfast cereal dinner lunch".split()))

In [None]:
# example5: display word vectors in 2D space using PCA
def display_pca_scatterplot(model, words=None, sample=0):
    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.vocab.keys()), sample)
        else:
            words = [word for word in model.vocab]

    word_vectors = np.array([model[w] for w in words])
    twodim = PCA().fit_transform(word_vectors)[:, :2]

    plt.figure(figsize=(6, 6))
    plt.scatter(twodim[:, 0], twodim[:, 1], edgecolors='k', c='r')
    for word, (x, y) in zip(words, twodim):
        plt.text(x+0.05, y+0.05, word)