In [2]:
import gensim
from gensim.models import KeyedVectors
from operator import itemgetter
from tqdm import tqdm

In [3]:
embeddings = ['deps', 'bow2', 'bow5']
models = {x: KeyedVectors.load_word2vec_format('{}.words'.format(x)) for x in embeddings}

In [None]:
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.palettes import d3
from bokeh.io import output_notebook
from bokeh.layouts import column

output_file("cluster-output.html")

def get_model_plot(name, model, words, N=20, perplexity=30.0):
    word_list = []
    vectors = []
    for word in words:
        if word in model.vocab:
            word_list.append(word)
            vectors.append(model[word])
    
    kmeans = KMeans(n_clusters=N)
    kmeans.fit(vectors)
    clusters = kmeans.labels_

    tsne = TSNE(n_components=2, perplexity=perplexity)
    tsne_vectors = tsne.fit_transform(vectors)
    
    p = figure(tools="pan,wheel_zoom,reset,save",
               toolbar_location="above",
               title="T-SNE for 2000 nouns with {}.words".format(name))

    colormap = d3['Category20'][N]
    colors = [colormap[i%20] for i in clusters]

    source = ColumnDataSource(data=dict(x1=tsne_vectors[:,0],
                                        x2=tsne_vectors[:,1],
                                        names=word_list,
                                        colors=colors))

    p.scatter(x="x1", y="x2", size=8, source=source, color='colors')

    labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')
    p.add_layout(labels)

    return p

N = 20

with open("2000_nouns_sorted.txt") as file: 
    words = [x.strip() for x in file.readlines()] 

p_list = []
for name, model in models.items():
    p_list.append(get_model_plot(name, model, words))
show(column(p_list))

{'bow2': <gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f337a263438>,
 'bow5': <gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f336b918e48>,
 'deps': <gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f33c17959b0>}

0.2699586525433222