## Visualising words in semantic space

## Dimensionality Reduction Algorithm - T-SNE

In [1]:
## load model from previous step
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('~/imdb/models/model.wv')

In [2]:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

%matplotlib inline

import argparse
import logging
import time
import codecs
from sklearn.cluster import KMeans


# importing bokeh library for interactive dataviz
import bokeh.plotting as bp
import bokeh.models as bmo
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
from bokeh.palettes import d3
from bokeh.models import ColumnDataSource, CategoricalColorMapper


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level = logging.WARNING)

In [3]:
# vocabulary
vocab = list(model.wv.vocab)

In [25]:
def plot_clust_tsne(model, clusters = 9, words_number = 400):
    start = time.time()
    print("Load word2vec model ... ", end = "", flush = True)
    w2v_model = model
    print("finished in {:.2f} sec.".format(time.time() - start), flush = True)
    word_vectors = w2v_model.wv.syn0
    n_words = word_vectors.shape[0]
    vec_size = word_vectors.shape[1]
    print("#words = {0}, vector size = {1}".format(n_words, vec_size))

    start = time.time()
    print("Compute clustering ... ", end = "", flush = True)
    kmeans = KMeans(n_clusters = clusters, n_jobs=-1, random_state = 0)
    idx = kmeans.fit_predict(word_vectors)
    print("finished in {:.2f} sec.".format(time.time() - start), flush = True)

    start = time.time()
    print("Generate output file ... ", end = "", flush = True)
    word_centroid_list = list(zip(w2v_model.wv.index2word, idx))
    word_centroid_list_sort = sorted(word_centroid_list, 
                                     key = lambda el: el[1], reverse = False)
        
    
    file_out = codecs.open("output_cluster", "w", encoding='utf-8', errors='ignore')
    file_out.write("~/imdb/n")
    
    
    
    for word_centroid in word_centroid_list_sort:
        line = word_centroid[0] + '\t' + str(word_centroid[1]) + '\n'
        file_out.write(line)
    file_out.close()
    print("finished in {:.2f} sec.".format(time.time() - start), flush = True)
    
    ####### tsne ######
    
    #### reading clusters
    import pandas as pd

    clusters = pd.read_csv("~/imdb/output_cluster", names = ["words", "cluster"], skiprows = 1, 
                           delimiter = "\t")
     # getting a list of word vectors
    word_vectors = [model[w] for w in list(model.wv.vocab.keys())[:words_number]]

    # dimensionality reduction. No hope humans can seize dimensions higher than 5D
    
    from sklearn.manifold import TSNE
    tsne_model = TSNE(n_components=2, verbose = 1, random_state = 0)
    tsne_model = tsne_model.fit_transform(word_vectors)

    tsne_df = pd.DataFrame(tsne_model, columns=['x', 'y'])
    tsne_df['words'] = list(model.wv.vocab.keys())[:words_number]
    tsne_df = pd.merge(tsne_df, clusters, how = "inner", on = "words")
    
    palette = d3['Category20'][len(tsne_df['cluster'].unique())]
    # colors = brewer["Spectral"][len(df['cluster'].unique())]
    # Create a map between factor and color.
    colormap = {i: palette[i] for i in tsne_df['cluster'].unique()}
    # Create a list of colors for each value that we will be looking at.
    colors = [colormap[x] for x in tsne_df['cluster']]
    tsne_df['color'] = colors
    # tsne_df['cluster'] = tsne_df['cluster'].astype('category')
    
    tsne_df.to_csv('~/imdb/tsne_df_to_r.csv')
    source = ColumnDataSource(dict(tsne_df)) # no need to make references to df
    
    # defining the chart
    output_notebook()
    plot_tfidf = bp.figure(plot_width = 700, plot_height = 600, title = "Mapping word vectors",
    tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
    x_axis_type = None, y_axis_type = None, min_border = 1)

    # Interactivity of Bokeh
    plot_tfidf.scatter(x ='x',
                   y ='y', 
                   color = 'color',
                   legend = 'cluster',
                   source = source)
    hover = plot_tfidf.select(dict(type = HoverTool))
    hover.tooltips = {"word": "@words"}
    
    show(plot_tfidf)
    

In [26]:
plot_clust_tsne(model) # word_number and number of clusters can be specified
## defaults are 400 words and 10 clusters

Load word2vec model ... finished in 0.00 sec.
#words = 7048, vector size = 150
Compute clustering ... finished in 2.07 sec.
Generate output file ... finished in 0.02 sec.
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 400 samples in 0.001s...
[t-SNE] Computed neighbors for 400 samples in 0.047s...
[t-SNE] Computed conditional probabilities for sample 400 / 400
[t-SNE] Mean sigma: 0.662279
[t-SNE] KL divergence after 250 iterations with early exaggeration: 97.513031
[t-SNE] Error after 1000 iterations: 1.400862
