# Visualizing Word Embeddings on the Tensorboard

In [17]:
import numpy as np
import tensorflow as tf
import os
import zipfile
from tensorflow.contrib.tensorboard.plugins import projector
import csv


## Read the GloVe file

Here we first need to download the GloVe word embeddings (`glove.6B.zip`) found at this [website](https://nlp.stanford.edu/projects/glove/). Then we read the GloVe file to get the first 50000 words in the file. We will be using 50 dimensional word vectors

In [18]:
vocabulary_size = 50000

pret_embeddings = np.empty(shape=(vocabulary_size,50),dtype=np.float32)

words = [] 

word_idx = 0
# Open the zip file
with zipfile.ZipFile('glove.6B.zip') as glovezip:
    # Read the file with 50 dimensional embeddings
    with glovezip.open('glove.6B.50d.txt') as glovefile:
        # Read line by line
        for li, line in enumerate(glovefile):
            # Print progress
            if (li+1)%10000==0: print('.',end='')
                
            # Get the word and the corresponding vector
            line_tokens = line.decode('utf-8').split(' ')
            word = line_tokens[0]
            vector = [float(v) for v in line_tokens[1:]]
            
            assert len(vector)==50
            words.append(word)
            # Update the embedding matrix
            pret_embeddings[word_idx,:] = np.array(vector)
            word_idx += 1
            # If the first 50000 words being read, finish
            if word_idx == vocabulary_size:
                break
                
print('\tDone')

.....	Done


## Create TensorFlow Variable

Here we create a TensorFlow variable to store the embeddings we read above and save it to the disk. This is necessary for the visualization.

In [19]:
# Create a directory to save our model
log_dir = 'models'
if not os.path.exists(log_dir):
    os.mkdir(log_dir)

tf.reset_default_graph()

# Create a Tensorflow variable initialized with the word embedings we just read in
embeddings = tf.get_variable('embeddings',shape=[vocabulary_size, 50],
                             initializer=tf.constant_initializer(pret_embeddings))

session = tf.InteractiveSession()

tf.global_variables_initializer().run()

# Define a saver, that will save the Tensorflow variables to a given location
saver = tf.train.Saver({'embeddings':embeddings})
# Save the file
saver.save(session, os.path.join(log_dir, "model.ckpt"), 0)

# Define metadata for word embeddings
with open(os.path.join(log_dir,'metadata.tsv'), 'w',encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['Word','Word ID'])
    for wi,w in enumerate(words):
      writer.writerow([w,wi])

## Define the configuration to tell the Tensorboard where and what to look

In [20]:
config = projector.ProjectorConfig()

# You can add multiple embeddings. Here we add only one.
embedding_config = config.embeddings.add()
embedding_config.tensor_name = embeddings.name
# Link this tensor to its metadata file (e.g. labels).
embedding_config.metadata_path = 'metadata.tsv'

# Use the same LOG_DIR where you stored your checkpoint.
summary_writer = tf.summary.FileWriter(log_dir)

# The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will
# read this file during startup.
projector.visualize_embeddings(summary_writer, config)