This notebook is based on this TensorFlow [tutorial](https://www.tensorflow.org/versions/r0.10/tutorials/word2vec/index.html#vector-representations-of-words). It's been modified to use a SummaryWriter so we can track the training process using TensorBoard. For a nice getting started with TensorBoard tutorial, check this [out](https://www.tensorflow.org/versions/r0.10/how_tos/summaries_and_tensorboard/index.html).

In [None]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
import time
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

from sklearn.manifold import TSNE

Download the text corpus.

In [None]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urllib.request.urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Read the data into a string.

In [None]:
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words"""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data
  
words = read_data(filename)
print('Data size %d' % len(words))

Build the dictionary and replace rare words with UNK token.

In [None]:
vocabulary_size = 50000

def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count = unk_count + 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
del words  # Hint to reduce memory.

Function to generate a training batch for the skip-gram model.

In [None]:
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [skip_window]
    for j in range(num_skips):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  return batch, labels

print('data:', [reverse_dictionary[di] for di in data[:8]])

for num_skips, skip_window in [(2, 1), (4, 2)]:
    data_index = 0
    batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])

Build and train a skip-gram model.

In [None]:
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default():

  # Input data.
  train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

  # Ops and variables pinned to the CPU because of missing GPU implementation
  with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

  # Compute the average NCE loss for the batch.
  # tf.nce_loss automatically draws a new sample of the negative labels each
  # time we evaluate the loss.
  loss = tf.reduce_mean(
      tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
                     num_sampled, vocabulary_size))

  # Construct the SGD optimizer using a learning rate of 1.0.
  optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

  # Compute the cosine similarity between minibatch examples and all embeddings.
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
      normalized_embeddings, valid_dataset)
  similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True)

  # Define info to be used by the SummaryWriter. This will let TensorBoard
  # plot loss values during the training process.
  loss_summary = tf.scalar_summary("loss", loss)
  train_summary_op = tf.merge_summary([loss_summary])

  # Add variable initializer.
  init = tf.initialize_all_variables()
  print("finished building graph.")

In [None]:
# Begin training.
num_steps = 100001

session = tf.InteractiveSession(graph=graph)

# We must initialize all variables before we use them.
init.run()
print("Initialized")

# Directory in which to write summary information.
# You can point TensorBoard to this directory via:
# $ tensorboard --logdir=/tmp/word2vec_basic/summaries
# Tensorflow assumes this directory already exists, so we need to create it.
timestamp = str(int(time.time()))
if not os.path.exists(os.path.join("/tmp/word2vec_basic",
                                   "summaries", timestamp)):
  os.makedirs(os.path.join("/tmp/word2vec_basic", "summaries", timestamp))
# Create the SummaryWriter
train_summary_writer = tf.train.SummaryWriter(
    os.path.join(
        "/tmp/word2vec_basic", "summaries", timestamp), session.graph)

average_loss = 0
for step in xrange(num_steps):
  batch_inputs, batch_labels = generate_batch(
      batch_size, num_skips, skip_window)
  feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

  # We perform one update step by evaluating the optimizer op (including it
  # in the list of returned values for session.run()
  # Also evaluate the training summary op.
  _, loss_val, tsummary = session.run(
      [optimizer, loss, train_summary_op],
      feed_dict=feed_dict)
  average_loss += loss_val
  # Write the evaluated summary info to the SummaryWriter. This info will
  # then show up in the TensorBoard events.
  train_summary_writer.add_summary(tsummary, step)

  if step % 2000 == 0:
    if step > 0:
      average_loss /= 2000
    # The average loss is an estimate of the loss over the last 2000 batches.
    print("Average loss at step ", step, ": ", average_loss)
    average_loss = 0

  # Note that this is expensive (~20% slowdown if computed every 500 steps)
  if step % 10000 == 0:
    sim = similarity.eval()
    for i in xrange(valid_size):
      valid_word = reverse_dictionary[valid_examples[i]]
      top_k = 8  # number of nearest neighbors
      nearest = (-sim[i, :]).argsort()[1:top_k + 1]
      log_str = "Nearest to %s:" % valid_word
      for k in xrange(top_k):
        close_word = reverse_dictionary[nearest[k]]
        log_str = "%s %s," % (log_str, close_word)
      print(log_str)
final_embeddings = normalized_embeddings.eval()
print("finished training.")

# Start TensorBoard

Start TensorBoard while the training is running (or after it's done) by pointing it to the directory in which the summaries were written. The script is configured to write them to this location.

```sh
$ tensorboard --logdir=/tmp/word2vec_basic/summaries
```

Open a browser to `http://localhost:6006/`.

![TensorBoard](images/tensorboard1.png)

In [None]:
# Visualize the embeddings.
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
  assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
  plt.figure(figsize=(18, 18))  # in inches
  for i, label in enumerate(labels):
    x, y = low_dim_embs[i, :]
    plt.scatter(x, y)
    plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')

  plt.savefig(filename)


In [None]:
try:
  from sklearn.manifold import TSNE
  import matplotlib.pyplot as plt

  tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
  plot_only = 500
  low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
  labels = [reverse_dictionary[i] for i in xrange(plot_only)]
  plot_with_labels(low_dim_embs, labels)

except ImportError:
  print("Please install sklearn and matplotlib to visualize embeddings.")

## How-to find the 'nearby' words for a specific given word

Here's a function to find the 'nearby' words for a specific word.

E.g., picking "six" as the word may give a result like this (after about 100K training steps, for higher accuracy, try running for 500k):

```

Found word six at index 22
Nearest to six:
seven
four
eight
five
nine
three
two
zero
```

In [None]:
test_word = 'six'
test_word_idx = dictionary[test_word]
print("Found word {} at index {}".format(test_word, test_word_idx))

test_embeddings = tf.nn.embedding_lookup(normalized_embeddings, [test_word_idx])
test_similarity = tf.matmul(test_embeddings, normalized_embeddings, transpose_b=True)

top_k = 8  # number of nearest neighbors

# Extra: eval the 'test word' similarity
sim = test_similarity.eval()
nearest = (-sim[0, :]).argsort()[1:top_k + 1]

print("Nearest to {}:".format(test_word))
for k in xrange(top_k):
  close_word = reverse_dictionary[nearest[k]]
  print (close_word)