Deep Learning
=============

Assignment 5
------------

The goal of this assignment is to train a Word2Vec skip-gram model over [Text8](http://mattmahoney.net/dc/textdata) data.

In [0]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE

%matplotlib inline

Download the data from the source website if necessary.

In [3]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


Read the data into a string.

In [4]:
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words"""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data
  
words = read_data(filename)
print('Data size %d' % len(words))

Data size 17005207


Build the dictionary and replace rare words with UNK token.

In [5]:
vocabulary_size = 200000

def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count = unk_count + 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
del words  # Hint to reduce memory.

Most common words (+UNK) [['UNK', 53855], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]


Function to generate a training batch for the skip-gram model.

In [6]:
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
    
  assert num_skips <= 2 * skip_window

  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)

  span = 2 * skip_window + 1 # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)

  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [ skip_window ] #make sure we don't pick the word itself as a label
    for j in range(num_skips): #num_skips = the number of times we want to use the same input to generate a label
      #make sure we don't pick the same label twice for a word
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  return batch, labels

print('data:', [reverse_dictionary[di] for di in data[:8]])

for num_skips, skip_window in [(2, 1), (4, 2)]:
    data_index = 0
    batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])

data: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first']

with num_skips = 2 and skip_window = 1:
    batch: ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term']
    labels: ['as', 'anarchism', 'originated', 'a', 'as', 'term', 'a', 'of']

with num_skips = 4 and skip_window = 2:
    batch: ['as', 'as', 'as', 'as', 'a', 'a', 'a', 'a']
    labels: ['term', 'a', 'originated', 'anarchism', 'originated', 'as', 'of', 'term']


Train a skip-gram model.

In [7]:
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default():

    # Input data.
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Variables.
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    softmax_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Model.
    # Look up embeddings for inputs.
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)
    # Compute the softmax loss, using a sample of the negative labels each time.
    loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
                               labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))

    # Optimizer.
    # Note: The optimizer will optimize the softmax_weights AND the embeddings.
    # This is because the embeddings are defined as a variable quantity and the
    # optimizer's `minimize` method will by default modify all variable quantities 
    # that contribute to the tensor it is passed.
    # See docs on `tf.train.Optimizer.minimize()` for more details.
    optimizer = tf.train.AdagradOptimizer(1).minimize(loss)

    # Compute the similarity between minibatch examples and all embeddings.
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



In [0]:
num_steps = 100001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    average_loss = 0
    
    for step in range(num_steps):
        batch_data, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
        
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        
        average_loss += l
        if step % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step %d: %f' % (step, average_loss))
            average_loss = 0
            
        # note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log = '%s %s,' % (log, close_word)
                print(log)
    final_embeddings = normalized_embeddings.eval()

In [54]:
test_embed = final_embeddings
word = 'man'
queen = test_embed[dictionary[word]]

print(test_embed.shape)

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  
  similarity = tf.matmul(queen.reshape(1,128), tf.transpose(test_embed))
 
  similarity_eval = similarity.eval()
  
  top_k = 8 # number of nearest neighbors
  nearest = (-similarity_eval[0, :]).argsort()[1:top_k+1]
  print(nearest)
  log = 'Nearest to '+word+':'
  for k in range(top_k):
      close_word = reverse_dictionary[nearest[k]]
      log = '%s %s,' % (log, close_word)
  print(log)

(200000, 128)


----------------------------------------
Exception happened during processing of request from ('::ffff:127.0.0.1', 39152, 0, 0)
Traceback (most recent call last):
  File "/usr/lib/python3.6/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.6/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.6/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.6/socketserver.py", line 696, in __init__
    self.handle()
  File "/usr/lib/python3.6/http/server.py", line 418, in handle
    self.handle_one_request()
  File "/usr/lib/python3.6/http/server.py", line 406, in handle_one_request
    method()
  File "/usr/lib/python3.6/http/server.py", line 639, in do_GET
    self.copyfile(f, self.wfile)
  File "/usr/lib/python3.6/http/server.py", line 800, in copyfile
    shutil.copyfil

[   398   1013   1498 127765   7917 146152  13318   6098]
Nearest to man: person, woman, boy, chanute, glasgow, programmatical, trudeau, revenge,


In [51]:
import pickle
with open('skip-gram-embeddings-200000v-128e.pickle', 'wb') as handle:
    pickle.dump(final_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
from google.colab import files
files.download('skip-gram-embeddings-200000v-128e.pickle')

KeyboardInterrupt: ignored

In [0]:
num_points = 400

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])

In [0]:
def plot(embeddings, labels):
  assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
  pylab.figure(figsize=(15,15))  # in inches
  for i, label in enumerate(labels):
    x, y = embeddings[i,:]
    pylab.scatter(x, y)
    pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                   ha='right', va='bottom')
  pylab.show()

words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)

---

Problem
-------

An alternative to skip-gram is another Word2Vec model called [CBOW](http://arxiv.org/abs/1301.3781) (Continuous Bag of Words). In the CBOW model, instead of predicting a context word from a word vector, you predict a word from the sum of all the word vectors in its context. Implement and evaluate a CBOW model trained on the text8 dataset.

---

In [6]:
data_index = 0

def generate_batch_cow(batch_size, R):
    global data_index
    buffer_index = 0
    
    assert R > 0

    batch = np.ndarray(shape=(batch_size, R*2), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    data_len = batch_size + 4
    buffer = collections.deque(maxlen=data_len)
    
    #fill our buffer with enough entries from the data input
    for _ in range(data_len):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
        
    for i in range(batch_size):
        target = R  # target label at the center of the buffer
        for j in range(2*R+1):
            if j==target:#skip over the middle word
                continue
            if j<target:#set first R words
                batch[i][j] = buffer[i + j]
            else:#the next R words will be offset by 1 in the vector
                batch[i][j-1] = buffer[i + j]
        labels[i] = buffer[i+target];
    return batch, labels

print('data encoded: ', data[:16])
print('data:', [reverse_dictionary[di] for di in data[:16]])

for R in range(1,3):
    data_index = 0
    batch, labels = generate_batch_cow(batch_size=8, R=R)
    print('\nwith R= %d: ' % (R))
    for j in range(0,batch.shape[0]):
        print('    batch ',j,': ', [reverse_dictionary[bi] for bi in batch[:][j]])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])

data encoded:  [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156, 128, 742, 477, 10572, 134, 1]
data: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the']

with R= 1: 
    batch  0 :  ['anarchism', 'as']
    batch  1 :  ['originated', 'a']
    batch  2 :  ['as', 'term']
    batch  3 :  ['a', 'of']
    batch  4 :  ['term', 'abuse']
    batch  5 :  ['of', 'first']
    batch  6 :  ['abuse', 'used']
    batch  7 :  ['first', 'against']
    labels: ['originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used']

with R= 2: 
    batch  0 :  ['anarchism', 'originated', 'a', 'term']
    batch  1 :  ['originated', 'as', 'term', 'of']
    batch  2 :  ['as', 'a', 'of', 'abuse']
    batch  3 :  ['a', 'term', 'abuse', 'first']
    batch  4 :  ['term', 'of', 'first', 'used']
    batch  5 :  ['of', 'abuse', 'used', 'against']
    batch  6 :  ['abuse', 'first', 'against', 'early']
    batch  7 :  ['first', 

In [7]:
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
cbow_r = 2#how many words in the past and future do we retain
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.ndarray(shape=(valid_size, cbow_r*2), dtype=np.int32)
valid_labels = np.empty(shape=(valid_size), dtype=np.int32)
num_sampled = 64 # Number of negative examples to sample.


for key,i in enumerate(random.sample(range(valid_window), valid_size)):
    target = cbow_r  # target label at the center of the buffer
    for j in range(2*cbow_r+1):
        if j==target:#skip over the middle word
            valid_labels[key] = data[i + j]
            continue
        if j<target:#set first R words
            valid_examples[key][j] = data[i + j]
        else:#the next R words will be offset by 1 in the vector
            valid_examples[key][j-1] = data[i + j]
   
#print(valid_examples)
#print(valid_labels)
with tf.device('/gpu:0'):
  graph = tf.Graph()

  with graph.as_default():

      # Input data.
      train_dataset = tf.placeholder(tf.int32, shape=[batch_size,cbow_r*2])
      train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
      valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

      # Variables.
      embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
      softmax_weights = tf.Variable(
          tf.truncated_normal([vocabulary_size, embedding_size],
                               stddev=1.0 / math.sqrt(embedding_size)))
      softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))

      global_step = tf.Variable(0, trainable=False)  # count the number of steps taken.
      decay_learning_rate = tf.train.exponential_decay(0.01, global_step, 1000, 0.96)

      # Model.
      # Look up embeddings for inputs.
     #train_dataset_reshaped = tf.reshape(train_dataset,(batch_size,embedding_size * cbow_r))
      #embed = tf.nn.embedding_lookup(embeddings, train_dataset)
      #embed = tf.reshape(embed, (batch_size,-1))
      embed = None
      for i in range(2*cbow_r):
          embed_batch = tf.nn.embedding_lookup( embeddings , train_dataset[:,i])
          emb_x,emb_y = embed_batch.get_shape().as_list()

          if embed is None:
              print('embed is None, hence reshaping  from %s to  (%s,%s)' %(embed_batch.get_shape(), emb_x, emb_y) )
              embed = tf.reshape(embed_batch, [emb_x, emb_y, 1])
          else:
              print('embed is not None, hence  concating earlier embed %s with current embed_batch'\
                        %(embed.get_shape()) )
              embed = tf.concat([embed,tf.reshape(embed_batch,[emb_x,emb_y,1])],2)

      embed =  tf.reduce_mean(embed,2,keepdims=False)

      # Compute the softmax loss, using a sample of the negative labels each time.
      loss = tf.reduce_mean(
      tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
                                 labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))

      # Optimizer.
      # Note: The optimizer will optimize the softmax_weights AND the embeddings.
      # This is because the embeddings are defined as a variable quantity and the
      # optimizer's `minimize` method will by default modify all variable quantities 
      # that contribute to the tensor it is passed.
      # See docs on `tf.train.Optimizer.minimize()` for more details.
      optimizer = tf.train.AdamOptimizer(0.001)

      train_op = optimizer.minimize(loss, global_step = global_step)

      # Compute the similarity between minibatch examples and all embeddings.
      # We use the cosine distance:
      norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
      normalized_embeddings = embeddings / norm

      #valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
      #valid_embeddings = tf.reshape(valid_embeddings, (valid_size,-1))

      valid_embeddings = None
      for i in range(2*cbow_r):
          valid_embeddings_batch = tf.nn.embedding_lookup( embeddings , valid_dataset[:,i])
          emb_x,emb_y = valid_embeddings_batch.get_shape().as_list()

          if valid_embeddings is None:
              print('valid_embeddings is None, hence reshaping  from %s to  (%s,%s)' 
                    % (valid_embeddings_batch.get_shape(), emb_x, emb_y) )
              valid_embeddings = tf.reshape(valid_embeddings_batch, [emb_x, emb_y, 1])
          else:
              print('valid_embeddings is not None, hence  concating earlier embed %s with current embed_batch'\
                        %(valid_embeddings.get_shape()) )
              valid_embeddings = tf.concat([valid_embeddings,tf.reshape(valid_embeddings_batch,[emb_x,emb_y,1])],2)

      valid_embeddings =  tf.reduce_mean(valid_embeddings,2,keepdims=False)

      predictions = tf.nn.softmax(tf.matmul(valid_embeddings, tf.transpose(softmax_weights)) + softmax_biases)
      #logits = tf.matmul(valid_embeddings, tf.transpose(softmax_weights))
      #logits = tf.nn.bias_add(logits, softmax_biases)


      #valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
      #similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

embed is None, hence reshaping  from (128, 128) to  (128,128)
embed is not None, hence  concating earlier embed (128, 128, 1) with current embed_batch
embed is not None, hence  concating earlier embed (128, 128, 2) with current embed_batch
embed is not None, hence  concating earlier embed (128, 128, 3) with current embed_batch
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.

valid_embeddings is None, hence reshaping  from (16, 128) to  (16,128)
valid_embeddings is not None, hence  concating earlier embed (16, 128, 1) with current embed_batch
valid_embeddings is not None, hence  concating earlier embed (16, 128, 2) with current embed_batch
valid_embeddings is not None, hence  concating earlier embed (16, 128, 3) with current embed_batch


In [0]:
with tf.Session(graph=graph) as session:
    params = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
    ids = tf.constant([0,2,3,4])
    embed = tf.nn.embedding_lookup(params,ids).eval()
    print(embed[0][0]+embed[1][0]+embed[2][0]+embed[3][0])
    embed = tf.reduce_mean(embed, 0, keepdims=False)
    print(embed)
    print(embed.shape)
    


0.8401964
Tensor("Mean_19:0", shape=(128,), dtype=float32)
(128,)


In [1]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isnâ€™t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/45/99/837428d26b47ebd6b66d6e1b180e98ec4a557767a93a81a02ea9d6242611/GPUtil-1.3.0.tar.gz
Building wheels for collected packages: gputil
  Running setup.py bdist_wheel for gputil ... [?25l- done
[?25h  Stored in directory: /content/.cache/pip/wheels/17/0f/04/b79c006972335e35472c0b835ed52bfc0815258d409f560108
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.3.0
Collecting humanize
  Downloading https://files.pythonhosted.org/packages/8c/e0/e512e4ac6d091fc990bbe13f9e0378f34cf6eecd1c6c268c9e598dcf5bb9/humanize-0.5.1.tar.gz
Building wheels for collected packages: humanize
  Running setup.py bdist_wheel for humanize ... [?25l- done
[?25h  Stored in directory: /content/.cache/pip/wheels/69/86/6c/f8b8593bc273ec4b0c653d3827f7482bb2001a2781a73b7f44
Successfully built humanize
Installing collected packages: humanize
Successfully installed humanize-0.5.1
[<GPUtil.GPUtil.GPU 

In [0]:
!kill -9 -1

In [9]:
num_steps = 100001

config = tf.ConfigProto()
config.gpu_options.allow_growth = True

with tf.device('/gpu:0'):
  with tf.Session(graph=graph, config=config) as session:
      tf.global_variables_initializer().run()
      print('Initialized')
      average_loss = 0

      for step in range(num_steps):
          batch_data, batch_labels = generate_batch_cow(batch_size, cbow_r)

          feed_dict = {train_dataset : batch_data, train_labels : batch_labels}

          _, l = session.run([train_op, loss], feed_dict=feed_dict)

          average_loss += l
          if step % 2000 == 0:
              if step > 0:
                  average_loss = average_loss / 2000
              # The average loss is an estimate of the loss over the last 2000 batches.
              print('Average loss at step %d: %f' % (step, average_loss))
              average_loss = 0

          # note that this is expensive (~20% slowdown if computed every 500 steps)
          if step % 10000 == 0:
              pred = predictions.eval()
              for j in range(0,valid_examples.shape[0]):
                  print('    valid_examples ',j,': ', [reverse_dictionary[bi] for bi in valid_examples[:][j]], 
                        ':---------',reverse_dictionary[np.argmax(pred[j])],';------:', reverse_dictionary[valid_labels[j]])
      final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step 0: 8.722609
    valid_examples  0 :  ['and', 'the', 'culottes', 'of'] :--------- insurence ;------: sans
    valid_examples  1 :  ['without', 'archons', 'chief', 'king'] :--------- infamia ;------: ruler
    valid_examples  2 :  ['a', 'positive', 'by', 'self'] :--------- penalise ;------: label
    valid_examples  3 :  ['abolished', 'although', 'are', 'differing'] :--------- susitna ;------: there
    valid_examples  4 :  ['word', 'anarchism', 'derived', 'from'] :--------- nicety ;------: is
    valid_examples  5 :  ['first', 'used', 'early', 'working'] :--------- strawpedo ;------: against
    valid_examples  6 :  ['has', 'also', 'taken', 'up'] :--------- sordid ;------: been
    valid_examples  7 :  ['it', 'has', 'been', 'taken'] :--------- transferase ;------: also
    valid_examples  8 :  ['from', 'the', 'without', 'archons'] :--------- rapoport ;------: greek
    valid_examples  9 :  ['against', 'early', 'class', 'radicals'] :--------- chevalines ;

Average loss at step 42000: 3.124991
Average loss at step 44000: 3.082338
Average loss at step 46000: 3.070993
Average loss at step 48000: 3.020359
Average loss at step 50000: 3.004976
    valid_examples  0 :  ['and', 'the', 'culottes', 'of'] :--------- the ;------: sans
    valid_examples  1 :  ['without', 'archons', 'chief', 'king'] :--------- the ;------: ruler
    valid_examples  2 :  ['a', 'positive', 'by', 'self'] :--------- is ;------: label
    valid_examples  3 :  ['abolished', 'although', 'are', 'differing'] :--------- they ;------: there
    valid_examples  4 :  ['word', 'anarchism', 'derived', 'from'] :--------- is ;------: is
    valid_examples  5 :  ['first', 'used', 'early', 'working'] :--------- in ;------: against
    valid_examples  6 :  ['has', 'also', 'taken', 'up'] :--------- been ;------: been
    valid_examples  7 :  ['it', 'has', 'been', 'taken'] :--------- has ;------: also
    valid_examples  8 :  ['from', 'the', 'without', 'archons'] :--------- ranging ;-----

Average loss at step 92000: 2.873050
Average loss at step 94000: 2.839710
Average loss at step 96000: 2.779775
Average loss at step 98000: 2.441414
Average loss at step 100000: 2.574801
    valid_examples  0 :  ['and', 'the', 'culottes', 'of'] :--------- the ;------: sans
    valid_examples  1 :  ['without', 'archons', 'chief', 'king'] :--------- the ;------: ruler
    valid_examples  2 :  ['a', 'positive', 'by', 'self'] :--------- as ;------: label
    valid_examples  3 :  ['abolished', 'although', 'are', 'differing'] :--------- they ;------: there
    valid_examples  4 :  ['word', 'anarchism', 'derived', 'from'] :--------- hebrew ;------: is
    valid_examples  5 :  ['first', 'used', 'early', 'working'] :--------- the ;------: against
    valid_examples  6 :  ['has', 'also', 'taken', 'up'] :--------- been ;------: been
    valid_examples  7 :  ['it', 'has', 'been', 'taken'] :--------- had ;------: also
    valid_examples  8 :  ['from', 'the', 'without', 'archons'] :--------- time ;--