# Word2vec with Tensorflow

https://adventuresinmachinelearning.com/word2vec-tutorial-tensorflow/

https://github.com/adventuresinML/adventures-in-ml-code/blob/master/tf_word2vec.py

In [1]:
import tensorflow as tf

## Prepare Data

In [2]:
# $ head -10000 uba.txt > uba-10000.txt
filename = '/home/francolq/tass2018/uba-10000.txt'
with open(filename) as f:
    data = f.read()

In [3]:
data2 = tf.compat.as_str(data)

In [4]:
import collections

def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [5]:
words = data2.split()
n_words = 10000
data, count, d, rd = build_dataset(words, n_words)

In [27]:
print(words[:10])
print([d[w] for w in words[:10]])
print(data[:10])
print([rd[i] for i in data[:10]])

['Mañana', 'me', 'voy', 'a', 'mi', 'casa', 'con', '10', 'kilos', 'más']
[172, 8, 46, 5, 27, 184, 22, 262, 5736, 89]
[172, 8, 46, 5, 27, 184, 22, 262, 5736, 89]
['Mañana', 'me', 'voy', 'a', 'mi', 'casa', 'con', '10', 'kilos', 'más']


In [7]:
import numpy as np
import random
import collections

data_index = 0
# generate batch data
def generate_batch(data, batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    context = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window input_word skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # input word at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]  # this is the input word
            context[i * num_skips + j, 0] = buffer[target]  # these are the context words
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, context

In [8]:
batch_size = 128
skip_window = 1       # How many words to consider left and right.
num_skips = 2
batch, context = generate_batch(data, batch_size, num_skips, skip_window)

In [9]:
print([rd[i] for i in batch[:20]])
print([rd[i] for i in context[:20,0]])
#context[:,0].shape

['me', 'me', 'voy', 'voy', 'a', 'a', 'mi', 'mi', 'casa', 'casa', 'con', 'con', '10', '10', 'kilos', 'kilos', 'más', 'más', '😒', '😒']
['voy', 'Mañana', 'a', 'me', 'voy', 'mi', 'a', 'casa', 'mi', 'con', '10', 'casa', 'con', 'kilos', 'más', '10', '😒', 'kilos', '😂', 'más']


Here, batch will be the input (the center tokens), and context will be the output (the left and right contexts that must be predicted).

**Observations:**
- First token is not sampled. And last?
- Tweets are all together in a single sequence

## Model: step by step

In [10]:
embedding_size = 128  # Dimension of the embedding vector.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_context = tf.placeholder(tf.int32, shape=[batch_size, 1])
# (here it wrongly said train_labels in the tutorial)
# valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [11]:
# Look up embeddings for inputs.
vocabulary_size = n_words  # 10000
embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)


Instructions for updating:
Colocations handled automatically by placer.


In [12]:
import math

# Construct the variables for the softmax
weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                          stddev=1.0 / math.sqrt(embedding_size)))
biases = tf.Variable(tf.zeros([vocabulary_size]))
hidden_out = tf.matmul(embed, tf.transpose(weights)) + biases

After this, we have:

- E (embed): vocab x dim
- W (weights): vocab x dim
- b (biases): vocab

out = E W^T + b


In [13]:
# convert train_context to a one-hot format
train_one_hot = tf.one_hot(train_context, vocabulary_size)
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hidden_out, 
    labels=train_one_hot))
# Construct the SGD optimizer using a learning rate of 1.0.
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(cross_entropy)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

Instructions for updating:
Use tf.cast instead.


**And this i don't know, but it looks like it is only for validation:**

In [14]:
# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(
      normalized_embeddings, valid_dataset)
similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True)

Instructions for updating:
keep_dims is deprecated, use keepdims instead


NameError: name 'valid_dataset' is not defined

## Model all together

In [21]:
import math

graph = tf.Graph()
with graph.as_default():
    embedding_size = 128  # Dimension of the embedding vector.
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_context = tf.placeholder(tf.int32, shape=[batch_size, 1])
    
    vocabulary_size = n_words  # 10000
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # Construct the variables for the softmax
    weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                              stddev=1.0 / math.sqrt(embedding_size)))
    biases = tf.Variable(tf.zeros([vocabulary_size]))
    hidden_out = tf.matmul(embed, tf.transpose(weights)) + biases

    # convert train_context to a one-hot format
    train_one_hot = tf.one_hot(train_context, vocabulary_size)
    cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hidden_out, 
        labels=train_one_hot))
    # Construct the SGD optimizer using a learning rate of 1.0.
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(cross_entropy)

    init = tf.global_variables_initializer()


## Running

In [22]:
num_steps = 100
# init = tf.global_variables_initializer()  # already done

In [23]:
with tf.Session(graph=graph) as session:
  # We must initialize all variables before we use them.
  init.run()
  print('Initialized')

  average_loss = 0
  for step in range(num_steps):
    batch_inputs, batch_context = generate_batch(data,
        batch_size, num_skips, skip_window)
    feed_dict = {train_inputs: batch_inputs, train_context: batch_context}

    # We perform one update step by evaluating the optimizer op (including it
    # in the list of returned values for session.run()
    _, loss_val = session.run([optimizer, cross_entropy], feed_dict=feed_dict)
    average_loss += loss_val

    #if step % 2000 == 0:
    if True:
      if step > 0:
        average_loss /= 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step ', step, ': ', average_loss)
      average_loss = 0

Initialized
Average loss at step  0 :  9.347723007202148
Average loss at step  1 :  0.004658615112304688
Average loss at step  2 :  0.004629480838775635
Average loss at step  3 :  0.00458070182800293
Average loss at step  4 :  0.004641133308410645
Average loss at step  5 :  0.004632509231567382
Average loss at step  6 :  0.004548101425170898
Average loss at step  7 :  0.004597013950347901
Average loss at step  8 :  0.0046786203384399415
Average loss at step  9 :  0.0046365785598754886
Average loss at step  10 :  0.004614619731903076
Average loss at step  11 :  0.004452562808990478
Average loss at step  12 :  0.0046217765808105465
Average loss at step  13 :  0.004466467380523682
Average loss at step  14 :  0.004539928436279297
Average loss at step  15 :  0.004534406661987305
Average loss at step  16 :  0.004396549224853516
Average loss at step  17 :  0.004445652961730957
Average loss at step  18 :  0.004428196907043457
Average loss at step  19 :  0.00458828067779541
Average loss at step

## Model with Negative Sampling

In [18]:
num_sampled = 64    # Number of negative examples to sample.

In [24]:
import math

with graph.as_default():
    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
    nce_loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                       biases=nce_biases,
                       labels=train_context,
                       inputs=embed,
                       num_sampled=num_sampled,
                       num_classes=vocabulary_size))
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(nce_loss)

    init = tf.global_variables_initializer()


Here:

- nce_weights: vocab x dim
- nce_biases: vocab
- loss = ?

## Run again

In [25]:
with tf.Session(graph=graph) as session:
  # We must initialize all variables before we use them.
  init.run()
  print('Initialized')

  average_loss = 0
  for step in range(num_steps):
    batch_inputs, batch_context = generate_batch(data,
        batch_size, num_skips, skip_window)
    feed_dict = {train_inputs: batch_inputs, train_context: batch_context}

    # We perform one update step by evaluating the optimizer op (including it
    # in the list of returned values for session.run()
    _, loss_val = session.run([optimizer, cross_entropy], feed_dict=feed_dict)
    average_loss += loss_val

    #if step % 2000 == 0:
    if True:
      if step > 0:
        average_loss /= 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step ', step, ': ', average_loss)
      average_loss = 0

Initialized
Average loss at step  0 :  9.3038330078125
Average loss at step  1 :  0.004653340339660645
Average loss at step  2 :  0.0046351137161254885
Average loss at step  3 :  0.0046837892532348635
Average loss at step  4 :  0.004674358367919922
Average loss at step  5 :  0.004685054779052735
Average loss at step  6 :  0.0047127151489257816
Average loss at step  7 :  0.004659594058990479
Average loss at step  8 :  0.004666184425354004
Average loss at step  9 :  0.004641903877258301
Average loss at step  10 :  0.004634509563446045
Average loss at step  11 :  0.004647600173950195
Average loss at step  12 :  0.004657378196716308
Average loss at step  13 :  0.00465419340133667
Average loss at step  14 :  0.004656907081604004
Average loss at step  15 :  0.004678123474121094
Average loss at step  16 :  0.004656991958618164
Average loss at step  17 :  0.004664521217346191
Average loss at step  18 :  0.0046551403999328615
Average loss at step  19 :  0.004666850090026855
Average loss at step

## Evaluation

TODO!