# Setup & Intro
Created with guidance from: https://www.tensorflow.org/tutorials/word2vec

- Import the nessary libraries
- Create the constants
- Setup the tensorflow interactive session

In [69]:
import math
import collections
import tensorflow as tf
import numpy as np

VOCABULARY_SIZE = 5000
EMBEDDING_SIZE = 128
BATCH_SIZE = 128
NUM_SAMPLED = 64

sess = tf.InteractiveSession()

def print_tensors(tensors: {str: tf.Variable}):
    init = tf.global_variables_initializer()
    sess.run(init)
    for name, tensor in tensors.items():
        print('{}:'.format(name))
        print(tensor)
        print(sess.run(tensor))

# Building the graph
- Define the `embeddings` matrix as a big random matrix to start, initalised as a uniform unit cube
- Define the `weights` between each word in the vocabulary and the embeddings
- Define the `biases` for each word in the vocabulary

In [39]:
embeddings = tf.Variable(tf.random_uniform([VOCABULARY_SIZE, EMBEDDING_SIZE], -1.0, 1.0))
weights = tf.Variable(tf.truncated_normal([VOCABULARY_SIZE, EMBEDDING_SIZE],
                                          stddev=1.0 / math.sqrt(EMBEDDING_SIZE)))
biases = tf.Variable(tf.zeros([VOCABULARY_SIZE]))

print_tensors({'Embeddings': embeddings, 'Weights': weights, 'Biases': biases})

Embeddings:
<tf.Variable 'Variable_33:0' shape=(100, 100) dtype=float32_ref>
[[-0.99959564  0.17514896  0.6023774  ... -0.7602339   0.20073867
  -0.20027566]
 [ 0.36964917 -0.20007443  0.08079648 ... -0.94977164 -0.8610573
   0.01886177]
 [-0.13537192 -0.02241683 -0.22921944 ...  0.02772141 -0.20781112
  -0.72739315]
 ...
 [ 0.76542735 -0.48729515  0.06938696 ... -0.43892407  0.47028303
   0.6520159 ]
 [ 0.32232738  0.5187137  -0.15069056 ...  0.21919918  0.78081894
  -0.6112175 ]
 [ 0.13921404  0.1475718  -0.07931185 ...  0.06942058 -0.08953094
  -0.8860445 ]]
Weights:
<tf.Variable 'Variable_34:0' shape=(100, 100) dtype=float32_ref>
[[-0.11367207  0.08951253  0.03935289 ... -0.05828243  0.07683927
   0.00589061]
 [-0.13877697 -0.03214059 -0.11897998 ... -0.04240558 -0.03036506
   0.17767867]
 [-0.16628455 -0.07077862  0.01699749 ...  0.09565924 -0.12992476
  -0.05562282]
 ...
 [-0.08442713  0.05986143  0.01399434 ... -0.02756865  0.1139672
  -0.0425541 ]
 [ 0.1206388  -0.10573912  0.0

# Define the inputs
- Each word is represented as an integer, so the the data for `train_inputs` will look like:
  `[[41, 12, 42...], [...], ...]`
- `train_labels` is a list of labels for each document in `train_inputs`. In this case it is the next word in the    sequence in the format of `[[41], [4], ...]`
- `embed` is a lookup matrix of the document vocabulary combined with the emeddings. Each row of the matrix are the embeddings for that word in the `train_input` document

In [54]:
batch_size = 3

train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

embed = tf.nn.embedding_lookup(embeddings, train_inputs)
print('Embed:\n{}'.format(embed))

Embed:
Tensor("embedding_lookup_2:0", shape=(3, 100), dtype=float32)


# Define the loss function used for training

This is too define how accurate the weights and biases are at predicting the `train_labels` given the `embed` inputs

In [75]:
loss = tf.reduce_mean(
  tf.nn.nce_loss(weights=weights,
                 biases=biases,
                 labels=train_labels,
                 inputs=embed,
                 num_sampled=NUM_SAMPLED,
                 num_classes=VOCABULARY_SIZE))

# Fetch the dataset

In [None]:
# Fetch dataset here

# Generate batches of data to use for training

In [72]:
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        if data_index == len(data):
            buffer[:] = data[:span]
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

# Train the model

In [73]:
for inputs, labels in generate_batch(batch_size=8, num_skips=2, skip_window=1):
  feed_dict = {train_inputs: inputs, train_labels: labels}
  _, cur_loss = session.run([optimizer, loss], feed_dict=feed_dict)

NameError: name 'data' is not defined