## A test on word embedding

In [1]:
import io
import re

import tensorflow as tf
from tensorflow.keras import layers

In [2]:
%load_ext tensorboard

### Init

In [3]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [4]:
sentence = "The cute ginger cat sits comfortably on the mat"
tokens = list(sentence.lower().split())

In [5]:
vocab, index = {}, 1  # start indexing from 1

vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
      
vocab

{'<pad>': 0,
 'the': 1,
 'cute': 2,
 'ginger': 3,
 'cat': 4,
 'sits': 5,
 'comfortably': 6,
 'on': 7,
 'mat': 8}

In [6]:
inverse_vocab = {index: token for token, index in vocab.items()}
inverse_vocab

{0: '<pad>',
 1: 'the',
 2: 'cute',
 3: 'ginger',
 4: 'cat',
 5: 'sits',
 6: 'comfortably',
 7: 'on',
 8: 'mat'}

In [7]:
example_sequence = [vocab[word] for word in tokens]
example_sequence

[1, 2, 3, 4, 5, 6, 7, 1, 8]

### Positive pairs

In [8]:
window_size = 2

#Create 'pairs'
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=len(vocab),
      window_size=window_size,
      negative_samples=0)

# Should be 2+3+4+4+4+4+4+3+2=30
len(positive_skip_grams)

30

In [9]:
# A few positive pairs
for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(4, 6): (cat, comfortably)
(1, 3): (the, ginger)
(3, 5): (ginger, sits)
(2, 1): (cute, the)
(7, 6): (on, comfortably)


### Negative sampling

When training skip-gram, the model tries to predict context words given a target word. For each real pair, instead of comparing against all vocabulary words (which is expensive), we sample a few 'fake' context words and train the model to distinguish real from fake. Here we sample random words from the vocabulary for a given target word in a window.

In [11]:
target_word, context_word = positive_skip_grams[0] # [4, 6] in this example, '4' is the target word 'cat', '6' is the context_word: 'comfortably'.
num_ns = 5

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # positive ones
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words
    unique=True,  # all the negative samples should be unique
    range_max=len(vocab),  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)

# Can contain anything except for that with index 6 ('comfortably').
negative_sampling_candidates

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([3, 1, 5, 2, 4])>

### Put everything together

For a given positive pair, now we have 'num_ns' and negative sampled context words. Here we batch them into one tensor.

In [12]:
squeezed_context_class = tf.squeeze(context_class, 1)
context = tf.concat([squeezed_context_class, negative_sampling_candidates], 0)

# Label the first context word as `1` (positive) followed by `num_ns` `0`s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")
target = target_word

In [13]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

target_index    : 4
target_word     : cat
context_indices : [6 3 1 5 2 4]
context_words   : ['comfortably', 'ginger', 'the', 'sits', 'cute', 'cat']
label           : [1 0 0 0 0 0]


In [14]:
print("target  :", target)
print("context :", context)
print("label   :", label)

target  : 4
context : tf.Tensor([6 3 1 5 2 4], shape=(6,), dtype=int64)
label   : tf.Tensor([1 0 0 0 0 0], shape=(6,), dtype=int64)
