https://www.tensorflow.org/text/tutorials/word2vec#compile_all_steps_into_one_function

In [1]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

2025-12-13 04:28:39.668571: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
%load_ext tensorboard

In [3]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [4]:
sentence = "The wide road shimmered in the hot sun"
tokens = list(sentence.lower().split())
len(tokens)

8

In [5]:
vocab, index = {}, 1

vocab['<pad>'] = 0
for token in tokens:
    if token not in vocab:
        vocab[token] = index
        index += 1

vocab_size = len(vocab)
vocab_size

8

In [6]:
inverse_vocab = {index: token for token, index in vocab.items()}
inverse_vocab

{0: '<pad>',
 1: 'the',
 2: 'wide',
 3: 'road',
 4: 'shimmered',
 5: 'in',
 6: 'hot',
 7: 'sun'}

In [7]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 4, 5, 1, 6, 7]


In [11]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0,
      seed=SEED,)
print(len(positive_skip_grams))

26


In [15]:
[[inverse_vocab[i], inverse_vocab[j]] for i, j in positive_skip_grams]

[['in', 'hot'],
 ['shimmered', 'the'],
 ['shimmered', 'wide'],
 ['sun', 'hot'],
 ['the', 'hot'],
 ['road', 'wide'],
 ['road', 'the'],
 ['shimmered', 'road'],
 ['in', 'the'],
 ['hot', 'the'],
 ['shimmered', 'in'],
 ['the', 'in'],
 ['the', 'road'],
 ['in', 'shimmered'],
 ['hot', 'sun'],
 ['in', 'road'],
 ['wide', 'the'],
 ['the', 'shimmered'],
 ['sun', 'the'],
 ['wide', 'shimmered'],
 ['hot', 'in'],
 ['road', 'shimmered'],
 ['road', 'in'],
 ['the', 'wide'],
 ['wide', 'road'],
 ['the', 'sun']]

In [14]:
positive_skip_grams

[[5, 6],
 [4, 1],
 [4, 2],
 [7, 6],
 [1, 6],
 [3, 2],
 [3, 1],
 [4, 3],
 [5, 1],
 [6, 1],
 [4, 5],
 [1, 5],
 [1, 3],
 [5, 4],
 [6, 7],
 [5, 3],
 [2, 1],
 [1, 4],
 [7, 1],
 [2, 4],
 [6, 5],
 [3, 4],
 [3, 5],
 [1, 2],
 [2, 3],
 [1, 7]]

In [17]:
targe_word, context_word = positive_skip_grams[0]

num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))

context_class

<tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[6]])>

In [18]:
context_class.shape

TensorShape([1, 1])

In [22]:
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,
    num_true=1,
    num_sampled=num_ns,
    unique=True,
    range_max=vocab_size,
    seed=SEED,
    name="negative_sampling_"
)
negative_sampling_candidates

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([2, 1, 4, 3])>

In [23]:
[inverse_vocab[index.numpy()] for index in negative_sampling_candidates]

['wide', 'the', 'shimmered', 'road']

In [27]:
squeezed_context_class = tf.squeeze(context_class, 1)
squeezed_context_class

<tf.Tensor: shape=(1,), dtype=int64, numpy=array([6])>

In [33]:
context = tf.concat([squeezed_context_class, negative_sampling_candidates], 0)

In [29]:
label = tf.constant([1] + [0]*num_ns, dtype="int64")
target = targe_word

In [31]:
target

5

In [34]:
inverse_vocab[targe_word]

'in'

In [35]:
context

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([6, 2, 1, 4, 3])>

In [36]:
[inverse_vocab[c.numpy()] for c in context]

['hot', 'wide', 'the', 'shimmered', 'road']

In [37]:
label

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0])>

In [39]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[targe_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

target_index    : 5
target_word     : in
context_indices : [6 2 1 4 3]
context_words   : ['hot', 'wide', 'the', 'shimmered', 'road']
label           : [1 0 0 0 0]


(target, context, label)

In [40]:
print("target  :", target)
print("context :", context)
print("label   :", label)

target  : 5
context : tf.Tensor([6 2 1 4 3], shape=(5,), dtype=int64)
label   : tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)
