In [None]:
# Word2Vec Negative Sample implementation
# paper: https://arxiv.org/abs/1301.3781
# source: https://www.tensorflow.org/tutorials/text/word2vec

import tensorflow as tf
import io
import re 
import string
import numpy as np

In [None]:
# Constants and corpus file
BATCH = 1024                # batch size
EMB = 128                   # embedding dimensions (number of features for each word)
EPOCH = 20                  # number of epochs for training
SEQUENCE_LENGTH = 10        # maximum length of a sentence
VOCAB_SIZE = 4096           # number of unique words in the corpus
NUM_NS = 4                  # number of negative samples per positive context.
WIN_SIZE = 2                # window size for skip-gram
BUFFER = 10000              # buffer size for shuffling
FILE = "../corpus/Turkish-English Parallel Corpus.txt"  # corpus file

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples and vocabulary size.
def generate_training_data(sequences, window_size, num_negative_samples, vocab_size):
  targets, contexts, labels = [], [], []
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)  #  Zipf's distribution

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in sequences:
    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_negative_samples,
          unique=True,
          range_max=vocab_size,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_negative_samples, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [None]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,'[%s]' % re.escape(string.punctuation), '')

In [None]:
with open(FILE, encoding = 'utf-8') as f:
  lines = f.read().splitlines()
for line in lines[5000:5005]:
  print(line)

In [None]:
dataset = tf.data.TextLineDataset(FILE).filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [None]:
# find maximum length of a sentence in the dataset
vectorize_layer = tf.keras.layers.TextVectorization(
                standardize=custom_standardization,
                max_tokens=VOCAB_SIZE,
                output_mode='int',
                output_sequence_length=SEQUENCE_LENGTH)
                
vectorize_layer.adapt(dataset.batch(BATCH))

In [None]:
inverse_vocab = vectorize_layer.get_vocabulary()
inverse_vocab[:10]

In [None]:
# Vectorize the data in text_ds.
text_vector_ds = dataset.batch(BATCH).prefetch(tf.data.AUTOTUNE).map(vectorize_layer).unbatch()

In [None]:
sequences = list(text_vector_ds.as_numpy_iterator())
len(sequences)

In [None]:
targets, contexts, labels = generate_training_data(
    sequences=sequences, 
    window_size=WIN_SIZE, 
    num_negative_samples=NUM_NS, 
    vocab_size=VOCAB_SIZE)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER).batch(BATCH, drop_remainder=True)
dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, num_ns=4):
    super(Word2Vec, self).__init__()
    self.target_embedding = tf.keras.layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = tf.keras.layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=num_ns+1)

  def call(self, pair):
    target, context = pair

    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)               # word_emb: (batch size, embedding size)
    context_emb = self.context_embedding(context)          # context_emb: (batch size, context size, embedding size)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)  # dots: (batch size, context size)
    return dots

In [None]:
word2vec = Word2Vec(vocab_size = VOCAB_SIZE, embedding_dim = EMB, num_ns=NUM_NS)
word2vec.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
word2vec.fit(dataset, epochs=EPOCH, callbacks=[callback], verbose=1)

In [None]:
#docs_infra: no_execute
%tensorboard --logdir logs

In [None]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [None]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if (index == 0): continue  # skip 0, it's padding.
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()