In [None]:
%pip install -r requirements.txt

Lesson derived from https://www.tensorflow.org/text/guide/word_embeddings

# Word Embeddings

Word embeddings are ways of representing words as numerical vectors. These vectors can account for a word's context, allowing us to calculate its similarity to other words in its corpus: word's that appear in the same context might be synonyms, or they might simply be used in similar ways.

Consider the examples "run" and "walk": a good word embeddings model would have these words very close in vector-space; similarly, "mother" and "father" or "Boston" and "Massachusetts".

## One-hot encodings

One naïve way to capture this information is to encode each word's location in a vocabulary as a 1 and every other word as a 0. This is known as **one-hot encoding**, and it typically doesn't get us very far: each word's vector has to be the length of the vocabulary, even though it is mostly filled with 0s.

This is what is known as a "sparse" index.

## Unique numbers

As a second attempt, you might assign each word a unique number. But this approach, while "denser" than a one-hot encoding, fails to capture any contextual information about a given word.

## Enter word embeddings

Word embeddings are trained models that produce dense representations of words in a corpus. Smaller corpora might have 8 dimensions, while larger corpora can have up to 1024 dimensions.

We're going to attempt to train an embeddings model on the Greek texts in `./data`.

In [None]:
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [None]:
import random

batch_size = 1024
seed = random.randint(0, 1000)
train_ds = tf.keras.utils.text_dataset_from_directory(
    "data", batch_size=batch_size, validation_split=0.2, subset="training", seed=seed
)
val_ds = tf.keras.utils.text_dataset_from_directory(
    "data", batch_size=batch_size, validation_split=0.2, subset="validation", seed=seed
)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
embedding_layer = tf.keras.layers.Embedding(1000, 5)

In [None]:
result = embedding_layer(tf.constant([1, 2, 3]))
result.numpy()

In [None]:
result = embedding_layer(tf.constant([[0, 1, 2], [3, 4, 5]]))
result.shape

In [None]:
import re
import string

def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [None]:
embedding_dim=16

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(1)
])

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15,
    callbacks=[tensorboard_callback])

In [None]:
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [None]:
import io

out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()