# Practise Embeddings with custom layers

*Inspiration*

- [original paper](https://arxiv.org/abs/1301.3781)
- [easy implementation](https://www.kdnuggets.com/2018/04/implementing-deep-learning-methods-feature-engineering-text-data-cbow.html#:~:text=(%2018%3An14%20)-,Implementing%20Deep%20Learning%20Methods%20and%20Feature%20Engineering%20for%20Text%20Data,Continuous%20Bag%20of%20Words%20(CBOW)&text=The%20CBOW%20model%20architecture%20tries,Science%20Lead%20at%20Applied%20Materials.)

## Ideas improvement

- Do not train on common "words". i.e. Use a word as label only if a scaled runif number is < tfidf(word)

In [2]:
import tensorflow as tf

# model
# 1. EmbedCustom: embedding
# 2. AvgInputsCustom: average them
# 3. SoftCustom: Apply linear model Softmax

class EmbedCustom(tf.keras.layers.Layer):
    """
    Custom Embedding layer. Calls specific slices of the weights instead of applying matrix multiplication.
    :param vocab_size: integer defining the desired size of the weights
    :param embed_size: integer describing the size of the embedding
    """
    def __init__(self, vocab_size: int, embed_size: int):
        super(EmbedCustom, self).__init__()
        self.embed_size = embed_size
        self.vocab_size = vocab_size

    def build(self, input_shape: tuple):
        # initialised weight on first call of object
        self.W = self.add_weight(shape=(self.vocab_size, self.embed_size),
                                 trainable=True,
                                 initializer="random_normal") # input_shape[1] -> vocab_size

    def call(self, inputs):
        # return the weights with respect to an array/list of indices
        return tf.gather(self.W, inputs)

class AvgInputsCustom(tf.keras.layers.Layer):
    """
    Averaging layer takes as input an (n, m, k) array and averages it along the 1st axis to output an array of dimension (n, k)
    """
    def __init__(self):
        super(AvgInputsCustom, self).__init__()

    def call(self, inputs):
        return tf.reduce_mean(inputs, axis=1)

class SoftCustom(tf.keras.layers.Layer):
    """
    Custom Dense layer with Softmax activation to regress the embedding to an vocabulary size.
    :param vocab_size: Size of vocabulary (output size)
    :param embed_size: Size of the embedding (input size)
    """
    def __init__(self, vocab_size, embed_size):
        super(SoftCustom, self).__init__()
        self.embed_size = embed_size
        self.vocab_size = vocab_size

    def build(self, input_shape):
        self.W = self.add_weight(shape=(embed_size, vocab_size),
                                 trainable=True,
                                 initializer="random_normal")
        self.b = self.add_weight(shape=(1, vocab_size),
                                 trainable=True,
                                 initializer="random_normal")

    def call(self, inputs):
        return tf.nn.softmax(tf.add(tf.matmul(inputs, self.W), self.b))

class EmbedModel(tf.keras.models.Model):
    """
    Embedding model combining all layers defined above into a keras model.
    :param vocab_size: Size of vocabulary
    :param embed_size: Size of the embedding
    """
    def __init__(self, vocab_size, embed_size):
        super(EmbedModel, self).__init__()
        self.embed_layer = EmbedCustom(vocab_size, embed_size)
        self.avg_layer = AvgInputsCustom()
        self.soft_layer = SoftCustom(vocab_size, embed_size)

    def call(self, inputs):
        x = self.embed_layer(inputs)
        x = self.avg_layer(x)
        x = self.soft_layer(x)
        return x

## Experiment on generated tokens

Instead of using real sentences, run the model on some randomly generated tokens.

In [3]:
# "data" as random uniform numbers, representing fixed sentence lengths of 10
a = tf.random.uniform(shape=(300, 10), minval=0, maxval=10, dtype=tf.int64)
# define vocab size + 1 because of paddingss token
vocab_size = len(tf.unique(tf.reshape(a, (-1)))[0]) + 1
pad_token = vocab_size - 1
# define size of embedding
embed_size = 3

In [4]:
def get_training(corpus, window_size):
    """
    Get all possible training tuples out of a corpus and return (inputs, label) tuples
    """
    inputs = []
    labels = []
    ws = 2 * window_size
    # extract all tuples from each sentence
    for sentence in corpus:
        n = len(sentence)
        # use each word once for 
        for i, word in enumerate(sentence):
            # get ids of context
            choose = [i + k for k in range(-window_size, window_size + 1) if
                        k != 0 and i + k >= 0 and i + k < n]
            # choose elements
            context = tf.gather(sentence, choose)  
            context = tf.concat([context, [pad_token] * (ws - len(choose))], axis=0)  # pad with 0
            y = tf.keras.utils.to_categorical(word, vocab_size)
            inputs.append(context)
            labels.append(y)
    return inputs, labels

In [8]:
m = EmbedModel(vocab_size + 1, embed_size)
m.compile(loss="categorical_crossentropy", optimizer="rmsprop")

# get training data
inputs, labels = get_training(a, 2)
inputs = tf.stack(inputs)
labels = tf.stack(labels)

# transform into a keras dataset
data = tf.data.Dataset.from_tensor_slices((inputs, labels))
data_batch = data.batch(10)

In [9]:
# train model in 4 epochs
m.fit(data_batch, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1331f4400>

**Finding**: We find by the loss function that the model trains nicely.