In [None]:
%pip install numpy scipy plotly tqdm
%pip install --no-cache-dir --force-reinstall https://dm.cs.tu-dortmund.de/nats/nats25_09_01_attention-0.1-py3-none-any.whl
import nats25_09_01_attention

# Attention

In [None]:
import numpy as np
import plotly.graph_objects as go
from tqdm.auto import tqdm
from abc import ABC, abstractmethod
from scipy.sparse import csr_array
from copy import deepcopy
import nbbootstrap
await nbbootstrap.ensure_package("ipywidgets")

## Backpropagation Again

For this notebook, you will be asked to write an Attention-based model and train it. To do so, you will need most of the modules built in the last assignment. The code frames for the the required network modules underneath are the same as in the last assignment and contain valid solutions.

This time, the modules must be compatible to pass multiple vectors through `forward` and `backward` at once.

E.g. the `Linear` module must work with `forward` input `X` of shape $n \times d_{in}$ and return $n \times d_{out}$ matrices. During `backward` it will receive an $n \times d_{out}$ `delta` and must return a `next_delta` of shape $n \times d_{in}$. The stored gradients should have shape $d_{in} \times d_{out}$ for the weights and $d_{out}$ for the bias. You can aggregate the $n$ individual gradients by taking their mean, which simulates $n$ individual gradient steps at once.

## Abstract and architecture modules

In [None]:
class NetworkModule(ABC):
    @abstractmethod
    def _forward(self, X):
        return None
    def forward(self, X):
        self.last_input = deepcopy(X)
        return self._forward(X)
    @abstractmethod
    def backward(self, delta): pass
    def step_gradient(self, learning_rate): pass
class TrainableModule(NetworkModule):
    def __init__(self):
        self.last_input = None
        self.grad = None
    def backward(self, delta):
        if self.last_input is None:
            raise AssertionError("Tried to execute backpropagation without forward feeding data.")
        next_delta = self._next_delta(delta)
        self.grad = self._gradient_for_last_input(delta)
        return next_delta
    @abstractmethod
    def _next_delta(self, delta):
        pass
    @abstractmethod
    def _gradient_for_last_input(self, delta):
        pass
    @abstractmethod
    def step_gradient(self, learning_rate):
        pass

In [None]:
class ModuleChain(NetworkModule):
    def __init__(self, modules):
        super().__init__()
        self.modules = modules
    def _forward(self, X):
        for m in self.modules:
            X = m.forward(X)
        return X
    def backward(self, delta):
        for m in reversed(self.modules):
            delta = m.backward(delta)
        return delta
    def step_gradient(self, learning_rate):
        for m in self.modules:
            m.step_gradient(learning_rate)
class ModuleConcat(NetworkModule):
    def __init__(self, modules):
        super().__init__()
        self.modules = modules
    def _forward(self, X):
        parts = [m.forward(x) for m,x in zip(self.modules,X)]
        return np.concatenate(parts, axis=-1)
    def backward(self, delta):
        assert delta.shape[-1] % len(self.modules) == 0
        delta_part_length = delta.shape[-1] // len(self.modules)
        forwarded_delta_parts = []
        for i_module, m in enumerate(self.modules):
            forwarded_delta_parts.append(
                m.backward(delta[..., delta_part_length*i_module: delta_part_length*(i_module+1)])
            )
        return forwarded_delta_parts
    def step_gradient(self, learning_rate):
        for m in self.modules:
            m.step_gradient(learning_rate)

## Trained modules

In [None]:
class Linear(TrainableModule):
    def __init__(self, n_in, n_out, use_bias=True):
        '''
        Creates a fully connected linear layer translating from vectors of length `n_in` to vectors of length `n_out`.
        `use_bias` controls whether or not this layer should use a bias (`f(x) = x^T W+b`) or not (`f(x) = x^T W`).
        '''
        super().__init__()
        self.weights = np.random.sample((n_in, n_out))
        self.weights /= np.sum(self.weights,axis=0,keepdims=True)
        self.use_bias = use_bias
        self.bias = np.random.sample(n_out)
        self.bias /= np.sum(self.bias)
    def _forward(self, X):
        if self.use_bias:
            return X.dot(self.weights) + self.bias
        return X.dot(self.weights)
    def _next_delta(self, delta):
        return delta.dot(self.weights.T)
    def _gradient_for_last_input(self, delta):
        if type(self.last_input) == csr_array:
            if self.last_input.shape[0] == 1:
                self.last_input = self.last_input.toarray().flatten()
            else:
                self.last_input = self.last_input.toarray()
        if len(delta.shape) == 1:
            weight_grad = np.outer(self.last_input, delta)
            bias_grad = delta.copy()
        else:
            weight_grad = np.mean([np.outer(i,o) for i,o in zip(self.last_input, delta)], axis=0)
            bias_grad = np.mean(delta,axis=0)
        return (weight_grad, bias_grad)
    def step_gradient(self, learning_rate):
        self.weights -= learning_rate * self.grad[0]
        if self.use_bias: self.bias -= learning_rate * self.grad[1].flatten()
        self.grad = None
class LinkedLinear(Linear):
    def __init__(self, other_linear, use_bias=True):
        # You probably want to start by invoking the `Linear` constructor.
        super().__init__(*other_linear.weights.shape, use_bias=use_bias)
        self.weights = other_linear.weights
class LinkedTransposedLinear(Linear):
    def __init__(self, other_linear, use_bias=True):
        # You probably want to start by invoking the `Linear` constructor.
        super().__init__(*other_linear.weights.T.shape, use_bias=use_bias)
        self.weights = other_linear.weights.T

## Activation function modules

**Take care:** The `Softmax` class from the last assignment is renamed to `SparseSoftmax`! The *additional* `DenseSoftmax` is to be used for dense vector representations, i.e. where `delta` is a dense vector. In the Attention, we will need a dense implementation of `Softmax`.

In [None]:
class ReLU(NetworkModule):
    def _forward(self, X):
        return np.maximum(0, X)
    def backward(self, delta):
        return (self.last_input > 0) * delta
class Sigmoid(NetworkModule):
    def _forward(self, X):
        return 1/(1+np.exp(-X))
    def backward(self, delta):
        v = self.forward(self.last_input)
        return v * (1-v) * delta
class SparseSoftmax(NetworkModule):
    def _forward(self, X):
        X = X-np.max(X, axis=-1, keepdims=True)
        X_exp = np.exp(X)
        return X_exp / np.sum(X_exp, axis=-1, keepdims=True)
    def backward(self, delta):
        assert type(delta) == csr_array
        # Make last input matrix-shaped if it was a vector
        if len(self.last_input.shape) == 1:
            in_softmax = self._forward(self.last_input)
            for i, j, delta_j in zip(delta.indptr, delta.indices, delta.data):
                softmax_deriv_j = -in_softmax * in_softmax[j]
                softmax_deriv_j[j] += in_softmax[j]
                next_delta = softmax_deriv_j * delta_j
                return next_delta
        next_delta = np.zeros(self.last_input.shape)
        for i, j, delta_j in zip(delta.indptr, delta.indices, delta.data):
            in_softmax = self._forward(self.last_input[i])
            softmax_deriv_j = -in_softmax * in_softmax[j]
            softmax_deriv_j[j] += in_softmax[j]
            next_delta[i] = softmax_deriv_j * delta_j
        return next_delta
class DenseSoftmax(NetworkModule):
    def _forward(self, X):
        X = X-np.max(X, axis=-1, keepdims=True)
        X_exp = np.exp(X)
        return X_exp / np.sum(X_exp, axis=-1, keepdims=True)
    def backward(self, delta):
        result = np.zeros(self.last_input.shape)
        for i in range(len(result)):
            s = self._forward(self.last_input[i])
            dense_jacobian = np.diag(s) - np.outer(s,s)
            result[i] = dense_jacobian.dot(delta[i])
        return result

## Loss functions

In [None]:
class Loss(ABC):
    @abstractmethod
    def forward(self, prediction, target): pass
    @abstractmethod
    def backward(self, prediction, target): pass
class L2Loss(Loss):
    def forward(self, prediction, target):
        return 1/2 * np.sum(np.square(prediction-target), axis=-1)
    def backward(self, prediction, target):
        return prediction - target
class CELoss(Loss):
    def forward(self, prediction, target):
        if len(prediction.shape) == 1:
            return -target.dot(np.log(prediction))
        else:
            return np.array([-ystar.dot(np.log(y)) for ystar, y in zip(target, prediction)])
    def backward(self, prediction, target):
        assert type(target) == csr_array
        # Make prediction matrix-shaped if it was a vector
        if len(prediction.shape) == 1: prediction = prediction[None, :]
        delta_data = np.zeros(target.data.shape)
        for i_data, (i, j, value) in enumerate(zip(target.indptr, target.indices, target.data)):
            delta_data[i_data] = - value / prediction[i,j]
        return csr_array((delta_data, target.indices, target.indptr), shape=prediction.shape)

## Scaled Dot-Product Attention

In the lecture, you were introduced to the Scaled Dot-Product Attention, which is defined as

$$SDPA(Q,K,V) = Softmax\left(\frac{QK^T}{\sqrt{d_k}}\right) \cdot V$$

where $d_k$ is the dimension of queries $Q$, keys $K$, and values $V$, sometimes called the latent dimension of the language model.
Note the shapes of the terms:
- $Q, K$, and $V$ are $n \times d_k$ matrices representing $n$ vectors (e.g. word embeddings) of dimension $d_k$
- $QK^T$ is an $n \times n$ matrix that denotes the importance of some key $K_j$ for some query $Q_i$.
- $SDPA(Q,K,V)$ is an $n \times m$ matrix that aggregates the values $V_j$ that correspond to each key $K_j$ according to their importance for the query $Q_i$.

For simplicity, we can decompose $SDPA$ into the steps
- Transposing $K$
- Dot Product of $Q$ and $K^T$
- Scalar Multiplication of $QK^T$ with $d_k^{-\frac{1}{2}}$
- Dense Softmax of $\frac{QK^T}{\sqrt{d_k}}$
- Dot Product of $Softmax\left(\frac{QK^T}{\sqrt{d_k}}\right)$ and $V$

To represent these steps in our framework, we need the additional modules `Transpose`, `DotProduct` and `ScalarMultiplication`.

In [None]:
class Transpose(NetworkModule):
    def _forward(self, X):
        pass # Your solution here
    def backward(self, delta):
        pass # Your solution here
class ScalarProduct(NetworkModule):
    def __init__(self, factor):
        super().__init__()
        self.factor = factor
    def _forward(self, X):
        pass # Your solution here
    def backward(self, delta):
        pass # Your solution here
class DotProduct(NetworkModule):
    def _forward(self, X):
        # Here X should be a tuple, list or similar (A, B)
        # of vectors or matrices to take the numpy.dot product off
        pass # Your solution here
    def backward(self, delta):
        # This must return a tuple (delta_A, delta_B)
        # which contains the appropriate delta for each
        # of the inputs.
        pass # Your solution here

Using these new modules, you can define the $SDPA$ as an "activation function", that receives triplets $(Q,K,V)$ during `forward` and sends triplets $(\delta_Q,\delta_K,\delta_V)$ during `backward`. Receive the latent dimension $d_k$ in the constructor to initialize the `ScalarProduct` module with.

You can use the `numpy_zip` function to join multiple arrays of the same shape into a joined array. For example `numpy_zip(A,B,C)` for arrays `A`, `B`, `C` of shape `(5,10)` will be returned as an array of shape `(3,5,10)`.

In [None]:
def numpy_zip(*arrays):
    return np.concatenate([array[None] for array in arrays], axis=0)
class SDPA(NetworkModule):
    def __init__(self, latent_dimension):
        super().__init__()
        self.latent_dimension = latent_dimension
        # self.transpose = ...
        # self.dot_product_qk = ...
        # self.normalization = ...
        # self.softmax = ...
        # self.dot_product_v = ...
        pass # Your solution here
    def _forward(self, X):
        pass # Your solution here
    def backward(self, delta):
        pass # Your solution here

Use the code below to debug your solution. The code initializes random query, key, and value matrices and a random `target` matrix.
Then back propagation is used to compute the gradients for $Q$, $K$, and $V$ such that $SDPA(Q,K,V)$ approximates the desired `target` matrix.
For a working implementation, the loss value should be approximately 0 after 2000 iterations.

In [None]:
Q, K, V = np.random.sample((3, 5, 10))
attention = SDPA(Q.shape[1])
target = np.random.sample((5,10))
loss = L2Loss()
learning_rate = 1e-1
loss_values = []
for _ in tqdm(range(2000)):
	predicted = attention.forward(numpy_zip(Q, K, V))
	loss_value = loss.forward(predicted, target)
	loss_values.append(np.sum(loss_value))
	gradient = attention.backward(loss.backward(predicted, target))
	Q -= gradient[0] * learning_rate
	K -= gradient[1] * learning_rate
	V -= gradient[2] * learning_rate
go.Figure(go.Scatter(y=loss_values)).show()

Using the helper classes `InputFork`, which repeats an input and returns it as a list, and `SkipAddNorm`, which creates a skip connection around a given module and adds and normalizes its outputs with the inputs, you can fill out the classes `PremultipliedSDPA` and `MultiHeadAttention`.

`PremultipliedSDPA` should contain the three linear modules to transform the inputs $Q$, $K$, and $V$ and an `SDPA`.
During `forward`, split the input `X` into the three inputs, apply the linear transforms and forward the results through the `SDPA`.

`MultiHeadAttention` should fork the input into `n_heads` copies, forward them through each of the `PremultipliedSDPA` heads, concatenate their results and run it through a `Linear` module to reduce the dimension back down to the `latent_dimension`.
Make use of the parent class `ModuleChain` to reduce the required code as much as possible.

In [None]:
class InputFork(NetworkModule):
    def __init__(self, n_forks):
        self.n_forks = n_forks
    def _forward(self, X):
        return [X for _ in range(self.n_forks)]
    def backward(self, delta):
        return np.sum(delta,axis=0)
class SkipAddNorm(NetworkModule):
    def __init__(self, skipped_module):
        super().__init__()
        self.skipped_module = skipped_module
        self.stds = None
        self.n = None
        self.centered = None
    def _forward(self, X):
        X2 = self.skipped_module.forward(X)
        added = X + X2
        means = np.mean(added, axis=-1, keepdims=True)
        self.centered = added - means
        self.stds = np.sqrt(np.mean(np.square(self.centered), axis=-1, keepdims=True))
        # Adding a small epsilon to avoid division by zero
        self.stds += 1e-10
        self.n = self.centered.shape[-1]
        standardized_centered = self.centered / self.stds
        return standardized_centered
    def backward(self, delta):
        assert self.n == delta.shape[-1]
        pre_standardized_delta = np.zeros(delta.shape)
        for i,(x,d,std) in enumerate(zip(self.last_input, delta, self.stds)):
            jacobian = (
                (self.n * np.eye(self.n) - 1) / (self.n * std)
                - np.outer(x,x) / (self.n * std**3)
            )
            pre_standardized_delta[i] = jacobian.dot(d)
        skipped_module_delta = self.skipped_module.backward(pre_standardized_delta)
        return pre_standardized_delta + skipped_module_delta
    def step_gradient(self, learning_rate):
        self.skipped_module.step_gradient(learning_rate)
class PremultipliedSDPA(NetworkModule):
    def __init__(self, latent_dimension):
        super().__init__()
        self.latent_dimension = latent_dimension
        # self.linears = ...
        # self.sdpa = ...
        pass # Your solution here
    def _forward(self, X):
        Q,K,V = X
        pass # Your solution here
    def backward(self, delta):
        # Should return a triplet of deltas for Q, K, and V respectively.
        pass # Your solution here
    def step_gradient(self, learning_rate):
        pass # Your solution here
class MultiHeadAttention(ModuleChain):
    def __init__(self, latent_dimension, n_heads):
        self.latent_dimension = latent_dimension
        self.n_heads = n_heads
        # Call the super constructor
        # super().__init__([...])
        pass # Your solution here

The cell below contains some code to test, whether or not backpropagation through your `MultiHeadAttention` class works.
A single set of inputs is matched against a single output and the weights inside the `MultiHeadAttention` are changed to achieve the desired output.
The loss should approach 0 within the 10k gradient steps, although the approximation quality can vary due to the random initialization.
If the loss is far from 0 for multiple executions, you probably still have a bug in your code.

In [None]:
Q, K, V = np.random.normal(0,1,(3,20,5))
attention = MultiHeadAttention(Q.shape[1], 5)
target = np.random.sample(Q.shape)
loss = L2Loss()
learning_rate = 1e-1
loss_values = []
for _ in tqdm(range(10_000)):
	predicted = attention.forward(numpy_zip(Q, K, V))
	loss_value = loss.forward(predicted, target)
	loss_values.append(np.sum(loss_value))
	gradient = attention.backward(loss.backward(predicted, target))
	attention.step_gradient(learning_rate)
go.Figure(go.Scatter(y=loss_values)).show()

We will again make use of the `OneHotDict` to go from tokens to vectors and back. The class is the same as in the previous notebook but adapted to handling multiple words at once.

In [None]:
class OneHotDict():
    def __init__(self, vocabulary):
        # Ensure appropriate types for ordered access
        self.vocabulary = np.array(list(vocabulary))
        self.lookup = {k:v for v,k in enumerate(vocabulary)}
    def word_to_one_hot(self, word):
        assert word in self.lookup.keys()
        return csr_array(([1.], ([0],[self.lookup[word]])), shape=(1, len(self.vocabulary)))
    def words_to_one_hot(self, words):
        assert all(word in self.lookup.keys() for word in words)
        return csr_array(
            (np.ones(len(words)), (np.arange(len(words)),[self.lookup[word] for word in words])),
            shape=(len(words), len(self.vocabulary))
        )
    def one_hot_to_word(self, one_hot):
        return self.one_hot_to_words(one_hot)[0]
    def one_hot_to_words(self, one_hot):
        assert one_hot.shape[-1] == len(self.vocabulary)
        return self.vocabulary[one_hot.argmax(axis=-1).flatten()]

Once again, we load the Hamlet text, since it contains only quite few different tokens.

In [None]:
import nbbootstrap
file_path = await nbbootstrap.ensure_resource("https://dm.cs.tu-dortmund.de/nats/data/hamlet.txt")
with open(file_path, "rt") as file:
    full = file.read()
import re
sentence_regex = re.compile(r"[.?!]|\n\n+")
words_regex = re.compile(r"[\w']+", re.U)
special_chars = ".:,;?!-_\"'()„“”‚‘’…"
padding_token = "_" # This character can not occur in any of the words.
k_tokens = 7 # How many tokens to base the next word on.
tokenized_sentences = [] # Store your output in this list.
# First split Hamlet into sentences, then tokenize each sentence.
for sentence in sentence_regex.split(full):
    for c in special_chars: sentence = sentence.replace(c, "")
    sentence = sentence.strip()
    if sentence == "": continue
    tokenized = []
    for w in words_regex.findall(sentence):
        tokenized.append(w.lower())
    if len(tokenized) <= 2: continue
    tokenized_sentences.append([
        *[padding_token for _ in range(k_tokens)],
        *tokenized,
        padding_token,
    ])
ohd = OneHotDict(np.unique([token for sentence in tokenized_sentences for token in sentence]))

As positional encoding, we will make use of the RoPE rotary embedding.

In [None]:
class RotaryEmbedding(NetworkModule):
    def __init__(self, latent_dimension, max_pos=100):
        self.thetas = 10_000**(-2*np.arange(latent_dimension//2)/(latent_dimension//2))
        self.sines = np.sin(np.outer(np.arange(max_pos), self.thetas))
        self.cosines = np.cos(np.outer(np.arange(max_pos), self.thetas))
        self.rotation_matrices = np.array([
            [
                [
                    [self.cosines[m,i], -self.sines[m,i]],
                    [self.sines[m,i], self.cosines[m,i]],
                ]
                for i in range(latent_dimension//2)
            ]
            for m in range(max_pos)
        ])
        self.position = 0
    def set_position(self, position):
        self.position = position
    def _forward(self, X):
        result = np.zeros(X.shape)
        for pos in range(X.shape[0]):
            for i in range(X.shape[1]//2):
                result[pos, 2*i:2*(i+1)] = X[pos, 2*i:2*(i+1)].dot(
                    self.rotation_matrices[self.position+pos,i]
                )
        return result
    def backward(self, delta):
        result = np.zeros(delta.shape)
        for pos in range(delta.shape[0]):
            for i in range(delta.shape[1]//2):
                result[pos, 2*i:2*(i+1)] = delta[pos, 2*i:2*(i+1)].dot(
                    self.rotation_matrices[self.position+pos,i].T
                )
        return result

To faster obtain a reasonable result, we will start with the pretrained word vectors obtained from last weeks assignment.

In [None]:
import nbbootstrap
file_path = await nbbootstrap.ensure_resource("https://dm.cs.tu-dortmund.de/nats/data/model_2_words_100_dim.json.gz")
import gzip, json
with gzip.open(file_path) as f:
	old_model = json.load(f)
init_vecs = np.array(old_model["model"]["children"][0]["children"][0]["weights"])
# Overwriting the vocabulary to ensure a proper match between indices and vectors.
# As long as you did not change anything about the text parsing above, this should work.
ohd = OneHotDict(old_model["vocabulary"])

We can finally create the multi headed attention model.
The latent dimension is prescribed by the pre-trained word vectors, whilst the number of `MultiHeadedAttention`-blocks and the number of heads per block can be parameterized.
We will again enforce normalized word vectors to avoid diverging model weights.

In [None]:
word_vec_dim = init_vecs.shape[1]
n_heads = 5
n_blocks = 2
make_decoder_block = lambda n_heads: ModuleChain([
    SkipAddNorm(ModuleChain([
        InputFork(3),
        MultiHeadAttention(word_vec_dim, n_heads)
    ])),
    SkipAddNorm(ModuleChain([
        Linear(word_vec_dim, word_vec_dim),
        ReLU(),
        Linear(word_vec_dim, word_vec_dim),
    ])),
])
rotary_embedding = RotaryEmbedding(word_vec_dim, max_pos = 2*max(len(s) for s in tokenized_sentences))
decoder_only_model = ModuleChain([
    Linear(len(ohd.vocabulary), word_vec_dim),
    rotary_embedding,
    *[make_decoder_block(n_heads) for _ in range(n_blocks)],
    Linear(word_vec_dim, len(ohd.vocabulary)),
    SparseSoftmax(),
])
decoder_only_model.modules[0].weights = init_vecs.copy()
# Normalize embeddings
decoder_only_model.modules[0].weights /= np.linalg.norm(decoder_only_model.modules[0].weights,axis=1,keepdims=True)

Below is the training pipeline. We again sample random positions in random sentences and predict the next word by having inputs and outputs shifted by one token.
You can modify the code at your convenience, run it and see how the model slowly approaches the expected outputs, first in the padding tokens and later in (similar) tokens.

To obtain good results, you will probably have to run this for way more than $10\,000$ iterations.
This is likely not viable in the browser, but you can run it at least for long enough to observe how the model transitions from entirely random tokens to somewhat resembling the expected output.
On repeated executions of this cell, you may want to comment out the code that resets the word vectors to the pretrained ones.

In [None]:
from ipywidgets import Label
from IPython.display import display
np.seterr(divide='raise', invalid='raise')
loss = CELoss()
learning_rate = 1e-1
floating_loss = None
float_val = 0.9999
expected_label = Label(value="Expected:",continuous_update=True)
predicted_label = Label(value="Predicted:",continuous_update=True)
probs_label = Label(value="True probabilities:",continuous_update=True)
display(expected_label)
display(predicted_label)
display(probs_label)
with tqdm(range(10_000)) as bar:
    for it in bar:
        sentence = tokenized_sentences[np.random.randint(len(tokenized_sentences))]
        offset = np.random.randint(min(len(sentence)-k_tokens-2, k_tokens-1),len(sentence)-k_tokens-1)
        # Set rotary encoding position
        decoder_only_model.modules[1].set_position(offset)
        inputs = ohd.words_to_one_hot(sentence[offset:][:k_tokens])
        outputs = ohd.words_to_one_hot(sentence[offset:][1:k_tokens+1])
        predicted = decoder_only_model.forward(inputs)
        decoder_only_model.backward(loss.backward(predicted, outputs))
        decoder_only_model.step_gradient(learning_rate)
        # Reset the embedding vectors to the pretrained vectors during burn-in.
        if it < 1_000:
            decoder_only_model.modules[0].weights = init_vecs.copy()
        # Normalize embeddings
        decoder_only_model.modules[0].weights /= np.linalg.norm(decoder_only_model.modules[0].weights,axis=1,keepdims=True)
        loss_value = loss.forward(predicted, outputs)
        floating_loss = np.mean(loss_value) if floating_loss is None else floating_loss*float_val + np.mean(loss_value)*(1-float_val)
        bar.desc = f"Floating loss: {floating_loss:>8.4f}"
        if it%20 == 0:
            expected_label.value = f"Expected: '{' '.join(sentence[offset:][1:k_tokens+1])}'"
            predicted_label.value = f"Predicted: '{' '.join(ohd.one_hot_to_words(predicted))}'"
            probs_label.value = f"True probabilities: {', '.join(list(map(lambda v: f'{v:.5f}', (outputs * predicted).max(axis=1).toarray().flatten())))}"

## Model evaluation

Add code to the cell below to visualize the attention each head puts on each input word to predict the output words.
The token-to-token attention is the matrix produced by the `DenseSoftmax` inside `SDPA`s.
You will thus have to forward the random input through the individual parts of the model manually to obtain these matrices (or use the input caching of the modules).

Run the cell for each head (at least of the first block) multiple times and try to figure out, what each head is paying attention to.

In [None]:
# Parameters to choose what block and what head inside that block
# to visualize token-to-token attention for.
inspected_block = 0
# Selecting a random inputs-outputs-pair.
sentence = tokenized_sentences[np.random.randint(len(tokenized_sentences))]
offset = np.random.randint(min(len(sentence)-k_tokens-2, k_tokens-1), len(sentence)-k_tokens-1)
decoder_only_model.modules[1].set_position(offset)
inputs = ohd.words_to_one_hot(sentence[offset:][:k_tokens])
outputs = ohd.words_to_one_hot(sentence[offset:][1:k_tokens+1])
# Compute the attention of the specified head
# importance = ... <- this should be the matrix of token-to-token attention
#                     where each row belongs to an output word and each
#                     column belongs to an input word.
for inspected_head in range(len(decoder_only_model.modules[2].modules[0].skipped_module.modules[1].modules[1].modules)):
    pass # Your solution here
    # Preparing the input and output words for visualization.
    # The lists need to be duplicate free for plotly visualization,
    # hence, we add some whitespaces to make all tokens "different"
    words_in = sentence[offset:][:k_tokens]
    cnts = {k:0 for k in set(words_in)}
    for i in range(len(words_in)):
        cnts[words_in[i]] += 1
        words_in[i] = f"{words_in[i]}{' '*cnts[words_in[i]]}"
    words_out = sentence[offset:][1:k_tokens+1]
    cnts = {k:0 for k in set(words_out)}
    cnts[padding_token] = 1
    for i in range(len(words_out)):
        cnts[words_out[i]] += 1
        words_out[i] = f"{words_out[i]}{' '*cnts[words_out[i]]}"
    # Visualize the token-to-token attention in that head.
    go.Figure(
        go.Heatmap(x=words_in,y=words_out,z=importance),
        layout_yaxis_scaleanchor="x",
        layout_yaxis_title="Expected tokens",
        layout_xaxis_title="Input tokens",
        layout_title=f"Attention of head {inspected_head} in block {inspected_block}",
    ).show()

You can also use the model to produce sentences similar to the model of last week.
Start with `k_tokens` times the `padding_token` and feed the attention model with the last `k_tokens` produced tokens.
Sample from the probability distribution *of the last output vector* to produce the next token.
Add the token to the `generated_words` and repeat.

Attention models are somewhat famous for not having a fixed length, yet being sequence-to-sequence models. What happens if you change the number of input tokens?

**Take care:** Remember to set the position of the `rotary_embedding`! The position must be the 0-based index of the first input token.

In [None]:
# Initialization
generated_words = [padding_token for _ in range(k_tokens)]
# generated_words += "hamlet and his".split(" ")
for word in generated_words:
	if word != padding_token: print(word, end=" ")
while (all(v==padding_token for v in generated_words) or generated_words[-1] != padding_token) and len(generated_words) < 100:
	# Process:
	# - set the rotary embedding start position to the number of tokens *before* the first input token
	# - feed forward the last k words as one-hot vectors
	# - sample word according to predicted probabilities
	# - append word to sentence print the word
	pass # Your solution here
	generated_words.append(next_word)
	if next_word != padding_token: print(next_word, end=" ")
print()
print("Generated",len(generated_words),"tokens")

You can of course also use evaluation on the word vectors obtained after training.
We initialized the vectors to pre-trained vectors, but the backpropagation is allowed to modify the word embeddings.
How meaningful do you think these are?

In [None]:
# Do some stuff
pass # Your solution here

## Model IO

To save your model or load a stored state, you can use the cells below.
Pickling is a very easy-to-use way of saving models, but somewhat vulnerable to changes in the class code.
Writing a more robust IO code is generally advised (e.g. the JSON to gzip pipeline used for the pretrained vectors), but that takes significantly more effort.

In [None]:
# Saving a model
import pickle
with open("path_to_model.pkl", "wb") as f: pickle.dump(decoder_only_model, f)

In [None]:
# Loading a model
import pickle
with open("path_to_model.pkl", "rb") as f: decoder_only_model = pickle.load(f)