In [1]:
import torch
import seqgen.seq_gen as g

%load_ext autoreload
%autoreload 2

In [2]:
if torch.cuda.device_count():
    device="cuda"
else:
    device="cpu"
print("Device", device)

Device cuda


In [3]:
features, target_seqs = g.generate_synthetic_training_data(3, max_length=10, device=device)
input_seqs = torch.Tensor(features[:, :, 0]).to(torch.int64)
coordinates = torch.Tensor(features[:, :, 1:])

In [4]:
features.shape, input_seqs.shape, coordinates.shape, target_seqs.shape

(torch.Size([3, 10, 5]),
 torch.Size([3, 10]),
 torch.Size([3, 10, 4]),
 torch.Size([3, 10]))

In [5]:
# Show the first three encoded input sequences
input_seqs[0:3]

tensor([[ 0,  6,  9, 12,  1,  1,  1,  1,  1,  1],
        [ 0, 11, 13, 12, 12,  4,  8,  6,  7,  1],
        [ 0,  3,  1,  1,  1,  1,  1,  1,  1,  1]], device='cuda:0')

In [6]:
# Show the coordinates of the tokens of the first input sequence
coordinates[0]

tensor([[0.0000, 0.0000, 0.0000, 0.0000],
        [0.4963, 0.5521, 0.7857, 0.9137],
        [0.6651, 0.6081, 0.9589, 1.0000],
        [0.7272, 0.6422, 1.0000, 0.9667],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000]], device='cuda:0')

In [7]:
# Show the first three encoded output sequences
target_seqs[0:3]

tensor([[ 0,  6,  9, 12,  1,  1,  1,  1,  1,  1],
        [ 0, 11, 13, 12, 12,  4,  8,  6,  7,  1],
        [ 0,  3,  1,  1,  1,  1,  1,  1,  1,  1]], device='cuda:0')

## Embedding Layer

The embedding layers maps each token to a vector space of dimension $\mathbb{R}^{D_{emb}}$.
If we have an input sequence `[5,3,4]` and $D_{emb} = 2$ the output may look like this: `[[0.319, 0.841], [0.781, 0.682], [0.432,0.968]]`.

The embedding layer expects an input sequence of type `int` where each integer in the input sequence represents a class. The total number of distinct possible classes of the input sequence is called the vocabulary size $N_{vocab}$.

In [9]:
emb = torch.nn.Embedding(num_embeddings=17, embedding_dim=2).to(device)
x_emb = emb(input_seqs)
x_emb.shape

torch.Size([3, 10, 2])

In [10]:
# Show embedding of first input sequence
x_emb[0]

tensor([[-1.0152,  0.6046],
        [ 2.5122, -0.5076],
        [-0.0699, -1.7749],
        [ 0.8289, -0.3582],
        [-0.4849,  0.2877],
        [-0.4849,  0.2877],
        [-0.4849,  0.2877],
        [-0.4849,  0.2877],
        [-0.4849,  0.2877],
        [-0.4849,  0.2877]], device='cuda:0', grad_fn=<SelectBackward0>)

## LSTM Layer

The LSTM layer implements recursion in a neural net. It will need three hyperparameters:
- **input_size**: This is the dimension of the input vectors that are run through the LSTM layer. If the vectors have been run to an embedding layer before input_size must be equal to the argument embedding_dim of the embedding layer
- **hidden_size**: This is the dimension of the internal state vector $h_n$, which is identical to the dimension of the cell state $c_n$ and the dimension of the output vectors $out$. The hidden size can be freely chosen by you. Small values for hidden_size may leed to underfitting, but large values can cause overfitting.
- **num_layers**: This parameter defines how many layers of LSTMs are stacked in the network. The more layers you stack the more complex patterns the LSTM is able to model, but this also comes with te risk of overfitting the data.

There is also another important parameter:
- **batch_first**: If the input tensor of the LSTM layer is of shape `(batch_size, sequence_length, embedding_dim)` you will have to set this parameter to True. Otherwise if the input is of shape `(sequence_length, embedding_dim, batch_size)` you will have to set this parameter to false.

Now let's look at the outputs of the LSTM layer:
- **output**: This is the predicted tensor of the LSTM layer which will be passed to the next layer. You may add a linear classification and a softmax layer after the LSTM layer. The output tensor is of shape `(batch_size, sequence_length, hidden_size)` if `batch_first` is set to true.
- **h_n**: Hidden state, tensor of shape `(num_layers, batch_size, hidden_size)`
- **c_n**: Cell state, tensor of shape `(num_layers, batch_size, hidden_size)`

In [12]:
hidden_size=4
lstm = torch.nn.LSTM(input_size=2, hidden_size=hidden_size, num_layers=7, batch_first=True).to(device)
lstm_output, (h_n, c_n) = lstm(x_emb)

In [13]:
lstm_output.shape, h_n.shape, c_n.shape

(torch.Size([3, 10, 4]), torch.Size([7, 3, 4]), torch.Size([7, 3, 4]))

## Linear classifier

After the tensors have been passed trough the LSTM layer it is time to implement a classification of these tensors. The linear layer's task is to take the output of the LSTM layer and map it to the output classes. In language models these classes would be the characters or words of the output vocabulary. There are two hyperparameters of the linear layer that we have to set:

- **in_features**: This is the dimension of the vectors that represent the words in our sequences. When these vectors come from an LSTM layer the dimension of the input features is equal to the hidden_size value of the LSTM layer.
- **out_features**: The dimension of the output vectors of the linear layer is equal to the number of characters / words of our output vocabulary. If we want to produce englisch sentences with our model and there are 5000 possible words in our vocabulary this parameter's value would be 5000.

The output of the linear layer is of shape `(batch_size, sequence_length, target_vocab_size)`

In [14]:
output_size = 5000
linear = torch.nn.Linear(in_features=hidden_size, out_features=output_size).to(device)
linear_output = linear(lstm_output)
linear_output.shape

torch.Size([3, 10, 5000])

## Softmax function

The purpose of the softmax layer is to compute a probability for each position and each word of the output vocabulary.

In [15]:
softmax = torch.nn.LogSoftmax(dim=1)
softmax_output = softmax(linear_output)
softmax_output.shape

torch.Size([3, 10, 5000])

# The Encoder

In [16]:
from seqgen.model import seq2seq_lstm

In [23]:
lr = 1e-2
embedding_dim = 20
hidden_size=20

encoder = seq2seq_lstm.EncoderRNN(vocab_size=17, embedding_dim=embedding_dim, hidden_size=hidden_size).to(features.device)
decoder = seq2seq_lstm.DecoderRNN(hidden_size=hidden_size, vocab_size=23).to(features.device)

# Initialize optimizer for encoder and decoder
encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=lr)
decoder_optimizer = torch.optim.SGD(decoder.parameters(), lr=lr)

# Loss function
criterion = torch.nn.NLLLoss()

In [24]:
# Initialize the encoder hidden state and cell state with zeros
hn = encoder.initHidden(input_seqs.shape[0], device=features.device)
cn = encoder.initHidden(input_seqs.shape[0], device=features.device)

# Iterate over the sequence words and run every word through the encoder
for i in range(input_seqs.shape[1]):
    # Run the i-th word of the input sequence through the encoder.
    # As a result we will get the prediction (output), the hidden state and the cell state.
    # The hidden state and cell state will be used as inputs in the next round
    print(f"Run word {i+1} of all {input_seqs.shape[0]} sequences through the encoder")
    output, (hn, cn) = encoder(input_seqs[:, i].unsqueeze(dim=1), (hn, cn))

Run word 1 of all 500 sequences through the encoder
Run word 2 of all 500 sequences through the encoder
Run word 3 of all 500 sequences through the encoder
Run word 4 of all 500 sequences through the encoder
Run word 5 of all 500 sequences through the encoder
Run word 6 of all 500 sequences through the encoder
Run word 7 of all 500 sequences through the encoder
Run word 8 of all 500 sequences through the encoder
Run word 9 of all 500 sequences through the encoder
Run word 10 of all 500 sequences through the encoder
Run word 11 of all 500 sequences through the encoder
Run word 12 of all 500 sequences through the encoder
Run word 13 of all 500 sequences through the encoder
Run word 14 of all 500 sequences through the encoder
Run word 15 of all 500 sequences through the encoder
Run word 16 of all 500 sequences through the encoder
Run word 17 of all 500 sequences through the encoder
Run word 18 of all 500 sequences through the encoder
Run word 19 of all 500 sequences through the encoder
Ru

In [25]:
output.shape, hn.shape, cn.shape

(torch.Size([500, 1, 20]), torch.Size([1, 500, 20]), torch.Size([1, 500, 20]))

# The Decoder

In [26]:
loss = 0

# Iterate over words of target sequence and run words through the decoder.
# This will produce a prediction for the next word in the sequence
for i in range(0, target_seqs.size(1)):
    print(f"Run word {i+1} through decoder")
    output, (hn, cn) = decoder(target_seqs[:, 0].unsqueeze(dim=1), (hn, cn))
    loss += criterion(output.squeeze(), target_seqs[:, i])

print("LOSS", loss)

Run word 1 through decoder
Run word 2 through decoder
Run word 3 through decoder
Run word 4 through decoder
Run word 5 through decoder
Run word 6 through decoder
Run word 7 through decoder
Run word 8 through decoder
Run word 9 through decoder
Run word 10 through decoder
Run word 11 through decoder
Run word 12 through decoder
Run word 13 through decoder
Run word 14 through decoder
Run word 15 through decoder
Run word 16 through decoder
Run word 17 through decoder
Run word 18 through decoder
Run word 19 through decoder
Run word 20 through decoder
LOSS tensor(63.9882, device='cuda:0', grad_fn=<AddBackward0>)


In [31]:
history = []

for epoch in range(2000):
    features, target_seqs = g.generate_synthetic_training_data(500, max_length=20, device=device)
    features = features.to(device)
    target_seqs = target_seqs.to(device)
    input_seqs = torch.Tensor(features[:, :, 0]).to(torch.int64)
    coordinates = torch.Tensor(features[:, :, 1:])

    # Initialize the encoder hidden state and cell state with zeros
    hn = encoder.initHidden(input_seqs.shape[0], device=features.device)
    cn = encoder.initHidden(input_seqs.shape[0], device=features.device)

    # Iterate over the sequence words and run every word through the encoder
    for i in range(input_seqs.shape[1]):
        # Run the i-th word of the input sequence through the encoder.
        # As a result we will get the prediction (output), the hidden state and the cell state.
        # The hidden state and cell state will be used as inputs in the next round
        output, (hn, cn) = encoder(input_seqs[:, i].unsqueeze(dim=1), (hn, cn))

    # Set gradients of all model parameters to zero
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss = 0

    # Iterate over words of target sequence and run words through the decoder.
    # This will produce a prediction for the next word in the sequence
    for i in range(0, target_seqs.size(1)):
        # Run word i+1 through decoder
        output, (hn, cn) = decoder(target_seqs[:, i].unsqueeze(dim=1), (hn, cn))
        loss += criterion(output.squeeze(), target_seqs[:, i])

    history.append(loss)
    if not epoch % 100:
        print(f"LOSS after epoch {epoch}", loss)

    # Compute gradient
    loss.backward()

    # Update weights of encoder and decoder
    encoder_optimizer.step()
    decoder_optimizer.step()

LOSS after epoch 0 tensor(45.7595, device='cuda:0', grad_fn=<AddBackward0>)
LOSS after epoch 100 tensor(31.6045, device='cuda:0', grad_fn=<AddBackward0>)
LOSS after epoch 200 tensor(16.0011, device='cuda:0', grad_fn=<AddBackward0>)
LOSS after epoch 300 tensor(5.9200, device='cuda:0', grad_fn=<AddBackward0>)
LOSS after epoch 400 tensor(2.6844, device='cuda:0', grad_fn=<AddBackward0>)
LOSS after epoch 500 tensor(1.5140, device='cuda:0', grad_fn=<AddBackward0>)
LOSS after epoch 600 tensor(1.0296, device='cuda:0', grad_fn=<AddBackward0>)
LOSS after epoch 700 tensor(0.7734, device='cuda:0', grad_fn=<AddBackward0>)
LOSS after epoch 800 tensor(0.5999, device='cuda:0', grad_fn=<AddBackward0>)
LOSS after epoch 900 tensor(0.4919, device='cuda:0', grad_fn=<AddBackward0>)
LOSS after epoch 1000 tensor(0.4297, device='cuda:0', grad_fn=<AddBackward0>)
LOSS after epoch 1100 tensor(0.3630, device='cuda:0', grad_fn=<AddBackward0>)
LOSS after epoch 1200 tensor(0.3190, device='cuda:0', grad_fn=<AddBackwar

#### Save model history

In [32]:
import pickle
from datetime import datetime

model_data = {
    "history": history,
    "lr": lr,
    "embedding_dim": embedding_dim,
    "hidden_size": hidden_size
}

now = datetime.now() # current date and time
date_time = now.strftime("%Y-%m-%d_%H-%M-%S")
"training_" + date_time

with open("training_" + date_time+ '.pkl', 'wb') as f:
    pickle.dump(model_data, f)

## Make predictions

We run our input sequences through the model and get output seuences. Then we decode the output sequences with the Vocabulary class and get our final latex code.

In [33]:
from seqgen.vocabulary import *

vocab_in = Vocabulary(vocab_filename="seqgen/vocab_in.txt")
vocab_out = Vocabulary(vocab_filename="seqgen/vocab_out.txt")

predictions = torch.zeros(target_seqs.shape)

with torch.no_grad():
    # Initialize the encoder hidden state and cell state with zeros
    hn = encoder.initHidden(input_seqs.shape[0], device=features.device)
    cn = encoder.initHidden(input_seqs.shape[0], device=features.device)

    # Iterate over the sequence words and run every word through the encoder
    for i in range(input_seqs.shape[1]):
        output, (hn, cn) = encoder(input_seqs[:, i].unsqueeze(dim=1), (hn, cn))

    for i in range(0, target_seqs.size(1)):
        output, (hn, cn) = decoder(target_seqs[:, i].unsqueeze(dim=1), (hn, cn))
        predicted_char = torch.argmax(output, dim=2)
        predictions[:, i] = torch.argmax(output, dim=2).squeeze()

In [34]:
# Pick random sequence and run it through the model
import random

i = random.randint(0, predictions.size(0))
print("MODEL INPUT", vocab_in.decode_sequence(input_seqs[i].cpu().numpy()))
print("MODEL OUTPUT", vocab_out.decode_sequence(predictions[i].numpy()))

MODEL INPUT ['<start>', '7', '7', '9', '9', '8', 'op_divide', '9', '5', '3', '1', '1', '7', 'op_minus', 'op_minus', '6', '4', '7', 'op_minus', '<end>']
MODEL OUTPUT ['<start>', '7', '7', '9', '9', '8', '/', '9', '5', '3', '1', '1', '7', '-', '-', '6', '4', '7', '-', '<end>']
