# Preamble

The first attempt at using an LSTM model to decrypt data in a Seq2Seq manner is clearly not working very well, while training loss is decreasing very slowly there is little to no change to the testing loss, suggesting overfitting. This is because the Seq2Seq method is much harder to train than the classification method hence we expect to need more data and more training, one of the significant difficulties in this regard is how slow the model it.

## Variables

In [1]:
PAD_TOKEN = 27
SOS_TOKEN = 29
EOS_TOKEN = 28

In [2]:
DATA_AMOUNT = [50000, 150]
FIXED_LENGTH = int(DATA_AMOUNT[1]*5)
TRAIN_SPLIT = 0.8
STREAM = False

In [23]:
BATCH_SIZE = 64
EMBEDDING_DIM = 64
HIDDEN_DIM = 64
LAYERS = 2
ENCODE_DROPOUT, DECODE_DROPOUT = (0.5, 0.5)
VOCAB_SIZE = 30
NUM_EPOCHS = 5
LR = 0.001
CLIP = 1.0
TEACHER_FORCING_RATIO = 0

## Imports

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
from modules import lstm_data
from modules import caesar
from modules import get_text

from tqdm.autonotebook import tqdm
import random
import os

In [6]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Creating Data

In [8]:
encryption = lambda x: caesar.encrypt(x, key=14)[0]

trainData, testData = lstm_data.initialise(encryption, *DATA_AMOUNT, TRAIN_SPLIT, stream=STREAM, fixed_length=FIXED_LENGTH )
train_loader, test_loader = lstm_data.data2loader(trainData, testData, BATCH_SIZE=BATCH_SIZE)

Resolving data files:   0%|          | 0/52 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/50000 [00:00<?, ?it/s]

# Model


The model used borrows heavily from the [PyTorch Seq2Seq tutorial](https://github.com/bentrevett/pytorch-seq2seq/blob/main/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb)

In [9]:
class Encoder(nn.Module):
    def __init__(self, input_dim=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, n_layers=LAYERS, dropout=ENCODE_DROPOUT):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, X):
        embedded = self.dropout(self.embedding(X))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [10]:
class Decoder(nn.Module):
    def __init__(self, output_dim=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, n_layers=LAYERS, dropout=DECODE_DROPOUT):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

In [11]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers"

    def forward(self, src, trg, teacher_forcing_ratio):
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        input = trg[0, :]
        for t in range(1, trg_length):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs


In [12]:
model = Seq2Seq(Encoder(), Decoder(), device).to(device)
if os.name != "nt":
    model = torch.compile(model)
    print("compiled")
model

compiled


OptimizedModule(
  (_orig_mod): Seq2Seq(
    (encoder): Encoder(
      (embedding): Embedding(30, 64)
      (rnn): LSTM(64, 64, num_layers=2, dropout=0.5)
      (dropout): Dropout(p=0.5, inplace=False)
    )
    (decoder): Decoder(
      (embedding): Embedding(30, 64)
      (rnn): LSTM(64, 64, num_layers=2, dropout=0.5)
      (fc_out): Linear(in_features=64, out_features=30, bias=True)
      (dropout): Dropout(p=0.5, inplace=False)
    )
  )
)

In [13]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)


model.apply(init_weights)

OptimizedModule(
  (_orig_mod): Seq2Seq(
    (encoder): Encoder(
      (embedding): Embedding(30, 64)
      (rnn): LSTM(64, 64, num_layers=2, dropout=0.5)
      (dropout): Dropout(p=0.5, inplace=False)
    )
    (decoder): Decoder(
      (embedding): Embedding(30, 64)
      (rnn): LSTM(64, 64, num_layers=2, dropout=0.5)
      (fc_out): Linear(in_features=64, out_features=30, bias=True)
      (dropout): Dropout(p=0.5, inplace=False)
    )
  )
)

# Training

In [14]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 138,910 trainable parameters


In [None]:
optimiser = optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)

In [16]:
def train_fn(model, clip, teacher_forcing_ratio, data_loader=train_loader, optimiser=optimiser, loss_fn=loss_fn,  device=device):
    model.train()
    epoch_loss = 0
    for i, (X, y) in enumerate(data_loader):
        X, y = X.to(device).permute(1, 0), y.to(device).permute(1, 0)
        optimiser.zero_grad()
        output = model(X, y, teacher_forcing_ratio)
        output_dim = output.shape[-1]
        output = output[1:].contiguous().view(-1, output_dim)
        y = y[1:].contiguous().view(-1)
        loss = loss_fn(output, y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimiser.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [17]:
def evaluate_fn(model, data_loader=test_loader, loss_fn=loss_fn, device=device):
    model.eval()
    epoch_loss = 0
    with torch.inference_mode():
        for i, (X, y) in enumerate(data_loader):
            X, y = X.to(device).permute(1, 0), y.to(device).permute(1, 0)
            output = model(X, y, 0)  # turn off teacher forcing
            output_dim = output.shape[-1]
            output = output[1:].contiguous().view(-1, output_dim)
            y = y[1:].contiguous().view(-1)
            loss = loss_fn(output, y)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)



In [21]:
for epoch in tqdm(range(NUM_EPOCHS)):
    train_loss = train_fn(
        model=model,
        clip=CLIP,
        teacher_forcing_ratio=TEACHER_FORCING_RATIO, 
    )
    valid_loss = evaluate_fn(
        model,
    )
    print(f"Train Loss: {train_loss:.3f} | Test Loss {valid_loss:.3f}")

  0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 2.603 | Test Loss 2.854
Train Loss: 2.579 | Test Loss 2.853
Train Loss: 2.565 | Test Loss 2.854
Train Loss: 2.554 | Test Loss 2.852
Train Loss: 2.546 | Test Loss 2.852
Train Loss: 2.541 | Test Loss 2.853
Train Loss: 2.536 | Test Loss 2.979
Train Loss: 2.533 | Test Loss 2.856
Train Loss: 2.528 | Test Loss 2.850
Train Loss: 2.522 | Test Loss 2.854


# Example useage

In [22]:
model.eval()
test_string = "The quick brown fox jumps over the lazy dog"
true_key = 14
enc_text, _ = caesar.encrypt(test_string, key=true_key)
print(f"Original Text: '{test_string}'")
print(f"Encrypted Text: '{enc_text}'")

input_tensor = torch.tensor([SOS_TOKEN] + get_text.string2_num_list(enc_text) + [EOS_TOKEN], dtype=torch.long).unsqueeze(1).to(device)

with torch.inference_mode():
    trg_tensor = torch.zeros(len(input_tensor), 1, dtype=torch.long).to(device)
    output = model(input_tensor, trg_tensor, 0) # teacher_forcing_ratio = 0
predicted_indexes = output.argmax(2).squeeze(1)
predicted_chars = []
print(predicted_indexes)
for idx in predicted_indexes:
    if idx.item() == EOS_TOKEN:
        predicted_chars.append("@")
    if idx.item() == 0:
        predicted_chars.append(" ")
    else:
        predicted_chars.append(chr(idx.item() -1 + ord('a')))

predicted_text = "".join(predicted_chars)
print(f"Model Prediction (Text): {predicted_text}")

Original Text: 'The quick brown fox jumps over the lazy dog'
Encrypted Text: 'hvs eiwqy pfckb tcl xiadg cjsf hvs zonm rcu'
tensor([ 0, 20,  8,  5,  0,  0,  1,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0], device='cuda:0')
Model Prediction (Text):  the  ae                                     


# Save the Model

In [24]:
torch.save(obj=model.state_dict(), f="models/01_LSTM_attempt_1.pt")