# Preamble

## Variables

In [19]:
DATA_AMOUNT = [100000, 100]
TRAIN_SPLIT = 0.8

In [20]:
BATCH_SIZE = 64
EMBEDDING_DIM = 128
NUM_FILTERS = 128
FILTER_SIZES = [2, 3, 4, 5, 7]
DROPOUT_PROB = 0.5
NUM_CLASSES = 25
VOCAB_SIZE = 28
NUM_EPOCHS = 10
LR = 0.01

## Imports

In [21]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
import get_text
import ceaser
from tqdm.autonotebook import tqdm

In [23]:
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim

In [24]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Creating Data

In [25]:
trainData, testData = get_text.initialise(ceaser.encrypt, *DATA_AMOUNT, TRAIN_SPLIT, stream=False)

Resolving data files:   0%|          | 0/52 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/100000 [00:00<?, ?it/s]

In [26]:
train = [datum[0] for datum in trainData]
trainLabel = [datum[1] for datum in trainData]
test = [datum[0] for datum in testData]
testLabel = [datum[1] for datum in testData]

### DataLoaders

In [27]:
class CipherDataset(Dataset):
    def __init__(self, chunks, labels):
        self.chunks = chunks
        self.labels = labels

    def __len__(self):
        return len(self.chunks)

    def __getitem__(self, idx):
        sequence = torch.tensor(self.chunks[idx], dtype=torch.long)
        label = torch.tensor(self.labels[idx] - 1, dtype=torch.long)
        return sequence, label

In [28]:
def pad_fn(batch):
    chunks, labels = zip(*batch)
    chunks = pad_sequence(chunks, batch_first=True, padding_value=27)
    labels = torch.stack(labels)
    return chunks, labels

In [29]:
train_dataset = CipherDataset(train, trainLabel)
test_dataset = CipherDataset(test, testLabel)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_fn)

# CNN Model

In [30]:
class CNNClassifier(nn.Module):
    def __init__(self,
                 num_filters=NUM_FILTERS,
                 filter_sizes=FILTER_SIZES,
                 num_classes=NUM_CLASSES,
                 dropout_prob=DROPOUT_PROB,
                 embed_dim=EMBEDDING_DIM,
                 vocab_size=VOCAB_SIZE):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=27)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=num_filters, kernel_size=k)
            for k in filter_sizes
        ])
        self.linear = nn.Linear(len(filter_sizes) * num_filters, num_classes)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x):
        embedded = self.embedding(x).permute(0, 2, 1)
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = torch.cat(pooled, dim=1)
        dropped = self.dropout(cat)
        return self.linear(dropped)

In [31]:
model = CNNClassifier().to(device)
# model = torch.compile(model)
model

CNNClassifier(
  (embedding): Embedding(28, 128, padding_idx=27)
  (convs): ModuleList(
    (0): Conv1d(128, 128, kernel_size=(2,), stride=(1,))
    (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
    (2): Conv1d(128, 128, kernel_size=(4,), stride=(1,))
    (3): Conv1d(128, 128, kernel_size=(5,), stride=(1,))
    (4): Conv1d(128, 128, kernel_size=(7,), stride=(1,))
  )
  (linear): Linear(in_features=640, out_features=25, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

# Training

In [32]:
loss_fn = nn.CrossEntropyLoss()
optimiser = optim.Adam(model.parameters(), lr=LR)

In [33]:
for epoch in tqdm(range(NUM_EPOCHS)):
    model.train()
    running_loss = 0

    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimiser.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimiser.step()
        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)

    model.eval()
    test_loss = 0
    correct = 0
    total = 0

    with torch.inference_mode():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            test_loss += loss.item()

            predicted = torch.argmax(outputs, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_test_loss = test_loss / len(test_loader)
    accuracy = 100 * correct / total
    print(f"Train Loss: {avg_test_loss:.4f} | Test Loss: {avg_test_loss:.4f} | Test Accuracy: {accuracy:.1f}%")

  0%|          | 0/10 [00:00<?, ?it/s]

Validation Loss: 0.0010 | Accuracy: 100.00%
Validation Loss: 0.0002 | Accuracy: 100.00%
Validation Loss: 0.0022 | Accuracy: 99.99%
Validation Loss: 0.0000 | Accuracy: 100.00%
Validation Loss: 0.0112 | Accuracy: 99.99%
Validation Loss: 0.0080 | Accuracy: 100.00%
Validation Loss: 0.0071 | Accuracy: 100.00%
Validation Loss: 0.0134 | Accuracy: 99.98%
Validation Loss: 0.0220 | Accuracy: 99.99%
Validation Loss: 0.0289 | Accuracy: 99.99%


# A simple test Case

In [37]:
model.eval()
test_string = "The quick brown fox jumps over the lazy dog"
true_key = 14
enc_text, _ = ceaser.encrypt(test_string, key=true_key)
print(f"Original Text: '{test_string}'")
print(f"Encrypted Text: '{enc_text}'")
input_tensor = torch.tensor(get_text.string2_num_list(enc_text), dtype=torch.long).unsqueeze(0).to(device)
with torch.inference_mode():
    pred_logits = model(input_tensor)
pred_key = torch.argmax(pred_logits, dim=1) + 1
print(f"Model Prediction (Key): {pred_key}")

Original Text: 'The quick brown fox jumps over the lazy dog'
Encrypted Text: 'hvs eiwqy pfckb tcl xiadg cjsf hvs zonm rcu'
Model Prediction (Key): tensor([14], device='cuda:0')
