# Preamble

The goal of this is as a first, very basic, proof of concept of the idea to use a neural network to break basic codes. The first code we attempt to break is a Caesar cipher. To do so we use a CNN to classify the encrypted text by the key that will decrypt it. This approach works well, when trained on the Guttenberg Project from [Hugging Face](https://huggingface.co/datasets/manu/project_gutenberg) we get very close to 100% accuracy with small amounts of training time and no hyperparameter tuning. The structure of the model was largely inspired by [Understanding Convolutional Neural Networks for NLP](https://dennybritz.com/posts/wildml/understanding-convolutional-neural-networks-for-nlp/).

## Variables

In [1]:
DATA_AMOUNT = [10000, 1000]
TRAIN_SPLIT = 0.8
STREAM = False

In [2]:
BATCH_SIZE = 64
EMBEDDING_DIM = 128
NUM_FILTERS = 128
FILTER_SIZES = [2, 3, 4, 5, 7]
DROPOUT_PROB = 0.5
NUM_CLASSES = 25
VOCAB_SIZE = 28
NUM_EPOCHS = 10
LR = 0.01

## Imports

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from modules.data_handling import caesar_cnn_data
from modules.data_handling import get_text
from modules.encryption import caesar

from tqdm.autonotebook import tqdm

In [5]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

## Creating Data

In [7]:
trainData, testData = caesar_cnn_data.initialise(*DATA_AMOUNT, TRAIN_SPLIT, stream=STREAM)
train_loader, test_loader = caesar_cnn_data.data2loader(trainData, testData, BATCH_SIZE=BATCH_SIZE)

Resolving data files:   0%|          | 0/52 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

# CNN Model

In [8]:
class CNNCodeBreaker(nn.Module):
    def __init__(self,
                 num_filters=NUM_FILTERS,
                 filter_sizes=FILTER_SIZES,
                 num_classes=NUM_CLASSES,
                 dropout_prob=DROPOUT_PROB,
                 embed_dim=EMBEDDING_DIM,
                 vocab_size=VOCAB_SIZE):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=27)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=num_filters, kernel_size=k)
            for k in filter_sizes
        ])
        self.linear = nn.Linear(len(filter_sizes) * num_filters, num_classes)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x):
        embedded = self.embedding(x).permute(0, 2, 1)
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = torch.cat(pooled, dim=1)
        dropped = self.dropout(cat)
        return self.linear(dropped)

In [9]:
model = CNNCodeBreaker().to(device)
model

CNNCodeBreaker(
  (embedding): Embedding(28, 128, padding_idx=27)
  (convs): ModuleList(
    (0): Conv1d(128, 128, kernel_size=(2,), stride=(1,))
    (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
    (2): Conv1d(128, 128, kernel_size=(4,), stride=(1,))
    (3): Conv1d(128, 128, kernel_size=(5,), stride=(1,))
    (4): Conv1d(128, 128, kernel_size=(7,), stride=(1,))
  )
  (linear): Linear(in_features=640, out_features=25, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

# Training

In [10]:
loss_fn = nn.CrossEntropyLoss()
optimiser = optim.Adam(model.parameters(), lr=LR)

In [11]:
for epoch in tqdm(range(NUM_EPOCHS)):
    model.train()
    running_loss = 0

    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimiser.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimiser.step()
        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)

    model.eval()
    test_loss = 0
    correct = 0
    total = 0

    with torch.inference_mode():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            test_loss += loss.item()

            predicted = torch.argmax(outputs, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_test_loss = test_loss / len(test_loader)
    accuracy = 100 * correct / total
    print(f"Epoch {epoch:02d}/{NUM_EPOCHS} | Train Loss: {avg_test_loss:.4f} | Test Loss: {avg_test_loss:.4f} | Test Accuracy: {accuracy:.1f}%")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 00/10 | Train Loss: 0.0001 | Test Loss: 0.0001 | Test Accuracy: 100.0%


KeyboardInterrupt: 

## Saving

In [None]:
torch.save(model.state_dict(), "models/00_caesar_cnn.pt")

# Example Useage

In [None]:
model.eval()
test_string = "The quick brown fox jumps over the lazy dog"
true_key = 14
enc_text, _ = caesar.encrypt(test_string, key=true_key)
print(f"Original Text: '{test_string}'")
print(f"Encrypted Text: '{enc_text}'")
input_tensor = torch.tensor(get_text.string2_num_list(enc_text), dtype=torch.long).unsqueeze(0).to(device)
with torch.inference_mode():
    pred_logits = model(input_tensor)
pred_key = torch.argmax(pred_logits, dim=1) + 1
print(f"Model Prediction (Key): {pred_key}")

Original Text: 'The quick brown fox jumps over the lazy dog'
Encrypted Text: 'hvs eiwqy pfckb tcl xiadg cjsf hvs zonm rcu'
Model Prediction (Key): tensor([22], device='cuda:0')
