# Imports and installation


In [1]:
%%capture
!pip install lightning datasets

In [2]:
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
import torch.nn as nn
import lightning as L
import random

SEED = 124
BATCH_SIZE = 128
HIDDEN_SIZE = 512
NUM_LAYERS= 2
torch.manual_seed(SEED)
L.seed_everything(SEED)

  from .autonotebook import tqdm as notebook_tqdm
Seed set to 124


124

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Data Preparation

In [4]:
df = pd.read_csv('../../../Datasets/randomized_shorthex2hex.csv')
df = df[:40960]

In [5]:
df.head()

Unnamed: 0,text,text_hex,deflate_hex
0,this is not a,74686973206973206e6f742061,789c2bc9c82c5600a2bcfc1285440021fe04a7
1,"and gives a comforting,",616e64206769766573206120636f6d666f7274696e672c,789c4bcc4b5148cf2c4b2d56485448cecf4dcb2f2ac9cc...
2,killer). While some may,6b696c6c6572292e205768696c6520736f6d65206d6179,789ccbceccc9492dd2d45308cfc8cc495528cecf4d55c8...
3,in his closet &,696e2068697320636c6f7365742026,789ccbcc53c8c82c5648cec92f4e2d515003002b16052c
4,film to watch. Mr.,66696c6d20746f2077617463682e204d722e,789c4bcbccc95528c957284f2c49ced053f02dd203003d...


Instead of using the standard \<EOS> and \<SOS> tags we're using the letter S and E since they are not present in the vocabulary

In [6]:
ds = Dataset.from_pandas(df)
ds_train_test = ds.train_test_split(test_size=0.2, seed=SEED)
ds_test_dev = ds_train_test['test'].train_test_split(test_size=0.5, seed=SEED)
ds_splits = DatasetDict({
    'train': ds_train_test['train'],
    'valid': ds_test_dev['train'],
    'test': ds_test_dev['test']
})

ds_splits

DatasetDict({
    train: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 32768
    })
    valid: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 4096
    })
    test: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 4096
    })
})

# Tokenizzare in caratteri singoli o in sequenze di caratteri?

In [7]:
ds_splits['train'][0]

{'text': 'wife (Liswood), is a',
 'text_hex': '7769666520284c6973776f6f64292c2069732061',
 'deflate_hex': '789c2bcf4c4b55d0f0c92c2ecfcf4fd1d451c82c56480400492406a7'}

## Data tokenization

In [8]:
token2id = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "a": 10, "b": 11, "c": 12, "d": 13, "e": 14, "f": 15, "P":16}

In [9]:
def create_id2token_vocab(token_to_id):
    id2token = {}
    for token, id in token_to_id.items():
        id2token[id] = token

    return id2token

id2token = create_id2token_vocab(token2id)
id2token

{0: '0',
 1: '1',
 2: '2',
 3: '3',
 4: '4',
 5: '5',
 6: '6',
 7: '7',
 8: '8',
 9: '9',
 10: 'a',
 11: 'b',
 12: 'c',
 13: 'd',
 14: 'e',
 15: 'f',
 16: 'P'}

In [12]:
# Corrected collate_fn function
def collate_fn(batch):

    texts = [elem['text_hex'] for elem in batch]
    encoded_hexs = [[token2id[char] for char in text] for text in texts]

    outputs = [elem['deflate_hex'] for elem in batch]
    encoded_outputs = [[token2id[char] for char in output] for output in outputs]

    # # Pad the sequences to 256 chars with the padding token
    padded_hex = [torch.Tensor(encoded_hex + [token2id["P"]] * (INPUT_DIM - len(encoded_hex))) for encoded_hex in encoded_hexs]
    padded_outputs = [torch.Tensor(encoded_output + [token2id["P"]] * (INPUT_DIM - len(encoded_output))) for encoded_output in encoded_outputs]

    # # Stack the sequences
    padded_hex = torch.stack(padded_hex).long()
    padded_outputs = torch.stack(padded_outputs).long()


    return {
        'inputs': padded_hex,
        'outputs': padded_outputs
    }

INPUT_DIM = 256

In [13]:
train_dataloader = DataLoader(ds_splits['train'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(ds_splits['valid'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

torch.set_printoptions(profile="full")

for batch in train_dataloader:
    print(batch['inputs'].shape)
    break

torch.Size([128, 256])


In [14]:
import nltk
from nltk.metrics.distance import edit_distance

def decode_output(output):
    return ''.join([id2token[int(id)] for id in output])

def decode_input(input):
    return ''.join([id2token[int(id)] for id in input])

def evaluate(_device, _print, _cycle):
    model.eval()
    total_distance = 0
    total = 0

    for batch in val_dataloader:
        x = batch["inputs"].to(_device)
        y = batch["outputs"].to(_device)

        y_hat = model(x)
        y_hat = torch.argmax(y_hat, dim=-1)

        output = decode_output(y[0])
        output_hat = decode_output(y_hat[0])

        output = [x for x in output if x != "P"]
        output_hat = [x for x in output_hat if x != "P"]
        
        distance = edit_distance(output, output_hat)
        
        if _print:
            print(f"output = {output}")
            print(f"output_hat = {output_hat}")

        total_distance += distance
        total += 1

        if distance == 0:
            print(f"DISTANCE = 0!")
            print(f"output = {output}")
            print(f"output_hat = {output_hat}")

        if _cycle == False:
            return distance

    return total_distance/total

In [16]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR

EMBED_DIM = 256
HIDDEN_DIM = 512
OUTPUT_DIM = len(token2id)
LEARNING_RATE = 1e-3
DROPOUT_RATE = 0.5

class FeedForward(pl.LightningModule):
    def __init__(self, input_length=INPUT_DIM, dictionary_size=17, embedding_dim=EMBED_DIM, learning_rate=1e-3, 
                 optimizer_type=AdamW, scheduler_type=StepLR, scheduler_step_size=1, scheduler_gamma=0.9):
        super().__init__()
        self.save_hyperparameters()

        # Embedding layer to transform dictionary indices into dense vectors
        self.embedding = nn.Embedding(num_embeddings=input_length, embedding_dim=embedding_dim)

        # Convolutional layers
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=embedding_dim, kernel_size=3, padding=1)

        # Fully connected layers for classification
        self.fc1 = nn.Linear(embedding_dim, embedding_dim)  
        self.fc2 = nn.Linear(embedding_dim, dictionary_size)

        # Hyperparameters
        self.learning_rate = learning_rate
        
        self.loss = nn.CrossEntropyLoss()

    def forward(self, x):
        # Embedding layer
        x = self.embedding(x) 

        # Transpose from (batch_size, sequence_length, embedding_dim) to (batch_size, embedding_dim, sequence_length)
        # x = x.permute(0, 2, 1)

        x = torch.relu(self.conv1(x))

        # x = x.permute(0, 2, 1)

        x = torch.relu(self.fc1(x))

        x = self.fc2(x)
        
        return x

    def configure_optimizers(self):
        optimizer = self.hparams.optimizer_type(self.parameters(), lr=self.hparams.learning_rate)
        scheduler = self.hparams.scheduler_type(optimizer, step_size=self.hparams.scheduler_step_size, gamma=self.hparams.scheduler_gamma)
        return [optimizer], [scheduler]

    def step(self, batch, batch_idx):
        x = batch["inputs"]
        y = batch["outputs"]
        y = y.view(y.shape[0] * y.shape[1])
        y_hat = self(x)
        y_hat = y_hat.view(y_hat.shape[0] * y_hat.shape[1], y_hat.shape[2])
        loss = self.loss(y_hat, y)
        return loss

    def training_step(self, batch, batch_idx):  
        loss = self.step(batch, batch_idx)
        self.log('train_loss', loss, prog_bar = True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss = self.step(batch, batch_idx)
        self.log('val_loss', loss, prog_bar = True)
        self.log("edit_distance", evaluate(_device = device, _print = False, _cycle = False), prog_bar = True)
        return loss

# Assuming device, train_dataloader, and val_dataloader are defined
model = FeedForward().to(device)

# Initialize a trainer
trainer = pl.Trainer(max_epochs=10)

# Train the model ⚡
trainer.fit(model, train_dataloader, val_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 65.5 K
1 | conv1     | Conv1d           | 196 K 
2 | fc1       | Linear           | 65.8 K
3 | fc2       | Linear           | 4.4 K 
4 | loss      | CrossEntropyLoss | 0     
-----------------------------------------------
332 K     Trainable params
0         Non-trainable params
332 K     Total params
1.330     Total estimated model params size (MB)


Epoch 9: 100%|██████████| 256/256 [00:08<00:00, 29.92it/s, v_num=33, train_loss=0.105, val_loss=0.118, edit_distance=8.190] 

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 256/256 [00:08<00:00, 29.88it/s, v_num=33, train_loss=0.105, val_loss=0.118, edit_distance=8.190]


In [None]:
print(evaluate(_device = "cpu", _print = True, _cycle = True))

output = ['7', '8', '9', 'c', '2', 'b', 'c', '9', '4', '8', '5', '5', '4', '8', 'c', 'e', '4', '8', '2', 'c', '4', 'a', '4', 'c', '2', 'e', '4', '9', '2', 'd', '2', 'a', '5', '6', 'c', '8', '2', 'f', '5', '2', '2', '8', 'c', 'e', '2', 'c', '2', '9', '4', 'd', '2', 'c', 'c', '9', 'c', 'c', 'c', 'f', 'd', '3', '0', '3', '0', '0', '9', 'a', 'b', 'f', '0', 'a', 'b', '1']
output_hat = ['7', '8', '9', 'c', '2', 'b', 'c', '9', '4', '8', '5', '5', '4', '8', 'c', 'e', '4', '8', '2', 'c', '4', 'a', '4', 'c', '2', 'e', '4', '9', '2', 'd', '2', 'a', '5', '6', 'c', '8', '2', 'f', '5', '2', '2', '8', 'c', 'e', '2', 'c', '2', '9', '4', 'd', '2', 'c', 'c', '9', 'c', 'c', 'c', 'f', 'd', '3', '0', '3', '0', '0', '9', 'a', 'a', 'a', '0', 'a']
output = ['7', '8', '9', 'c', 'c', 'b', 'c', 'c', '4', 'b', '4', 'f', 'c', 'd', 'c', 'b', 'c', 'c', '2', 'f', '2', 'd', '5', '6', '7', '0', 'c', 'f', '4', 'c', 'c', 'c', 'c', '9', 'c', '9', '5', '7', '5', '4', '5', '0', '7', '2', 'c', '9', 'c', 'f', '5', '3', '2', '

output = ['7', '8', '9', 'c', 'c', 'b', 'c', 'f', '5', '3', '2', '8', 'c', '9', '5', '7', '2', '8', 'c', '9', 'c', '8', '2', 'c', '5', '6', '2', '8', 'c', 'e', '2', 'c', '4', '9', '0', '5', '0', '0', '2', 'b', '8', '2', '0', '5', '8', 'e']
output_hat = ['7', '8', '9', 'c', 'c', 'b', 'c', 'f', '5', '3', '2', '8', 'c', '9', '5', '7', '2', '8', 'c', '9', 'c', '8', '2', 'c', '5', '6', '2', '8', 'c', 'e', '2', 'c', '4', '9', '0', '5', '0', '0', '2', 'b', 'b', '7', '0', '5', '7', 'e']
output = ['7', '8', '9', 'c', '0', 'b', '4', '8', '2', 'd', '4', '9', '2', 'd', '5', '2', 'f', '0', '4', 'a', '4', 'c', 'c', 'e', '2', 'e', 'c', 'e', 'c', 'f', 'd', '3', '5', '1', 'f', '0', '4', '8', 'a', 'c', '4', 'c', 'c', 'c', '5', '7', 'f', '0', 'c', 'd', 'a', 'c', '4', 'c', 'a', 'c', '4', 'a', 'c', 'c', 'c', 'e', 'd', '4', 'd', '3', 'd', '3', 'd', '3', '0', '4', '0', '0', 'c', '6', 'e', 'e', '0', 'b', '3', 'a']
output_hat = ['7', '8', '9', 'c', '0', 'b', '4', '8', '2', 'd', '4', '9', '2', 'd', '5', '2', 'f

# Model

# Training