# Imports and installation


In [1]:
%%capture
!pip install lightning datasets

In [2]:
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
import torch.nn as nn
import lightning as L
import random

SEED = 124
BATCH_SIZE = 128
HIDDEN_SIZE = 512
NUM_LAYERS= 2
torch.manual_seed(SEED)
L.seed_everything(SEED)

  from .autonotebook import tqdm as notebook_tqdm
Seed set to 124


124

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Data Preparation

In [6]:
df = pd.read_csv('../../../Datasets/shorthex2hex.csv')
df = df[:40960]

In [7]:
df.head()

Unnamed: 0,text,text_hex,deflate_hex
0,One of the other,4f6e65206f6620746865206f74686572,789cf3cf4b55c84f5328c9005240a208002eb405bb
1,A wonderful little production.,4120776f6e64657266756c206c6974746c652070726f64...,789c735428cfcf4b492d4a2bcd51c8c92c29c949552828...
2,I thought this was,492074686f75676874207468697320776173,789cf35428c9c82f4dcf2801d299c50ae589c5003dea06b0
3,Basically there's a family,4261736963616c6c79207468657265277320612066616d...,789c734a2cce4c4eccc9a95428c9482d4a552f56485448...
4,"Petter Mattei's ""Love in",506574746572204d6174746569277320224c6f766520696e,789c0b482d29492d52f04d045299eac50a4a3ef965a90a...


Instead of using the standard \<EOS> and \<SOS> tags we're using the letter S and E since they are not present in the vocabulary

In [8]:
ds = Dataset.from_pandas(df)
ds_train_test = ds.train_test_split(test_size=0.2, seed=SEED)
ds_test_dev = ds_train_test['test'].train_test_split(test_size=0.5, seed=SEED)
ds_splits = DatasetDict({
    'train': ds_train_test['train'],
    'valid': ds_test_dev['train'],
    'test': ds_test_dev['test']
})

ds_splits

DatasetDict({
    train: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 32768
    })
    valid: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 4096
    })
    test: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 4096
    })
})

# Tokenizzare in caratteri singoli o in sequenze di caratteri?

In [9]:
ds_splits['train'][0]

{'text': 'I wonder what audiences',
 'text_hex': '4920776f6e64657220776861742061756469656e636573',
 'deflate_hex': '789cf35428cfcf4b492d5228cf482c51482c4dc94ccd4b4e2d060063a8089e'}

## Data tokenization

In [10]:
token2id = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "a": 10, "b": 11, "c": 12, "d": 13, "e": 14, "f": 15, "P":16}

In [11]:
def create_id2token_vocab(token_to_id):
    id2token = {}
    for token, id in token_to_id.items():
        id2token[id] = token

    return id2token

id2token = create_id2token_vocab(token2id)
id2token

{0: '0',
 1: '1',
 2: '2',
 3: '3',
 4: '4',
 5: '5',
 6: '6',
 7: '7',
 8: '8',
 9: '9',
 10: 'a',
 11: 'b',
 12: 'c',
 13: 'd',
 14: 'e',
 15: 'f',
 16: 'P'}

In [12]:
# Corrected collate_fn function
def collate_fn(batch):

    texts = [elem['text_hex'] for elem in batch]
    encoded_hexs = [[token2id[char] for char in text] for text in texts]

    outputs = [elem['deflate_hex'] for elem in batch]
    encoded_outputs = [[token2id[char] for char in output] for output in outputs]

    # # Pad the sequences to 256 chars with the padding token
    padded_hex = [torch.Tensor(encoded_hex + [token2id["P"]] * (256 - len(encoded_hex))) for encoded_hex in encoded_hexs]
    padded_outputs = [torch.Tensor(encoded_output + [token2id["P"]] * (256 - len(encoded_output))) for encoded_output in encoded_outputs]

    # # Stack the sequences
    padded_hex = torch.stack(padded_hex).long()
    padded_outputs = torch.stack(padded_outputs).long()


    return {
        'inputs': padded_hex,
        'outputs': padded_outputs
    }

MAX_SEQ_LEN = 256

In [13]:
train_dataloader = DataLoader(ds_splits['train'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(ds_splits['valid'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

torch.set_printoptions(profile="full")

for batch in train_dataloader:
    print(batch['inputs'].shape)
    break

torch.Size([128, 256])


In [14]:
import nltk
from nltk.metrics.distance import edit_distance

def decode_output(output):
    return ''.join([id2token[int(id)] for id in output])

def decode_input(input):
    return ''.join([id2token[int(id)] for id in input])

def evaluate(_device, _print):
    model.eval()
    total_distance = 0
    total = 0

    for batch in val_dataloader:
        x = batch["inputs"].to(_device)
        y = batch["outputs"].to(_device)

        y_hat = model(x)
        y_hat = torch.argmax(y_hat, dim=-1)

        output = decode_output(y[0])
        output_hat = decode_output(y_hat[0])

        output = [x for x in output if x != "P"]
        output_hat = [x for x in output_hat if x != "P"]
        
        distance = edit_distance(output, output_hat)
        
        if _print:
            print(f"output = {output}")
            print(f"output_hat = {output_hat}")

        return distance

In [15]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR

INPUT_DIM = 256
EMBED_DIM = 128
HIDDEN_DIM = 512
OUTPUT_DIM = len(token2id)
LEARNING_RATE = 1e-3
DROPOUT_RATE = 0.5

class FeedForward(pl.LightningModule):
    def __init__(self, input_dim=INPUT_DIM, embed_dim = EMBED_DIM, hidden_dim=HIDDEN_DIM, output_dim=OUTPUT_DIM, learning_rate=LEARNING_RATE,
                 dropout_rate=DROPOUT_RATE, optimizer_type=AdamW, scheduler_type=StepLR,
                 scheduler_step_size=5, scheduler_gamma=0.1):
        super().__init__()
        self.save_hyperparameters()
        self.embed = nn.Embedding(input_dim, embed_dim)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.norm1 = nn.LayerNorm(hidden_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim//2)
        self.norm2 = nn.LayerNorm(hidden_dim//2)
        self.fc3 = nn.Linear(hidden_dim//2, output_dim)
        self.loss = nn.CrossEntropyLoss()

    def forward(self, x):
        x = self.embed(x)
        x = torch.relu(self.norm1(self.fc1(x)))
        x = self.dropout(x)
        x = torch.relu(self.norm2(self.fc2(x)))
        x = self.fc3(x)
        return x

    def configure_optimizers(self):
        optimizer = self.hparams.optimizer_type(self.parameters(), lr=self.hparams.learning_rate)
        scheduler = self.hparams.scheduler_type(optimizer, step_size=self.hparams.scheduler_step_size, gamma=self.hparams.scheduler_gamma)
        return [optimizer], [scheduler]

    def step(self, batch, batch_idx):
        x = batch["inputs"]
        y = batch["outputs"]
        y = y.view(y.shape[0] * y.shape[1])
        y_hat = self(x)
        y_hat = y_hat.view(y_hat.shape[0] * y_hat.shape[1], y_hat.shape[2])
        loss = self.loss(y_hat, y)
        return loss

    def training_step(self, batch, batch_idx):  
        loss = self.step(batch, batch_idx)
        self.log('train_loss', loss, prog_bar = True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss = self.step(batch, batch_idx)
        self.log('val_loss', loss, prog_bar = True)
        self.log("edit_distance", evaluate(_device = device, _print = False), prog_bar = True)
        return loss

# Assuming device, train_dataloader, and val_dataloader are defined
model = FeedForward().to(device)

# Initialize a trainer
trainer = pl.Trainer(max_epochs=50)

# Train the model ⚡
trainer.fit(model, train_dataloader, val_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: g:\Uni\Esperimenti\Zip-generation\Notebooks\Hex2Hex\Baselines\lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 850   
1 | conv1     | Conv1d           | 19.3 K
2 | conv2     | Conv1d           | 24.6 K
3 | pool      | MaxPool1d        | 0     
4 | fc1       | Linear           | 524 K 
5 | fc2       | Linear           | 2.2 K 
6 | loss      | CrossEntropyLoss | 0     
-----------------------------------------------
571 K     Trainable params
0         Non-trainable params
571 K     Total params
2.286     Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

C:\Users\tomma\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
C:\Users\tomma\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


IndexError: tuple index out of range

In [None]:
print(evaluate(_device = device, _print = True))

output = ['7', '8', '9', 'c', '7', '3', '4', 'c', '2', 'b', '4', '9', '2', 'd', '5', '2', '0', '8', '4', 'a', '4', 'c', 'c', 'e', '2', 'c', 'c', 'e', 'd', '5', '5', '1', '0', '8', '2', 'a', '2', 'd', '4', 'a', 'c', 'c', '5', '1', '4', '8', 'a', 'd', 'c', '8', '4', 'f', '2', '9', '2', 'd', '0', '6', '0', '0', '7', 'c', '2', '5', '0', '9', '7', 'c']
output_hat = ['7', '8', '2', '2', 'c', '7', '2', '8', 'c', 'c', 'c', 'c', '8', 'c', '2', '8', '2', '9', '2', '8', 'c', '9', '2', '8', 'c', '8', 'c', 'c', '8', 'c', 'c', '8', 'c', 'c', '2', '8', '2', '8', 'c', 'c', '2', '8', 'c', 'c', '2', '8', '2', '7', 'c', '8', 'c', '9']
48


# Model

# Training