# Imports and installation


In [6]:
%%capture
!pip install lightning datasets

In [7]:
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
import torch.nn as nn
import lightning as L
import random

SEED = 999
BATCH_SIZE = 32
torch.manual_seed(SEED)
L.seed_everything(SEED)

INFO: Seed set to 999
INFO:lightning.fabric.utilities.seed:Seed set to 999


999

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Data Preparation

In [9]:
df = pd.read_csv('shorthex2hex.csv')

In [10]:
df.head()

Unnamed: 0,text,text_hex,deflate_hex
0,One of the other,4f6e65206f6620746865206f74686572,789cf3cf4b55c84f5328c9005240a208002eb405bb
1,A wonderful little production.,4120776f6e64657266756c206c6974746c652070726f64...,789c735428cfcf4b492d4a2bcd51c8c92c29c949552828...
2,I thought this was,492074686f75676874207468697320776173,789cf35428c9c82f4dcf2801d299c50ae589c5003dea06b0
3,Basically there's a family,4261736963616c6c79207468657265277320612066616d...,789c734a2cce4c4eccc9a95428c9482d4a552f56485448...
4,"Petter Mattei's ""Love in",506574746572204d6174746569277320224c6f766520696e,789c0b482d29492d52f04d045299eac50a4a3ef965a90a...


In [11]:
df = df[:10000]

Instead of using the standard \<EOS> and \<SOS> tags we're using the letter S and E since they are not present in the vocabulary

In [12]:
df['text_hex'] = 'S' + df['text_hex'] + 'E'
df['deflate_hex'] = 'S' + df['deflate_hex'] + 'E'

In [13]:
ds = Dataset.from_pandas(df)
ds_train_test = ds.train_test_split(test_size=0.2, seed=SEED)
ds_test_dev = ds_train_test['test'].train_test_split(test_size=0.5, seed=SEED)
ds_splits = DatasetDict({
    'train': ds_train_test['train'],
    'valid': ds_test_dev['train'],
    'test': ds_test_dev['test']
})

ds_splits

DatasetDict({
    train: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 8000
    })
    valid: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 1000
    })
})

# Tokenizzare in caratteri singoli o in sequenze di caratteri?

In [14]:
ds_splits['train'][0]

{'text': 'I really enjoyed this',
 'text_hex': 'S49207265616c6c7920656e6a6f7965642074686973E',
 'deflate_hex': 'S789cf354284a4dccc9a95448cdcbcaaf4c4d5128c9c82c0600533607d9E'}

## Data tokenization

In [15]:
token2id = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "a": 10, "b": 11, "c": 12, "d": 13, "e": 14, "f": 15, "P":16, "S": 17, "E":18 }

In [16]:
def create_id2token_vocab(token_to_id):
    id2token = {}
    for token, id in token_to_id.items():
        id2token[id] = token

    return id2token

id2token = create_id2token_vocab(token2id)
id2token

{0: '0',
 1: '1',
 2: '2',
 3: '3',
 4: '4',
 5: '5',
 6: '6',
 7: '7',
 8: '8',
 9: '9',
 10: 'a',
 11: 'b',
 12: 'c',
 13: 'd',
 14: 'e',
 15: 'f',
 16: 'P',
 17: 'S',
 18: 'E'}

In [17]:
def collate_fn(batch):

  def pad_sequences(sequences, maxlen, value=token2id['P']):
    padded_sequences = []
    for sequence in sequences:
        padded_sequence = sequence[:maxlen]
        padded_sequence.extend([value] * (maxlen - len(padded_sequence)))

        padded_sequence = sequence +  [value] * (maxlen - len(sequence))
        padded_sequences.append(padded_sequence)

    return padded_sequences


  texts = [elem['text_hex'] for elem in batch]
  encoded_hex = [[token2id[x] for x in hex] for hex in texts]


  outputs = [elem['deflate_hex'] for elem in batch]
  encoded_outputs = [[token2id[x] for x in hex] for hex in outputs]


  maxlen = 0
  for seq in encoded_hex:
    if len(seq) > maxlen:
      maxlen = len(seq)
  for seq in encoded_outputs:
    if len(seq) > maxlen:
      maxlen = len(seq)

  padded_encoded_hex = pad_sequences(encoded_hex, maxlen)
  padded_encoded_outputs = pad_sequences(encoded_outputs, maxlen)


  return {
      'inputs': torch.tensor(padded_encoded_hex),
      "outputs": torch.tensor(padded_encoded_outputs)
  }


BATCH_SIZE = 32

# Model

In [18]:
class Encoder(nn.Module):
  def __init__(self, vocab_len, embedding_dim, hidden_dim, num_layers, bidirectional, dropout):
    super().__init__()

    self.vocab_len = vocab_len
    self.embedding_dim = embedding_dim
    self.hidden_dim = hidden_dim

    self.embedding = nn.Embedding(vocab_len, embedding_dim, padding_idx=token2id['P'])
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, batch):
    # batch has dimension [sentence_length, batch_size]
    x = self.embedding(batch)
    x = self.dropout(x)
    logits, (h, c) = self.lstm(x)
    # we don't need to return the outputs as we just need to pass the
    # context vector to the decoder block
    return (h, c)


In [19]:
class Decoder(nn.Module):
  def __init__(self, vocab_len, embedding_dim, hidden_dim, output_dim, num_layers, bidirectional, dropout):
    super().__init__()
    self.embedding_dim = embedding_dim
    self.hidden_dim = hidden_dim
    self.output_dim = output_dim

    self.embedding = nn.Embedding(vocab_len, embedding_dim, padding_idx=token2id['P'])
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout)
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(hidden_dim, output_dim)

  def forward(self, x, h, c):
    # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
    # is 1 here because we are sending in a single word and not a sentence
    x = x.unsqueeze(0)

    embedding = self.dropout(self.embedding(x))
    out, (h, c) = self.lstm(embedding, (h, c))
    logits = self.linear(out)
    logits = logits.squeeze(0)
    return logits, h, c

# Training

In [None]:
import pytorch_lightning as pl
EPOCHS = 20
LR = 1e-3
EMBEDDING_DIM = 256
HIDDEN_DIM = 1024
NUM_LAYERS = 4
DROPOUT = 0.3
BIDIRECTIONAL = False

# Combined EncoderDecoder model
class EncoderDecoder(pl.LightningModule):
    def __init__(self):
        super(EncoderDecoder, self).__init__()
        self.encoder = Encoder(len(token2id), EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, BIDIRECTIONAL, DROPOUT)
        self.decoder = Decoder(len(token2id), EMBEDDING_DIM, HIDDEN_DIM, len(token2id), NUM_LAYERS, BIDIRECTIONAL, DROPOUT)
        self.criterion = nn.CrossEntropyLoss(ignore_index=token2id['P'])

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        target_len = source.shape[0]
        batch_size = source.shape[1]
        target_vocab_size = len(token2id)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        h, c = self.encoder(source)
        x = target[0]

        for t in range(1, target_len):
            output, h, c = self.decoder(x, h, c)
            outputs[t] = output
            pred = output.argmax(1)
            x = target[t] if random.random() < teacher_forcing_ratio else pred

        return outputs

    def training_step(self, batch, batch_idx):
        inputs, targets = batch['inputs'], batch['outputs']
        output = self(inputs, targets)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        targets = targets[1:].view(-1)
        loss = self.criterion(output, targets)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, targets = batch['inputs'], batch['outputs']
        output = self(inputs, targets)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        targets = targets[1:].view(-1)
        loss = self.criterion(output, targets)
        self.log('val_loss', loss)

    def test_step(self, batch, batch_idx):
        inputs, targets = batch['inputs'], batch['outputs']
        output = self(inputs, targets)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        targets = targets[1:].view(-1)
        loss = self.criterion(output, targets)
        self.log('test_loss', loss)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=LR)


class HexDataModule(pl.LightningDataModule):
    def __init__(self, train_dataset, val_dataset, test_dataset):
        super().__init__()
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

# Initialize the data module and model
data_module = HexDataModule(ds_splits['train'], ds_splits['valid'], ds_splits['test'])
model = EncoderDecoder()

# Train the model
trainer = pl.Trainer(max_epochs=EPOCHS)
trainer.fit(model, datamodule=data_module)


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | encoder   | Encoder          | 30.4 M
1 | decoder   | Decoder          | 30.5 M
2 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
60.9 M    Trainable params
0         Non-trainable params
60.9 M    Total params
243.649   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [None]:
# Create the test dataloader
test_dataloader = data_module.test_dataloader()

# Now proceed with the test
with torch.no_grad():
    # Get a batch from the test dataloader
    batch = next(iter(test_dataloader))
    h, c = model.encoder(batch['inputs'][0].unsqueeze(1).to('cpu'))

    outputs = [token2id["S"]]
    for i in range(60):
        prev = torch.LongTensor([outputs[-1]]).to('cpu')
        out, h, c = model.decoder(prev, h, c)  # Handle both hidden and cell states
        pred = out.argmax(1).item()

        outputs.append(pred)
        if pred == token2id['E']:
            break

    print(f"Outputs = {outputs}")