# Imports and installation


In [2]:
%%capture
!pip install lightning datasets

In [3]:
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
import torch.nn as nn
import lightning as L
import random

SEED = 999
BATCH_SIZE = 32
torch.manual_seed(SEED)
L.seed_everything(SEED)

INFO: Seed set to 999


999

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Data Preparation

In [5]:
df = pd.read_csv('/kaggle/input/shortnew/shorthex2hex.csv')

In [6]:
df.head()

Unnamed: 0,text,text_hex,deflate_hex
0,One of the other,4f6e65206f6620746865206f74686572,789cf3cf4b55c84f5328c9005240a208002eb405bb
1,A wonderful little production.,4120776f6e64657266756c206c6974746c652070726f64...,789c735428cfcf4b492d4a2bcd51c8c92c29c949552828...
2,I thought this was,492074686f75676874207468697320776173,789cf35428c9c82f4dcf2801d299c50ae589c5003dea06b0
3,Basically there's a family,4261736963616c6c79207468657265277320612066616d...,789c734a2cce4c4eccc9a95428c9482d4a552f56485448...
4,"Petter Mattei's ""Love in",506574746572204d6174746569277320224c6f766520696e,789c0b482d29492d52f04d045299eac50a4a3ef965a90a...


In [7]:
df = df[:10000]

Instead of using the standard \<EOS> and \<SOS> tags we're using the letter S and E since they are not present in the vocabulary

In [8]:
df['text_hex'] = 'S' + df['text_hex'] + 'E'
df['deflate_hex'] = 'S' + df['deflate_hex'] + 'E'

In [9]:
ds = Dataset.from_pandas(df)
ds_train_test = ds.train_test_split(test_size=0.2, seed=SEED)
ds_test_dev = ds_train_test['test'].train_test_split(test_size=0.5, seed=SEED)
ds_splits = DatasetDict({
    'train': ds_train_test['train'],
    'valid': ds_test_dev['train'],
    'test': ds_test_dev['test']
})

ds_splits

DatasetDict({
    train: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 8000
    })
    valid: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 1000
    })
})

# Tokenizzare in caratteri singoli o in sequenze di caratteri?

In [10]:
ds_splits['train'][0]

{'text': 'I really enjoyed this',
 'text_hex': 'S49207265616c6c7920656e6a6f7965642074686973E',
 'deflate_hex': 'S789cf354284a4dccc9a95448cdcbcaaf4c4d5128c9c82c0600533607d9E'}

## Data tokenization

In [11]:
token2id = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "a": 10, "b": 11, "c": 12, "d": 13, "e": 14, "f": 15, "P":16, "S": 17, "E":18 }

In [12]:
def create_id2token_vocab(token_to_id):
    id2token = {}
    for token, id in token_to_id.items():
        id2token[id] = token

    return id2token

id2token = create_id2token_vocab(token2id)
id2token

{0: '0',
 1: '1',
 2: '2',
 3: '3',
 4: '4',
 5: '5',
 6: '6',
 7: '7',
 8: '8',
 9: '9',
 10: 'a',
 11: 'b',
 12: 'c',
 13: 'd',
 14: 'e',
 15: 'f',
 16: 'P',
 17: 'S',
 18: 'E'}

In [13]:
def collate_fn(batch):

  def pad_sequences(sequences, maxlen, value=token2id['P']):
    padded_sequences = []
    for sequence in sequences:
        padded_sequence = sequence[:maxlen]
        padded_sequence.extend([value] * (maxlen - len(padded_sequence)))

        padded_sequence = sequence +  [value] * (maxlen - len(sequence))
        padded_sequences.append(padded_sequence)

    return padded_sequences


  texts = [elem['text_hex'] for elem in batch]
  encoded_hex = [[token2id[x] for x in hex] for hex in texts]


  outputs = [elem['deflate_hex'] for elem in batch]
  encoded_outputs = [[token2id[x] for x in hex] for hex in outputs]


  maxlen = 0
  for seq in encoded_hex:
    if len(seq) > maxlen:
      maxlen = len(seq)
  for seq in encoded_outputs:
    if len(seq) > maxlen:
      maxlen = len(seq)

  padded_encoded_hex = pad_sequences(encoded_hex, maxlen)
  padded_encoded_outputs = pad_sequences(encoded_outputs, maxlen)


  return {
      'inputs': torch.tensor(padded_encoded_hex),
      "outputs": torch.tensor(padded_encoded_outputs)
  }


# Model

# Training

In [34]:
import nltk
from nltk.metrics.distance import edit_distance

def decode_output(output):
    return ''.join([id2token[int(id)] for id in output])

def decode_input(input):
    return ''.join([id2token[int(id)] for id in input])

def evaluate(_device, _print, _cycle):
    model.eval()
    total_distance = 0
    total = 0

    for batch in test_dataloader:
        x = batch["inputs"].transpose(0,1).to(_device)
        y = batch["outputs"].transpose(0,1).to(_device)

        y_hat = model(x, y)
        y_hat = torch.argmax(y_hat, dim=-1)

        output = decode_output(y.transpose(0,1)[0])
        output_hat = decode_output(y_hat.transpose(0,1)[0])

        output = [x for x in output if x != "P"]
        output_hat = [x for x in output_hat if x != "P"]
        
        distance = edit_distance(output, output_hat)
        
        if _print:
            print(f"output = {output}")
            print(f"output_hat = {output_hat}")

        total_distance += distance
        total += 1

        if distance == 0:
            print(f"DISTANCE = 0!")
            print(f"output = {output}")
            print(f"output_hat = {output_hat}")

        if _cycle == False:
            return distance

    return total_distance/total

In [35]:
import pytorch_lightning as pl
EPOCHS = 20
LR = 1e-3
EMBEDDING_DIM = 256
HIDDEN_DIM = 1024
NUM_LAYERS = 4
DROPOUT = 0.3
BIDIRECTIONAL = False

class Seq2Seq(pl.LightningModule):
    def __init__(self, vocab_len, embedding_dim, hidden_dim, output_dim, num_layers, bidirectional, dropout):
        super(Seq2Seq, self).__init__()
        # Encoder
        self.embedding = nn.Embedding(vocab_len, embedding_dim, padding_idx=token2id['P'])
        self.encoder_lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout)
        # Decoder
        self.decoder_lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.output_dim = output_dim
        self.linear = nn.Linear(hidden_dim, output_dim)
        # Loss
        self.criterion = nn.CrossEntropyLoss()
        
        self.apply(init_weights)  # Apply the weight initialization

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        target_len = target.shape[0]
        batch_size = target.shape[1]
        target_vocab_size = self.output_dim

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(self.device)

        x = self.dropout(self.embedding(source))
        _, (h, c) = self.encoder_lstm(x)

        x = target[0]
        for t in range(1, target_len):
            x = self.dropout(self.embedding(x.unsqueeze(0)))
            out, (h, c) = self.decoder_lstm(x, (h, c))
            output = self.linear(out.squeeze(0))
            outputs[t] = output
            pred = output.argmax(1)
            x = target[t] if random.random() < teacher_forcing_ratio else pred

        return outputs

    def step(self, batch):
        inputs, targets = batch['inputs'], batch['outputs']
        inputs = inputs.transpose(0, 1)
        targets = targets.transpose(0, 1)
        output = self(inputs, targets)
        output_dim = output.shape[-1]

        output = output.reshape(-1, output_dim)
        targets = targets.reshape(-1)
        
        return (output, targets)
    
    def training_step(self, batch):
        loss = self.criterion(*self.step(batch))
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch):
        loss = self.criterion(*self.step(batch))
        self.log('val_loss', loss, prog_bar=True)
        self.log("edit_distance", evaluate(_device = self.device, _print = False, _cycle = False), prog_bar = True)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=LR)

# Assuming token2id, ds_splits, and collate_fn are defined
model = Seq2Seq(len(token2id), EMBEDDING_DIM, HIDDEN_DIM, len(token2id), NUM_LAYERS, BIDIRECTIONAL, DROPOUT)

train_dataloader = DataLoader(ds_splits['train'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers = 3)
val_dataloader = DataLoader(ds_splits['valid'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers = 3)
test_dataloader = DataLoader(ds_splits['test'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers = 3)

# Train the model
trainer = pl.Trainer(max_epochs=EPOCHS)
trainer.fit(model, train_dataloader, val_dataloader)


INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

In [33]:
print(evaluate(_device = device, _print = True, _cycle = True))

output = ['S', '7', '8', '9', 'c', '0', 'b', 'c', '9', 'c', '8', '2', 'c', '5', '6', 'c', '8', 'c', 'd', '2', 'f', 'c', 'b', '4', 'c', '5', '5', '0', '0', '3', '2', '3', '2', 'f', '2', '8', 'b', '8', 'a', '3', '2', '9', '3', '7', '2', '5', '2', 'f', '5', '0', '0', '6', '6', '1', 'a', '0', '8', '7', 'a', 'E']
output_hat = ['0', '7', '8', '9', 'c', 'f', '3', 'c', 'c', '2', '8', '2', '8', '2', 'c', '2', 'c', '2', 'c', '2', '8', '2', '8', '2', '8', '2', '8', '2', '8', '2', '8', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
output = ['S', '7', '8', '9', 'c', '0', 'b', '4', 'f', 'c', 'd', 'c', '9', 'd', '1', 'c', '9', '5', '4', 'c', 'f', '5', '5', 'c', '8', 'c', 'b', '2', 'f', '5', '1', '4', '8', '5', '4', 'c', '8', 'c', 'd', '2', 'f', 'c', 'b', '4', 'c', '0', '5', '0', '0', '4', '7', '3', '6', '0', '6', 'f', '0', 'E']
output_hat = ['0', '7', '8', '9', 'c', 'f', '3', 'c', 'c', '2', '8', '2', '8', '2