# Imports and installation


In [1]:
%%capture
!pip install lightning datasets

In [2]:
import pandas as pd
import torch
from datasets import load_dataset, Dataset, DatasetDict
from torch.utils.data import DataLoader
from typing import Dict, List, Tuple
from dataclasses import dataclass
from pprint import pprint
import torch.nn as nn
import torch.optim as optim
import numpy as np
import lightning as L
import random

SEED = 999
BATCH_SIZE = 32
torch.manual_seed(SEED)
L.seed_everything(SEED)

INFO: Seed set to 999
INFO:lightning.fabric.utilities.seed:Seed set to 999


999

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preparation

In [5]:
df = pd.read_csv('/content/drive/MyDrive/kaggle/shorthex.csv')

In [6]:
df.head()

Unnamed: 0,text,text_hex,deflate_hex
0,One of the other,4f6e65206f6620746865206f74686572,789cf3cf4b55c84f5328c9005240a208002eb405bb
1,A wonderful little production.,4120776f6e64657266756c206c6974746c652070726f64...,789c735428cfcf4b492d4a2bcd51c8c92c29c949552828...
2,I thought this was,492074686f75676874207468697320776173,789cf35428c9c82f4dcf2801d299c50ae589c5003dea06b0
3,Basically there's a family,4261736963616c6c79207468657265277320612066616d...,789c734a2cce4c4eccc9a95428c9482d4a552f56485448...
4,"Petter Mattei's ""Love in",506574746572204d6174746569277320224c6f766520696e,789c0b482d29492d52f04d045299eac50a4a3ef965a90a...


In [7]:
df = df[:10000]

Instead of using the standard \<EOS> and \<SOS> tags we're using the letter S and E since they are not present in the vocabulary

In [8]:
df['text_hex'] = 'S' + df['text_hex'] + 'E'
df['deflate_hex'] = 'S' + df['deflate_hex'] + 'E'

In [9]:
ds = Dataset.from_pandas(df)
ds_train_test = ds.train_test_split(test_size=0.2, seed=SEED)
ds_test_dev = ds_train_test['test'].train_test_split(test_size=0.5, seed=SEED)
ds_splits = DatasetDict({
    'train': ds_train_test['train'],
    'valid': ds_test_dev['train'],
    'test': ds_test_dev['test']
})

ds_splits

DatasetDict({
    train: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 8000
    })
    valid: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 1000
    })
})

# Tokenizzare in caratteri singoli o in sequenze di caratteri?

In [10]:
ds_splits['train'][0]

{'text': 'I really enjoyed this',
 'text_hex': 'S49207265616c6c7920656e6a6f7965642074686973E',
 'deflate_hex': 'S789cf354284a4dccc9a95448cdcbcaaf4c4d5128c9c82c0600533607d9E'}

## Data tokenization

In [11]:
token2id = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "a": 10, "b": 11, "c": 12, "d": 13, "e": 14, "f": 15, "P":16, "S": 17, "E":18 }

In [12]:
def create_id2token_vocab(token_to_id):
    id2token = {}
    for token, id in token_to_id.items():
        id2token[id] = token

    return id2token

id2token = create_id2token_vocab(token2id)
id2token

{0: '0',
 1: '1',
 2: '2',
 3: '3',
 4: '4',
 5: '5',
 6: '6',
 7: '7',
 8: '8',
 9: '9',
 10: 'a',
 11: 'b',
 12: 'c',
 13: 'd',
 14: 'e',
 15: 'f',
 16: 'P',
 17: 'S',
 18: 'E'}

In [13]:
def collate_fn(batch):

  def pad_sequences(sequences, maxlen, value=token2id['P']):
    padded_sequences = []
    for sequence in sequences:
        padded_sequence = sequence[:maxlen]
        padded_sequence.extend([value] * (maxlen - len(padded_sequence)))

        padded_sequence = sequence +  [value] * (maxlen - len(sequence))
        padded_sequences.append(padded_sequence)

    return padded_sequences


  texts = [elem['text_hex'] for elem in batch]
  encoded_hex = [[token2id[x] for x in hex] for hex in texts]


  outputs = [elem['deflate_hex'] for elem in batch]
  encoded_outputs = [[token2id[x] for x in hex] for hex in outputs]


  maxlen = 0
  for seq in encoded_hex:
    if len(seq) > maxlen:
      maxlen = len(seq)
  for seq in encoded_outputs:
    if len(seq) > maxlen:
      maxlen = len(seq)

  padded_encoded_hex = pad_sequences(encoded_hex, maxlen)
  padded_encoded_outputs = pad_sequences(encoded_outputs, maxlen)


  return {
      'inputs': torch.tensor(padded_encoded_hex),
      "outputs": torch.tensor(padded_encoded_outputs)
  }


BATCH_SIZE = 32
train_dataloader = DataLoader(ds_splits['train'], batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True, num_workers=2)
validation_dataloader = DataLoader(ds_splits['valid'], batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=False, num_workers=2)
test_dataloader = DataLoader(ds_splits['test'], batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=False, num_workers=2)


In [14]:
next(iter(train_dataloader))

{'inputs': tensor([[17,  5,  9,  ..., 16, 16, 16],
         [17,  5,  7,  ..., 16, 16, 16],
         [17,  4, 15,  ..., 16, 16, 16],
         ...,
         [17,  4,  2,  ..., 16, 16, 16],
         [17,  4,  5,  ..., 16, 16, 16],
         [17,  5,  4,  ..., 16, 16, 16]]),
 'outputs': tensor([[17,  7,  8,  ..., 16, 16, 16],
         [17,  7,  8,  ..., 16, 16, 16],
         [17,  7,  8,  ..., 16, 16, 16],
         ...,
         [17,  7,  8,  ..., 16, 16, 16],
         [17,  7,  8,  ..., 16, 16, 16],
         [17,  7,  8,  ..., 16, 16, 16]])}

# Model

In [15]:
class Encoder(nn.Module):
  def __init__(self, vocab_len, embedding_dim, hidden_dim, num_layers, bidirectional, dropout):
    super().__init__()

    self.vocab_len = vocab_len
    self.embedding_dim = embedding_dim
    self.hidden_dim = hidden_dim

    self.embedding = nn.Embedding(vocab_len, embedding_dim, padding_idx=token2id['P'])
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, batch):
    # batch has dimension [sentence_length, batch_size]
    x = self.embedding(batch)
    x = self.dropout(x)
    logits, (h, c) = self.lstm(x)
    # we don't need to return the outputs as we just need to pass the
    # context vector to the decoder block
    return (h, c)


In [16]:
class Decoder(nn.Module):
  def __init__(self, vocab_len, embedding_dim, hidden_dim, output_dim, num_layers, bidirectional, dropout):
    super().__init__()
    self.embedding_dim = embedding_dim
    self.hidden_dim = hidden_dim
    self.output_dim = output_dim

    self.embedding = nn.Embedding(vocab_len, embedding_dim, padding_idx=token2id['P'])
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout)
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(hidden_dim, output_dim)

  def forward(self, x, h, c):
    # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
    # is 1 here because we are sending in a single word and not a sentence
    x = x.unsqueeze(0)

    embedding = self.dropout(self.embedding(x))
    out, (h, c) = self.lstm(embedding, (h, c))
    logits = self.linear(out)
    logits = logits.squeeze(0)
    return logits, h, c

In [17]:
class EncoderDecoder(nn.Module):
  def __init__(self, encoder, decoder):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, source, target, teacher_forcing_ratio = 0.5):
    target_len = source.shape[0]
    batch_size = source.shape[1]
    target_vocab_size = len(token2id)

    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

    h, c = self.encoder(source)
    x = target[0]

    for t in range(1, target_len):
      output, h, c = self.decoder(x, h, c)
      outputs[t] = output
      pred = output.argmax(1)

      x = target[t] if random.random() < teacher_forcing_ratio else pred

    return outputs

# Training

In [33]:
EPOCHS = 20
LR = 1e-3
EMBEDDING_DIM = 256
HIDDEN_DIM = 1024
NUM_LAYERS = 4
DROPOUT = 0.3
BIDIRECTIONAL = False

encoder = Encoder(len(token2id), EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, BIDIRECTIONAL, DROPOUT).to(device)
decoder = Decoder(len(token2id), EMBEDDING_DIM, HIDDEN_DIM, len(token2id), NUM_LAYERS, BIDIRECTIONAL, DROPOUT).to(device)
model = EncoderDecoder(encoder, decoder).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index = token2id['P'])

In [35]:
for epoch in range(EPOCHS):
  model.train()
  epoch_loss = 0
  for i, batch in enumerate(train_dataloader):
    inputs = batch['inputs'].to(device)
    targets = batch['outputs'].to(device)

    output = model(inputs, targets)

    output = output[1:].reshape(-1, output.shape[2])
    targets = targets[1:].reshape(-1)

    optimizer.zero_grad()
    loss = criterion(output, targets)

    loss.backward()
    #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
    optimizer.step()

    epoch_loss += loss.item()

  print(f"Epoch {epoch+1}: Loss = {epoch_loss/len(train_dataloader)}")

Epoch 1: Loss = 2.163138861656189
Epoch 2: Loss = 2.0930136556625367
Epoch 3: Loss = 2.086097363471985
Epoch 4: Loss = 2.0803390345573427
Epoch 5: Loss = 2.07758845615387
Epoch 6: Loss = 2.0748239793777468
Epoch 7: Loss = 2.072451006412506
Epoch 8: Loss = 2.0696512937545775
Epoch 9: Loss = 2.0667436790466307
Epoch 10: Loss = 2.0645857133865357
Epoch 11: Loss = 2.071227045536041
Epoch 12: Loss = 2.069625972270966
Epoch 13: Loss = 2.0679206938743593
Epoch 14: Loss = 2.062342148780823
Epoch 15: Loss = 2.0605491228103636
Epoch 16: Loss = 2.0595159821510314
Epoch 17: Loss = 2.0591312804222106
Epoch 18: Loss = 2.0580504875183108
Epoch 19: Loss = 2.0576289305686952
Epoch 20: Loss = 2.056765923976898


In [31]:
with torch.no_grad():
  batch = next(iter(test_dataloader))
  h, c = model.encoder(batch['inputs'][0].unsqueeze(1).to(device))

  outputs = [token2id["S"]]
  for i in range(60):
    prev = torch.LongTensor([outputs[-1]]).to(device)
    out, h, c = model.decoder(prev, h, c)
    pred = out.argmax(1).item()

    outputs.append(pred)
    if pred == token2id['E']:
      break

  print(f"Outputs = {outputs}")

Outputs = [17, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
