# Imports and installation


In [27]:
%%capture
!pip install lightning datasets

In [28]:
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
import torch.nn as nn
import lightning as L
import random

SEED = 999
BATCH_SIZE = 64
torch.manual_seed(SEED)
L.seed_everything(SEED)

INFO: Seed set to 999


999

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Data Preparation

In [30]:
df = pd.read_csv('/kaggle/input/shorthex2hex/shorthex2hex.csv')

In [31]:
df.head()

Unnamed: 0,text,text_hex,deflate_hex
0,One of the other,4f6e65206f6620746865206f74686572,789cf3cf4b55c84f5328c9005240a208002eb405bb
1,A wonderful little production.,4120776f6e64657266756c206c6974746c652070726f64...,789c735428cfcf4b492d4a2bcd51c8c92c29c949552828...
2,I thought this was,492074686f75676874207468697320776173,789cf35428c9c82f4dcf2801d299c50ae589c5003dea06b0
3,Basically there's a family,4261736963616c6c79207468657265277320612066616d...,789c734a2cce4c4eccc9a95428c9482d4a552f56485448...
4,"Petter Mattei's ""Love in",506574746572204d6174746569277320224c6f766520696e,789c0b482d29492d52f04d045299eac50a4a3ef965a90a...


Instead of using the standard \<EOS> and \<SOS> tags we're using the letter S and E since they are not present in the vocabulary

In [32]:
df['text_hex'] = 'S' + df['text_hex'] + 'E'
df['deflate_hex'] = 'S' + df['deflate_hex'] + 'E'

In [33]:
ds = Dataset.from_pandas(df)
ds_train_test = ds.train_test_split(test_size=0.2, seed=SEED)
ds_test_dev = ds_train_test['test'].train_test_split(test_size=0.5, seed=SEED)
ds_splits = DatasetDict({
    'train': ds_train_test['train'],
    'valid': ds_test_dev['train'],
    'test': ds_test_dev['test']
})

ds_splits

DatasetDict({
    train: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 40000
    })
    valid: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 5000
    })
})

# Tokenizzare in caratteri singoli o in sequenze di caratteri?

In [34]:
ds_splits['train'][0]

{'text': 'First of all, this',
 'text_hex': 'S4669727374206f6620616c6c2c2074686973E',
 'deflate_hex': 'S789c73cb2c2a2e51c84f5348ccc9d15128c9c82c06003c54065bE'}

## Data tokenization

In [35]:
token2id = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "a": 10, "b": 11, "c": 12, "d": 13, "e": 14, "f": 15, "P":16, "S": 17, "E":18 }

In [36]:
def create_id2token_vocab(token_to_id):
    id2token = {}
    for token, id in token_to_id.items():
        id2token[id] = token

    return id2token

id2token = create_id2token_vocab(token2id)
id2token

{0: '0',
 1: '1',
 2: '2',
 3: '3',
 4: '4',
 5: '5',
 6: '6',
 7: '7',
 8: '8',
 9: '9',
 10: 'a',
 11: 'b',
 12: 'c',
 13: 'd',
 14: 'e',
 15: 'f',
 16: 'P',
 17: 'S',
 18: 'E'}

In [37]:
# Corrected collate_fn function
def collate_fn(batch):

    texts = [elem['text_hex'] for elem in batch]
    encoded_hex = [[token2id[char] for char in text] for text in texts]

    outputs = [elem['deflate_hex'] for elem in batch]
    encoded_outputs = [[token2id[char] for char in output] for output in outputs]

    #pad sequences
    encoded_hex = nn.utils.rnn.pad_sequence([torch.tensor(seq) for seq in encoded_hex], batch_first=True, padding_value=token2id['P'])
    encoded_outputs = nn.utils.rnn.pad_sequence([torch.tensor(seq) for seq in encoded_outputs], batch_first=True, padding_value=token2id['P'])

    return {
        'inputs': encoded_hex,
        'outputs': encoded_outputs
    }

MAX_SEQ_LEN = 256

In [38]:
train_dataloader = DataLoader(ds_splits['train'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers = 3)
val_dataloader = DataLoader(ds_splits['valid'], batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, num_workers = 3)

torch.set_printoptions(profile="full")

for batch in train_dataloader:
    print(batch['inputs'].shape)
    break

torch.Size([64, 78])


In [None]:
import pytorch_lightning as pl

class myRNN(pl.LightningModule):
    def __init__(self, input_dim, emb_dim, hidden_dim, output_dim):
        super().__init__()
        self.emb = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.RNN(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.loss_fn = nn.MSELoss()  # Instantiate the loss function here


    def forward(self, input):
        
        hidden_state = torch.zeros(1, 64).to(device)

        for char in input:
            embedded_char = self.emb(char)
            embedded_char = embedded_char.unsqueeze(0)
            _, hidden_state = self.rnn(embedded_char, hidden_state)

        generated_sequence = []
        input_char = self.emb(torch.Tensor([token2id['S']]).to(device).long()).to(device)
        
        while len(generated_sequence) < MAX_SEQ_LEN:
            output, hidden_state = self.rnn(input_char, hidden_state)
            output = self.fc(output)
            output = torch.argmax(output)
            output = output.cpu().item()
            generated_sequence.append(output)
            if output == token2id['E']:
                break

            input_char = self.emb(torch.Tensor([output]).to(device).long()).to(device)

        return generated_sequence

    def training_step(self, batch):
        input = batch['inputs'].to(device)
        output = batch['outputs'].to(device)

        predictions = []
        
        for sentence in input:
            prediction = self(sentence)
            predictions.append(prediction)

        #Manually fill every sequence to the same length
        for seq in predictions:
            seq += [token2id['P']] * (MAX_SEQ_LEN - len(seq))

        #convert the output in a list
        output = output.tolist()

        #Same for output
        for seq in output:
            seq += [token2id['P']] * (MAX_SEQ_LEN - len(seq))
        
        predictions = torch.tensor(predictions, dtype=torch.float).to(device)
        output = torch.tensor(output, dtype=torch.float).to(device)
        
        #set required_grad to true
        predictions.requires_grad = True
        output.requires_grad = True

        loss = self.loss_fn(predictions, output) 

        print(f"loss: {loss}")

        return loss
    
    def validation_step(self, batch):
        input = batch['inputs'].to(device)
        output = batch['outputs'].to(device)

        predictions = []
        
        for sentence in input:
            prediction = self(sentence)
            predictions.append(prediction)

        #Manually fill every sequence to the same length
        for seq in predictions:
            seq += [token2id['P']] * (MAX_SEQ_LEN - len(seq))

        #convert the output in a list
        output = output.tolist()

        #Same for output
        for seq in output:
            seq += [token2id['P']] * (MAX_SEQ_LEN - len(seq))
        
        predictions = torch.tensor(predictions, dtype=torch.float).to(device)
        output = torch.tensor(output, dtype=torch.float).to(device)

        loss = self.loss_fn(predictions, output) 

        print(f"loss: {loss}")

        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

model = myRNN(input_dim=len(token2id), emb_dim=32, hidden_dim=64, output_dim=len(token2id))

trainer = pl.Trainer(max_epochs=10)
trainer.fit(model, train_dataloader, val_dataloader)



Sanity Checking: |          | 0/? [00:00<?, ?it/s]

# Model

# Training