In [1]:
%%capture
%pip install torch pandas lightning trl

import torch
from torch import nn
import pytorch_lightning as pl
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd

import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from transformers import BartForConditionalGeneration, BartTokenizer

In [2]:
SEED = 999
torch.manual_seed(SEED)

if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

#set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
print("Device:", device)


Device: cuda


In [3]:
import pandas as pd
from torch.utils.data import DataLoader, Dataset

# Custom Dataset
class TextHexDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['text']
        hex_data = self.dataframe.iloc[idx]['deflate_hex']
        return text, hex_data

# Load the dataset
df = pd.read_csv('../Datasets/new_dataset_deflate.csv')

# Create datasets
dataset = TextHexDataset(df)

# Split the dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# DataLoaders with batch_size = 1
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1)

In [4]:
import pytorch_lightning as pl
from transformers import BartTokenizer, BartModel
import torch

class TransformerModel(pl.LightningModule):
    def __init__(self):
        super(TransformerModel, self).__init__()
        self.tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
        self.transformer = BartModel.from_pretrained('facebook/bart-base')
        self.to(device)

    def forward(self, text):

        # Tokenize the text
        input_ids = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
        attention_mask = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True).attention_mask.to(device)

        print(f"input_ids = {input_ids}")
        print(f"attention_mask = {attention_mask}")

        # Pass tokenized text through the transformer
        transformer_output = self.transformer(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state.to(device)
        print(f"transformer_output = {transformer_output}")

        transformer_output_squeezed = transformer_output.squeeze(0)
        print(f"decoded_transformer_output = {transformer_output_squeezed}")

        return transformer_output

    def training_step(self, batch):
        # Training logic here
        text, hex_data = batch

        output = self.forward(text)

        loss = torch.nn.MSELoss(output, hex_data)
        return loss
    
    def validation_step(self, batch):
        # Validation logic here
        text, hex_data = batch

        output = self.forward(text)

        loss = torch.nn.MSELoss(output, hex_data)
        return loss
    
    def test_step(self, batch):
        # Testing logic here
        text, hex_data = batch
        
        output = self.forward(text)
    
        loss = torch.nn.MSELoss(output, hex_data)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)



model = TransformerModel()
trainer = pl.Trainer(max_epochs=10)
trainer.fit(model, train_loader, val_loader)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]input_ids = tensor([[    0,  1121,   127,   393,    12,  4345,  9794,     7,   192,    25,
           171,  1318,  4133,    25,   678,    11,   127,  7370,     6,   939,
         21320,  2115,    42,   822,     2]], device='cuda:0')
attention_mask = tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]], device='cuda:0')


C:\Users\tomma\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


transformer_output = tensor([[[ 2.4690,  2.7761,  1.4309,  ...,  2.1089,  0.0401, -0.9130],
         [ 1.6525,  0.9595, -3.7883,  ...,  1.4576, -1.7831, -1.3209],
         [ 1.2243,  1.7641, -0.5684,  ..., -0.7462, -0.0230, -0.0991],
         ...,
         [ 0.8521,  0.1347, -1.4984,  ...,  2.3607,  1.2293, -1.0776],
         [ 2.2744,  0.4862, -0.0816,  ...,  1.0944,  0.0569, -0.5945],
         [-0.2819, -0.3464, -0.0053,  ...,  0.7679, -1.9116,  0.0454]]],
       device='cuda:0')
decoded_transformer_output = tensor([[ 2.4690,  2.7761,  1.4309,  ...,  2.1089,  0.0401, -0.9130],
        [ 1.6525,  0.9595, -3.7883,  ...,  1.4576, -1.7831, -1.3209],
        [ 1.2243,  1.7641, -0.5684,  ..., -0.7462, -0.0230, -0.0991],
        ...,
        [ 0.8521,  0.1347, -1.4984,  ...,  2.3607,  1.2293, -1.0776],
        [ 2.2744,  0.4862, -0.0816,  ...,  1.0944,  0.0569, -0.5945],
        [-0.2819, -0.3464, -0.0053,  ...,  0.7679, -1.9116,  0.0454]],
       device='cuda:0')


RuntimeError: Boolean value of Tensor with more than one value is ambiguous

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

C:\Users\tomma\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


RuntimeError: Overflow when unpacking long