# Help

In [15]:
from torch.utils.data import Dataset

In [None]:
class GPT2Dataset(Dataset):
    def __init__(self, tokenizer, texts, max_length=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=self.max_length
        )
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()
        
        labels = input_ids.clone()

        return input_ids, attention_mask

    def __len__(self):
        return len(self.texts)

Import the necessary modules

In [17]:
import pytorch_lightning as pl

from torch.utils.data import DataLoader
from torch.optim.adamw import AdamW
from transformers import AutoTokenizer, AutoModel, AutoConfig

Create a PyTorch Lightning module that wraps the GPT-2 model

In [18]:
class GPT2FineTuner(pl.LightningModule):
    def __init__(self, model, tokenizer, config):
        super().__init__()

        self.model = model
        self.tokenizer = tokenizer
        self.config = config

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        return outputs

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask = batch

        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        loss = outputs[0].mean()

        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=1e-5)

        return optimizer


Load the pre-trained GPT-2 model and tokenizer

In [19]:
gpt_config = AutoConfig.from_pretrained("sshleifer/tiny-gpt2", output_hidden_states=True)
gpt_tokenizer = AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2", config=gpt_config)
gpt_model = AutoModel.from_pretrained("sshleifer/tiny-gpt2", config=gpt_config)

Create an instance of the GPT2FineTuner and a PyTorch Lightning Trainer

In [20]:
fine_tuner = GPT2FineTuner(gpt_model, gpt_tokenizer, gpt_config)
trainer = pl.Trainer(max_epochs=200, enable_progress_bar=True)

In [21]:
# Replace this with your list of texts
texts = ["Text 1", "Text 2", "Text 3"]

# Create a dataset instance
dataset = GPT2Dataset(gpt_tokenizer, texts)

Prepare your dataset using Hugging Face's Dataset library or custom data loader, and then start the fine-tuning process

In [22]:
train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
trainer.fit(fine_tuner, train_dataloader)