In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import pandas as pd
import numpy as np

# Load the train_split dataset
train_split = pd.read_csv('train_split.csv')

# Define a custom dataset class
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_max_length=512, target_max_length=64):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = str(self.data.iloc[idx]['cleaned_article'])
        target_text = str(self.data.iloc[idx]['cleaned_highlights'])

        source = self.tokenizer.encode_plus(
            source_text,
            max_length=self.source_max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        target = self.tokenizer.encode_plus(
            target_text,
            max_length=self.target_max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': source['input_ids'].flatten(),
            'attention_mask': source['attention_mask'].flatten(),
            'labels': target['input_ids'].flatten(),
            'decoder_attention_mask': target['attention_mask'].flatten()
        }

# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Initialize the dataset and data loader
train_dataset = TextDataset(train_split, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Define training parameters
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
epochs = 1
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}', leave=False):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        decoder_attention_mask = batch['decoder_attention_mask'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, decoder_attention_mask=decoder_attention_mask)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1} - Average Loss: {avg_loss:.4f}')

# Save the fine-tuned model
model_path = 'fine_tuned_t5_model'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
                                                                                                                                    

Epoch 1 - Average Loss: 2.4627


('fine_tuned_t5_model\\tokenizer_config.json',
 'fine_tuned_t5_model\\special_tokens_map.json',
 'fine_tuned_t5_model\\spiece.model',
 'fine_tuned_t5_model\\added_tokens.json')