In [2]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import time

device = torch.device('cpu')
print(f'Using device: {device}')

Using device: cpu


In [3]:
class CustomDataset(Dataset):
    def __init__(self, articles, summaries, tokenizer, max_length):
        self.articles = articles
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.articles)
    
    def __getitem__(self, idx):
        article = self.articles[idx]
        summary = self.summaries[idx]
        
        encoding = self.tokenizer.encode_plus(
            article,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        labels = self.tokenizer.encode(
            summary,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': labels.flatten()
        }

In [4]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    tokenized_articles = df['Tokenized_Articles'].tolist()
    tokenized_summaries = df['Tokenized_Summaries'].tolist()
    return tokenized_articles, tokenized_summaries

In [5]:
def fine_tune_t5(data_file, model_name='t5-small', batch_size=1, epochs=1):
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    device = torch.device('cpu')
    
    tokenized_articles, tokenized_summaries = load_data(data_file)
    
    train_articles, val_articles, train_summaries, val_summaries = train_test_split(
        tokenized_articles, tokenized_summaries, test_size=0.1, random_state=42
    )
    
    train_dataset = CustomDataset(train_articles, train_summaries, tokenizer, max_length=512)
    val_dataset = CustomDataset(val_articles, val_summaries, tokenizer, max_length=512)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    
    for epoch in range(epochs):
        model.train()
        start_time = time.time()
        
        for step, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            if step % 10 == 0:
                elapsed_time = time.time() - start_time
                print(f"Epoch {epoch + 1}/{epochs}, Step {step}/{len(train_loader)}, Loss: {loss.item():.4f}, Time: {elapsed_time:.2f}s")
                start_time = time.time()
        
        model.eval()
        total_val_loss = 0
        for batch in val_loader:
            with torch.no_grad():
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                val_loss = outputs.loss
                total_val_loss += val_loss.item()
        
        avg_val_loss = total_val_loss / len(val_loader)
        print(f'Epoch {epoch + 1}/{epochs}, Average Validation Loss: {avg_val_loss:.4f}')
    
    model.save_pretrained('fine_tuned_t5_small_model')
    tokenizer.save_pretrained('fine_tuned_t5_small_model')

In [6]:
if __name__ == '__main__':
    data_file = r'C:\Users\DEll\Downloads\Text Summarization\summarization_data.csv'
    
    fine_tune_t5(data_file)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch 1/1, Step 0/2001, Loss: 13.8252, Time: 3.97s
Epoch 1/1, Step 10/2001, Loss: 3.0118, Time: 35.66s
Epoch 1/1, Step 20/2001, Loss: 2.4073, Time: 33.52s
Epoch 1/1, Step 30/2001, Loss: 2.7016, Time: 33.64s
Epoch 1/1, Step 40/2001, Loss: 2.4711, Time: 32.92s
Epoch 1/1, Step 50/2001, Loss: 1.6185, Time: 33.38s
Epoch 1/1, Step 60/2001, Loss: 2.0785, Time: 32.47s
Epoch 1/1, Step 70/2001, Loss: 2.3879, Time: 32.99s
Epoch 1/1, Step 80/2001, Loss: 2.2997, Time: 32.20s
Epoch 1/1, Step 90/2001, Loss: 2.3197, Time: 32.52s
Epoch 1/1, Step 100/2001, Loss: 2.4183, Time: 32.72s
Epoch 1/1, Step 110/2001, Loss: 2.3385, Time: 32.80s
Epoch 1/1, Step 120/2001, Loss: 2.3841, Time: 32.20s
Epoch 1/1, Step 130/2001, Loss: 2.2369, Time: 32.62s
Epoch 1/1, Step 140/2001, Loss: 2.2808, Time: 32.32s
Epoch 1/1, Step 150/2001, Loss: 1.6601, Time: 32.32s
Epoch 1/1, Step 160/2001, Loss: 1.2604, Time: 32.24s
Epoch 1/1, Step 170/2001, Loss: 1.9437, Time: 32.59s
Epoch 1/1, Step 180/2001, Loss: 2.2415, Time: 32.59s
Epoc

In [15]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

model_name = 'fine_tuned_t5_small_model'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

article = "In the heart of the forest, where sunlight filtered through the dense canopy, lived a solitary fox named Jasper. He roamed the undergrowth with silent grace, his amber eyes keen and observant. Jasper had always preferred solitude, finding solace in the quiet rustle of leaves and the distant calls of birds. Yet one spring morning, he encountered a wounded rabbit. Despite his instinct to retreat, Jasper stayed, offering a gentle nudge of reassurance. In that fleeting moment, an unexpected bond formed between predator and prey, revealing the fragile beauty of compassion in the wild."

inputs = tokenizer.encode("summarize: " + article, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = model.generate(inputs, max_length=300, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Generated Summary:", generated_summary)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generated Summary: a fox named jasper roamed the undergrowth with silent grace, his amber eyes keen and observant. despite his instinct to retreat, he stayed, offering a gentle nudge of reassurance.
