In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm
import numpy as np
import pandas as pd
from rouge_score import rouge_scorer


In [2]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_max_length=512, target_max_length=64):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = str(self.data.iloc[idx]['cleaned_article'])
        target_text = str(self.data.iloc[idx]['cleaned_highlights'])

        source = self.tokenizer.encode_plus(
            source_text,
            max_length=self.source_max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        target = self.tokenizer.encode_plus(
            target_text,
            max_length=self.target_max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': source['input_ids'].flatten(),
            'attention_mask': source['attention_mask'].flatten(),
            'labels': target['input_ids'].flatten(),
            'decoder_attention_mask': target['attention_mask'].flatten()
        }


In [3]:
def calculate_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(pred, ref) for pred, ref in zip(predictions, references)]
    return scores


In [4]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [5]:
validation_split = pd.read_csv('validate_split.csv')
validation_dataset = TextDataset(validation_split, tokenizer)  # Assuming TextDataset is defined
validation_loader = DataLoader(validation_dataset, batch_size=8, shuffle=False)


In [7]:
model.eval()
predictions, references = [], []
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
with torch.no_grad():
    for batch in tqdm(validation_loader, desc='Evaluating'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=64)
        decoded_preds = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]
        decoded_labels = [tokenizer.decode(ids, skip_special_tokens=True) for ids in labels]

        predictions.extend(decoded_preds)
        references.extend(decoded_labels)

    rouge_scores = [rouge_scorer.score(pred, ref) for pred, ref in zip(predictions, references)]
    avg_rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
    avg_rouge2 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
    avg_rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

    print(f'Average ROUGE-1 F1: {avg_rouge1:.4f}')
    print(f'Average ROUGE-2 F1: {avg_rouge2:.4f}')
    print(f'Average ROUGE-L F1: {avg_rougeL:.4f}')


Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████| 288/288 [30:16<00:00,  6.31s/it]


Average ROUGE-1 F1: 0.2964
Average ROUGE-2 F1: 0.1179
Average ROUGE-L F1: 0.2149


In [8]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd

# Load the fine-tuned model and tokenizer
model_path = 'fine_tuned_t5_model'
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Define the article to summarize
article = """The rise of artificial intelligence (AI) has brought both excitement and concern to various industries. In healthcare, AI is revolutionizing how diseases are diagnosed and treated. AI-powered algorithms can analyze medical images, such as X-rays and MRIs, with incredible accuracy, helping doctors detect diseases like cancer earlier than ever before. This early detection can significantly improve patient outcomes and save lives.

In addition to diagnostics, AI is also being used to personalize treatment plans. By analyzing vast amounts of patient data, AI can recommend the most effective treatments based on individual factors like genetics and lifestyle. This personalized approach not only improves patient outcomes but also reduces healthcare costs by minimizing trial-and-error treatments.

However, the widespread adoption of AI in healthcare also raises ethical and privacy concerns. Issues such as data security, bias in algorithms, and the impact on jobs are hotly debated topics. Despite these challenges, the potential of AI to revolutionize healthcare is undeniable. As technology continues to advance, AI is poised to play an increasingly vital role in improving patient care and advancing medical research."""

# Function to generate summary
def generate_summary(model, tokenizer, text, max_length=150):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Generate and display summary
generated_summary = generate_summary(model, tokenizer, article)

print("Original Article:")
print(article)
print("\nGenerated Summary:")
print(generated_summary)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Original Article:
The rise of artificial intelligence (AI) has brought both excitement and concern to various industries. In healthcare, AI is revolutionizing how diseases are diagnosed and treated. AI-powered algorithms can analyze medical images, such as X-rays and MRIs, with incredible accuracy, helping doctors detect diseases like cancer earlier than ever before. This early detection can significantly improve patient outcomes and save lives.

In addition to diagnostics, AI is also being used to personalize treatment plans. By analyzing vast amounts of patient data, AI can recommend the most effective treatments based on individual factors like genetics and lifestyle. This personalized approach not only improves patient outcomes but also reduces healthcare costs by minimizing trial-and-error treatments.

However, the widespread adoption of AI in healthcare also raises ethical and privacy concerns. Issues such as data security, bias in algorithms, and the impact on jobs are hotly deb