In [1]:
import torch
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.utils.serialization as xser
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import Dataset, DataLoader, DistributedSampler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import nltk
from tqdm import tqdm

# Initialize TPU device
device = xm.xla_device()
print(f"Device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Device: xla:0


I0000 00:00:1722740079.023391     920 pjrt_api.cc:100] GetPjrtApi was found for tpu at /usr/local/lib/python3.10/site-packages/torch_xla/lib/libtpu.so
I0000 00:00:1722740079.023475     920 pjrt_api.cc:79] PJRT_Api is set for device type tpu
I0000 00:00:1722740079.023486     920 pjrt_api.cc:146] The PJRT plugin has PJRT API version 0.46. The framework PJRT API version is 0.46.
E0000 00:00:1722740079.023668     920 common_lib.cc:798] Could not set metric server port: INVALID_ARGUMENT: Could not find SliceBuilder port 8471 in any of the 0 ports provided in `tpu_process_addresses`="local"
=== Source Location Trace: === 
learning/45eac/tfrc/runtime/common_lib.cc:479
E0804 02:54:39.060750645    1259 oauth2_credentials.cc:238]            oauth_fetch: UNKNOWN:C-ares status is not ARES_SUCCESS qtype=A name=metadata.google.internal. is_balancer=0: Domain name not found {grpc_status:2, created_time:"2024-08-04T02:54:39.060729989+00:00"}


In [2]:
# Model and Tokenizer
model_name = "Michau/t5-base-en-generate-headline"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
# Load dataset
dataset = pd.read_csv("/kaggle/input/news-headline/news_summary.csv", encoding='latin-1')

In [25]:
train_dataset, test_dataset = train_test_split(dataset, shuffle=True, test_size=3500, random_state=42)
#train_dataset, val_dataset = train_test_split(train_dataset, shuffle=True, test_size=0.1, random_state=42)

"""print(f"Train set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
print(f"Test set size: {len(test_dataset)}")"""

'print(f"Train set size: {len(train_dataset)}")\nprint(f"Validation set size: {len(val_dataset)}")\nprint(f"Test set size: {len(test_dataset)}")'

In [None]:
model = T5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline")
tokenizer = T5Tokenizer.from_pretrained("Michau/t5-base-en-generate-headline")
model = model.to(device)

In [13]:
# Function to generate headlines
def generate_headline(text):
    encoding = tokenizer.encode_plus("headline: " + text, return_tensors = "pt")
    input_ids = encoding["input_ids"].to(device)
    attention_masks = encoding["attention_mask"].to(device)
    outputs = model.generate(input_ids = input_ids, attention_mask = attention_masks,
                             max_length=100, min_length=20)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [44]:
# Function to generate headlines
def generate_headline(text, max_length=100, min_length=20, num_beams=5, repetition_penalty=2.5, length_penalty=1.0, early_stopping=True):
    encoding = tokenizer.encode_plus("headline: " + text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    input_ids = encoding["input_ids"].to(device)
    attention_masks = encoding["attention_mask"].to(device)
    
    outputs = model.generate(
        input_ids=input_ids, 
        attention_mask=attention_masks,
        max_length=max_length, 
        min_length=min_length,
        num_beams=num_beams,
        repetition_penalty=repetition_penalty,
        length_penalty=length_penalty,
        early_stopping=early_stopping
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [4]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForSeq2Seq, T5Tokenizer, T5ForConditionalGeneration, AdamW
import torch

class NewsHeadlineDataset(Dataset):
    def __init__(self, articles, headlines, tokenizer, max_length=512):
        self.articles = articles
        self.headlines = headlines
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        article = self.articles[idx]
        headline = self.headlines[idx]

        input_encoding = self.tokenizer.encode_plus(
            "headline: " + article,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        target_encoding = self.tokenizer.encode_plus(
            headline,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

# Extract columns
headlines = dataset['headlines'].tolist()
articles = dataset['text'].tolist()

# Split the dataset into training, validation, and test sets
train_articles, test_articles, train_headlines, test_headlines = train_test_split(
    articles, headlines, test_size=3500, shuffle=True, random_state=42
)
train_articles, val_articles, train_headlines, val_headlines = train_test_split(
    train_articles, train_headlines, train_size=20000, test_size=1500, shuffle=True, random_state=42
)

# Dataloader adaptation for TPUs
def get_dataloader(articles, headlines, tokenizer, batch_size=6):
    dataset = NewsHeadlineDataset(articles, headlines, tokenizer)
    sampler = DistributedSampler(dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True)
    dataloader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)
    return dataloader

train_dataloader = get_dataloader(train_articles, train_headlines, tokenizer)
val_dataloader = get_dataloader(val_articles, val_headlines, tokenizer)
test_dataloader = get_dataloader(test_articles, test_headlines, tokenizer)

# Use DataCollatorForSeq2Seq for dynamic padding
#data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

print(f"Train set size: {len(train_articles)}")
print(f"Validation set size: {len(val_articles)}")
print(f"Test set size: {len(test_articles)}")


E0000 00:00:1722740136.890878     920 common_lib.cc:798] Could not set metric server port: INVALID_ARGUMENT: Could not find SliceBuilder port 8471 in any of the 0 ports provided in `tpu_process_addresses`="local"
=== Source Location Trace: === 
learning/45eac/tfrc/runtime/common_lib.cc:479
E0804 02:55:36.921754492    2063 oauth2_credentials.cc:238]            oauth_fetch: UNKNOWN:C-ares status is not ARES_SUCCESS qtype=A name=metadata.google.internal. is_balancer=0: Domain name not found {grpc_status:2, created_time:"2024-08-04T02:55:36.921737638+00:00"}


Train set size: 20000
Validation set size: 1500
Test set size: 3500


In [5]:
import torch
from transformers import AdamW
from tqdm import tqdm
from torch.nn.utils import clip_grad_norm_

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Define the training loop
def train_epoch(dataloader, model, optimizer, device):
    model.train()
    dataloader = pl.MpDeviceLoader(dataloader, device)
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        
        # Gradient clipping
        clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

# Define the evaluation loop
def evaluate_model(dataloader, model, device):
    model.eval()
    dataloader = pl.MpDeviceLoader(dataloader, device)
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

# Train the model using xmp.spawn
def _mp_fn(rank, flags):
    device = xm.xla_device()
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=1e-5)
    
    num_epochs = flags['num_epochs']
    early_stopping_patience = flags['early_stopping_patience']
    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(num_epochs):
        train_loss = train_epoch(train_dataloader, model, optimizer, device)
        val_loss = evaluate_model(val_dataloader, model, device)
        xm.master_print(f'Epoch {epoch+1}, Train loss: {train_loss}, Val loss: {val_loss}')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            if xm.is_master_ordinal():
                xm.save(model.state_dict(), 'best_t5_model.pth')
                xm.master_print("Saved Best Model")
        else:
            epochs_no_improve += 1
        
        if epochs_no_improve >= early_stopping_patience:
            xm.master_print("Early stopping triggered")
            break

# Call to start the process, with appropriate flags
FLAGS = {
    'num_epochs': 10,
    'early_stopping_patience': 1
}
xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=1, start_method='fork')


Training: 100%|██████████| 3334/3334 [39:45<00:00,  1.40it/s]
Evaluating: 100%|██████████| 250/250 [00:53<00:00,  4.69it/s]


Epoch 1, Train loss: 0.4325519307037725, Val loss: 0.04356083687394857
Saved Best Model


Training: 100%|██████████| 3334/3334 [37:28<00:00,  1.48it/s]
Evaluating: 100%|██████████| 250/250 [00:38<00:00,  6.50it/s]


Epoch 2, Train loss: 0.048148489444352904, Val loss: 0.04025787775963545
Saved Best Model


Training: 100%|██████████| 3334/3334 [37:51<00:00,  1.47it/s]
Evaluating: 100%|██████████| 250/250 [00:39<00:00,  6.40it/s]


Epoch 3, Train loss: 0.04432570244547094, Val loss: 0.03889596602320671
Saved Best Model


Training: 100%|██████████| 3334/3334 [38:15<00:00,  1.45it/s]
Evaluating: 100%|██████████| 250/250 [00:38<00:00,  6.47it/s]


Epoch 4, Train loss: 0.04207328416828354, Val loss: 0.03807511404156685
Saved Best Model


Training: 100%|██████████| 3334/3334 [38:00<00:00,  1.46it/s]
Evaluating: 100%|██████████| 250/250 [00:38<00:00,  6.45it/s]


Epoch 5, Train loss: 0.04022952817502221, Val loss: 0.037378009773790834
Saved Best Model


Training: 100%|██████████| 3334/3334 [37:42<00:00,  1.47it/s]
Evaluating: 100%|██████████| 250/250 [00:38<00:00,  6.52it/s]


Epoch 6, Train loss: 0.03876923154631711, Val loss: 0.03701328165829182
Saved Best Model


Training: 100%|██████████| 3334/3334 [38:11<00:00,  1.45it/s]
Evaluating: 100%|██████████| 250/250 [00:38<00:00,  6.43it/s]


Epoch 7, Train loss: 0.03736640420006549, Val loss: 0.03675418548285961
Saved Best Model


Training: 100%|██████████| 3334/3334 [38:20<00:00,  1.45it/s]
Evaluating: 100%|██████████| 250/250 [00:38<00:00,  6.45it/s]


Epoch 8, Train loss: 0.03624113619184523, Val loss: 0.03655692219734192
Saved Best Model


Training: 100%|██████████| 3334/3334 [38:15<00:00,  1.45it/s]
Evaluating: 100%|██████████| 250/250 [00:38<00:00,  6.50it/s]


Epoch 9, Train loss: 0.035189949142806404, Val loss: 0.03640651270747185
Saved Best Model


Training: 100%|██████████| 3334/3334 [37:45<00:00,  1.47it/s]
Evaluating: 100%|██████████| 250/250 [00:38<00:00,  6.45it/s]


Epoch 10, Train loss: 0.03413055155534648, Val loss: 0.03635417181253433
Saved Best Model


In [None]:
# Function to calculate evaluation metrics on the test set
def evaluate_on_test_set(test_dataloader, model, tokenizer, device):
    model.eval()
    predictions = []
    references = []

    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Testing"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=100,
                min_length=20,
                num_beams=5,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True
            )

            for i in range(len(outputs)):
                predicted_headline = tokenizer.decode(outputs[i], skip_special_tokens=True)
                reference_headline = tokenizer.decode(labels[i], skip_special_tokens=True)
                predictions.append(predicted_headline)
                references.append(reference_headline)

    # Calculate metrics (e.g., BLEU, ROUGE, METEOR)
    # You can use libraries such as nltk or rouge_score to calculate these metrics

    return predictions, references

# Evaluate the model on the test set
predictions, references = evaluate_on_test_set(test_dataloader, model, tokenizer, device)

# Print some example predictions
for i in range(10):
    print(f"Article: {test_articles[i]}")
    print(f"Generated Headline: {predictions[i]}")
    print(f"Actual Headline: {references[i]}")
    print()


In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
nltk.download('wordnet')

from pycocoevalcap.meteor.meteor import Meteor
# Initialize METEOR scorer
meteor_scorer = Meteor()

def evaluateRandomly_test(metric, n=10):
    for i in range(n):
        print(i)
        eval_sample = test_dataset.iloc[i:i+1, :]
        print('news_article > ', eval_sample['text'].iloc[0])
        headline = eval_sample['headlines'].iloc[0]
        print('original_headline = ', headline)
        output_sentence = generate_headline(eval_sample['text'].iloc[0])
        print('predicted_headline < ', output_sentence)

        # Prepare data in the expected format
        gts = {0: [headline]}  # Ground truth as a dictionary
        res = {0: [output_sentence]}  # Hypothesis as a dictionary
        
        if metric == 'meteor':
            # Calculate METEOR score
            score, _ = meteor_scorer.compute_score(gts, res)
            print(f"METEOR score: {score}")
            
        elif metric == 'bleu':
            # Calculate BLEU score
            bleu_score = sentence_bleu([headline.split()], output_sentence.split())
            print(f"BLEU score: {bleu_score}")
            
        else:            
            # Calculate ROUGE score
            rouge_scorer_inst = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
            rouge_scores = rouge_scorer_inst.score(headline, output_sentence)
            print(f"ROUGE-1 score: {rouge_scores['rouge1'].fmeasure}")
            print(f"ROUGE-L score: {rouge_scores['rougeL'].fmeasure}")

In [7]:
# Load the saved model
model = T5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline")
model.load_state_dict(torch.load('/kaggle/working/best_t5_model.pth'))
model = model.to(device)

In [33]:
# Load the fine-tuned model and tokenizer
model_name = "Michau/t5-base-en-generate-headline"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Load the saved fine-tuned model
model.load_state_dict(torch.load("/kaggle/working/best_t5_model.pth"))

<All keys matched successfully>

In [36]:
# Function to evaluate the model on the test set
def evaluate_test_set(test_articles, test_headlines):
    model.eval()
    meteor_scores = []

    for article, true_headline in tqdm(zip(test_articles, test_headlines), total=len(test_articles), desc="Evaluating"):
        generated_headline = generate_headline(article)
        meteor_score = nltk.translate.meteor_score.single_meteor_score(true_headline.split(), generated_headline.split())
        meteor_scores.append(meteor_score)
    
    avg_meteor_score = sum(meteor_scores) / len(meteor_scores)
    return avg_meteor_score

# Evaluate the model on the test set
avg_meteor_score = evaluate_test_set(test_articles, test_headlines)
print(f"Average METEOR Score on the Test Set: {avg_meteor_score}")

Evaluating: 100%|██████████| 3500/3500 [1:08:58<00:00,  1.18s/it]

Average METEOR Score on the Test Set: 0.46160107794777444





In [37]:
def print_sample_predictions(test_articles, test_headlines, num_samples=10):
    model.eval()
    for i in range(num_samples):
        article = test_articles[i]
        true_headline = test_headlines[i]
        generated_headline = generate_headline(article)
        
        print(f"Article {i+1}: {article[:150]}...")  # Print the first 150 characters of the article for brevity
        print(f"True Headline: {true_headline}")
        print(f"Generated Headline: {generated_headline}")
        print(f"METEOR Score: {nltk.translate.meteor_score.single_meteor_score(true_headline.split(), generated_headline.split())}")
        print('-'*100)

# Print sample predictions for analysis
print_sample_predictions(test_articles, test_headlines)


Article 1: Students in Karnataka will get extra marks if their parents cast votes in the upcoming assembly elections, the Associated Management of Primary and Se...
True Headline: K'taka students to get extra marks if parents vote in polls
Generated Headline: K'taka students to get extra marks if parents cast votes in polls: Association
METEOR Score: 0.8892857142857141
----------------------------------------------------------------------------------------------------
Article 2: Syrian anti-aircraft defences on Monday shot down missiles over two air bases, Syria's state media said. The missiles targeted Shayrat air base in the...
True Headline: Syria shoots down missiles fired at two air bases
Generated Headline: Syrian air defences shoot down missiles over two bases.
METEOR Score: 0.4955555555555556
----------------------------------------------------------------------------------------------------
Article 3: A Dinosaur-like creature's fossil was found during an excavation on Sunday i