# Text summarizer
Here I try around with Pre-training with Extracted Gap-sentences for Abstractive Summarization


In [1]:
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, DatasetDict
!pip install -q rouge_score
!pip install -q tqdm
!pip install .q sentencepiece

from rouge_score import rouge_scorer
import numpy as np
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm




### Global variables

In [2]:
# Load the dataset
train, test = load_dataset(
    "wikihow",
    "sep",
    data_dir="../Dataset/",
    split=["train", "test"],
    trust_remote_code=True,
)
dataset = DatasetDict({"train": train, "test": test})
dataset = dataset.select_columns(["text", "headline"])

In [3]:
# Define model and tokenizer
model_name = "google/pegasus-cnn_dailymail"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)



# Preprocessing function
prefix = "summarize: "
max_input_length = 512
max_target_length = 64

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Preprocessing

In [4]:
def preprocess_function(dataset):
    inputs = [prefix + text for text in dataset["text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding=True)
    targets = dataset["headline"]
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True, padding=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Split the data
train_test_split = tokenized_datasets["train"].train_test_split(test_size=0.1)
tokenized_datasets["train"] = train_test_split["train"]
tokenized_datasets["validation"] = train_test_split["test"]



Map: 100%|██████████| 1060732/1060732 [05:12<00:00, 3399.47 examples/s]
Map: 100%|██████████| 37800/37800 [00:11<00:00, 3308.84 examples/s]


### Evaluation

In [7]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Enable mixed precision
scaler = torch.cuda.amp.GradScaler()

# Generate summaries for the test set with tqdm progress bar
generated_summaries = []
reference_summaries = tokenized_datasets["test"]["headline"]

# Define batch size
batch_size = 32  # Adjust the batch size based on your GPU memory

# Process in batches with mixed precision
for i in tqdm(range(0, len(tokenized_datasets["test"]), batch_size), desc="Generating summaries"):
    batch_texts = tokenized_datasets["test"]["text"][i:i+batch_size]
    batch_inputs = tokenizer([prefix + text for text in batch_texts], return_tensors="pt", max_length=max_input_length, truncation=True, padding=True).to(device)
    
    with torch.cuda.amp.autocast():  # Enable mixed precision
        summary_ids = model.generate(batch_inputs["input_ids"], max_length=max_target_length, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    
    batch_summaries = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    generated_summaries.extend(batch_summaries)

Generating summaries:   0%|          | 0/1182 [00:00<?, ?it/s]

Generating summaries: 100%|██████████| 1182/1182 [40:17<00:00,  2.05s/it]


In [9]:
# Save summaries to a CSV file
import pandas as pd
input_texts = tokenized_datasets["test"]["text"]
summary_data = pd.DataFrame({
    "input_text": input_texts,
    "reference_summary": reference_summaries,
    "generated_summary": generated_summaries
})
summary_data.to_csv("generated_summaries.csv", index=False)


#Check if generated summaries are empty
if len(generated_summaries) == 0:
    print("No summaries were generated.")
else:
    # Evaluate using ROUGE score
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for ref_summary, gen_summary in zip(reference_summaries, generated_summaries):
        # Ensure both summaries are non-empty
        if ref_summary.strip() and gen_summary.strip():
            scores = scorer.score(ref_summary, gen_summary)
            rouge1_scores.append(scores['rouge1'].fmeasure)
            rouge2_scores.append(scores['rouge2'].fmeasure)
            rougeL_scores.append(scores['rougeL'].fmeasure)
        else:
            print(f"Empty summary detected: ref='{ref_summary}', gen='{gen_summary}'")

    # Calculate average ROUGE scores
    if rouge1_scores and rouge2_scores and rougeL_scores:
        avg_rouge1 = np.mean(rouge1_scores)
        avg_rouge2 = np.mean(rouge2_scores)
        avg_rougeL = np.mean(rougeL_scores)
        print(f'Average ROUGE-1 Score: {avg_rouge1:.4f}')
        print(f'Average ROUGE-2 Score: {avg_rouge2:.4f}')
        print(f'Average ROUGE-L Score: {avg_rougeL:.4f}')
    else:
        print("No valid scores to calculate averages.")

Average ROUGE-1 Score: 0.1249
Average ROUGE-2 Score: 0.0373
Average ROUGE-L Score: 0.1063
