In [None]:
import evaluate
import nltk
import numpy as np
import pandas as pd
import torch

from datasets import load_dataset
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, TrainingArguments, Trainer, pipeline, set_seed

nltk.download("punkt")
nltk.download("punkt_tab")

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
dataset = load_dataset("cnn_dailymail", "3.0.0")
dataset

In [None]:
def textgen_summary_pipeline(text: str, model_name: str = "gpt2-xl") -> str:
    set_seed(42)
    pipe = pipeline("text-generation", model=model_name)
    gpt2_query = text + "\nTL;DR:\n" # adding TL;DR (too long; didn't read) at the end makes the model generating summaries
    pipe_out = pipe(gpt2_query, max_length=512, clean_up_tokenization_spaces=True)
    if "pegasus" in model_name:
        return pipe_out[0]["summary_text"].replace(" .<n>", ".\n")
    else:
        return "\n".join(sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query) :]))

def summary_pipeline(text: str, model_name: str = "t5-large"):
    pipe = pipeline("summarization", model=model_name)
    pipe_out = pipe(text)
    return "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

In [None]:
sample = dataset["train"][12]
sample

In [None]:
summaries = {
    "_gt": sample["highlights"],
    "baseline": "\n".join(sent_tokenize(sample["article"])[:3]),
    "model_gpt2_xl": textgen_summary_pipeline(text=sample["article"]),
    "model_bart": summary_pipeline(text=sample["article"], model_name="facebook/bart-large-cnn"),
    "model_pegasus": summary_pipeline(text=sample["article"], model_name="google/pegasus-cnn_dailymail"),
    "model_t5": summary_pipeline(text=sample["article"], model_name="t5-large"),
}

In [None]:
for key in summaries.keys():
    print(key.upper() + "\n")
    print(summaries[key] + "\n")

# Model Evaluation

In [None]:
rouge_metric = evaluate.load("rouge")
reference = dataset["train"][12]["highlights"]
records = []
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

for model_name in summaries:
    rouge_metric.add(prediction=summaries[model_name], reference=reference)
    score = rouge_metric.compute()
    rouge_dict = dict((rn, score[rn]) for rn in rouge_names)
    records.append(rouge_dict)
pd.DataFrame.from_records(records, index=summaries.keys())

In [None]:
def evaluate_baseline(dataset, metric: str, column_text: str = "article", column_summary: str = "highlights"):
    summaries = ["\n".join(sent_tokenize(text)[:3]) for text in dataset[column_text]]
    metric.add_batch(predictions=summaries, references=dataset[column_summary])
    score = metric.compute()
    return score

In [None]:
test_samples = dataset["test"].shuffle(seed=42).select(range(1000))
score = evaluate_baseline(dataset=test_samples, metric=evaluate.load("rouge"))
rouge_dict = dict((rn, score[rn]) for rn in rouge_names)
pd.DataFrame.from_dict(rouge_dict, orient="index", columns=["baseline"]).T

In [None]:
def chunks(list_of_elements, batch_size):
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def evaluate_pegasus(dataset, metric, model, tokenizer, batch_size=16, device="cpu", column_text="article", column_summary="highlights"):
    article_batches = list(chunks(list_of_elements=dataset[column_text], batch_size=batch_size))
    target_batches = list(chunks(list_of_elements=dataset[column_summary], batch_size=batch_size))

    for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):
        inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
        summaries = model.generate(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device), num_beams=8, max_length=128)
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries]
        decoded_summaries = [d.replace(". <n>", "\n") for d in decoded_summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
    score = metric.compute()
    return score

In [None]:
model_name = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
score = evaluate_pegasus(dataset=test_samples, metric=evaluate.load("rouge"), model=model, tokenizer=tokenizer, batch_size=8)
rouge_dict = dict((rn, score[rn]) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=["pegasus"])

## Training a Text Generation Model

In [None]:
dataset_samsum = load_dataset("knkarthick/samsum")
spplit_lengths = [len(dataset_samsum[split]) for split in dataset_samsum]

print("Split lengths", spplit_lengths)
print("Features", dataset_samsum["train"].column_names)

print("\nDialogue:\n", dataset_samsum["train"][0]["dialogue"])
print("\Summary:\n", dataset_samsum["train"][0]["summary"])

In [None]:
summary_pegasus = summary_pipeline(text=dataset_samsum["train"][0]["dialogue"], model_name="google/pegasus-cnn_dailymail")

print("Summary:")
summary_pegasus

In [None]:
score = evaluate_pegasus(
    dataset=dataset_samsum["test"],
    metric=evaluate.load("rouge"),
    model=model,
    tokenizer=tokenizer,
    column_text="dialogue",
    column_summary="summary",
    batch_size=8
)
rogue_dict = dict((rn, score[rn]) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=["pegasus"])

In [None]:
def convert_examples_to_features(batch):
    try:
        input_encodings = tokenizer(batch["dialogue"], max_length=1024, truncation=True)
    except Exception as e:
        print(e)
        print(batch)
        return
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(batch["summary"], max_length=128, truncation=True)
    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    }

In [None]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched=True)
columns = ["input_ids", "labels", "attention_mask"]
dataset_samsum_pt.set_format(type="torch", columns=columns)
dataset_samsum_pt

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="pegasus-samsum",
    num_train_epochs=1,
    warmup_steps=500,
    per_device_train_batch_size=1,
    per_gpu_eval_batch_size=1,
    weight_decay=0.01,
    logging_steps=10,
    push_to_hub=False,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1e6,
    gradient_accumulation_steps=16,
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=dataset_samsum_pt["train"],
    eval_dataset=dataset["validation"]
)

trainer.train()

In [None]:
score = evaluate_pegasus(
    dataset=dataset_samsum["test"],
    metric=evaluate.load("rouge"),
    model=trainer.model,
    tokenizer=tokenizer,
    batch_size=2,
    column_text="dialogue",
    column_summary="summary",
)

rogue_dict = dict((rn, score[rn]) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=["pegasus"])

In [None]:
gen_kwargs = {"num_beams": 8, "max_length": 128}
sample_text = dataset_samsum["test"][0]["dialogue"]
sample_summary = dataset_samsum["test"][0]["summary"]
pipe = pipeline("summarization", model=trainer.model, tokenizer=tokenizer)

print("\nDialogue:\n", sample_text)
print("\nTarget Summary:\n", sample_summary)
print("\nModel Summary\n:", pipe(sample_text, **gen_kwargs)[0]["summary_text"])