In [None]:
!pip install datasets

In [None]:
!pip install sacrebleu

In [None]:
import pandas as pd
import numpy as np
from datasets import load_metric
bleu_metric = load_metric("sacrebleu")
bleu_metric.add(
 prediction="the the the the the the", reference=["the cat is on the mat"])
results = bleu_metric.compute(smooth_method="floor", smooth_value=0)
results["precisions"] = [np.round(p,2) for p in results["precisions"]]
pd.DataFrame.from_dict(results, orient="index",columns=["Value"])

In [None]:
from datasets import load_dataset

data = load_dataset("cnn_dailymail",'3.0.0')

In [None]:
data["train"].column_names

### Outputting the length of article and summaries

In [None]:
article = data["train"][1]['article']
highlight = data["train"][1]['highlights']

In [None]:
print(f"Article (excerpt of 500 characters, total length: {len(article)})\n")
print("Article", article[:500])
print("highlight", highlight)

### Output of different models

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')
string = "The U.S. are a country. The U.N. is an organization."
sent_tokenize(string)

In [None]:
def baseline(text):
  return "\n".join(sent_tokenize(text)[:3])

In [None]:
sample = data["train"][0]["article"][:2000]
baseline(sample)

In [None]:
summaries = {}

In [None]:
from transformers import pipeline, set_seed

set_seed(42)
pipe = pipeline("text-generation","gpt2-xl",device=0)
gpt_query = sample + "\nTL;DR:\n"
pipe_out = pipe(gpt_query, max_length=512, clean_up_tokenization_spaces=True)


In [None]:
summaries["gpt2"] = pipe_out[0]['generated_text'][len(gpt_query):]

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
#del model
del pipe_out
#del trainer
import gc
gc.collect()
gc.collect()

In [None]:
pipe = pipeline("summarization","t5-large")
pipe_out = pipe(sample)
summaries["T5"] =  "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

In [None]:
summaries

In [None]:
del pipe

In [None]:
pipe = pipeline("summarization","t5-large")
pipe_out = pipe(sample)
summaries["T5"] =  "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

In [None]:
pipe = pipeline("summarization","facebook/bart-large-cnn")
pipe_out = pipe(sample)
summaries["Bart"] =  "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

In [None]:
pipe = pipeline("summarization",model="google/pegasus-cnn_dailymail")
pipe_out = pipe(sample)


In [None]:
summaries["Pegasus"] =  pipe_out[0]["summary_text"].replace(".<n>",".\n")

In [None]:
summaries

In [None]:
reference = data["train"][0]["highlights"]

In [None]:
!pip install rouge_score

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
from datasets import load_metric
rouge_metric = load_metric("rouge")

In [None]:
for model_name in summaries:
  rouge_metric.add(predictions=summaries[model_name],reference=reference)
  score = rouge_metric.compute()
  print(score)

In [None]:
test = data["test"].shuffle(seed=42).select(range(100))

In [None]:
from tqdm import tqdm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

def chunks(list_of_elem, batch_size):
  for i in range(0, len(list_of_elem), batch_size):
    yield list_of_elem[i : i + batch_size]


def evaluate(model, dataset,tokenizer,metric, column_text="article", reference_text="highlights",batch_size=16):

  article_batches = list(chunks(test["article"], batch_size))
  reference_batches = list(chunks(test["highlights"], batch_size))

  for article_batch, reference_batch in tqdm(zip(article_batches,reference_batches),total = len(article_batches)):

    tokenize_text = tokenizer(article_batch,max_length=1024, padding="max_length", truncation=True,return_tensors="pt")

    summaries = model.generate(input_ids=tokenize_text["input_ids"].to(device),
attention_mask=tokenize_text["attention_mask"].to(device),
length_penalty=0.8, num_beams=8, max_length=128)

    decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries]

    decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]

    metric.add_batch(predictions=decoded_summaries, references=reference_batch)

  return metric.compute()



In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
score = evaluate(model, test, tokenizer , rouge_metric, column_text="article", reference_text="highlights",batch_size=8)

rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn,score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=["pegasus"])

In [None]:
import gc
gc.collect()

In [None]:
del model
del tokenizer