In [1]:
!pip install \
transformers \
datasets \
transformers[torch] \
transformers[sentencepiece]

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m69.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Download

In [2]:
from datasets import load_dataset

In [3]:
dataset = load_dataset("cnn_dailymail", version="3.0.0")
print(f"Features: {dataset['train'].column_names}")

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Features: ['article', 'highlights', 'id']


In [4]:
sample = dataset['train'][1]
print(f"""
Article (excerpt of 500 characters, total length: {len(sample["article"])}):
      """)
print(sample["article"][:500])
print(f"\nSummary (length: {len(sample['highlights'])}):")
print(sample["highlights"])


Article (excerpt of 500 characters, total length: 4051):
      
Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most s

Summary (length: 281):
Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .


# Text Summarization Pipeline

In [5]:
import nltk
from nltk.tokenize import sent_tokenize

In [6]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
string = "Mr. Ronit is paying $10. The shopkeeper is selling that item."
sent_tokenize(string)

['Mr. Ronit is paying $10.', 'The shopkeeper is selling that item.']

In [8]:
sample_text = dataset["train"][1]["article"][:2000]

In [9]:
summaries = {}

In [10]:
def three_sentence_summary(text):
  return "\n".join(sent_tokenize(text)[:3])

summaries["baseline"] = three_sentence_summary(sample_text)
summaries

{'baseline': 'Editor\'s note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events.\nHere, Soledad O\'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial.\nMIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor."'}

# GPT 2

In [11]:
from transformers import pipeline, set_seed
set_seed(42)
pipe = pipeline("text-generation", model="gpt2-xl")
gpt2_query = sample_text + "\nTL:DR:\n"
pipe_out = pipe(gpt2_query, max_length=512, clean_up_tokenization_spaces=True)
summaries["gpt2"] = "\n".join(
    sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query):])
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [12]:
summaries

{'baseline': 'Editor\'s note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events.\nHere, Soledad O\'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial.\nMIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor."',
 'gpt2': "I'm not an expert on mental illness and would be happy to learn of an expert who can vouch for this place.\nA mental illness is not like a physical illness because, if you have a mental illness, you usually don't know you have a mental illness.\nThere is no cure for a mental illness.\nThere is also no cure for becoming a criminal.\nFor the majority"}

# T5

In [14]:
pipe = pipeline("summarization", model="t5-large")
pipe_out = pipe(sample_text)
summaries["t5"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [15]:
print(summaries["t5"])

mentally ill inmates are housed on the ninth floor of a florida jail .
most face drug charges or charges of assaulting an officer .
judge says arrests often result from confrontations with police .
one-third of all people in Miami-dade county jails are mental ill .


# BART

In [16]:
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
pipe_out = pipe(sample_text)
summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [17]:
print(summaries["bart"])

Mentally ill inmates are housed on the "forgotten floor" of Miami-Dade jail.
Most often, they face drug charges or charges of assaulting an officer.
Judge Steven Leifman says the arrests often result from confrontations with police.
He says about one-third of all people in the county jails are mentally ill.


# PEGASUS

In [18]:
pipe = pipeline("summarization", model="google/pegasus-cnn_dailymail")
pipe_out = pipe(sample_text)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)neration_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

In [19]:
summaries["pegasus"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"].replace(" .<n>", ".\n").replace("<n>", ".\n")))

In [20]:
print(summaries["pegasus"])

Mentally ill inmates in Miami are housed on the "forgotten floor".
The ninth floor is where they're held until they're ready to appear in court.
Most often, they face drug charges or charges of assaulting an officer.
They end up on the ninth floor severely mentally disturbed .


# Comparing different summaries

In [21]:
print("GROUND TRUTH")
print(dataset["train"][1]["highlights"])
print("")
for model_name in summaries:
  print(model_name.upper())
  print(summaries[model_name])
  print("")

GROUND TRUTH
Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .

BASELINE
Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events.
Here, Soledad O'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial.
MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor."

GPT2
I'm not an expert on mental illness and would be happy to learn of an expert who can vouch for this place.
A mental illness is not like a physical illness because, if you have a mental illness, you usually don't know y

# Evaluation

In [22]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/118.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m112.6/118.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.7.0 sacrebleu-2.3.1


In [23]:
from datasets import load_metric
bleu_metric = load_metric("sacrebleu")

  bleu_metric = load_metric("sacrebleu")


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [24]:
import pandas as pd
import numpy as np
bleu_metric.add(
    predictions="hello hello hello hello hello hello", reference=["hello I am Ronit, and hello again"]
)

results = bleu_metric.compute(smooth_method="floor", smooth_value=0)
results["precisions"] = [np.round(p, 2) for p in results["precisions"]]
pd.DataFrame.from_dict(results, orient="index", columns=["Values"])

Unnamed: 0,Values
score,0.0
counts,"[2, 0, 0, 0]"
totals,"[6, 5, 4, 3]"
precisions,"[33.33, 0.0, 0.0, 0.0]"
bp,0.716531
sys_len,6
ref_len,8


In [25]:
bleu_metric.add(
    predictions="hello I Ronit, and hello", reference=["hello I am Ronit, and hello again"]
)

results = bleu_metric.compute(smooth_method="floor", smooth_value=0)
results["precisions"] = [np.round(p, 2) for p in results["precisions"]]
pd.DataFrame.from_dict(results, orient="index", columns=["Values"])

Unnamed: 0,Values
score,43.298201
counts,"[6, 4, 2, 1]"
totals,"[6, 5, 4, 3]"
precisions,"[100.0, 80.0, 50.0, 33.33]"
bp,0.716531
sys_len,6
ref_len,8


In [26]:
!pip install rouge_score
rouge_metric = load_metric("rouge")

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=9a1d5f833e6e49bad9e071b6d997924b0c699ed0f76952b94e34ac675a322a62
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [27]:
reference = dataset["train"][1]["highlights"]
records = []
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

In [28]:
for model_name in summaries:
  rouge_metric.add(prediction=summaries[model_name], reference=reference)
  score = rouge_metric.compute()
  rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
  records.append(rouge_dict)


pd.DataFrame.from_records(records, index=summaries.keys())

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.365079,0.145161,0.206349,0.285714
gpt2,0.152542,0.0,0.084746,0.152542
t5,0.382979,0.130435,0.255319,0.382979
bart,0.475248,0.222222,0.316832,0.415842
pegasus,0.326531,0.208333,0.285714,0.326531


# Evaluating pegasus on the CNN/DailyMail Dataset

In [29]:
def evaluate_summaries_baseline(dataset, metric,
                                column_text="article", column_summary="highlights"):
  summaries = [three_sentence_summary(text) for text in dataset[column_text]]
  metric.add_batch(predictions=summaries,
                   references=dataset[column_summary])

  score = metric.compute()
  return score

In [30]:
test_sampled = dataset["test"].shuffle(seed=42).select(range(1000))

In [31]:
score = evaluate_summaries_baseline(test_sampled, rouge_metric)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame.from_dict(rouge_dict, orient="index", columns=["baseline"]).T

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.389276,0.171296,0.245061,0.354239


In [32]:
from tqdm import tqdm
import torch

In [33]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [34]:
def chunks(list_of_elements, batch_size):
  for i in range(0, len(list_of_elements), batch_size):
    yield list_of_elements[i: i+batch_size]

In [35]:
def evaluate_summaries_pegasus(dataset, metric, model, tokenizer,
                               batch_size=16, device=device, column_text="article", column_summary="highlights"):
  article_batches=list(chunks(dataset[column_text], batch_size))
  target_batches = list(chunks(dataset[column_summary], batch_size))

  for article_batches, target_batches in tqdm(
      zip(article_batches, target_batches), total=len(article_batches)
  ):
    inputs = tokenizer(article_batches, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
    summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                   attention_mask=inputs["attention_mask"].to(device),
                   length_penalty=0.8,
                   num_beams=8,
                   max_length=128)

    decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                          clean_up_tokenization_spaces=True)
    for s in summaries]

    decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
    metric.add_batch(predictions=decoded_summaries, references=target_batches)

    score = metric.compute()
    return score

In [36]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [37]:
model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
score = evaluate_summaries_pegasus(test_sampled, rouge_metric, model, tokenizer, batch_size=8)
score

  0%|          | 0/125 [00:07<?, ?it/s]


{'rouge1': AggregateScore(low=Score(precision=0.2514869879850981, recall=0.33116296993427513, fmeasure=0.28624853254090543), mid=Score(precision=0.3437029872900844, recall=0.4737559764990129, fmeasure=0.389936401488888), high=Score(precision=0.4096958605869597, recall=0.5893576269297783, fmeasure=0.4648925136790579)),
 'rouge2': AggregateScore(low=Score(precision=0.0699415672065928, recall=0.09125814646479324, fmeasure=0.07927820020543086), mid=Score(precision=0.13225348110258595, recall=0.18464811794553174, fmeasure=0.14982892254522812), high=Score(precision=0.187601758915215, recall=0.2836408659299481, fmeasure=0.22076496480762817)),
 'rougeL': AggregateScore(low=Score(precision=0.16414539170788528, recall=0.22133919674150382, fmeasure=0.1842511509412843), mid=Score(precision=0.23173288160301697, recall=0.3242664500382681, fmeasure=0.2632261968176638), high=Score(precision=0.2949539511157585, recall=0.4276349041388396, fmeasure=0.33580665785799013)),
 'rougeLsum': AggregateScore(low=

In [39]:
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=["pegasus"])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.389936,0.149829,0.263226,0.321016


# Training on SAMSum Dataset

In [40]:
!pip install py7zr

Collecting py7zr
  Downloading py7zr-0.20.6-py3-none-any.whl (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.7/66.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting texttable (from py7zr)
  Downloading texttable-1.6.7-py2.py3-none-any.whl (10 kB)
Collecting pycryptodomex>=3.6.6 (from py7zr)
  Downloading pycryptodomex-3.18.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyzstd>=0.14.4 (from py7zr)
  Downloading pyzstd-0.15.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (412 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m412.3/412.3 kB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyppmd<1.1.0,>=0.18.1 (from py7zr)
  Downloading pyppm

In [41]:
samsum_dataset = load_dataset("samsum")

Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

In [42]:
samsum_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [43]:
split_length = [len(samsum_dataset[split]) for split in samsum_dataset]
split_length

[14732, 819, 818]

In [44]:
print(f"Split lengths: {split_length}")
print(f"Features: {samsum_dataset['train'].column_names}")
print("\nDialogue:")
print(samsum_dataset['test'][0]['dialogue'])
print("\nSummary")
print(samsum_dataset['test'][0]['summary'])

Split lengths: [14732, 819, 818]
Features: ['id', 'dialogue', 'summary']

Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Summary
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


## Evaluating Pegasus on samsum

In [45]:
pipe_out = pipe(samsum_dataset['test'][0]["dialogue"])
print("Summary:")
pipe_out

Your max_length is set to 128, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


Summary:


[{'summary_text': "Amanda: Ask Larry Amanda: He called her last time we were at the park together .<n>Hannah: I'd rather you texted him .<n>Amanda: Just text him ."}]

In [46]:
print(pipe_out[0]["summary_text"].replace(" .<n>", ".\n"))

Amanda: Ask Larry Amanda: He called her last time we were at the park together.
Hannah: I'd rather you texted him.
Amanda: Just text him .


In [47]:
score = evaluate_summaries_pegasus(samsum_dataset['test'], rouge_metric, model,
                                   tokenizer, column_text="dialogue",
                                   column_summary="summary", batch_size=8)
print(score)

  0%|          | 0/103 [00:03<?, ?it/s]

{'rouge1': AggregateScore(low=Score(precision=0.14484363696648184, recall=0.286075810913497, fmeasure=0.1948646737227217), mid=Score(precision=0.24922655850247316, recall=0.4002186257015611, fmeasure=0.26711692217605854), high=Score(precision=0.3788046410919295, recall=0.5070377564755585, fmeasure=0.356215031553085)), 'rouge2': AggregateScore(low=Score(precision=0.015306122448979591, recall=0.03362068965517241, fmeasure=0.018748548873926164), mid=Score(precision=0.0647020456928291, recall=0.08213259441707718, fmeasure=0.06050691911363271), high=Score(precision=0.14443517527978933, recall=0.14226934523809523, fmeasure=0.11255367346879358)), 'rougeL': AggregateScore(low=Score(precision=0.10722085744715056, recall=0.21200678228625902, fmeasure=0.1450660721296641), mid=Score(precision=0.17752725179922407, recall=0.30068378668367324, fmeasure=0.19658113507089814), high=Score(precision=0.2727156132739212, recall=0.39525403324745434, fmeasure=0.263831578217267)), 'rougeLsum': AggregateScore(l




In [48]:
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=["pegasus"])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.267117,0.060507,0.196581,0.196891


# Fine-Tuning Pegasus

In [49]:
def convert_examples_to_features(example_batch):
  input_encodings = tokenizer(example_batch["dialogue"], max_length=1024, truncation=True)
  with tokenizer.as_target_tokenizer():
    target_encodings=tokenizer(example_batch["summary"], max_length=128, truncation=True)

  return {"input_ids": input_encodings["input_ids"],
          "attention_mask": input_encodings["attention_mask"],
          "labels": target_encodings["input_ids"]}

In [50]:
dataset_samsum_pt = samsum_dataset.map(convert_examples_to_features, batched=True)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [51]:
columns = ["input_ids", "labels", "attention_mask"]
dataset_samsum_pt.set_format(type='torch', columns=columns)
dataset_samsum_pt

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})

In [52]:
from transformers import DataCollatorForSeq2Seq

In [53]:
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [54]:
from transformers import Trainer, TrainingArguments

In [55]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [56]:
training_args = TrainingArguments(
    output_dir='fine-tuned-pegasus-samsum',
    num_train_epochs=1,
    warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.1, logging_steps=10, push_to_hub=True,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16
)

In [57]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [59]:
trainer = Trainer(model=model, args=training_args,
                  tokenizer=tokenizer,
                  data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"],
                  eval_dataset=dataset_samsum_pt["validation"])

Cloning https://huggingface.co/ronit33/fine-tuned-pegasus-samsum into local empty directory.


In [60]:
trainer.train()

You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
500,1.6647,1.482927


TrainOutput(global_step=920, training_loss=1.8359185861504597, metrics={'train_runtime': 1794.6638, 'train_samples_per_second': 8.209, 'train_steps_per_second': 0.513, 'total_flos': 5526698901602304.0, 'train_loss': 1.8359185861504597, 'epoch': 1.0})

In [62]:
score = evaluate_summaries_pegasus(
    samsum_dataset["test"], rouge_metric, trainer.model, tokenizer,
    batch_size=2, column_text="dialogue", column_summary="summary"
)
print(score)

  0%|          | 0/410 [00:01<?, ?it/s]

{'rouge1': AggregateScore(low=Score(precision=0.32142857142857145, recall=0.5625, fmeasure=0.40909090909090917), mid=Score(precision=0.3807142857142857, recall=0.7395833333333333, fmeasure=0.5018427518427518), high=Score(precision=0.44, recall=0.9166666666666666, fmeasure=0.5945945945945945)), 'rouge2': AggregateScore(low=Score(precision=0.07407407407407407, recall=0.13333333333333333, fmeasure=0.09523809523809523), mid=Score(precision=0.2037037037037037, recall=0.4303030303030303, fmeasure=0.2761904761904762), high=Score(precision=0.3333333333333333, recall=0.7272727272727273, fmeasure=0.4571428571428572)), 'rougeL': AggregateScore(low=Score(precision=0.21428571428571427, recall=0.375, fmeasure=0.2727272727272727), mid=Score(precision=0.30714285714285716, recall=0.6041666666666667, fmeasure=0.40663390663390664), high=Score(precision=0.4, recall=0.8333333333333334, fmeasure=0.5405405405405406)), 'rougeLsum': AggregateScore(low=Score(precision=0.21428571428571427, recall=0.375, fmeasure




In [63]:
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=["pegasus"])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.501843,0.27619,0.406634,0.406634
