# Bart-Base-CNN Summarization Model

In [1]:
!pip install datasets
!pip install py7zr
!pip install transformers
!pip install evaluate
!pip install sentencepiece
!pip install rouge_score
!pip install accelerate -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collec

In [2]:
from datasets import load_dataset


data = load_dataset("JeremyAlain/SLF5K")
split_lengths = [len(data[split]) for split in data]

Downloading builder script:   0%|          | 0.00/11.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

Downloading and preparing dataset slf5_k/SLF5K to /root/.cache/huggingface/datasets/JeremyAlain___slf5_k/SLF5K/1.0.0/6b37f332eea04ffa072f2c66e87393132bd68a310796894ee18fb105544d3294...


Downloading data files:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/836k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.99M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/4 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating development split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset slf5_k downloaded and prepared to /root/.cache/huggingface/datasets/JeremyAlain___slf5_k/SLF5K/1.0.0/6b37f332eea04ffa072f2c66e87393132bd68a310796894ee18fb105544d3294. Subsequent calls will reuse this data.


  0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
from transformers import pipeline
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')
pipe = pipeline("summarization", model="ainize/bart-base-cnn", framework='pt')
pipe_out = pipe(data['train'][0]['post'][:1000])
print("GROUND TRUTH")
print(data["train"][0]["ideal_human_summary"])
print("")
print("BART")
print(" ".join(sent_tokenize(pipe_out[0]["summary_text"])))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

GROUND TRUTH
The poster is frustrated with a new person at the dog park who is upset with him because their young dogs are playing roughly. The poster will step in if it gets aggressive and wants the new person to understand this. 

BART
A new dog (8m shephad/retriever) has been coming with his owner the last few days . The first day they came buzz took a liking to this new dog, max, and was all over him . Max was playing back, not submissive or anything but the owner stepped in and hauled buzz off and whacked him .


In [4]:
from tqdm import tqdm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

def chunks(list_of_elements, batch_size):
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def evaluate_summaries(dataset, metric, model, tokenizer,
                       batch_size=16, device=device,
                       column_text="post", column_summary="ideal_human_summary"):
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024, truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                   attention_mask=inputs["attention_mask"].to(device),
                                   length_penalty=0.8, num_beams=8, max_length=128)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                              clean_up_tokenization_spaces=True)
                             for s in summaries]

        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        
    return metric.compute(predictions=decoded_summaries, references=target_batch)

In [5]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_ckpt = "ainize/bart-base-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [6]:
import evaluate
import pandas as pd

rouge_metric = evaluate.load("rouge")
score = evaluate_summaries(data["test"], rouge_metric, model,
                           tokenizer, column_text="post",
                           column_summary="ideal_human_summary", batch_size=8)

pd.DataFrame(score, index=["bart"])

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

100%|██████████| 88/88 [06:55<00:00,  4.72s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
bart,0.224662,0.013889,0.144595,0.144595


In [7]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["post"], truncation=True,
                                max_length=1024)

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["ideal_human_summary"], max_length=128,
                                     truncation=True)

    return {"input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]}

data_features = data.map(convert_examples_to_features,
                                       batched=True)

data_features.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]



Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/698 [00:00<?, ? examples/s]

In [8]:
from transformers import DataCollatorForSeq2Seq

seq2seq_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [9]:
from transformers import TrainingArguments, Trainer

# Gradient accumulation saves memory by updating the model only every X batches
training_args = TrainingArguments(
    output_dir="bart-dia", num_train_epochs=10, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10, push_to_hub=False,
    evaluation_strategy="steps", eval_steps=100, save_steps=1e6,
    gradient_accumulation_steps=10)

In [10]:
trainer = Trainer(model=model, args=training_args,
                  tokenizer=tokenizer, data_collator=seq2seq_collator,
                  train_dataset=data_features["train"],
                  eval_dataset=data_features["validation"])

trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
100,2.4562,2.188439
200,2.2904,2.083102
300,2.2999,2.012454
400,2.3472,1.995897
500,2.4038,1.98619
600,1.8885,1.99774
700,1.9389,1.988586
800,2.177,1.961249
900,2.0799,1.962036
1000,1.971,1.952134


TrainOutput(global_step=5000, training_loss=1.4132083475112915, metrics={'train_runtime': 4648.4139, 'train_samples_per_second': 10.756, 'train_steps_per_second': 1.076, 'total_flos': 8910893434060800.0, 'train_loss': 1.4132083475112915, 'epoch': 10.0})

In [11]:
# Evaluate after finetuning
score = evaluate_summaries(
    data_features["test"], rouge_metric, trainer.model, tokenizer,
    batch_size=2, column_text="post", column_summary="ideal_human_summary")

pd.DataFrame(score, index=[f"bart_finetuned"])

100%|██████████| 349/349 [04:51<00:00,  1.20it/s]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
bart_finetuned,0.418182,0.162745,0.350649,0.350649


In [21]:
sample_text = data["test"][0]["post"]
reference = data["test"][0]["ideal_human_summary"]

inputs = tokenizer(sample_text, max_length=1024, truncation=True,
                   padding="max_length", return_tensors="pt")

summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                           attention_mask=inputs["attention_mask"].to(
    device),
    length_penalty=0.8, num_beams=8, max_length=128)

decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                      clean_up_tokenization_spaces=True)
                     for s in summaries]

decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]


In [22]:
print(decoded_summaries)

["The poster took sleeping pills and didn't remember much after that. They woke up in the morning and realized their mistake and got their work done six minutes before their shift was supposed to start."]
