In [27]:
from transformers import pipeline
from text_rank.evaluation import *
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
summarizer = pipeline("summarization", model="KamilAin/bart-base-booksum", device=device)
data = read_data("small_datasets/CNNML_tiny.csv")
# Perform summarization

config.json: 100%|██████████| 1.74k/1.74k [00:00<00:00, 874kB/s]
model.safetensors: 100%|██████████| 558M/558M [00:16<00:00, 34.6MB/s] 
tokenizer_config.json: 100%|██████████| 331/331 [00:00<00:00, 110kB/s]
vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 28.0MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 28.5MB/s]
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 40.0MB/s]
special_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 120kB/s]


In [28]:
summary = summarizer(str(data[0]), max_length=70, min_length=30, do_sample=False)
# Print the summarized text
print(summary[0]['summary_text'])

LONDON, England (Reuters) - Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. He says he has no plans to spend his money on fast cars, drink and celebrity parties


In [21]:
from transformers import BartTokenizerFast, BartForConditionalGeneration
from datasets import load_dataset

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Load the dataset
dataset = load_dataset("cnn_dailymail", '3.0.0')
train_dataset = dataset["train"]
val_dataset = dataset["validation"]

# Load the fast tokenizer
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")

# Tokenize the dataset
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]



Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

In [22]:
from transformers import Trainer, TrainingArguments
# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
model.to(device)


training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    do_eval=True,
    save_total_limit=5,
    evaluation_strategy="epoch",
    max_steps=2000
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val
)

trainer.train()
model.save_pretrained("./bart_cnn_dailymail_finetuned")

Epoch,Training Loss,Validation Loss


In [23]:
from transformers import BartTokenizerFast, BartForConditionalGeneration
from text_rank.evaluation import *
def generate_summary(text, model, tokenizer):
    inputs = tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=30, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

model_path = "./bart_cnn_dailymail_finetuned"
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained(model_path)
data = read_data("small_datasets/CNNML_tiny.csv")
example_text = prefix + str(data[0])

summary = generate_summary(example_text, model, tokenizer)
print(summary)

Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday
