<a href="https://colab.research.google.com/github/PranjaliJain/dialogsum_trained/blob/main/DialogSum_TweetSumm_Pegasus_Batch4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
# ----- Check if GPU is connected ----- # 
gpu_info = !nvidia-smi -L
gpu_info = "\n".join(gpu_info)
if gpu_info.find("failed") >= 0:
    print("Not connected to a GPU")
else:
    print(gpu_info)


GPU 0: Tesla T4 (UUID: GPU-9d50ace9-76b6-8aa6-d759-b4757c47e4c8)


In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **DialogSumm**

In [23]:
!git clone https://github.com/PranjaliJain/dialogsum_trained.git

fatal: destination path 'dialogsum_trained' already exists and is not an empty directory.


In [24]:
%cd /content/dialogsum_trained
# !git stash
#!git pull 
#!git status

/content/dialogsum_trained


In [32]:
!pip install datasets
!pip install transformers
!pip install rouge_score
!pip install --user -U nltk
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 4.9 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [26]:
import json
from datasets import load_metric,Dataset,DatasetDict,load_dataset
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers.optimization import Adafactor, AdafactorSchedule
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
from transformers import AutoTokenizer
# import os
from torch import nn 
import torch

import nltk
nltk.download('punkt')

import sys
from google.colab import drive
import pandas as pd
import numpy as np
import huggingface_hub
import matplotlib.pyplot as plt
import gc


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
# ----- Check if GPU is connected ----- # 
gpu_info = !nvidia-smi -L
gpu_info = "\n".join(gpu_info)
if gpu_info.find("failed") >= 0:
    print("Not connected to a GPU")
else:
    print(gpu_info)


GPU 0: Tesla T4 (UUID: GPU-9d50ace9-76b6-8aa6-d759-b4757c47e4c8)


In [28]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [29]:
# model_checkpoint = "facebook/bart-large"
model_name = 'google/pegasus-large'

metric = load_metric("rouge")

TEST_SUMMARY_ID = 1


def transform_single_dialogsumm_file(file):
    data = open(file,"r").readlines()
    result = {"fname":[],"summary":[],"dialogue":[]}
    for i in data:
        d = json.loads(i)
        for j in d.keys():
            if j in result.keys():
                result[j].append(d[j])
    return Dataset.from_dict(result)

def transform_test_file(file):
    data = open(file,"r").readlines()
    result = {"fname":[],"summary%d"%TEST_SUMMARY_ID:[],"dialogue":[]}
    for i in data:
        d = json.loads(i)
        for j in d.keys():
            if j in result.keys():
                result[j].append(d[j])
    
    result["summary"] = result["summary%d"%TEST_SUMMARY_ID]
    return Dataset.from_dict(result)

def transform_dialogsumm_to_huggingface_dataset(train,validation,test):
    train = transform_single_dialogsumm_file(train)
    validation = transform_single_dialogsumm_file(validation)
    test = transform_test_file(test)
    return DatasetDict({"train":train,"validation":validation,"test":test})


def freeze_params(model: nn.Module):
    """Set requires_grad=False for each of model.parameters()"""
    for par in model.parameters():
        par.requires_grad = False

def freeze_embeds(model):
    """Freeze token embeddings and positional embeddings for BART and PEGASUS, just token embeddings for t5."""
    model_type = model.config.model_type
    if model_type == "t5":
        freeze_params(model.shared)
        for d in [model.encoder, model.decoder]:
            freeze_params(d.embed_tokens)
    else:
        freeze_params(model.model.shared)
        for d in [model.model.encoder, model.model.decoder]:
            freeze_params(d.embed_positions)
            freeze_params(d.embed_tokens)

def preprocess_function(examples):
    inputs = [doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [31]:
raw_datasets = transform_dialogsumm_to_huggingface_dataset("/content/dialogsum_trained/DialogSum_Data/dialogsum.train.jsonl","/content/dialogsum_trained/DialogSum_Data/dialogsum.dev.jsonl","/content/dialogsum_trained/DialogSum_Data/dialogsum.test.jsonl")

torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'


# model = BartForConditionalGeneration.from_pretrained(model_checkpoint)
# tokenizer = BartTokenizer.from_pretrained(model_checkpoint)
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)



freeze_embeds(model)

max_input_length = 256
max_target_length = 128

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

ImportError: ignored

In [None]:
## use batch_size = 1 to get time around 8 hrs with GPU
## use batch_size = 16 to get time around 4 hrs with GPU - but it goes out of memory
batch_size = 4
args = Seq2SeqTrainingArguments(
    "BART-LARGE-DIALOGSUM",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    # save_strategy="epoch",
    # metric_for_best_model="eval_rouge1",
    # greater_is_better=True,
    # seed=42,
    # generation_max_length=max_target_length,
    push_to_hub=False,
    #output_dir = '/content/drive/MyDrive/project/results/dialogsumm-bart'
    #logging_dir = '/content/drive/MyDrive/project/results/dialogsumm-bart'
)


In [None]:

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
lr_scheduler = AdafactorSchedule(optimizer)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    optimizers=(optimizer, lr_scheduler),
    compute_metrics=compute_metrics
)

trainer.train()


Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, fname, dialogue. If summary, fname, dialogue are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 12460
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 9345


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.2278,1.132623,41.544,21.0477,36.5022,38.4384,19.938
2,1.0159,1.107988,43.1135,22.9444,38.5014,40.1118,19.876
3,0.8875,1.096485,42.2639,21.7634,37.5836,39.2259,19.908


Saving model checkpoint to BART-LARGE-DIALOGSUM/checkpoint-500
Configuration saved in BART-LARGE-DIALOGSUM/checkpoint-500/config.json
Model weights saved in BART-LARGE-DIALOGSUM/checkpoint-500/pytorch_model.bin
tokenizer config file saved in BART-LARGE-DIALOGSUM/checkpoint-500/tokenizer_config.json
Special tokens file saved in BART-LARGE-DIALOGSUM/checkpoint-500/special_tokens_map.json
Saving model checkpoint to BART-LARGE-DIALOGSUM/checkpoint-1000
Configuration saved in BART-LARGE-DIALOGSUM/checkpoint-1000/config.json
Model weights saved in BART-LARGE-DIALOGSUM/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in BART-LARGE-DIALOGSUM/checkpoint-1000/tokenizer_config.json
Special tokens file saved in BART-LARGE-DIALOGSUM/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to BART-LARGE-DIALOGSUM/checkpoint-1500
Configuration saved in BART-LARGE-DIALOGSUM/checkpoint-1500/config.json
Model weights saved in BART-LARGE-DIALOGSUM/checkpoint-1500/pytorch_model.bin
tok

TrainOutput(global_step=9345, training_loss=1.0795598094893242, metrics={'train_runtime': 3744.2886, 'train_samples_per_second': 9.983, 'train_steps_per_second': 2.496, 'total_flos': 1.9283523178266624e+16, 'train_loss': 1.0795598094893242, 'epoch': 3.0})

In [None]:

out = trainer.predict(tokenized_datasets["test"],num_beams=5)
predictions, labels ,metric= out
print(metric)

The following columns in the test set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, summary1, fname, dialogue. If summary, summary1, fname, dialogue are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 500
  Batch size = 4


{'test_loss': 1.3360599279403687, 'test_rouge1': 41.5981, 'test_rouge2': 18.7302, 'test_rougeL': 35.8956, 'test_rougeLsum': 37.8259, 'test_gen_len': 19.866, 'test_runtime': 73.2878, 'test_samples_per_second': 6.822, 'test_steps_per_second': 1.706}


In [None]:
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after e ach sentence
decoded_preds = [" ".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
decoded_labels = [" ".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

In [None]:
# output summaries on test set
with open("test_output.txt","w") as f: 
    for i in decoded_preds:
        print(i)
        f.write(i.replace("\n","")+"\n")

#Person1# asks Ms. Dawson to take a dictation for him
#Person2# got stuck in traffic again. #Person1# suggests #Person
#Person1# tells Kate Masha and Hero are getting divorced. Kate is surprised
#Person1# wishes Brian a happy birthday and dances with him at the party
#Person1# and #Person2# are in the Olympic stadium. #Person
#Person1# tells #Person2# #Person3# is quitting
#Person2# feels itchy and #Person1#Person3#
#Person2# checks out and finds the bill has been added to someone else's
#Person1# asks Steven for help because his wife is going to divorce him.
#Person2# thinks of Abraham Lincoln as a man or woman of sound character.
#Person1# tells #Person2# the north of China are experiencing severe sand
Francis gets a remote car as a birthday gift from #Person2#. Francis
Tony tells Steven he got caught cheating and feels ashamed. Steven advises him to study hard
#Person1# is in a hurry to catch the nine-thirty train.
#Person1# asks #Person2# for advice on adjusting life. #Person
#Pers

In [None]:
model.save_pretrained('./saved_model/')
tokenizer.save_pretrained('./saved_model/')

Configuration saved in ./saved_model/config.json
Model weights saved in ./saved_model/pytorch_model.bin
tokenizer config file saved in ./saved_model/tokenizer_config.json
Special tokens file saved in ./saved_model/special_tokens_map.json


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.json',
 './saved_model/merges.txt',
 './saved_model/added_tokens.json')

# **TweetSumm**

In [None]:
%cd /content/TweetSumm_trained


/content/TweetSumm_trained


In [None]:
!git clone https://github.com/PranjaliJain/TweetSumm_trained.git

In [None]:
from tweet_sum_processor import TweetSumProcessor

In [None]:
model_name = './../dialogsum_trained/saved_model/'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

Didn't find file ./../dialogsum_trained/saved_model/added_tokens.json. We won't load it.
loading file ./../dialogsum_trained/saved_model/vocab.json
loading file ./../dialogsum_trained/saved_model/merges.txt
loading file None
loading file ./../dialogsum_trained/saved_model/special_tokens_map.json
loading file ./../dialogsum_trained/saved_model/tokenizer_config.json
loading configuration file ./../dialogsum_trained/saved_model/config.json
Model config BartConfig {
  "_name_or_path": "./saved_model/",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  

In [None]:
# ----- Metric
metric = load_metric("rouge")

# ---- Freeze parameters

def freeze_params(model: nn.Module):
    """Set requires_grad=False for each of model.parameters()"""
    for par in model.parameters():
        par.requires_grad = False

def freeze_embeds(model):
    """Freeze token embeddings and positional embeddings for BART and PEGASUS, just token embeddings for t5."""
    model_type = model.config.model_type
    if model_type == "t5":
        freeze_params(model.shared)
        for d in [model.encoder, model.decoder]:
            freeze_params(d.embed_tokens)
    else:
        freeze_params(model.model.shared)
        for d in [model.model.encoder, model.model.decoder]:
            freeze_params(d.embed_positions)
            freeze_params(d.embed_tokens)

freeze_embeds(model)

# ----- Reading in the Dataset
raw_datasets = load_dataset('csv', data_files={'train': '/content/TweetSumm_trained/data/tweetsum_train.csv',
                                          'valid': '/content/TweetSumm_trained/data/tweetsum_valid.csv',
                                          'test': '/content/TweetSumm_trained/data/tweetsum_test.csv'})

Using custom data configuration default-3760b9b9fac89e08


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-3760b9b9fac89e08/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-3760b9b9fac89e08/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
max_input_length = 256
max_target_length = 128

def preprocess_function(examples):
    model_inputs = tokenizer(examples["inputs"], max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summaries"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
batch_size = 1
args = Seq2SeqTrainingArguments(
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    output_dir = '/content/drive/MyDrive/CIS6930_final/results/bart', 
    logging_dir = '/content/drive/MyDrive/CIS6930_final/logs/bart'
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

# **NO fine-tuning on TweetSumm**

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
# trainer.train()

Using amp half precision backend


In [None]:
# --------------------- # 
#    TEST EVALUATION    #
# --------------------- #

out = trainer.predict(tokenized_datasets["test"])
generated_summaries = []
for i in range(0, 110): 
  generated_summaries.append(tokenizer.decode(out[0][i], skip_special_tokens =  True))
ground_truth = tokenized_datasets["test"]["summaries"]
conversation = tokenized_datasets["test"]["inputs"]

The following columns in the test set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: inputs, summaries. If inputs, summaries are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 110
  Batch size = 1


In [None]:
out.metrics

{'test_gen_len': 20.0,
 'test_loss': 3.859863519668579,
 'test_rouge1': 20.3368,
 'test_rouge2': 6.4265,
 'test_rougeL': 16.5027,
 'test_rougeLsum': 18.4148,
 'test_runtime': 40.5731,
 'test_samples_per_second': 2.711,
 'test_steps_per_second': 2.711}

In [None]:
P, R, F1 = score(generated_summaries, ground_truth, lang="en", verbose=True)

In [None]:
print(f"System level F1 score: {F1.mean():.3f}")
print(f"System level precision score: {P.mean():.3f}")
print(f"System level recall score: {R.mean():.3f}")

# **Fine-tuning on TweetSumm**

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: inputs, summaries. If inputs, summaries are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 869
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 2607


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.3793,2.10841,32.4224,14.2497,27.8786,30.3257,20.0
2,1.7324,2.064653,34.1352,15.3762,29.3475,32.1417,20.0
3,1.4169,2.076144,36.0761,16.9238,31.5337,34.2176,20.0


Saving model checkpoint to /content/drive/MyDrive/CIS6930_final/results/bart/checkpoint-500
Configuration saved in /content/drive/MyDrive/CIS6930_final/results/bart/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/CIS6930_final/results/bart/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/CIS6930_final/results/bart/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/CIS6930_final/results/bart/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: inputs, summaries. If inputs, summaries are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 108
  Batch size = 1
Saving model checkpoint to /content/drive/MyDrive/CIS6930_final/results/bart/checkpoint-1000
Configuration saved i

TrainOutput(global_step=2607, training_loss=1.798732568808886, metrics={'train_runtime': 1026.9638, 'train_samples_per_second': 2.539, 'train_steps_per_second': 2.539, 'total_flos': 1277508162945024.0, 'train_loss': 1.798732568808886, 'epoch': 3.0})

In [None]:
# --------------------- # 
#    TEST EVALUATION    #
# --------------------- #

out = trainer.predict(tokenized_datasets["test"])
generated_summaries = []
for i in range(0, 110): 
  generated_summaries.append(tokenizer.decode(out[0][i], skip_special_tokens =  True))
ground_truth = tokenized_datasets["test"]["summaries"]
conversation = tokenized_datasets["test"]["inputs"]

The following columns in the test set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: inputs, summaries. If inputs, summaries are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 110
  Batch size = 1


In [None]:
out.metrics

{'test_gen_len': 20.0,
 'test_loss': 2.0977628231048584,
 'test_rouge1': 34.1796,
 'test_rouge2': 14.9658,
 'test_rougeL': 30.0137,
 'test_rougeLsum': 32.0041,
 'test_runtime': 40.3824,
 'test_samples_per_second': 2.724,
 'test_steps_per_second': 2.724}

In [None]:
P, R, F1 = score(generated_summaries, ground_truth, lang="en", verbose=True)

In [None]:
print(f"System level F1 score: {F1.mean():.3f}")
print(f"System level precision score: {P.mean():.3f}")
print(f"System level recall score: {R.mean():.3f}")