
# Fine-tuning T5 Summarization

---

In [None]:
!nvidia-smi

Wed Apr 13 20:29:39 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Setup

---

In [None]:
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json
      - Validating: [32mOK[0m
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json


In [None]:
! pip install transformers
! pip install datasets
! pip install sentencepiece
! pip install rouge_score
! pip install wandb



In [None]:
import torch
import numpy as np
import datasets

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

from tabulate import tabulate
import nltk
from datetime import datetime

In [None]:
WANDB_INTEGRATION = True
if WANDB_INTEGRATION:
    import wandb

    wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mbigbam[0m (use `wandb login --relogin` to force relogin)


## Model and tokenizer

---

In [None]:
import tensorflow as tf
print(tf.test.is_built_with_cuda())

True


In [None]:
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModelForSeq2SeqLM, AutoModelForMaskedLM
import torch
#Llamado del modelo

# #Definición de modelo y tokenizador
tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
model = AutoModelForSeq2SeqLM.from_pretrained("google/bigbird-pegasus-large-arxiv")

# tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
# model = AutoModelForPreTraining.from_pretrained("google/bigbird-roberta-base")

# tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-large")
# model = AutoModelForMaskedLM.from_pretrained("google/bigbird-roberta-large")

# Se fijan los parámetros del modelo
model.config.max_length = 500
model.config.min_length = 350
print(model.config)

# tokenización
encoder_max_length = 256 
decoder_max_length = 128

BigBirdPegasusConfig {
  "_name_or_path": "google/bigbird-pegasus-large-arxiv",
  "activation_dropout": 0.0,
  "activation_function": "gelu_new",
  "architectures": [
    "BigBirdPegasusForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_type": "block_sparse",
  "block_size": 64,
  "bos_token_id": 2,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 16,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 16,
  "eos_token_id": 1,
  "gradient_checkpointing": false,
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "length_penalty": 0.8,
  "max_length": 500,
  "max_position_embeddings": 4096,
  "min_length": 350,
  "model_type": "bigbird_pegasus",
  "num_beams": 5,
  "num_hidden_layers": 16,
  "num_random_blocks": 3,
  "pad_token_id": 0,
  "scale_embedding": t

In [None]:
print(tokenizer)

PreTrainedTokenizerFast(name_or_path='google/bigbird-pegasus-large-arxiv', vocab_size=96103, model_max_len=4096, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=True)})


## Data

---

### Descarga y Preparación de los Datos

### Cargado de Dataset

In [None]:
train_data_txt = datasets.load_dataset("billsum", '3.0.0', split="train[:1000]")
validation_data_txt = datasets.load_dataset("billsum", '3.0.0', split="test[:100]")

Using custom data configuration 3.0.0
Reusing dataset billsum (/root/.cache/huggingface/datasets/billsum/3.0.0/3.0.0/d1e95173aed3acb71327864be74ead49b578522e4c7206048b2f2e5351b57959)
Using custom data configuration 3.0.0
Reusing dataset billsum (/root/.cache/huggingface/datasets/billsum/3.0.0/3.0.0/d1e95173aed3acb71327864be74ead49b578522e4c7206048b2f2e5351b57959)


**Preprocess and tokenize**

In [None]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["text"], batch["summary"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch


train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, 10, 10
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, 10, 10
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

Loading cached processed dataset at /root/.cache/huggingface/datasets/billsum/3.0.0/3.0.0/d1e95173aed3acb71327864be74ead49b578522e4c7206048b2f2e5351b57959/cache-3f636f53c3c3fd4b.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/billsum/3.0.0/3.0.0/d1e95173aed3acb71327864be74ead49b578522e4c7206048b2f2e5351b57959/cache-126858f242e16f21.arrow


## Training

---

### Metrics

In [None]:
# Borrowed from https://github.com/huggingface/transformers/blob/master/examples/seq2seq/run_summarization.py

nltk.download("punkt", quiet=True)

metric = datasets.load_metric("rouge")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

### Training arguments

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=10,  # demo
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=2,  # demo
    per_device_eval_batch_size=2,
    learning_rate=3e-04,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True, #Para métricas ROUGE
    logging_dir="logs",
    logging_steps=50,
    save_total_limit=3,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

### Train

Wandb integration

In [None]:
if WANDB_INTEGRATION:
    wandb_run = wandb.init(
        project="BigBird_Billsum_FT",
        config={
            "per_device_train_batch_size": training_args.per_device_train_batch_size,
            "learning_rate": training_args.learning_rate,
            "dataset": "BillSum",
        },
    )

    now = datetime.now()
    current_time = now.strftime("%H%M%S")
    wandb_run.name = "run_" + current_time

Evaluate before fine-tuning

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Attention type 'block_sparse' is not possible if sequence_length: 10 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


{'eval_gen_len': 498.57,
 'eval_loss': 11.246406555175781,
 'eval_rouge1': 0.7086,
 'eval_rouge2': 0.0083,
 'eval_rougeL': 0.7118,
 'eval_rougeLsum': 0.7046,
 'eval_runtime': 523.3183,
 'eval_samples_per_second': 0.191,
 'eval_steps_per_second': 0.096}

Train the model

In [None]:
torch.cuda.is_available()
import tensorflow as tf

In [None]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [None]:
# %%wandb
# uncomment to display Wandb charts

trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 5000


Step,Training Loss
50,10.9412
100,9.1534
150,8.6169
200,8.2971
250,7.8905
300,7.2746
350,6.7655
400,6.2447
450,6.3015
500,6.3043


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in results/checkpoint-500/tokenizer_config.json
Special tokens file saved in results/checkpoint-500/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-4000] due to args.save_total_limit
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in results/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-4500] due to args.save_total_limit
Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json
Model weights saved in results/checkpoint-1500/pytorch_model.bin
tokenizer 

TrainOutput(global_step=5000, training_loss=4.5079508392333985, metrics={'train_runtime': 1447.9641, 'train_samples_per_second': 6.906, 'train_steps_per_second': 3.453, 'total_flos': 282056294400000.0, 'train_loss': 4.5079508392333985, 'epoch': 10.0})

Evaluate after fine-tuning

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 100
  Batch size = 2


{'epoch': 10.0,
 'eval_gen_len': 351.79,
 'eval_loss': 6.039236068725586,
 'eval_rouge1': 2.7213,
 'eval_rouge2': 0.5139,
 'eval_rougeL': 2.2297,
 'eval_rougeLsum': 2.6067,
 'eval_runtime': 372.1941,
 'eval_samples_per_second': 0.269,
 'eval_steps_per_second': 0.134}

In [None]:
if WANDB_INTEGRATION:
    wandb_run.finish()




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/gen_len,█▁
eval/loss,█▁
eval/rouge1,▁█
eval/rouge2,▁█
eval/rougeL,▁█
eval/rougeLsum,▁█
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████

0,1
eval/gen_len,351.79
eval/loss,6.03924
eval/rouge1,2.7213
eval/rouge2,0.5139
eval/rougeL,2.2297
eval/rougeLsum,2.6067
eval/runtime,372.1941
eval/samples_per_second,0.269
eval/steps_per_second,0.134
train/epoch,10.0


## Evaluation

---

**Generate summaries from the fine-tuned model and compare them with those generated from the original, pre-trained one.**

In [None]:
from transformers import BigBirdTokenizer, BigBirdModel
import torch
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["text"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str


test_samples = validation_data_txt.select(range(16))

summaries_after_tuning = generate_summary(test_samples, model)[1]



RuntimeError: ignored

In [None]:
# model_before_tuning =  AutoModelForSeq2SeqLM.from_pretrained("google/bigbird-pegasus-large-arxiv")
# summaries_before_tuning = generate_summary(test_samples, model_before_tuning)[1]

model_before_tuning = ['in a recent letter to the editor of _ physica a _, a. s. aquilar, a. a. de oliveira, j. a. de oliveira, m. a. de oliveira, j. a. de oliveira, s. a. de oliveira, e. a. de oliveira, s. a. de oliveira, c. m. de oliveira, f. a. de oliveira, s. a. de oliveira, s. a. de oliveira, j. a. de oliveira, s. a. de oliveira, e. a. de oliveira, s. a. de oliveira, s. a. de oliveira, s. a. de oliveira, s. a. de oliveira, s. a. de oliveira, s. a. de oliveira, s. a. de oliveira, s. a. de oliveira, s. a. de oliveira, s. a. de olive',
 'this may be cited as : 1.. <n> ( a). : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : ',
 'in this letter, we express our deepest gratitude to the people of the united states of america for their decades of dedicated service to our country and to the world.<n> in addition, we express our deepest gratitude to the people of the united states of america for their decades of dedicated service to our country and to the world.',
 'we address here the question of whether or not the city of manchester has the right to require that any candidate for mayor be a citizen of manchester.<n> we also address the question of whether or not the city has the right to demand that any candidate for mayor be a citizen of manchester.',
 'we address the following question : 1. for a finite period of time ( 180 days ).<n> 2. for a finite period of time ( 45 days ).<n> 3. for a finite period of time ( 180 days ).<n> 4. for a finite period of time ( 45 days ).<n> 5. for a finite period of time ( 180 days ).<n> 6. for a finite period of time ( 45 days ).<n> 7. for a finite period of time ( 180 days ).<n> 8. for a finite period of time ( 45 days ).<n> 9. for a finite period of time ( 180 days ).<n> 10. for a finite period of time ( 45 days ).<n> 11. for a finite period of time ( 180 days ).<n> 12. for a finite period of time ( 45 days ).<n> 14. for a finite period of time ( 180 days ).<n> 15. for a finite period of time ( 45 days ).<n> 16. for a finite period of time (',
 'we address the following question : under what conditions can a controlled substance program be established and maintained?.<n> the answer is : under what conditions can a controlled substance program be established and maintained?.<n> the program is defined by : ( i ) the number of patients for which a controlled substance has been prescribed, ( ii ) the number of patients for which a controlled substance has been administered, and ( iii ) the total number of patients for which a controlled substance has been administered.<n> the program is defined by : ( i ) the number of patients for which a controlled substance has been prescribed, ( ii ) the number of patients for which a controlled substance has been administered, and ( iii ) the total number of patients for which a controlled substance has been administered.<n> the program is defined by : ( i ) the number of patients for which a controlled substance has been prescribed, ( ii ) the number of patients for which a controlled substance has been administered, and ( iii ) the total number of patients for which a controlled substance has been administered.<n> the program is defined by : ( i ) the number of patients for which',
 'in this brief report, we present the results of our study of the water cycle in the earth s mantle.<n> we show that the water cycle in the mantle of the earth can be divided into two parts.<n> the first part is adiabatic.<n> the second part is non - adiabatic.<n> we show that the water cycle in the mantle of the earth can be divided into two parts.<n> the first part is adiabatic.<n> the second part is non - adiabatic. in this report, we present the results of our study of the water cycle in the earth s mantle.<n> we show that the water cycle in the mantle of the earth can be divided into two parts.<n> the first part is adiabatic.<n> the second part is non - adiabatic.<n> we show that the water cycle in the mantle of the earth can be divided into two parts.<n> the first part is adiabatic.<n> the second part is non - adiabatic. in this report, we present the results of our study of the water cycle in the earth s mantle.<n> we',
 "in this brief note, we point out a flaw in the definition of  supercritical '' in eq.2(a ). in eq.2(b ), the definition of  supercritical '' in eq.2(a ) is as follows : in the definition of  supercritical '',  a supercritical point '' is defined as the point where the difference between the critical point and the value of the critical point is less than the critical point itself.  in eq.2(a ), the supercritical point is defined as the point at which the value of the critical point is equal to the critical point itself.  in eq.2(b ), the supercritical point is defined as the point at which the value of the critical point is equal to the critical point itself.  in eq.2(c ), the supercritical point is defined as the point at which the value of the critical point is equal to the critical point itself.  in eq.(d ), the supercritical point is defined as the point at which the value of the critical point is equal to",
 'the following report was submitted to the editor of _ american journal of physics _ : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : :',
 'the tribe refers to a consumer who order(s ) smokeless product by means of voice or other method, or to a consumer who order(s ) smokeless product by means of mail or other method, or to a consumer who order(s ) smokeless product by use of a common carrier, or to a consumer who order(s ) smokeless product by use of a common carrier, or to a consumer who order(s ) smokeless product by use of a common carrier.<n> the tribe is divided into two parts.<n> the first part refers to the tribe which is located at the state of virginia.<n> the second part refers to the tribe located at the state of new york.<n> the state of virginia is divided into two parts. in the state of new york, the state of virginia is divided into two parts. in the state of new york, the state of virginia is divided into two parts. in the state of new york, the state of virginia is divided into two parts. in the state of new york, the state of virginia is divided into two parts. in the state of new york ',
 'in this brief report, we address some of the questions raised in @xcite and @xcite, as well as in @xcite and @xcite.',
 'in this brief report, we summarize the results of our analysis of the dna assembly at the sloan digital sky survey ( sdss ) facility.<n> we find that : _ ( i ) _ dna assembly at the sdss facility is successful ; _ ( ii ) _ dna assembly at the sdss facility is successful ; _ ( iii ) _ dna assembly at the sdss facility is successful ; _ ( iv ) _ dna assembly at the sdss facility is successful ; _ ( v ) _ dna assembly at the sdss facility is successful ; _ ( vi ) _ dna assembly at the sdss facility is successful ; _ ( vi ) _ dna assembly at the sdss facility is successful ; _ ( v ) _ dna assembly at the sdss facility is successful ; _ ( vi ) _ dna assembly at the sdss facility is successful ; _ ( vi ',
 'a higher education accrediting agency ( haa ) has recently determined that there is a conflict of interest in the accrediting process of a single institution.<n> this is because the haa has determined that there is a conflict between the accrediting requirements of the institution and those of an independent accrediting agency.<n> the conflict of interest arises because the haa has determined that there is a conflict of interest between the accrediting requirements of the institution and those of an independent accrediting agency.<n> the conflict of interest arises because the haa has determined that there is a conflict of interest between the accrediting requirements of the institution and those of an independent accrediting agency.<n> the conflict of interest arises because the haa has determined that there is a conflict of interest between the accrediting requirements of the institution and those of an independent accrediting agency.<n> the conflict of interest arises because the haa has determined that there is a conflict of interest between the accrediting requirements of the institution and those of an independent accrediting agency.<n> the conflict of interest arises because the haa has determined that there is a conflict of interest between the accrediting requirements of the institution and those of an independent accrediting agency.<n> the conflict of interest arises',
 'hydrogen is one of the most abundant elements in the universe. yet it is also one of the least abundant.<n> hydrogen is the most abundant element in the universe.<n> the reason for this is twofold.<n> first, hydrogen is the most abundant element in the universe.<n> second, hydrogen is also one of the least abundant.<n> there are many reasons for hydrogen being the least abundant element in the universe.<n> the reason is twofold.<n> first, hydrogen is the most abundant element in the universe.<n> there are many reasons for hydrogen being the least abundant element in the universe.<n> there are many reasons for hydrogen being the least abundant element in the universe.<n> there are many reasons for hydrogen being the least abundant element in the universe.<n> there are many reasons for hydrogen being the least abundant element in the universe.<n> there are many reasons for hydrogen being the least abundant element in the universe.<n> there are many reasons for hydrogen being the least abundant element in the universe.<n> there are many reasons for hydrogen being the least abundant element in the universe.<n> there are many reasons for hydrogen being the least abundant element in the universe.<n> there are many reasons for',
 'based on a recent report of a high level of elevated levels of mercury in the atmosphere of the kingfish, a request has been made to the government of the commonwealth of hawaii for the establishment of a laboratory for the assessment of the levels of mercury present in the atmosphere of the kingfish. the laboratory for the assessment of the levels of mercury present in the atmosphere of the kingfish, based on a recent report of a high level of elevated levels of mercury in the atmosphere of the kingfish, a request has been made to the government of the commonwealth of hawaii for the establishment of a laboratory for the assessment of the levels of mercury present in the atmosphere of the kingfish. _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ ',
 'we report the results of a study of the effects of heavy elements on the stability of a heavy - ion collision ( hic ) site.<n> the study was performed as part of the hic legacy research program for the national institute of standards and technology ( nimt ) in the context of the heavy - ion collision ( hic ) modeling and prediction.<n> heavy - ion collision ( hic ) is one of the most important problems in the field of heavy - ion collision ( hic ) physics.<n> the hic is characterized by a large number of phenomena, including : formation and propagation of : ( i ) fragments ; ( ii ) nuclei ; ( iii ) secondary nuclei ; ( iv ) secondary nuclei ; ( v ) secondary nuclei ; ( vi ) secondary nuclei ; ( vii ) secondary nuclei ; ( vi ) secondary nuclei ; ( v ) secondary nuclei ; ( vi ) secondary nuclei ; ( vii ) secondary nuclei ; ( vi ) secondary nuclei ; ( viii ) secondary nuclei ; ( vi ) secondary nuclei ; ( vii )']

In [None]:
summaries_before_tuning

In [None]:
print(
    tabulate(
        zip(
            range(len(summaries_after_tuning)),
            summaries_after_tuning,
        ),
        headers=["Id", "Summary after"],
    )
)
print("\nTarget summaries:\n")
print(
    tabulate(list(enumerate(test_samples["summary"])), headers=["Id", "Target summary"])
)
print("\nSource documents:\n")
print(tabulate(list(enumerate(test_samples["text"])), headers=["Id", "Document"]))