
# Fine-tuning BART Summarization

---

In [2]:
!nvidia-smi

Fri Nov 26 17:49:12 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 496.13       Driver Version: 496.13       CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   68C    P8     8W /  N/A |    355MiB /  8192MiB |      2%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Setup

---

In [3]:
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension



Config option `kernel_spec_manager_class` not recognized by `EnableNBExtensionApp`.
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


In [4]:
! pip install transformers
! pip install datasets
! pip install sentencepiece
! pip install rouge_score
! pip install wandb



In [5]:
import torch
import numpy as np
import datasets

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

from tabulate import tabulate
import nltk
from datetime import datetime

In [6]:
WANDB_INTEGRATION = True
if WANDB_INTEGRATION:
    import wandb

    wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mzeroranger[0m (use `wandb login --relogin` to force relogin)


## Model and tokenizer

---

Hiperparámetros: 

[HF Bart configuration](https://huggingface.co/transformers/_modules/transformers/configuration_bart.html)

[Fairseq Bart](https://github.com/pytorch/fairseq/tree/master/examples/bart)

In [7]:
#Llamado del modelo
model_name = "sshleifer/distilbart-xsum-12-3"

#Definición de modelo y tokenizador
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Se fijan los parámetros del modelo
model.config.activation_dropout = 0.0
print(model.config)

# tokenización
encoder_max_length = 256 
decoder_max_length = 64

BartConfig {
  "_name_or_path": "sshleifer/distilbart-xsum-12-3",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 3,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "eos_token_ids": [
    2
  ],
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": false,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decode

## Data

---

In [8]:
#Lista de datasets compatibles con la biblioteca
datasets.list_datasets()

['0n1xus/codexglue',
 'AConsApart/anime_subtitles_DialoGPT',
 'ARTeLab/fanpage',
 'Abdo1Kamr/Arabic_Hadith',
 'AdWeeb/DravidianMT',
 'Adnan/Urdu_News_Headlines',
 'Akshith/aa',
 'Akshith/g_rock',
 'Akshith/test',
 'AlekseyDorkin/extended_tweet_emojis',
 'Annielytics/DoctorsNotes',
 'AryanLala/autonlp-data-Scientific_Title_Generator',
 'Avishekavi/Avi',
 'BSC-TeMU/SQAC',
 'BSC-TeMU/ancora-ca-ner',
 'BSC-TeMU/sts-ca',
 'BSC-TeMU/tecla',
 'BSC-TeMU/viquiquad',
 'BSC-TeMU/xquad-ca',
 'Babelscape/rebel-dataset',
 'Binbin/my_dataset',
 'BlakesOrb6/Fred-Flintstone',
 'Bosio/pacman',
 'Bosio/pacman_descriptions',
 'CAGER/rick',
 'CShorten/KerasBERT',
 'CShorten/ZillowPrize',
 'ChadxxxxHall/Inter-vision',
 'Champion/vpc2020_clear_anon_speech',
 'Check/a_re_gi',
 'Check/region_1',
 'Check/region_2',
 'Check/region_3',
 'Check/region_4',
 'Check/region_5',
 'Check/region_6',
 'Check/region_7',
 'Check/region_8',
 'Check/region_9',
 'Check/regions',
 'Check/vverify',
 'Chun/dataset',
 'Chuu/Vhh',


### Descarga y Preparación de los Datos

### Cargado de Dataset

In [9]:
train_data_txt = datasets.load_dataset("cnn_dailymail", '3.0.0', split="train[:500]")
validation_data_txt = datasets.load_dataset("cnn_dailymail", '3.0.0', split="validation[:500]")

Reusing dataset cnn_dailymail (C:\Users\JumpNShootMan\.cache\huggingface\datasets\cnn_dailymail\3.0.0\3.0.0\3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)
Reusing dataset cnn_dailymail (C:\Users\JumpNShootMan\.cache\huggingface\datasets\cnn_dailymail\3.0.0\3.0.0\3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


**Preprocess and tokenize**

In [10]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["article"], batch["highlights"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch


train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

Loading cached processed dataset at C:\Users\JumpNShootMan\.cache\huggingface\datasets\cnn_dailymail\3.0.0\3.0.0\3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234\cache-05d0fa29618d7a48.arrow
Loading cached processed dataset at C:\Users\JumpNShootMan\.cache\huggingface\datasets\cnn_dailymail\3.0.0\3.0.0\3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234\cache-6c84c9cc64e9d805.arrow


## Training

---

### Metrics

In [11]:
# Borrowed from https://github.com/huggingface/transformers/blob/master/examples/seq2seq/run_summarization.py

nltk.download("punkt", quiet=True)

metric = datasets.load_metric("rouge")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

### Training arguments

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=5,  # demo
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=3,  # demo
    per_device_eval_batch_size=3,
    learning_rate=3e-04, #comentar?
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True, #Para métricas ROUGE
    logging_dir="logs",
    logging_steps=50,
    save_total_limit=3,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

### Train

Wandb integration

In [13]:
wandb

<module 'wandb' from 'D:\\Anaconda\\envs\\ZREnv\\lib\\site-packages\\wandb\\__init__.py'>

In [14]:
if WANDB_INTEGRATION:
    wandb_run = wandb.init(
        project="BART_FT",
        config={
            "per_device_train_batch_size": training_args.per_device_train_batch_size,
            "learning_rate": training_args.learning_rate,
            "dataset": "cnn_dailymail",
        },
    )

    now = datetime.now()
    current_time = now.strftime("%H%M%S")
    wandb_run.name = "run_" + current_time

[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Evaluate before fine-tuning

In [15]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 500
  Batch size = 3


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


{'eval_loss': 6.269456386566162,
 'eval_rouge1': 23.8903,
 'eval_rouge2': 6.5581,
 'eval_rougeL': 17.4639,
 'eval_rougeLsum': 20.3212,
 'eval_gen_len': 24.872,
 'eval_runtime': 125.6547,
 'eval_samples_per_second': 3.979,
 'eval_steps_per_second': 1.329}

Train the model

In [16]:
torch.cuda.is_available()
import tensorflow as tf

In [17]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [18]:
#%%wandb
# uncomment to display Wandb charts

trainer.train()

***** Running training *****
  Num examples = 500
  Num Epochs = 5
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 3
  Gradient Accumulation steps = 1
  Total optimization steps = 835


Step,Training Loss
50,6.0582
100,4.981
150,4.6248
200,4.2347
250,3.9153
300,4.0136
350,3.854
400,3.2877
450,3.5294
500,3.6187


Saving model checkpoint to results\checkpoint-500
Configuration saved in results\checkpoint-500\config.json
Model weights saved in results\checkpoint-500\pytorch_model.bin
tokenizer config file saved in results\checkpoint-500\tokenizer_config.json
Special tokens file saved in results\checkpoint-500\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=835, training_loss=3.526165789758374, metrics={'train_runtime': 445.6977, 'train_samples_per_second': 5.609, 'train_steps_per_second': 1.873, 'total_flos': 967434240000000.0, 'train_loss': 3.526165789758374, 'epoch': 5.0})

Evaluate after fine-tuning

In [19]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 500
  Batch size = 3


{'eval_loss': 4.777433395385742,
 'eval_rouge1': 30.7197,
 'eval_rouge2': 10.8069,
 'eval_rougeL': 21.5491,
 'eval_rougeLsum': 28.07,
 'eval_gen_len': 51.326,
 'eval_runtime': 258.8836,
 'eval_samples_per_second': 1.931,
 'eval_steps_per_second': 0.645,
 'epoch': 5.0}

In [20]:
if WANDB_INTEGRATION:
    wandb_run.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/gen_len,▁█
eval/loss,█▁
eval/rouge1,▁█
eval/rouge2,▁█
eval/rougeL,▁█
eval/rougeLsum,▁█
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,█▁
train/epoch,▁▁▂▂▃▃▄▄▅▅▅▆▆▇▇███

0,1
eval/gen_len,51.326
eval/loss,4.77743
eval/rouge1,30.7197
eval/rouge2,10.8069
eval/rougeL,21.5491
eval/rougeLsum,28.07
eval/runtime,258.8836
eval/samples_per_second,1.931
eval/steps_per_second,0.645
train/epoch,5.0


## Evaluation

---

**Generate summaries from the fine-tuned model and compare them with those generated from the original, pre-trained one.**

In [21]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["article"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str


model_before_tuning = AutoModelForSeq2SeqLM.from_pretrained(model_name)

test_samples = validation_data_txt.select(range(16))

summaries_before_tuning = generate_summary(test_samples, model_before_tuning)[1]
summaries_after_tuning = generate_summary(test_samples, model)[1]

loading configuration file https://huggingface.co/sshleifer/distilbart-xsum-12-3/resolve/main/config.json from cache at C:\Users\JumpNShootMan/.cache\huggingface\transformers\4a0f7fb35f1504b6e865136124e3781fb488792aa105a84a991a3145a027791f.10ebe969457e130b9da526e7994b6191d3765d1d01ac6abc2eb20bb8adcbd4e0
Model config BartConfig {
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 3,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "eos_token_ids": [
  

In [22]:
print(
    tabulate(
        zip(
            range(len(summaries_after_tuning)),
            summaries_after_tuning,
            summaries_before_tuning,
        ),
        headers=["Id", "Summary after", "Summary before"],
    )
)
print("\nTarget summaries:\n")
print(
    tabulate(list(enumerate(test_samples["highlights"])), headers=["Id", "Target summary"])
)
print("\nSource documents:\n")
print(tabulate(list(enumerate(test_samples["article"])), headers=["Id", "Document"]))

  Id  Summary after                                                                                                                         Summary before
----  ------------------------------------------------------------------------------------------------------------------------------------  ----------------------------------------------------------------------------------------------------------------------------------------------
   0  David Crosby is known for weaving multilayered harmonies over sweet melodies.                                                         One of the world's best-known musicians has been injured in a hit-and-run crash.
      The jogger was struck by his car while he was on the road.
      He survived on raw fish and rainwater, he says.
   1  This is the first time SAE's national chapter suspended the students.                                                                 The University of Oklahoma has suspended three students over a racist chant.
     