In [1]:
!pip install transformers datasets accelerate evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m930.9 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


# Import library

In [2]:
from transformers import BartForConditionalGeneration, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
import torch
import random

import os
os.environ["WANDB_DISABLED"] = "true"


# a seed for reproducibility
SEED = 42
# set seed
np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)

# check for GPU device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Device available:', device) 

2024-06-17 05:00:17.994402: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-17 05:00:17.994535: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-17 05:00:18.151255: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Device available: cuda:0


# Load model

In [3]:
model_name = "facebook/bart-base"
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [4]:
text = "My mother is a person I admire most. She devoted a lot of time and energy to the upbringing of my two brothers and 1. Despite working hard, she always made time to teach us many useful things which are necessary and important in our later lives. Moreover, she is a good role model for me to follow. She always tries to get on well with people who live next door and help everyone when they are in difficulties, so most of them respect and love her. I admire and look up to my mother because she not only brings me up well but also stands by me and gives some help if necessary. For example, when I encounter some difficulties, she will give me some precious advice to help me solve those problems. She has a major influence on me and 1 hope that I will inherit some of her traits."


inputs = tokenizer(text, max_length=1024, return_tensors="pt", truncation=True)
summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=40, max_length=160)
tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

'My mother is a person I admire most. She devoted a lot of time and energy to the upbringing of my two brothers and 1. Despite working hard, she always made time to teach us many useful things which are necessary and important in our later lives. Moreover, she is a good role model for me to follow. She always tries to get on well with people who live next door and help everyone when they are in difficulties, so most of them respect and love her. I admire and look up to my mother because she not only brings me up well but also stands by me and gives some help if necessary. For example, when I encounter some difficulties, she will give me some precious advice to help me solve those problems. She has a major influence on me and I hope that'

# Load dataset

In [5]:
from datasets import load_dataset

dataset_name = "ccdv/pubmed-summarization"
dataset = load_dataset(dataset_name, trust_remote_code=True) #, split="train[:1%]")

Downloading builder script:   0%|          | 0.00/5.13k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.66k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/779M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/43.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/43.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 119924
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6633
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6658
    })
})

In [7]:
# small_train_dataset = dataset['train'].shuffle(seed=42).select(range(1000))
# small_test_dataset = dataset['test'].shuffle(seed=42).select(range(1000))

In [8]:
def preprocess_function(examples):
  inputs = [doc for doc in examples["article"]]
  model_inputs = tokenizer(inputs, max_length=1024, truncation=True,padding='max_length')
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples["abstract"], max_length=128, truncation=True, padding='max_length')
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [9]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, batch_size = 256)

# tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
# tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/119924 [00:00<?, ? examples/s]



Map:   0%|          | 0/6633 [00:00<?, ? examples/s]

Map:   0%|          | 0/6658 [00:00<?, ? examples/s]

In [10]:
tokenized_dataset


#tokenized_train

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 119924
    })
    validation: Dataset({
        features: ['article', 'abstract', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6633
    })
    test: Dataset({
        features: ['article', 'abstract', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6658
    })
})

# Fine tune model

In [11]:
!pip install rouge_score

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=12177664bf8f9ffb27d68578b361cb1ea8aca3c5f8140320860e2d4e72381b29
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [12]:
import numpy as np
from nltk.tokenize import sent_tokenize
import evaluate

rouge_score = evaluate.load("rouge")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [13]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
# import torch

# # Enable CUDA_LAUNCH_BLOCKING
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [15]:
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# model = torch.nn.DataParallel(model, device_ids = [0,1]).to(device)

In [22]:
training_args = TrainingArguments(
    'bert-finetuning-cola',
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.005,
    save_total_limit=3,
    num_train_epochs=1,
    fp16=True,
    gradient_accumulation_steps=4,
    remove_unused_columns=False,
    report_to = 'none'  # Equivalent to UPDATE_FREQ in other frameworks
)

# use the pre-built metrics 
def compute_metrics(eval_preds):
    metric = load_metric("glue", "cola")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Initialize the Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset,
#     eval_dataset=tokenized_dataset
# )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=small_train_dataset,
#     eval_dataset=small_test_dataset,
# )



In [23]:
trainer.train()



Epoch,Training Loss,Validation Loss
0,2.2107,1.998531


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams

TrainOutput(global_step=3747, training_loss=2.3035198829636627, metrics={'train_runtime': 11936.2297, 'train_samples_per_second': 10.047, 'train_steps_per_second': 0.314, 'total_flos': 7.310984028880896e+16, 'train_loss': 2.3035198829636627, 'epoch': 0.9997998799279568})

# Push to hub

In [26]:
kwargs = {
    "dataset_tags": dataset_name,
    "dataset": dataset_name,
    "model_name": f"{model_name}-pubmed",
    "finetuned_from": model_name,
    "tasks": "text-sumarization",
}

In [27]:
trainer.push_to_hub(**kwargs)

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


CommitInfo(commit_url='https://huggingface.co/QuanHcmus/bert-finetuning-cola/commit/a1186e35c122cbbfa470bf6947be351a58090b08', commit_message='End of training', commit_description='', oid='a1186e35c122cbbfa470bf6947be351a58090b08', pr_url=None, pr_revision=None, pr_num=None)