## Install Required Libraries

In [None]:
!pip install transformers datasets wandb -q

## Import Required Libraries

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import wandb
import os

## Define Hyperparameters

In [None]:
model_name = "GanjinZero/biobart-v2-base"
max_seq_length = 512
learning_rate = 2e-5
weight_decay = 0.01
max_steps = 500
warmup_steps = 100
batch_size = 4
gradient_accumulation_steps = 4
lr_scheduler_type = "linear"
optimizer = "adamw_hf"
random_state = 3407
output_dir = "./biobart-finetuned"

## Load BioBart Model and Tokenizer

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [26]:
import inspect
print(inspect.signature(model.forward))

(input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, decoder_input_ids: Optional[torch.LongTensor] = None, decoder_attention_mask: Optional[torch.LongTensor] = None, head_mask: Optional[torch.Tensor] = None, decoder_head_mask: Optional[torch.Tensor] = None, cross_attn_head_mask: Optional[torch.Tensor] = None, encoder_outputs: Optional[List[torch.FloatTensor]] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, decoder_inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None) -> Union[Tuple, transformers.modeling_outputs.Seq2SeqLMOutput]


## Load Dataset (Here we are using Wiki Medical Terms as an example)

In [None]:
# Load dataset from Hugging Face Hub
dataset = load_dataset("gamino/wiki_medical_terms", split="train")

## Data Preprocessing for Seq2Seq Models

In [30]:
def preprocess_function(examples):
    max_length = 128  # Set your preferred maximum length here
    
    # Tokenize the page_text as the input (prompt)
    inputs = tokenizer(examples['page_text'], padding="max_length", truncation=True, max_length=max_length)

    # Tokenize the page_title as the labels (for supervised learning tasks)
    labels = tokenizer(examples['page_title'], padding="max_length", truncation=True, max_length=max_length)

    # Return the required fields for the model's forward method
    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': labels['input_ids']  # Labels should be the tokenized page_title
    }

# Apply preprocessing to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/6861 [00:00<?, ? examples/s]

In [29]:
# Print the first few examples of tokenized data
for i in range(1):  # Change the range to print more examples if needed
    print(tokenized_dataset[0])


{'page_title': 'Paracetamol poisoning', 'page_text': 'Paracetamol poisoning, also known as acetaminophen poisoning, is caused by excessive use of the medication paracetamol (acetaminophen). Most people have few or non-specific symptoms in the first 24 hours following overdose. These include feeling tired, abdominal pain, or nausea. This is typically followed by a couple of days without any symptoms, after which yellowish skin, blood clotting problems, and confusion occurs as a result of liver failure. Additional complications may include kidney failure, pancreatitis, low blood sugar, and lactic acidosis. If death does not occur, people tend to recover fully over a couple of weeks. Without treatment, death from toxicity occurs 4 to 18 days later.Paracetamol poisoning can occur accidentally or as an attempt to die by suicide. Risk factors for toxicity include alcoholism, malnutrition, and the taking of certain other hepatotoxic medications. Liver damage results not from paracetamol itsel

## Define Training Arguments

In [31]:
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    warmup_steps=warmup_steps,
    max_steps=max_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    eval_strategy="no",
    report_to="wandb",  # Log training metrics to Weights and Biases
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
    optim=optimizer,
    lr_scheduler_type=lr_scheduler_type,
    seed=random_state
)

## Initialize Trainer

In [32]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer, padding=True)

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


## Initialize Weights & Biases for Tracking (Optional)

In [34]:
wandb.init(project="biobart-finetuning", name="BioBart-Fine-Tuning")

VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

## Start Training

In [35]:
trainer.train()

  self.pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
50,0.175
100,0.008
150,0.0065
200,0.0056
250,0.0038
300,0.0041
350,0.0035
400,0.0036
450,0.0022
500,0.0021


  self.pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to a

TrainOutput(global_step=1000, training_loss=0.011381900995969772, metrics={'train_runtime': 1319.0126, 'train_samples_per_second': 24.261, 'train_steps_per_second': 0.758, 'total_flos': 2438031228272640.0, 'train_loss': 0.011381900995969772, 'epoch': 4.662004662004662})

In [47]:
!rm -rf /kaggle/working/*

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Evaluation

In [37]:
# Use HuggingFace's dataset splitting method instead of train_test_split
train_test = tokenized_dataset.train_test_split(test_size=0.1)

# Access the train and validation datasets
train_dataset = train_test['train']
val_dataset = train_test['test']

# Define the trainer again with the validation dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start evaluation
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")


max_steps is given, it will override any value given in num_train_epochs
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Evaluation Results: {'eval_loss': 0.0005446386057883501, 'eval_runtime': 9.0066, 'eval_samples_per_second': 76.277, 'eval_steps_per_second': 4.774}


## Testing Model Responses

In [39]:
def test_model_response(inputs):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        device = next(model.parameters()).device
        # Tokenize the inputs and move to the correct device
        inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        # Use sampling with specified temperature and top_p
        outputs = model.generate(
            inputs['input_ids'], 
            max_length=150, 
            do_sample=True,   # Enable sampling
            temperature=0.9, 
            top_p=0.95,
            num_return_sequences=1  # Number of sequences to return for each input
        )
        
        # Decode outputs for each input
        generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        
        return [text.strip() for text in generated_texts]  # Return a list of generated responses

# Sample inputs to test
test_inputs = [
    "What should I do if I have a headache?",
    "I'm feeling very tired and unwell",
    "What are the symptoms of fever?"
]

# Print responses for each input
responses = test_model_response(test_inputs)
for input_text, response in zip(test_inputs, responses):
    print(f"Input: {input_text}\nResponse: {response}\n")


Input: What should I do if I have a headache?
Response: Headache

Input: I'm feeling very tired and unwell
Response: Tired and unwell

Input: What are the symptoms of fever?
Response: Fever



## Save Fine-Tuned Model

In [49]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


('/kaggle/working/saved_model/tokenizer_config.json',
 '/kaggle/working/saved_model/special_tokens_map.json',
 '/kaggle/working/saved_model/vocab.json',
 '/kaggle/working/saved_model/merges.txt',
 '/kaggle/working/saved_model/added_tokens.json',
 '/kaggle/working/saved_model/tokenizer.json')

In [52]:
import shutil
import os

shutil.make_archive(output_dir, 'zip', output_dir)

zip_file_path = f"{output_dir}.zip"
destination_path = "/kaggle/working/bertV2.zip"

# Check if the destination zip file already exists, and if so, remove it
if os.path.exists(destination_path):
    os.remove(destination_path)

shutil.move(zip_file_path, destination_path)

# Print the location of the downloaded zip file
print(f"Zip file created and moved to: {destination_path}")


Zip file created and moved to: /kaggle/working/bertV2.zip


In [55]:
print("/kaggle/working/bertV2.zip")

/kaggle/working/bertV2.zip
