In [1]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
from datasets import load_dataset

# Configuration
MODEL_NAME = "gpt2"
DATASET_NAME = "bio-nlp-umass/bioinstruct"
OUTPUT_DIR = "./gpt2-bioinstruct-finetuned"
LEARNING_RATE = 2e-4
BATCH_SIZE = 12
GRADIENT_ACCUMULATION_STEPS = 12
EPOCHS = 1

In [None]:
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [3]:
# Load dataset
dataset = load_dataset(DATASET_NAME, split="train") # using train split.

README.md:   0%|          | 0.00/5.02k [00:00<?, ?B/s]

biomed_instruct_25k.json:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25005 [00:00<?, ? examples/s]

In [4]:
# Preprocessing function (format for instruction following)
def preprocess_function(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]

    prompts = [f"Instruction: {instruction}\nInput: {input_text}\nOutput: {output_text}"
               for instruction, input_text, output_text in zip(instructions, inputs, outputs)]

    model_inputs = tokenizer(prompts, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/25005 [00:00<?, ? examples/s]

In [8]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    logging_dir="./logs",
    logging_steps=10,
    optim="adamw_torch",
    fp16=True,
    lr_scheduler_type="linear",
    warmup_ratio=0.03,
    save_strategy="steps",
    push_to_hub=False,
    report_to = "tensorboard",
    save_total_limit = 1
)

In [9]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [10]:
# Train
trainer.train()

Step,Training Loss
10,8.5297
20,1.1946
30,0.4641
40,0.4236
50,0.3945
60,0.3969
70,0.3721
80,0.3769
90,0.3747
100,0.3805


TrainOutput(global_step=173, training_loss=0.8990579464532047, metrics={'train_runtime': 1943.8994, 'train_samples_per_second': 12.863, 'train_steps_per_second': 0.089, 'total_flos': 6509307101184000.0, 'train_loss': 0.8990579464532047, 'epoch': 0.9961612284069098})

In [14]:
# Example input (instruction and input text)
instruction = "Explain the mechanism of action of a given drug in non-medical terms."
input_text = "Metformin"

# Construct the prompt
prompt = f"Instruction: {instruction}\nInput: {input_text}\nOutput:"

# Tokenize the prompt
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.7` -- this flag is only 
# used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
#  UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.9` -- this flag is only used 
# in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.

# Generate the output
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = model.generate(
        **model_inputs,
        max_length=200,  # Adjust as needed
        num_return_sequences=1,  # Generate one output
        temperature=0.7,  # Adjust for creativity
        top_p=0.9,  # Adjust for sampling
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id #Set pad token id
    )

# Decode the generated output
generated_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the results
print("Instruction:", instruction)
print("Input:", input_text)
print("Generated Output:", generated_output)

Instruction: Explain the mechanism of action of a given drug in non-medical terms.
Input: Metformin
Generated Output: Instruction: Explain the mechanism of action of a given drug in non-medical terms.
Input: Metformin
Output: Metformin works by inhibiting the synthesis of beta-blockers, which are the chemicals involved in the production of prostaglandins, the chemicals responsible for the immune system's response to an injury.


In [15]:
# Example input (instruction and input text)
instruction = "Explain the concept of 'gene expression' in simple terms."
input_text = ""  # No input text is needed for this example

# Construct the prompt
prompt = f"Instruction: {instruction}\nInput: {input_text}\nOutput:"

# Tokenize the prompt
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.7` -- this flag is only 
# used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
#  UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.9` -- this flag is only used 
# in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.

# Generate the output
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = model.generate(
        **model_inputs,
        max_length=200,  # Adjust as needed
        num_return_sequences=1,  # Generate one output
        temperature=0.7,  # Adjust for creativity
        top_p=0.9,  # Adjust for sampling
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id #Set pad token id
    )

# Decode the generated output
generated_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the results
print("Instruction:", instruction)
print("Input:", input_text)
print("Generated Output:", generated_output)

Instruction: Explain the concept of 'gene expression' in simple terms.
Input: 
Generated Output: Instruction: Explain the concept of 'gene expression' in simple terms.
Input: 
Output: Gene expression refers to the process of producing new proteins or nucleic acids. It is a crucial part of the immune system and plays a critical role in maintaining healthy immune function and overall health.


In [16]:
# Example input (instruction and input text)
instruction = "Answer the following question."
input_text = "What are the main causes of type 2 diabetes?"

# Construct the prompt
prompt = f"Instruction: {instruction}\nInput: {input_text}\nOutput:"

# Tokenize the prompt
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.7` -- this flag is only 
# used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
#  UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.9` -- this flag is only used 
# in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.

# Generate the output
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = model.generate(
        **model_inputs,
        max_length=200,  # Adjust as needed
        num_return_sequences=1,  # Generate one output
        temperature=0.7,  # Adjust for creativity
        top_p=0.9,  # Adjust for sampling
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id #Set pad token id
    )

# Decode the generated output
generated_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the results
print("Instruction:", instruction)
print("Input:", input_text)
print("Generated Output:", generated_output)

Instruction: Answer the following question.
Input: What are the main causes of type 2 diabetes?
Generated Output: Instruction: Answer the following question.
Input: What are the main causes of type 2 diabetes?
Output: Main causes of type 2 diabetes include obesity, high blood pressure, and smoking.


In [17]:
!zip -r "gpt2_bio_instruct_ft.zip" "/kaggle/working"

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/.virtual_documents/ (stored 0%)
  adding: kaggle/working/logs/ (stored 0%)
  adding: kaggle/working/logs/events.out.tfevents.1740231821.cfdb00881f52.31.0 (deflated 62%)
  adding: kaggle/working/gpt2-bioinstruct-finetuned/ (stored 0%)
  adding: kaggle/working/gpt2-bioinstruct-finetuned/checkpoint-173/ (stored 0%)
  adding: kaggle/working/gpt2-bioinstruct-finetuned/checkpoint-173/generation_config.json (deflated 24%)
  adding: kaggle/working/gpt2-bioinstruct-finetuned/checkpoint-173/merges.txt (deflated 53%)
  adding: kaggle/working/gpt2-bioinstruct-finetuned/checkpoint-173/rng_state.pth (deflated 25%)
  adding: kaggle/working/gpt2-bioinstruct-finetuned/checkpoint-173/config.json (deflated 51%)
  adding: kaggle/working/gpt2-bioinstruct-finetuned/checkpoint-173/tokenizer_config.json (deflated 54%)
  adding: kaggle/working/gpt2-bioinstruct-finetuned/checkpoint-173/training_args.bin (deflated 52%)
  adding: kaggle/working/gpt2-b