# LLaMA 3.2 3B Fine-tuning and Evaluation on MedMCQA

In [1]:

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, DataCollatorForLanguageModeling, pipeline
import torch
from tqdm import tqdm
import os

from peft import LoraConfig, get_peft_model

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# torch.cuda.current_device()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

dataset = load_dataset("openlifescienceai/medmcqa", split='train')

# def format_example(example):
#     options = f"A. {example['opa']} B. {example['opb']} C. {example['opc']} D. {example['opd']}"
#     prompt = f"### Instruction:\n{example['question']}\nOptions: {options}\n\n### Response:\n{example['cop']}"
#     return {"text": prompt}

ans_to_idx_map = {"A":0, "B":1, "C":2, "D":3}
idx_to_ans_map = {0:"A", 1:"B", 2:"C", 3:"D"}

def format_example(example):
    instruction = "Answer the following multiple-choice question by giving the most appropriate response. The answer should be one of [A, B, C, D]."
    options = f"A. {example['opa']} B. {example['opb']} C. {example['opc']} D. {example['opd']}"
    prompt = f"{instruction}\nQuestion: {example['question']}\n{options}\nAnswer: {idx_to_ans_map[example['cop']]}"
    
    return {"text": prompt}

dataset = dataset.map(format_example)

print(len(dataset))
print(dataset[0]['text'])


182822
Answer the following multiple-choice question by giving the most appropriate response. The answer should be one of [A, B, C, D].
Question: Chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma
A. Hyperplasia B. Hyperophy C. Atrophy D. Dyplasia
Answer: C


In [3]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

tokenized = dataset.map(tokenize_fn)


In [4]:
# add Quant_Lora
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,               # 4-bit quantization
    bnb_4bit_use_double_quant=True,  # double quantization for stability
    bnb_4bit_compute_dtype="bfloat16",  # or "float16"
    bnb_4bit_quant_type="nf4",       # best trade-off for LLaMA
)

# device_map={'':torch.cuda.current_device()}
# {'':torch.cuda.current_device()}, 
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=bnb_config)
# model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")

# add lora
lora_config = LoraConfig(
    r=64,                        # LoRA rank
    lora_alpha=16,               # Alpha scaling factor
    target_modules=["q_proj", "v_proj"],  # target modules in transformer blocks
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.40s/it]


trainable params: 18,350,080 || all params: 3,231,099,904 || trainable%: 0.5679


In [None]:
args = TrainingArguments(
    # output_dir="./llama3-medmcqa-baseline",
    output_dir="./llama3-qlora-medmcqa", # LoRA
    per_device_train_batch_size=3, # 5 : 15gb, 4 - 13gb, 3 - 11gb
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    logging_steps=10,
    save_strategy="epoch",
    fp16=False,
    bf16=True,
    report_to="none",
    # max_steps=15
)

trainer = Trainer(
    # model=model, 
    model=peft_model, # LoRA
    args=args,
    train_dataset=tokenized,
    #train_batch_size = 8,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,2.4666
20,2.2621
30,1.908
40,1.6368
50,1.4103
60,1.3396
70,1.238
80,1.2293
90,1.3151
100,1.279


After this => Needs to be checked

Evaluation

In [None]:

val_data = load_dataset("openlifescienceai/medmcqa", split="validation[:100]")
model = AutoModelForCausalLM.from_pretrained("./llama3-medmcqa-baseline", torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("./llama3-medmcqa-baseline", use_fast=True)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

correct = 0
total = 0

for sample in tqdm(val_data):
    options = f"A. {sample['opa']} B. {sample['opb']} C. {sample['opc']} D. {sample['opd']}"
    prompt = f"### Instruction:\n{sample['question']}\nOptions: {options}\n\n### Response:\n"
    output = pipe(prompt, max_new_tokens=10, do_sample=False)[0]['generated_text']
    answer = output.split("### Response:\n")[-1].strip()[:1].upper()
    if answer == sample["answer"].upper():
        correct += 1
    total += 1

accuracy = correct / total
print(f"Accuracy on MedMCQA (validation[:100]): {accuracy:.4f}")


In [None]:

print("\n🔍 Qualitative Evaluation:\n")
for i in range(5):
    q = val_data[i]
    options = f"A. {q['opa']} B. {q['opb']} C. {q['opc']} D. {q['opd']}"
    prompt = f"### Instruction:\n{q['question']}\nOptions: {options}\n\n### Response:\n"
    generated = pipe(prompt, max_new_tokens=20, do_sample=False)[0]['generated_text']
    print(f"Q: {q['question']}")
    print(f"Model: {generated.split('### Response:')[-1].strip()}")
    print(f"Answer: {q['answer']}")
    print("-" * 50)
