# **Parameter-Efficient Fine-Tuning of LLaMA 3.2 for Medical Reasoning**

# **1.0 Environment Setup**

In [None]:
%%capture

!pip install unsloth
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
!pip install rouge_score

In [None]:
# Import Libraries
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset, DatasetDict
from huggingface_hub import login
import wandb
import numpy as np
from rouge_score import rouge_scorer

## **2.0 Logging into Hugging Face and Weights & Biases**

In [None]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from unsloth import is_bfloat16_supported
!pip install trl datasets wandb huggingface_hub

from huggingface_hub import login
from transformers import TrainingArguments
from datasets import load_dataset

import wandb


In [None]:
from huggingface_hub import login

login(token="test") #git was blocking the push to the hub


In [None]:
import os
os.environ["WANDB_API_KEY"] = "test" #git was blocking the push to the hub

wandb.login(key=os.getenv("WANDB_API_KEY"))

## **3.0 Loading Model and Tokenizer**

In [None]:
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

## **4.0 Dataset Preparation**

In [None]:
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split="train")

validation_size = 100
train_size = len(dataset) - validation_size
indices = np.random.permutation(len(dataset))
train_indices, val_indices = indices[:train_size], indices[train_size:]

train_dataset = dataset.select(train_indices)
val_dataset = dataset.select(val_indices)

dataset_dict = DatasetDict({"train": train_dataset, "validation": val_dataset})

def formatting_prompts_func(examples):
    inputs = examples["Question"]
    cots = examples["Complex_CoT"]
    outputs = examples["Response"]
    texts = []
    for input, cot, output in zip(inputs, cots, outputs):
        convo = [
            {"role": "user", "content": input},
            {"role": "assistant", "content": f"<think>{cot}</think>\n<response>{output}</response>"}
        ]
        text = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
        texts.append(text)
    return {"text": texts}

dataset_dict = dataset_dict.map(formatting_prompts_func, batched=True, num_proc=2)

## **5.0 Fine-Tuning Setup with LoRA**

In [None]:
model_lora = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

## **6.0 Compute ROUGE-L Scores Before Fine-Tuning**

In [None]:
def compute_rouge_l(dataset, model, tokenizer, num_samples=10):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = []
    model.eval()
    for example in dataset.select(range(min(num_samples, len(dataset)))):
        question = example["Question"]
        ground_truth = example["Response"]
        convo = [{"role": "user", "content": question}]
        inputs = tokenizer([tokenizer.apply_chat_template(convo, tokenize=False)], return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=1200, use_cache=True)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        score = scorer.score(ground_truth, prediction)['rougeL'].fmeasure
        scores.append(score)
    return np.mean(scores)

pre_finetune_rouge = compute_rouge_l(dataset_dict["validation"], model_lora, tokenizer)
run = wandb.init(
    project='Fine-tune-Llama-3.2-3B-Instruct-on-Medical-COT-Dataset',
    job_type="training",
    anonymous="allow"
)
wandb.log({"pre_finetune_rouge_l": pre_finetune_rouge})


## **7.0 Initialize Fine-Tuning Trainer**

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model_lora,
    tokenizer=tokenizer,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["validation"],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,  

    args=TrainingArguments(
        per_device_train_batch_size=4,         
        gradient_accumulation_steps=4,         
        num_train_epochs=3,                    
        warmup_steps=100,                      
        max_steps=1000,                       
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",

        report_to="wandb",                    
        run_name="llama3-3B-lr2e-4-steps1000",
        save_strategy="steps",
        save_steps=50,
        save_total_limit=1,
        eval_strategy="steps",
        eval_steps=50,

        hub_model_id="AshProbably/LLaMA3.2B-MedCOT",
        logging_first_step=True,
    ),
)


## **9.0 Model Training**

In [None]:
trainer_stats = trainer.train()

## **10.0 Compute ROUGE-L Scores After Fine-Tuning**

In [None]:
post_finetune_rouge = compute_rouge_l(dataset_dict["validation"], model_lora, tokenizer)
wandb.log({"post_finetune_rouge_l": post_finetune_rouge})

## **11.0 Save Fine-Tuned Model and Tokenizer**

In [None]:
model_lora.save_pretrained("lora_adapters")
tokenizer.save_pretrained("lora_adapters")

## **12.0 Upload to Hugging Face**

In [None]:
model_lora.push_to_hub("AshProbably/medcot-llama3.2-3b-model", token="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")
tokenizer.push_to_hub("AshProbably/medcot-llama3.2-3b-tokenizer", token="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")

## **13.0 Run Model Inference**

In [None]:
def generate_response(question, model, tokenizer):
    FastLanguageModel.for_inference(model)
    convo = [{"role": "user", "content": question}]
    inputs = tokenizer([tokenizer.apply_chat_template(convo, tokenize=False)], return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=1200, use_cache=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("<response>")[1].split("</response>")[0] if "<response>" in response else response

question = """A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions?"""
response = generate_response(question, model_lora, tokenizer)
print(f"Response: {response}")

## **14.0 Instructions for Loading and Using the Fine-Tuned Model**

In [None]:
wandb.finish()