# Supervised Fine-tuning Hermes-2-Pro-Llama-3-8B using Low-Rank Adaptation (LoRA)

## Libraries Import

In [1]:
!pip install --upgrade huggingface_hub



In [2]:
!pip install "transformers>=4.36" "datasets>=2.16" "accelerate>=0.26" "peft>=0.17" "huggingface_hub>=0.19" "trl>=0.7"



In [3]:
import trl, transformers, peft, accelerate
print(trl.__version__)
print(transformers.__version__)
print(peft.__version__)
print(accelerate.__version__)

0.21.0
4.55.0
0.17.0
1.10.0


In [4]:
!pip uninstall transformers -y

Found existing installation: transformers 4.55.0
Uninstalling transformers-4.55.0:
  Successfully uninstalled transformers-4.55.0


In [5]:
!pip uninstall -y trl
!pip install "trl==0.4.7"

Found existing installation: trl 0.21.0
Uninstalling trl-0.21.0:
  Successfully uninstalled trl-0.21.0
Collecting trl==0.4.7
  Using cached trl-0.4.7-py3-none-any.whl.metadata (10 kB)
Collecting transformers>=4.18.0 (from trl==0.4.7)
  Using cached transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Using cached trl-0.4.7-py3-none-any.whl (77 kB)
Using cached transformers-4.55.0-py3-none-any.whl (11.3 MB)
Installing collected packages: transformers, trl
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [trl][32m1/2[0m [trl]
Successfully installed transformers-4.55.0 trl-0.4.7


In [6]:
!pip install torch torchvision torchaudio
!pip install --upgrade transformers datasets accelerate peft trl

Collecting trl
  Using cached trl-0.21.0-py3-none-any.whl.metadata (11 kB)
Using cached trl-0.21.0-py3-none-any.whl (511 kB)
Installing collected packages: trl
  Attempting uninstall: trl
    Found existing installation: trl 0.4.7
    Uninstalling trl-0.4.7:
      Successfully uninstalled trl-0.4.7
Successfully installed trl-0.21.0


In [7]:
!pip install --upgrade torch torchvision torchaudio

# installing latest stable 
!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1

Collecting torch
  Using cached torch-2.8.0-cp310-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting torchvision
  Using cached torchvision-0.23.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.1 kB)
Collecting torchaudio
  Using cached torchaudio-2.8.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (7.2 kB)
Using cached torch-2.8.0-cp310-none-macosx_11_0_arm64.whl (73.6 MB)
Using cached torchvision-0.23.0-cp310-cp310-macosx_11_0_arm64.whl (1.9 MB)
Using cached torchaudio-2.8.0-cp310-cp310-macosx_11_0_arm64.whl (1.9 MB)
Installing collected packages: torch, torchvision, torchaudio
[2K  Attempting uninstall: torch
[2K    Found existing installation: torch 2.3.1
[2K    Uninstalling torch-2.3.1:
[2K      Successfully uninstalled torch-2.3.1━━━━━━[0m [32m0/3[0m [torch]
[2K  Attempting uninstall: torchvision━━━━━━━━━━━━━[0m [32m0/3[0m [torch]
[2K    Found existing installation: torchvision 0.18.132m0/3[0m [torch]
[2K    Uninstalling torchvision-0.18.1:━━━━━━━━[0m [32m0/3[0m 

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
import os

# Check device Metal Performance Shaders for GPU acceleration (Mac)
device = "mps" if torch.backends.mps.is_available() else "cpu"
print("Using device:", device)

Using device: mps


## Loading Dataset

In [9]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("sonamtenzey/instruction_dataset-edu-ai")
print("Dataset Loaded", ds)

Dataset Loaded DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 89
    })
})


In [10]:
# ds = ds["train"].train_test_split(test_size=0.1)
# ds['validation'] = ds.pop('test')

## Formating Dataset for Instruction Tuning

In [11]:
def format_instruction(example):
    messages = example["messages"]
    instruction = ""
    input_text = ""
    response = ""

    for msg in messages:
        if msg["role"] == "user":
            if not instruction:
                instruction = msg["content"]
            else:
                input_text = msg["content"]  
        elif msg["role"] == "assistant":
            response = msg["content"]

    text = f"### Instruction:\n{instruction}\n\n"
    if input_text.strip():
        text += f"### Input:\n{input_text}\n\n"
    text += f"### Response:\n{response}</s>"

    return {"text": text}  

# Apply formatting
ds_formatted = ds["train"].map(format_instruction)

## Load Model & Tokenizer

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=None,
    low_cpu_mem_usage=False,
).to(device)

model.enable_input_require_grads()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Adding LoRA (Parameter_Efficient_Fine_Tuning)

In [13]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Should show ~0.5% trainable params (e.g., 20M/4B)

The 8-bit optimizer is not available on your device, only available on CUDA for now.


trainable params: 54,525,952 || all params: 8,085,049,344 || trainable%: 0.6744


## Setting up trainer

In [14]:
from transformers import EarlyStoppingCallback
from trl import SFTConfig, SFTTrainer

sft_config = SFTConfig(
    output_dir="./lora-finetuned-hermes",
    logging_dir = "./lora-finetuned-hermes/logs",
    
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    optim="adamw_torch",
    logging_steps=1,
    save_steps=5,
    save_total_limit=3,
    load_best_model_at_end = "eval_loss",
    greater_is_better =False,
    report_to="none",
    max_grad_norm=1.0,
    warmup_steps=2,
    lr_scheduler_type="cosine",
    fp16=False,
    bf16=False,  # Enable if supported (e.g., CUDA/Ampere+)
    seed=42,
    disable_tqdm=False,
    remove_unused_columns=False,
    label_names=["labels"],
    

    # Dataset processing
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": True,
        "packing": False,
        "max_seq_length": 1024,
    },
    dataset_text_field="messages",  
    dataset_num_proc=2,

    eval_strategy= "steps",
    eval_steps = 5,
)



In [15]:
if "validation" not in ds:
    print("Splitting dataset into train and validation...")
    split = ds["train"].train_test_split(test_size=0.15, seed=42)
    train_dataset = split["train"]
    eval_dataset = split["test"]  # will be used as validation
else:
    train_dataset = ds["train"]
    eval_dataset = ds["validation"]

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,   
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

Splitting dataset into train and validation...


## Training the model

In [16]:
print("Starting training...")
trainer.train()

# Save adapter (LoRA weights only ~100MB) 
trainer.save_model()
print("Model saved to ./lora-finetuned-hermes")

Starting training...


Step,Training Loss,Validation Loss
5,1.6862,1.847971
10,1.0506,1.060169
15,0.7054,0.682192
20,0.6038,0.641366




Model saved to ./lora-finetuned-hermes


In [17]:
for log in trainer.state.log_history:
    if "eval_loss" in log:
        print(f"Step {log.get('step', '??')}: Eval Loss = {log['eval_loss']:.4f}")

Step 5: Eval Loss = 1.8480
Step 10: Eval Loss = 1.0602
Step 15: Eval Loss = 0.6822
Step 20: Eval Loss = 0.6414


## Testing the Model

In [18]:
import json

def generate_response(instruction, input_data=None):
    """
    Generate response using the same ChatML format used in training.
    """
    # Format input data as JSON string if provided
    data_str = json.dumps(input_data, indent=2) if input_data else ""

    # Construct messages in the same structure as training
    messages = [
        {"role": "system", "content": "You are a specialized AI assistant for Bhutanese schools, designed to generate factual reports from educational data. You must follow these rules:\n1. Your response MUST be a valid JSON object with two keys: 'analysis_text' (string) and 'chart' (object or null).\n2. 'chart' MUST be null if no data visualization is requested, relevant, or possible.\n3. Do NOT make up data, scores, or facts. Only use provided information.\n4. If the question is about you (e.g., 'Who are you?'), briefly describe your role in 'analysis_text' and set 'chart' to null.\n5. Never generate markdown, code blocks, or explanations outside JSON."},
        {"role": "user", "content": f"Instruction: {instruction}\n\nData:\n{data_str}"},
    ]

    # Apply the same ChatML template used in training
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        temperature=None,
        top_p=None,
    )

    full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # Extract only assistant's response
    start_token = "<|im_start|>assistant"
    end_token = "<|im_end|>"

    if start_token in full_response:
        start_idx = full_response.index(start_token) + len(start_token)
        response_part = full_response[start_idx:].strip()
        # Remove trailing eos or im_end if present
        if end_token in response_part:
            response_part = response_part.split(end_token)[0].strip()
        return response_part
    else:
        return "Model did not generate a valid assistant response."

# Test case
instruction = "Create a table to compare the key performance indicators of Druk School and Ugyen Academy." 
input_data = [  
  {"school": "Druk School", "avg_score": 88, "pass_rate": 95, "attendance": 97},
  {"school": "Ugyen Academy", "avg_score": 91, "pass_rate": 98, "attendance": 96}]


print("--- Testing Model ---")
response = generate_response(instruction, input_data)
print(response)

--- Testing Model ---


  test_elements = torch.tensor(test_elements)


{"analysis_text": "Here is a table comparing the key performance indicators of Druk School and Ugyen Academy:", "chart": {"type": "table", "data": {"columns": [{"label": "School", "type": "string"}, {"label": "Average Score", "type": "number"}, {"label": "Pass Rate", "type": "number"}, {"label": "Attendance", "type": "number"}], "rows": [{"data": ["Druk School", 88, 95, 97]}, {"data": ["Ugyen Academy", 91, 98, 96]}]}}}


## Reloading Model

In [1]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os

# Define device
device = (
    "mps" if torch.backends.mps.is_available() else
    "cuda" if torch.cuda.is_available() else
    "cpu"
)
print(f"Using device: {device}")

# Paths
model_name = "NousResearch/Hermes-2-Pro-Llama-3-8B"
lora_path = "./lora-finetuned-hermes"
offload_folder = "./offload"
merged_model_path = "./merged-hermes-2-pro-lora"
os.makedirs(offload_folder, exist_ok=True)
os.makedirs(merged_model_path, exist_ok=True)

# Tokenizer: Load from base model, not LoRA
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Important for generation

# Set dtype based on device
if device == "mps":
    torch_dtype = torch.float16  # Or torch.float32 if needed
elif torch.cuda.is_available() and torch.cuda.is_bf16_supported():
    torch_dtype = torch.bfloat16
else:
    torch_dtype = torch.float16

# Load base model with device_map only (no .to(device))
print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=None,
    low_cpu_mem_usage=False,
).to(device)

# Apply LoRA adapter
print("Applying LoRA adapter...")
model = PeftModel.from_pretrained(
    model,
    lora_path,
    offload_folder=offload_folder,
)

# Merge LoRA weights
print("Merging LoRA weights into base model...")
model = model.merge_and_unload()

# Save the merged model and tokenizer
print(f"Saving merged model and tokenizer to {merged_model_path}...")
model.save_pretrained(merged_model_path)
tokenizer.save_pretrained(merged_model_path)
print("Model and tokenizer loaded successfully!")

print(f"Model and tokenizer saved successfully to {merged_model_path}!")

# outputs = model.generate(
#     **inputs,
#     max_new_tokens=150,
#     temperature=0.7,
#     do_sample=True,
#     pad_token_id=tokenizer.eos_token_id,
# )
# print("Response:", tokenizer.decode(outputs[0], skip_special_tokens=True))

Using device: mps
Loading base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The 8-bit optimizer is not available on your device, only available on CUDA for now.


Applying LoRA adapter...
Merging LoRA weights into base model...
Saving merged model and tokenizer to ./merged-hermes-2-pro-lora...
Model and tokenizer loaded successfully!
Model and tokenizer saved successfully to ./merged-hermes-2-pro-lora!


In [3]:
import json

def generate_response(instruction, input_data=None):
    """
    Generate response using the same ChatML format used in training.
    """
    # Format input data as JSON string if provided
    data_str = json.dumps(input_data, indent=2) if input_data else ""

    # Construct messages in the same structure as training
    messages = [
        {"role": "system", "content": "You are a specialized AI assistant for Bhutanese schools, designed to generate factual reports from educational data. You must follow these rules:\n1. Your response MUST be a valid JSON object with two keys: 'analysis_text' (string) and 'chart' (object or null).\n2. 'chart' MUST be null if no data visualization is requested, relevant, or possible.\n3. Do NOT make up data, scores, or facts. Only use provided information.\n4. If the question is about you (e.g., 'Who are you?'), briefly describe your role in 'analysis_text' and set 'chart' to null.\n5. Never generate markdown, code blocks, or explanations outside JSON."},
        {"role": "user", "content": f"Instruction: {instruction}\n\nData:\n{data_str}"},
    ]

    # Apply the same ChatML template used in training
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        temperature=None,
        top_p=None,
    )

    full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # Extract only assistant's response
    start_token = "<|im_start|>assistant"
    end_token = "<|im_end|>"

    if start_token in full_response:
        start_idx = full_response.index(start_token) + len(start_token)
        response_part = full_response[start_idx:].strip()
        # Remove trailing eos or im_end if present
        if end_token in response_part:
            response_part = response_part.split(end_token)[0].strip()
        return response_part
    else:
        return "Model did not generate a valid assistant response."

# Test case
instruction = "Generate a detailed and diagrammatic report for Dagana Primary School based on the provided student data" 
input_data = {
  "school": "Dagana Primary School",
  "academic_term": "First Term, 2025",
  "students": [
    {
      "student_name": "Tashi",
      "present": 76,
      "total_days": 88,
      "math": 78.5,
      "english": 69.2,
      "dzongkha": 82.0,
      "science": 74.3
    },
    {
      "student_name": "Dorji",
      "present": 68,
      "total_days": 88,
      "math": 63.7,
      "english": 71.5,
      "dzongkha": 76.8,
      "science": 60.0
    },
    {
      "student_name": "Nima",
      "present": 84,
      "total_days": 88,
      "math": 85.0,
      "english": 80.4,
      "dzongkha": 78.9,
      "science": 82.1
    }
  ]
}


print("--- Testing Model ---")
response = generate_response(instruction, input_data)
print(response)

--- Testing Model ---
{"analysis_text": "Dagana Primary School's student performance in First Term, 2025, is detailed below.", "chart": {"type": "bar", "title": "Subject-wise Performance of Students", "data": {"labels": ["Tashi", "Dorji", "Nima"], "datasets": [{"label": "Math", "data": [78.5, 63.7, 85.0]}, {"label": "English", "data": [69.2, 71.5, 80.4]}, {"label": "Dzongkha", "data": [82.0, 76.8, 78.9]}, {"label": "Science", "data": [74.3, 60.0, 82.1]}]}}}
