# LoRA Fine-tuning for NER Label Generation

This notebook fine-tunes the Qwen 0.5B-Instruct model using LoRA (Low-Rank Adaptation) for NER label generation as a text generation task.


In [1]:
# Imports
import json
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer


In [2]:
# Load processed data
with open("outputs/data/train_instruction_data.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

with open("outputs/data/val_instruction_data.json", "r", encoding="utf-8") as f:
    val_data = json.load(f)

print(f"Loaded {len(train_data)} training examples")
print(f"Loaded {len(val_data)} validation examples")

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

# Show sample from new JSON format
print("\nSample training example:")
sample = train_dataset[0]
print(f"Instruction: {sample['instruction'][:100]}...")
print(f"Input: {sample['input'][:100]}...")
print(f"Output: {sample['output'][:100]}...")


Loaded 3 training examples
Loaded 2 validation examples

Sample training example:
Instruction: From TOKENS, return JSON: {"PER":[[...]],"LOC":[[...]],"ORG":[[...]],"MISC":[[...]]}; each mention i...
Input: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']...
Output: {"PER":[],"LOC":[],"ORG":[["EU"]],"MISC":[["German"],["British"]]}...


In [3]:
# Load model and tokenizer
model_path = "models/Qwen2.5-0.5B-Instruct"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)

print(f"Model loaded: {model.__class__.__name__}")


Loading tokenizer...
Loading model...
Model loaded: Qwen2ForCausalLM


In [4]:
# Configure LoRA (Low-Rank Adaptation)
lora_config = LoraConfig(
    r=16,  # LoRA rank
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

# Apply LoRA adapter
print("Applying LoRA configuration...")
model = get_peft_model(model, lora_config)

# Count trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable_params:,} || All params: {all_params:,} || Trainable%: {100 * trainable_params / all_params:.2f}")


Applying LoRA configuration...
Trainable params: 8,798,208 || All params: 502,830,976 || Trainable%: 1.75


In [5]:
# Training configuration
training_args = TrainingArguments(
    output_dir="outputs/checkpoints",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    optim="adamw_torch",
    save_steps=500,
    logging_steps=50,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    max_grad_norm=1.0,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    eval_strategy="steps",
    eval_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
)

# Define how to format data for training
def formatting_func(example):
    """Return the text field for training."""
    return example["text"]

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    args=training_args,
    formatting_func=formatting_func,
    max_seq_length=512,
)

print("Trainer initialized successfully!")



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Trainer initialized successfully!


In [6]:
# Train the model
print("Starting training...")
trainer.train()

# Save the final fine-tuned model
print("\nSaving the final model...")
os.makedirs("outputs/final_model", exist_ok=True)
trainer.save_model("outputs/final_model")
tokenizer.save_pretrained("outputs/final_model")

print("Training completed and model saved!")


Starting training...


  0%|          | 0/3 [00:00<?, ?it/s]

{'train_runtime': 1.0856, 'train_samples_per_second': 8.29, 'train_steps_per_second': 2.763, 'train_loss': 0.6317812999089559, 'epoch': 3.0}

Saving the final model...
Training completed and model saved!


In [7]:
# Save training history and show final metrics
history = trainer.state.log_history

# Save training history
os.makedirs("outputs/results", exist_ok=True)
with open("outputs/results/training_history.json", "w") as f:
    json.dump(history, f, indent=2)

# Display final metrics
if history:
    final_metrics = history[-1]
    print("\nFinal training metrics:")
    for key, value in final_metrics.items():
        if isinstance(value, (int, float)):
            print(f"  {key}: {value:.4f}")



Final training metrics:
  train_runtime: 1.0856
  train_samples_per_second: 8.2900
  train_steps_per_second: 2.7630
  total_flos: 1999961729280.0000
  train_loss: 0.6318
  epoch: 3.0000
  step: 3.0000
