

# Complete QLoRA Fine-tuning Script for Qwen2.5-7B
Uses 4-bit quantization with PEFT/LoRA adapters
Optimized for Google Colab with single GPU


In [None]:
!pip install -q -U transformers datasets peft bitsandbytes accelerate trl
!pip install -q -U sentencepiece protobuf

In [None]:
import torch
import json
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel
)

from trl import SFTTrainer
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Model configuration
MODEL_NAME = "Qwen/Qwen2.5-7B"
OUTPUT_DIR = "./qwen2.5-7b-qlora-finetuned"
ADAPTER_DIR = "./qwen2.5-7b-qlora-adapter"

# QLoRA configuration (as specified)
LORA_R = 64
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
TARGET_MODULES = ["q_proj", "v_proj"]

# Training hyperparameters
MAX_LENGTH = 512
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-4
NUM_EPOCHS = 3
WARMUP_STEPS = 100
LOGGING_STEPS = 10
SAVE_STEPS = 100

# Dataset configuration
DATASET_NAME = "Open-Orca/OpenOrca"  # Can switch to OpenOrca-Slim or OpenOrca-Platypus2
MAX_SAMPLES = 5000  # Limit samples for faster training on Colab

print(f"Using device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print(f"CUDA available: {torch.cuda.is_available()}")

In [None]:
# This reduces memory usage from ~28GB to ~7GB for Qwen2.5-7B
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit loading
    bnb_4bit_quant_type="nf4",  # Use NormalFloat4 quantization
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute in bfloat16 for stability
    bnb_4bit_use_double_quant=True,  # Nested quantization for more memory savings
)

print("✓ 4-bit quantization config created")

In [None]:
print(f"Loading model: {MODEL_NAME}...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    padding_side="right"  # Important for decoder-only models
)

# Set pad token if not exists (required for batching)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Load model in 4-bit with quantization config
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",  # Automatically distribute model across available GPUs
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)


In [None]:
# Disable caching for training help saves memory
model.config.use_cache = False
model.config.pretraining_tp = 1  # Tensor parallelism setting

print("✓ Model and tokenizer loaded successfully")
print(f"Model device: {model.device}")
print(f"Model dtype: {model.dtype}")

In [None]:
# Prepare model for k-bit training (gradient checkpointing, input requires_grad, etc.)
model = prepare_model_for_kbit_training(model)

# Configure LoRA with specified parameters
lora_config = LoraConfig(
    r=LORA_R,  # Rank of the low-rank matrices
    lora_alpha=LORA_ALPHA,  # Scaling factor
    target_modules=TARGET_MODULES,  # Which layers to apply LoRA to
    lora_dropout=LORA_DROPOUT,  # Dropout probability
    bias="none",  # Don't train bias parameters
    task_type="CAUSAL_LM",  # Task type for language modeling
)

In [None]:
# Apply LoRA adapters to the model
model = get_peft_model(model, lora_config)

# Print trainable parameters
trainable_params = 0
all_params = 0
for _, param in model.named_parameters():
    all_params += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()

print(f"✓ LoRA adapters applied")
print(f"Trainable params: {trainable_params:,} || All params: {all_params:,} || Trainable%: {100 * trainable_params / all_params:.2f}%")


In [None]:
print(f"\nLoading dataset: {DATASET_NAME}...")

# Load OpenOrca dataset from Hugging Face
# Options: "Open-Orca/OpenOrca", "Open-Orca/OpenOrca-Slim", or custom
dataset = load_dataset(DATASET_NAME, split="train", streaming=False)

# Take a subset for faster training on Colab
dataset = dataset.select(range(min(MAX_SAMPLES, len(dataset))))

print(f"✓ Dataset loaded: {len(dataset)} samples")
print(f"Dataset columns: {dataset.column_names}")

In [None]:
def format_chat_template(example):
    """
    Format dataset examples using the specified chat template:
    <|system|>SYSTEM_MESSAGE</|system|>
    <|user|>USER_MESSAGE</|user|>
    <|assistant|>ASSISTANT_RESPONSE</|assistant|>
    """
    # OpenOrca format: 'system_prompt', 'question', 'response'
    system_msg = example.get('system_prompt', 'You are a helpful assistant.')
    user_msg = example.get('question', '')
    assistant_msg = example.get('response', '')

    # Create formatted text
    formatted_text = (
        f"<|system|>{system_msg}</|system|>\n"
        f"<|user|>{user_msg}</|user|>\n"
        f"<|assistant|>{assistant_msg}</|assistant|>"
    )

    return {"text": formatted_text}

# Apply formatting to dataset
print("Formatting dataset with chat template...")
formatted_dataset = dataset.map(
    format_chat_template,
    remove_columns=dataset.column_names,  # Remove original columns
    desc="Formatting dataset"
)

In [None]:
# Split into train and eval sets (90/10 split)
split_dataset = formatted_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

print(f"✓ Dataset formatted")
print(f"Train samples: {len(train_dataset)}")
print(f"Eval samples: {len(eval_dataset)}")
print(f"\nExample formatted text:\n{train_dataset[0]['text'][:300]}...\n")

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,

    # Training hyperparameters
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,

    # Optimizer settings
    optim="paged_adamw_8bit",  # 8-bit optimizer for memory efficiency
    warmup_steps=WARMUP_STEPS,
    weight_decay=0.01,
    max_grad_norm=1.0,

    # Precision settings
    bf16=True,  # Use bfloat16 mixed precision
    fp16=False,

    # Logging and saving
    logging_steps=LOGGING_STEPS,
    logging_dir=f"{OUTPUT_DIR}/logs",
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    save_total_limit=3,

    # Evaluation
    evaluation_strategy="steps",
    eval_steps=SAVE_STEPS,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",

    # Memory optimization
    gradient_checkpointing=True,

    # Other settings
    report_to="none",  # Disable wandb/tensorboard for simplicity
    seed=42,
)

print("✓ Training arguments configured")

In [None]:
# Use SFTTrainer (Supervised Fine-Tuning Trainer) from TRL
# This handles the text field automatically
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    dataset_text_field="text",  # Field containing the formatted text
    max_seq_length=MAX_LENGTH,
    packing=False,  # Don't pack multiple samples together
)

print("✓ Trainer initialized")

In [None]:
print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60 + "\n")

# Start training
trainer.train()

print("\n" + "="*60)
print("TRAINING COMPLETED")
print("="*60 + "\n")

In [None]:
print(f"Saving LoRA adapter to {ADAPTER_DIR}...")

# Save only the LoRA adapter weights (much smaller than full model)
model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)

print("✓ LoRA adapter saved successfully")

In [None]:
print("\n" + "="*60)
print("EVALUATING MODEL")
print("="*60 + "\n")

# Run evaluation
eval_results = trainer.evaluate()

print("Evaluation Results:")
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}")

In [None]:
print("\n" + "="*60)
print("INFERENCE EXAMPLE")
print("="*60 + "\n")

# Prepare model for inference
model.eval()

def generate_response(system_prompt, user_prompt, max_new_tokens=256):
    """Generate a response using the fine-tuned model"""

    # Format input using chat template
    input_text = (
        f"<|system|>{system_prompt}</|system|>\n"
        f"<|user|>{user_prompt}</|user|>\n"
        f"<|assistant|>"
    )

    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Decode and extract assistant response
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # Extract only the assistant's response
    if "<|assistant|>" in full_response:
        assistant_response = full_response.split("<|assistant|>")[-1]
        assistant_response = assistant_response.replace("</s>", "").strip()
    else:
        assistant_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return assistant_response

In [None]:
test_prompts = [
    {
        "system": "You are a helpful AI assistant.",
        "user": "What is machine learning?"
    },
    {
        "system": "You are a coding expert.",
        "user": "Write a Python function to calculate factorial."
    },
    {
        "system": "You are a creative writer.",
        "user": "Write a short poem about AI."
    }
]

print("Testing inference with fine-tuned model:\n")
for i, prompt in enumerate(test_prompts, 1):
    print(f"Example {i}:")
    print(f"System: {prompt['system']}")
    print(f"User: {prompt['user']}")

    response = generate_response(prompt['system'], prompt['user'])

    print(f"Assistant: {response}")
    print("-" * 60 + "\n")


In [None]:
print("\n✓ Script completed successfully!")
print(f"\nModel adapter saved to: {ADAPTER_DIR}")
print(f"Full training logs saved to: {OUTPUT_DIR}")

To load and use the fine-tuned adapter in a new session:

```python
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

# Load base model in 4-bit
base_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-7B",
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, "./qwen2.5-7b-qlora-adapter")
tokenizer = AutoTokenizer.from_pretrained("./qwen2.5-7b-qlora-adapter")

# Use for inference
model.eval()
```