In [1]:
import numpy as np
import torch
from transformers import (
    AutoModelForCausalLM,      # Class for causal models (GPT, Llama, etc.)
    AutoTokenizer,              # Automatic tokenizer
    TrainingArguments,          # Training configuration
    Trainer,                    # HuggingFace trainer class
    DataCollatorForLanguageModeling  # Prepares batches for language modeling
)
from peft import (
    LoraConfig,                 # LoRA configuration
    get_peft_model,             # Applies LoRA to model
    TaskType                    # Task type (CAUSAL_LM, SEQ_2_SEQ, etc.)
)
from datasets import load_dataset  # Load datasets from HuggingFace Hub
import os

FINE_TUNED_MODEL_NAME = os.getenv("FINE_TUNED_MODEL_NAME", "")

# Check if MPS is available
mps_available = torch.backends.mps.is_available()
mps_built = torch.backends.mps.is_built()

# Select the best available device
if mps_available:
    device = torch.device("mps")
else:
    device = torch.device("cpu")

2. Load Model and Tokenizer
For this guide, weâ€™ll use a lightweight ~1B model: Llama-3.2â€“1B-Instruct. If the model is gated, log into Hugging Face and accept the license (huggingface-cli login).

In [2]:
# Choose the model to fine-tune
model_name = "meta-llama/Llama-3.2-1B-Instruct"

# The tokenizer converts text to token IDs and vice versa
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True  # Allows custom code (required for some models)
)

# This is necessary for batching during training
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load the pre-trained model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.float32,  # Use reduced precision to save memory
    device_map={"": device},     # Map model to selected device
    trust_remote_code=True       # Allow custom code
)

Start with small models (0.5â€“1B) to validate your pipeline
For 7B models on 16GB unified memory: use batch_size=1, high gradient accumulation, LoRA with r=8â€“16

3. Inspect Model Structure (Recommended)
Identify linear layers where LoRA should be applied (typically attention projections).

In [3]:
linear_modules = set()
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        linear_modules.add(name.split('.')[-1])

exclude_modules = {'lm_head', 'embed_tokens', 'wte', 'wpe', 'ln_f'}
#why those exclusions
recommended_modules = linear_modules - exclude_modules
print(f"\nðŸ“‹ Linear modules found: {linear_modules}")
print(f"âœ… Recommended modules for LoRA: {recommended_modules}")
print("="*50)


ðŸ“‹ Linear modules found: {'k_proj', 'up_proj', 'gate_proj', 'v_proj', 'o_proj', 'lm_head', 'q_proj', 'down_proj'}
âœ… Recommended modules for LoRA: {'q_proj', 'k_proj', 'up_proj', 'gate_proj', 'v_proj', 'o_proj', 'down_proj'}


4. Configure and Apply LoRA
Weâ€™ll apply LoRA to train only the adapters while keeping the base model frozen.

LoRA parameters explained:

r (rank): Dimension of LoRA matrices. Typical values: 8-64. Higher = more capacity but more parameters
lora_alpha: Scaling factor. Typically 2Ã—r. Controls adapter influence
target_modules: Which layers to modify (attention, projections, etc.)
lora_dropout: Dropout for regularization (0.05-0.1)
bias: Whether to train biases ("none", "all", "lora_only")

In [4]:
def find_target_modules(model, exclude_names=None):
    exclude_names = exclude_names or {'lm_head', 'embed_tokens', 'wte', 'wpe', 'ln_f'}
    target_modules = set()
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            target_modules.add(name.split('.')[-1])
    return list(target_modules - exclude_names)

target_modules = find_target_modules(model)
print(f"Target modules: {target_modules}")

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

Target modules: ['q_proj', 'k_proj', 'up_proj', 'gate_proj', 'v_proj', 'o_proj', 'down_proj']


5. Dataset Preparation
Weâ€™ll use IMDB (1000-sample subset) for quick testing. You can substitute any HuggingFace dataset.

What weâ€™re doing:

Load dataset from HuggingFace Hub
Tokenize the text (convert to token IDs)
Apply padding and truncation to normalize lengths
Split into train/validation sets

Other useful testing datasets:

wikitext: Wikipedia articles
openwebtext: Web text corpus

In [5]:
dataset = load_dataset("imdb", split="train[:1000]")

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors=None
    )

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=list(dataset.column_names),
    desc="Tokenizing dataset"
)

split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset, eval_dataset = split["train"], split["test"]
print(f"Train: {len(train_dataset)} | Eval: {len(eval_dataset)}")

Train: 900 | Eval: 100


6. Training Configuration
Now weâ€™ll configure all training parameters. Here are the key ones explained:

Enter your email
Subscribe
Batch Size and Gradient Accumulation:

per_device_train_batch_size: How many examples to process together (higher = faster but more memory)
gradient_accumulation_steps: Accumulate gradients for N steps before updating weights (simulates larger batch sizes)
Effective batch size = per_device_train_batch_size Ã— gradient_accumulation_steps
Learning Rate:

learning_rate: How quickly the model learns (2e-4 is a solid default for LoRA)
Strategies:

evaluation_strategy: When to evaluate the model ("steps", "epoch", "no")
save_strategy: When to save checkpoints ("steps", "epoch", "no")

In [6]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./my-finetuned-model",  # use a descriptive name
    num_train_epochs=3,
    per_device_train_batch_size=2,      # Reduced for MPS stability
    per_device_eval_batch_size=2,       # Reduced for MPS stability
    gradient_accumulation_steps=8,      # Increased to maintain effective batch size
    learning_rate=2e-4,
    warmup_steps=100,

    gradient_checkpointing=False,       # Disable for MPS compatibility

    fp16=False,                         # MPS: keep fp32
    bf16=False,

    logging_steps=10,
    report_to="none",

    eval_strategy="steps",
    eval_steps=100,

    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    load_best_model_at_end=True,

    remove_unused_columns=False,
    seed=42,
    
    # MPS specific optimizations
    dataloader_pin_memory=False,        # Disable pin memory for MPS
    torch_compile=False,                # Disable torch compile for MPS
)

In [7]:
# Fix dataset to ensure labels are properly set for causal language modeling
def prepare_dataset(examples):
    # For causal language modeling, labels should be the same as input_ids
    examples["labels"] = examples["input_ids"].copy()
    return examples

train_dataset = train_dataset.map(prepare_dataset, batched=False)
eval_dataset = eval_dataset.map(prepare_dataset, batched=False)

print("Dataset prepared with labels for causal language modeling")

Dataset prepared with labels for causal language modeling


7. Start Training
Initialize the HuggingFace Trainer and launch the fine-tuning process.

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

train_result = trainer.train()

eval_results = trainer.evaluate()
print(f"â€¢ Eval loss: {eval_results['eval_loss']:.4f}")
print(f"â€¢ Perplexity: {np.exp(eval_results['eval_loss']):.2f}")

The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss,Validation Loss


What happens during training:

The model processes data batches
Calculates loss (prediction error)
Computes gradients via backpropagation
Updates only LoRA parameters (not the entire model)
Periodically evaluates on the validation set
Saves checkpoints
Critical metrics to monitor:

loss: Training set error (should decrease)
eval_loss: Validation set error (should decrease, but not too much relative to loss)
learning_rate: Changes during warmup phase

Save LoRA Adapters and Tokenizer
Training is complete. Now letâ€™s save the model to disk. LoRA adapters are extremely lightweight â€” you can save multiple versions with different names.

In [None]:
from pathlib import Path

FINE_TUNED_MODEL_NAME = "llama-1b-imdb-lora"  # alternatively use ENV
output_dir = Path(f"./{FINE_TUNED_MODEL_NAME}-finetuned")
output_dir.mkdir(parents=True, exist_ok=True)

model.save_pretrained(output_dir)    # save LoRA adapters
tokenizer.save_pretrained(output_dir)

print("âœ“ Adapters and tokenizer saved to:", output_dir)
for f in output_dir.iterdir():
    print("  â€¢", f.name)

To reload later:

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32).to(device)
model = PeftModel.from_pretrained(base_model, str(output_dir))
tokenizer = AutoTokenizer.from_pretrained(str(output_dir))

If you want a standalone model thatâ€™s easier to deploy, merge the adapters (the model will be larger on disk and you wonâ€™t be able to modify only the adapters anymore).

In [None]:
merged_model = model.merge_and_unload()    # PEFT: incorporates LoRA into base weights
merged_dir = f"./{FINE_TUNED_MODEL_NAME}-merged"
merged_model.save_pretrained(merged_dir)
tokenizer.save_pretrained(merged_dir)
print("âœ“ Merged model saved to:", merged_dir)