In [None]:
# Note this will not run on a Mac M1/M2 
# fp16 mixed precision requires a GPU (not 'mps') unless you set fp16=False

In [None]:
# Install necessary libraries.
# transformers includes huggingface transformers
# datasets for handling data, evaluate for metrics, peft for parameter-efficient
# fine-tuning (LoRA), trl for training, and bitsandbytes for quantization.
!pip install transformers datasets evaluate peft trl bitsandbytes

In [None]:
import os  # For interacting with the file system
import torch  # Core PyTorch library for deep learning
from datasets import load_dataset  # For loading and managing datasets
from transformers import (  # Hugging Face Transformers library
    AutoModelForCausalLM,  # Auto-classes for loading pretrained models
    AutoTokenizer,  # Auto-classes for loading pretrained tokenizers
    BitsAndBytesConfig,  # For 8-bit quantization of the model
    TrainingArguments,  # Configuration for training
    pipeline,  # For easier model inference
    logging  # For controlling logging output
)
from peft import LoraConfig  # Configuration for LoRA (Low-Rank Adaptation)
from trl import SFTTrainer  # Trainer for Supervised Fine-Tuning (SFT)


In [None]:
# --- Model and Dataset Setup ---

# Specify the base model (a pre-trained Llama model)
base_model = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
# Specify the dataset for instruction fine-tuning
guanaco_dataset = "mlabonne/guanaco-llama2-1k"
# Specify the name for the new fine-tuned model
new_model = "llama-1.1B-chat-guanaco"


In [None]:
# Load the dataset
dataset = load_dataset(guanaco_dataset, split="train")

In [None]:
# Load the pre-trained model, automatically placing layers on available devices
model = AutoModelForCausalLM.from_pretrained(base_model, device_map='auto')
# Disable caching for faster inference but potentially higher memory usage
model.config.use_cache = False
# Configuration for pre-training tensor parallelism (not relevant for fine-tuning)
model.config.pretraining_tp = 1  

In [None]:
# Load the tokenizer associated with the model
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
# Set padding token and padding direction (important for model input)
tokenizer.pad_token = tokenizer.eos_token 
tokenizer.padding_side = 'right'

In [None]:
# --- Run a quick inference before fine-tuning ---

# Suppress most logging messages for a cleaner output
logging.set_verbosity(logging.CRITICAL)

# Define a sample prompt for testing the model
prompt = "Who is Napoleon Bonaparte?"
# Create a text generation pipeline for easy inference
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
# Run inference and print the result
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])


## --- Fine-Tuning Setup ---

In [None]:
# Configure LoRA parameters for efficient fine-tuning
peft_params = LoraConfig(
    lora_alpha=16,  # Multiplier for LoRA outputs
    lora_dropout=0.1,  # Dropout probability for LoRA layers
    r=64,  # Rank (dimensionality) of LoRA matrices
    bias="none",  # No bias term in LoRA
    task_type="CAUSAL_LM"  # Type of task (Causal Language Modeling)
)

In [None]:
# Configure training arguments for the SFTTrainer
training_params = TrainingArguments(
    output_dir='./results',  # Output directory for checkpoints and results
    num_train_epochs=2,  # Number of training epochs
    per_device_train_batch_size=2,  # Batch size per device
    gradient_accumulation_steps=16,  # Accumulate gradients over 16 steps
    optim="adamw_torch",  # Optimizer (AdamW)
    save_steps=25,  # Save a checkpoint every 25 steps
    logging_steps=1,  # Log every step
    learning_rate=2e-4,  # Learning rate
    weight_decay=0.001,  # Weight decay for regularization
    fp16=True,  # Use 16-bit precision (if available)
    bf16=False,  # Don't use bfloat16 (not supported on all hardware)
    max_grad_norm=0.3,  # Gradient clipping for stability
    max_steps=-1,  # No limit on the number of training steps
    warmup_ratio=0.03,  # Warmup ratio for the learning rate
    group_by_length=True,  # Group sequences by length for efficient training
    lr_scheduler_type="cosine"  # Cosine learning rate scheduler
)

## --- Fine-Tuning and Saving ---

In [None]:
trainer = SFTTrainer(
    model=model,  
    train_dataset=dataset,  
    peft_config=peft_params,  
    dataset_text_field="text",  # Text field in the dataset
    max_seq_length=None,  # No maximum sequence length
    tokenizer=tokenizer,  
    args=training_params,  
    packing=False  # Don't pack sequences into a single batch 
)

In [None]:
# Manually trigger garbage collection and clear GPU cache
import gc 
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Start the training process
trainer.train()
# Save the fine-tuned model and tokenizer
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

In [None]:
# --- Run Inference After Fine-Tuning ---
# Run inference again with the same prompt to see the difference after fine-tuning
prompt = "Who is Napoleon Bonaparte?"
pipe = pipeline(task='text-generation', model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f'<s>[INST] {prompt} [/INST]')
print(result[0]['generated_text']) 