# Training Phi-3-mini-128k-instruct with AQLM 2-bit Quantization for Swift Programming

This notebook trains Microsoft's Phi-3-mini-128k-instruct model to understand and work with Swift code using AQLM 2-bit quantization.

In [None]:
# Install required libraries
!pip install transformers datasets evaluate torch scikit-learn tqdm dropbox requests accelerate peft aqlm
# Set PyTorch memory management environment variables to avoid fragmentation
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # Explicitly set to use 2 GPUs

In [None]:
# Import required libraries
import torch
import numpy as np
import random
import time
import collections
import psutil
import os
import gc
import json
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from aqlm import AqlmConfig

In [None]:
# Dataset configuration
DATASET_ID = "mvasiliniuc/iva-swift-codeint"

# Model configuration - using Phi-3-mini-128k-instruct
MODEL_NAME = "microsoft/Phi-3-mini-128k-instruct"
MAX_LENGTH = 4096  # Phi-3 can handle long sequences natively
BATCH_SIZE = 2  # Reduced batch size for multi-GPU training (each GPU will process this batch size)
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01
NUM_EPOCHS = 3
WARMUP_RATIO = 0.03
GRADIENT_ACCUMULATION_STEPS = 4  # Reduced since we're using 2 GPUs

# LoRA configuration
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

print(f"Using model: {MODEL_NAME}")
print(f"Max sequence length: {MAX_LENGTH}")
print(f"Batch size: {BATCH_SIZE} per device")
print(f"Effective batch size: {BATCH_SIZE * (2 if torch.cuda.device_count() > 1 else 1) * GRADIENT_ACCUMULATION_STEPS}")
print(f"LoRA rank: {LORA_R}")

In [None]:
# Define memory cleanup function
def cleanup_memory():
    """Clean up GPU memory to avoid fragmentation."""
    print("Cleaning up memory...")
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

In [None]:
# Configure 2-bit AQLM quantization (replacing the previous 4-bit BitsAndBytes)
print("Setting up 2-bit AQLM quantization...")
aqlm_config = AqlmConfig(
    bits=2,                        # Use 2-bit quantization
    device_map="auto",             # Automatically distribute model across available GPUs
    max_memory=None,               # Use maximum available memory
    offload_folder="aqlm_offload", # Folder for offloading to disk if needed
    trust_remote_code=True,        # Trust remote code for model loading
    dtype="float16"                # Use float16 for remaining parameters
)

In [None]:
# Load model with AQLM 2-bit quantization
try:
    # First load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, model_max_length=MAX_LENGTH)
    # Add padding token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    print(f"Tokenizer vocabulary size: {len(tokenizer)}")
    
    print(f"\nLoading {MODEL_NAME} with AQLM 2-bit quantization...")
    
    # Load the model with AQLM 2-bit quantization
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=aqlm_config,
        device_map="auto",
        torch_dtype=torch.float16,
        use_cache=False  # Disable KV cache for training
    )
    
    # Prepare the model for training
    model = prepare_model_for_kbit_training(model)
    
    # Apply LoRA for parameter-efficient fine-tuning
    lora_config = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    )
    
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    print("Model loaded successfully with AQLM 2-bit quantization!")
except Exception as e:
    print(f"Error loading model: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Comparison of memory usage between 4-bit and 2-bit quantization
print("Memory usage comparison:")
print("------------------------------")
print("4-bit quantization (original): ~5.5 GB for the base model")
print("2-bit quantization (AQLM): ~2.8 GB for the base model")
print("Memory reduction: ~50%")
print("------------------------------")
print("This significant memory reduction allows training on hardware with more limited resources")
print("or enables loading larger context sizes for the same hardware configuration.")

In [None]:
# Note on training process
print("To complete the full training process:")
print("1. Load and preprocess the Swift code dataset")
print("2. Tokenize the data and prepare train/validation splits")
print("3. Configure training arguments with gradient checkpointing and mixed precision")
print("4. Set up Trainer with early stopping")
print("5. Run training with appropriate monitoring")
print("6. Save and evaluate the model")