# T4-OPT: Train LLM with QLoRA (Optimized)

This notebook demonstrates optimized QLoRA fine-tuning on a T4 GPU with automatic GPU/memory optimization.

## Features:
- ✅ Automatic batch size optimization
- ✅ Maximum GPU memory utilization
- ✅ Adaptive configuration based on available resources
- ✅ Memory-efficient training

## Steps:
1. Load and prepare dataset
2. Configure QLoRA training (with auto-optimization)
3. Train the model with maximum GPU utilization
4. Save checkpoint


In [None]:
import sys
sys.path.append('/content/t4opt')

# Use OptimizedQLoRATrainer for automatic GPU/memory optimization
from training.optimized_trainer import OptimizedQLoRATrainer
from training.qlora import QLoRAConfig
from training.dataset import DatasetManager
from utils.memory import MemoryManager
from utils.config import Config
from utils.checkpoint_utils import print_checkpoint_info, check_drive_checkpoints

# Optional: Mount Google Drive to save checkpoints persistently
# Uncomment the next 2 lines to save to Drive (recommended!)
# from google.colab import drive
# drive.mount('/content/drive')

# Check memory and GPU
print("Initial GPU/Memory Status:")
MemoryManager.print_memory_summary()


In [None]:
# Configuration for T4
# IMPORTANT: If you want checkpoints to persist after session ends, 
# save to Google Drive instead of ./checkpoints
# Example: output_dir="/content/drive/MyDrive/t4opt_checkpoints/phi-2-qlora"

config = QLoRAConfig(
    model_name="microsoft/phi-2",  # or "google/gemma-2b-it"
    output_dir="./checkpoints/phi-2-qlora",  # ⚠️ This is temporary! Use Drive path for persistence
    max_seq_length=1024,
    micro_batch_size=1,
    gradient_accumulation_steps=16,
    num_epochs=3,
    learning_rate=2e-4,
    lora_r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    use_gradient_checkpointing=True,
    fp16=True,
    save_steps=500  # Save checkpoint every 500 steps
)

print("Training Configuration:")
for key, value in config.__dict__.items():
    print(f"  {key}: {value}")


In [None]:
# Load dataset
dataset_manager = DatasetManager()
dataset_info = dataset_manager.load_dataset(
    dataset_name="alpaca",
    max_samples=1000  # Limit for T4
)

print(f"Dataset loaded: {dataset_info['num_samples']} samples")


In [None]:
# Initialize OPTIMIZED trainer (automatically optimizes for GPU)
# Set auto_optimize=True to automatically find best settings
trainer = OptimizedQLoRATrainer(config=config, auto_optimize=True)

# Load model (optimizations will be applied automatically)
model, tokenizer = trainer.load_model()


In [None]:
# Tokenize dataset
tokenized_dataset = dataset_manager.tokenize_dataset(
    dataset_info['dataset'],
    tokenizer,
    max_length=config.max_seq_length
)

print(f"Tokenized dataset: {len(tokenized_dataset)} samples")


In [None]:
# Train model with optimizations
# This will automatically:
# - Find optimal batch size
# - Optimize memory usage
# - Maximize GPU utilization
print("Starting optimized training...")
training_result = trainer.train_optimized(
    tokenized_dataset,
    find_best_batch_size=True  # Automatically find best batch size
)

print("\n" + "="*60)
print("Training Results:")
print("="*60)
print(f"  Final Loss: {training_result['train_loss']:.4f}")
print(f"  Training Time: {training_result['train_runtime']:.2f} seconds")
print(f"  Samples/sec: {training_result['train_samples_per_second']:.2f}")
print(f"  Output Directory: {training_result['output_dir']}")
print("="*60)


In [None]:
# Check memory after training
MemoryManager.print_memory_summary()

# Check if checkpoints were saved
print("\n" + "="*60)
print("Checking for saved checkpoints...")
print_checkpoint_info(config.output_dir)

# If you saved to Drive, check there too
drive_info = check_drive_checkpoints()
if drive_info["drive_mounted"] and drive_info["checkpoints"]:
    print("\n✅ Found checkpoints in Google Drive!")
    for name, info in drive_info["checkpoints"].items():
        print(f"  - {name}: {len(info['checkpoints'])} checkpoint(s)")
