# T4-OPT: Train LLM with QLoRA (Maximum GPU Utilization! )

This notebook demonstrates optimized QLoRA fine-tuning with **maximum GPU utilization** for fastest training.

## ðŸš€ GPU Optimization Features:
- **Automatic batch size optimization** - Finds largest batch that fits (up to 16+)
- **Maximum GPU memory utilization** - Uses 98% of GPU memory
- **bf16 support** - Faster than fp16 on newer GPUs (auto-detected)
- **Flash attention** - Significantly faster training
- **Parallel data loading** - 4 workers with prefetching
- **TF32 enabled** - Faster on Ampere+ GPUs
- **CuDNN optimizations** - Benchmark mode for speed
- **Group by length** - Efficient sequence batching

In [None]:
import sys
sys.path.append('/content/t4opt')

from training.optimized_trainer import OptimizedQLoRATrainer
from training.qlora import QLoRAConfig
from training.dataset import DatasetManager
from utils.memory import MemoryManager
from utils.config import Config
from utils.checkpoint_utils import print_checkpoint_info, check_drive_checkpoints
from utils.colab_tools import ColabTools

ColabTools.verify_t4_compatibility()

MemoryManager.print_memory_summary()


In [None]:
config = QLoRAConfig(
    model_name="microsoft/phi-2", 
    output_dir="./checkpoints/phi-2-qlora",  
    max_seq_length=1024,
    micro_batch_size=1,  
    gradient_accumulation_steps=16,  
    num_epochs=3,
    learning_rate=2e-4,
    lora_r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    use_gradient_checkpointing=True,
    fp16=True,  
    save_steps=500  
)

print("Initial Training Configuration (will be optimized automatically):")
for key, value in config.__dict__.items():
    print(f"  {key}: {value}")
print("\nðŸ’¡ These settings will be automatically optimized for maximum GPU utilization!")


In [None]:
dataset_manager = DatasetManager()
max_samples = 5000  

dataset_info = dataset_manager.load_dataset(
    dataset_name="alpaca",
    max_samples=max_samples 
)

total_samples = dataset_info['num_samples']
print(f"Dataset loaded: {total_samples:,} samples")
if max_samples is None:
    print("   Using FULL dataset (best for training quality!)")
else:
    print(f"   Limited to {max_samples:,} samples")

estimated_time_per_epoch = (total_samples * 2.5) / 60  
total_time = estimated_time_per_epoch * config.num_epochs
print(f"\nEstimated training time:")
print(f"   Per epoch: ~{estimated_time_per_epoch:.1f} minutes")
print(f"   Total ({config.num_epochs} epochs): ~{total_time:.1f} minutes ({total_time/60:.1f} hours)")


In [None]:
trainer = OptimizedQLoRATrainer(config=config, auto_optimize=True)

model, tokenizer = trainer.load_model()

In [None]:
tokenized_dataset = dataset_manager.tokenize_dataset(
    dataset_info['dataset'],
    tokenizer,
    max_length=config.max_seq_length
)

print(f"Tokenized dataset: {len(tokenized_dataset)} samples")


In [None]:
training_result = trainer.train_optimized(
    tokenized_dataset,
    find_best_batch_size=True  
)

print(f"  Final Loss: {training_result['train_loss']:.4f}")
print(f"  Training Time: {training_result['train_runtime']:.2f} seconds")
print(f"  Samples/sec: {training_result['train_samples_per_second']:.2f}")
print(f"  Output Directory: {training_result['output_dir']}")



In [None]:
MemoryManager.print_memory_summary()

print_checkpoint_info(config.output_dir)

drive_info = check_drive_checkpoints()
if drive_info["drive_mounted"] and drive_info["checkpoints"]:
    print("\nFound checkpoints in Google Drive!")
    for name, info in drive_info["checkpoints"].items():
        print(f"  - {name}: {len(info['checkpoints'])} checkpoint(s)")
