# T4-OPT: Train LLM with QLoRA

This notebook demonstrates QLoRA fine-tuning on a T4 GPU.

## Steps:
1. Load and prepare dataset
2. Configure QLoRA training
3. Train the model
4. Save checkpoint


In [None]:
import sys
sys.path.append('/content/t4opt')

from training.qlora import QLoRATrainer, QLoRAConfig
from training.dataset import DatasetManager
from utils.memory import MemoryManager
from utils.config import Config
from utils.checkpoint_utils import print_checkpoint_info, check_drive_checkpoints

# Optional: Mount Google Drive to save checkpoints persistently
# Uncomment the next 2 lines to save to Drive (recommended!)
# from google.colab import drive
# drive.mount('/content/drive')

# Check memory
MemoryManager.print_memory_summary()


In [None]:
# Configuration for T4
# IMPORTANT: If you want checkpoints to persist after session ends, 
# save to Google Drive instead of ./checkpoints
# Example: output_dir="/content/drive/MyDrive/t4opt_checkpoints/phi-2-qlora"

config = QLoRAConfig(
    model_name="microsoft/phi-2",  # or "google/gemma-2b-it"
    output_dir="./checkpoints/phi-2-qlora",  # ⚠️ This is temporary! Use Drive path for persistence
    max_seq_length=1024,
    micro_batch_size=1,
    gradient_accumulation_steps=16,
    num_epochs=3,
    learning_rate=2e-4,
    lora_r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    use_gradient_checkpointing=True,
    fp16=True,
    save_steps=500  # Save checkpoint every 500 steps
)

print("Training Configuration:")
for key, value in config.__dict__.items():
    print(f"  {key}: {value}")


In [None]:
# Load dataset
dataset_manager = DatasetManager()
dataset_info = dataset_manager.load_dataset(
    dataset_name="alpaca",
    max_samples=1000  # Limit for T4
)

print(f"Dataset loaded: {dataset_info['num_samples']} samples")


In [None]:
# Initialize trainer
trainer = QLoRATrainer(config=config)

# Load model
model, tokenizer = trainer.load_model()


In [None]:
# Tokenize dataset
tokenized_dataset = dataset_manager.tokenize_dataset(
    dataset_info['dataset'],
    tokenizer,
    max_length=config.max_seq_length
)

print(f"Tokenized dataset: {len(tokenized_dataset)} samples")


In [None]:
# Train model
print("Starting training...")
training_result = trainer.train(tokenized_dataset)

print("\nTraining Results:")
print(f"  Final Loss: {training_result['train_loss']:.4f}")
print(f"  Training Time: {training_result['train_runtime']:.2f} seconds")
print(f"  Samples/sec: {training_result['train_samples_per_second']:.2f}")
print(f"  Output Directory: {training_result['output_dir']}")


In [None]:
# Check memory after training
MemoryManager.print_memory_summary()

# Check if checkpoints were saved
print("\n" + "="*60)
print("Checking for saved checkpoints...")
print_checkpoint_info(config.output_dir)

# If you saved to Drive, check there too
drive_info = check_drive_checkpoints()
if drive_info["drive_mounted"] and drive_info["checkpoints"]:
    print("\n✅ Found checkpoints in Google Drive!")
    for name, info in drive_info["checkpoints"].items():
        print(f"  - {name}: {len(info['checkpoints'])} checkpoint(s)")
