# Vishwamai T4 Training on Google Colab

This notebook demonstrates how to train Vishwamai models on Google Colab's T4 GPUs with optimizations.

## Setup Environment

First, let's install Vishwamai and its dependencies:

In [None]:
!pip install torch==2.1.0 flash-attn==2.3.0 triton==2.1.0
!git clone https://github.com/yourusername/VishwamAI.git
!cd VishwamAI && pip install -e .

## Check GPU and Import Libraries

In [None]:
import torch
from vishwamai.model import (
    create_model,
    ModelArgs,
    UnifiedConfig,
    VISHWAMAI_TINY
)
from vishwamai.utils.t4_utils import (
    enable_t4_optimizations,
    get_device_capabilities,
    get_memory_stats
)

# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU device: {torch.cuda.get_device_name(0)}")
print(f"\nDevice capabilities:")
capabilities = get_device_capabilities()
for k, v in capabilities.items():
    print(f"{k}: {v}")

## Enable T4 Optimizations

In [None]:
# Enable optimizations
enable_t4_optimizations()

# Create optimized model configuration
config = VISHWAMAI_TINY.update(
    dtype="fp16",
    use_mixed_precision=True,
    use_flash_attention=True,
    unified=UnifiedConfig(
        transformer=dict(
            fused_qkv=True,
            fused_mlp=True,
            use_memory_efficient_attention=True
        )
    )
)

## Prepare Training Data

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load wikitext dataset
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

# Tokenize dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names
)

## Create Model and Optimizer

In [None]:
import torch.optim as optim
from torch.cuda.amp import GradScaler

# Create model
model = create_model(config=config)
model.to('cuda')

# Setup optimizer and scaler
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
scaler = GradScaler()

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

## Training Loop

In [None]:
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import time

# Training parameters
batch_size = 32
num_epochs = 3

# Create dataloader
train_dataloader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=True)

# Training loop
model.train()
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    
    total_loss = 0
    start_time = time.time()
    
    for batch in tqdm(train_dataloader):
        # Move batch to GPU
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        
        # Forward pass with mixed precision
        with torch.cuda.amp.autocast():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            loss = outputs['loss']
        
        # Backward pass with gradient scaling
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        
        total_loss += loss.item()
    
    # Print epoch statistics
    avg_loss = total_loss / len(train_dataloader)
    elapsed_time = time.time() - start_time
    print(f"Average loss: {avg_loss:.4f}")
    print(f"Time taken: {elapsed_time:.2f} seconds")
    
    # Print memory usage
    memory_stats = get_memory_stats()
    print(f"Memory allocated: {memory_stats['allocated_gb']:.2f} GB")
    print(f"Max memory allocated: {memory_stats['max_allocated_gb']:.2f} GB")

## Save Model

In [None]:
# Save model and config
torch.save({
    'model_state_dict': model.state_dict(),
    'config': config.to_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, 'vishwamai_model.pt')

# Save to Google Drive (optional)
from google.colab import drive
drive.mount('/content/drive')
!cp vishwamai_model.pt /content/drive/MyDrive/

## Test Model Generation

In [None]:
# Test generation
model.eval()
prompt = "The quick brown fox"
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to('cuda')

with torch.no_grad():
    outputs = model.generate(
        input_ids,
        max_length=50,
        num_return_sequences=1,
        temperature=0.7
    )

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text:\n", generated_text)

## Performance Analysis

In [None]:
# Measure inference speed
import time
import numpy as np

def measure_inference_time(model, input_ids, num_runs=100):
    times = []
    model.eval()
    with torch.no_grad():
        for _ in range(num_runs):
            start = time.time()
            _ = model(input_ids=input_ids)
            times.append(time.time() - start)
    return np.mean(times), np.std(times)

# Test different sequence lengths
seq_lengths = [32, 64, 128, 256, 512]
for length in seq_lengths:
    print(f"\nSequence length: {length}")
    dummy_input = torch.randint(0, config.vocab_size, (1, length)).cuda()
    mean_time, std_time = measure_inference_time(model, dummy_input)
    print(f"Average inference time: {mean_time*1000:.2f} ms ± {std_time*1000:.2f} ms")
    print(f"Tokens per second: {length/mean_time:.0f}")