# VishwamAI Fine-tuning on Google Colab

This notebook provides a simplified interface for fine-tuning VishwamAI on your specific task.

In [None]:
# Verify GPU
!nvidia-smi

# Install dependencies
!pip install torch==2.4.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install transformers==4.34.0 datasets accelerate

# Clone repository
!git clone https://github.com/VishwamAI/VishwamAI.git
%cd VishwamAI
!pip install -r requirements.txt

In [None]:
import torch
import json
from transformers import TrainingArguments
from datasets import load_dataset
from vishwamai.model_utils import load_model, get_gpu_memory

# Check GPU setup
gpu_name = torch.cuda.get_device_name(0)
gpu_memory = get_gpu_memory()
print(f"Using GPU: {gpu_name} ({gpu_memory:.1f} GB)")

# Determine optimal configuration
if 'a100' in gpu_name.lower():
    gpu_type = 'A100_optimized'
elif 'v100' in gpu_name.lower():
    gpu_type = 'V100_optimized'
else:
    gpu_type = 'T4_optimized'

print(f"Using configuration: {gpu_type}")

In [None]:
# Load configuration
config_path = "configs/config_optimized.json"
with open(config_path) as f:
    config = json.load(f)

# Update with GPU-specific settings
gpu_config = config['colab_specific'][gpu_type]
config['model_config'].update({
    'dim': gpu_config['dim'],
    'batch_size': gpu_config['batch_size'],
    'max_seq_len': gpu_config['max_seq_len']
})

print("Model configuration:")
print(json.dumps(config['model_config'], indent=2))

In [None]:
# Load base model
model = load_model(
    config_path=config_path,
    device="cuda",
    use_cache=False  # Better memory efficiency for training
)

print(f"Model loaded with {sum(p.numel() for p in model.parameters())/1e6:.1f}M parameters")

In [None]:
# Load datasets - modify these for your specific task
train_dataset = load_dataset("gsm8k", split="train")
eval_dataset = load_dataset("cais/mmlu", split="validation")

print(f"Training examples: {len(train_dataset)}")
print(f"Evaluation examples: {len(eval_dataset)}")

In [None]:
# Configure training
output_dir = "./finetune_output"
!mkdir -p $output_dir

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=gpu_config['batch_size'],
    per_device_eval_batch_size=gpu_config['batch_size'],
    gradient_accumulation_steps=gpu_config['gradient_accumulation'],
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=100,
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,
    fp16=True,
    gradient_checkpointing=True
)

In [None]:
# Initialize trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Start training
trainer.train()

In [None]:
# Save model
model_save_path = "final_model"
trainer.save_model(model_save_path)
print(f"Model saved to {model_save_path}")

# Test loading
test_model = load_model(
    config_path=config_path,
    device="cuda",
    pretrained_path=model_save_path
)
print("Model loaded successfully")

# Quick test
test_input = torch.randint(0, model.args.vocab_size, (1, 128)).cuda()
with torch.inference_mode():
    output = test_model(test_input)
print(f"Test output shape: {output.shape}")