# VishwamAI Pretraining on Google Colab

This notebook provides pretraining setup for VishwamAI model with GPU optimizations.

In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Install core dependencies
!pip install torch==2.4.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install transformers==4.34.0 datasets accelerate

# Clone repository
!git clone https://github.com/kasinadhsarma/VishwamAI.git
%cd VishwamAI
!pip install -e .

In [None]:
import torch
import json
from datasets import load_dataset
from vishwamai.model_utils import load_model, get_gpu_memory

# Check GPU setup
gpu_name = torch.cuda.get_device_name(0)
gpu_memory = get_gpu_memory()
print(f"Using GPU: {gpu_name} ({gpu_memory:.1f} GB)")

# Determine optimal configuration
if 'a100' in gpu_name.lower():
    gpu_type = 'A100_optimized'
elif 'v100' in gpu_name.lower():
    gpu_type = 'V100_optimized'
else:
    gpu_type = 'T4_optimized'

print(f"Using configuration: {gpu_type}")

In [None]:
# Load and customize configuration
config_path = "configs/config_optimized.json"
with open(config_path) as f:
    config = json.load(f)

# Update configuration based on GPU type
gpu_config = config['colab_specific'][gpu_type]
config['model_config'].update({
    'dim': gpu_config['dim'],
    'batch_size': gpu_config['batch_size'],
    'max_seq_len': gpu_config['max_seq_len']
})

print("Model configuration:")
print(json.dumps(config['model_config'], indent=2))

In [None]:
# Load pretrain datasets
datasets = {
    "math": load_dataset("gsm8k", split="train"),
    "reasoning": load_dataset("cais/mmlu", split="train")
}

print("Loaded datasets:")
for name, dataset in datasets.items():
    print(f"{name}: {len(dataset)} examples")

In [None]:
# Initialize model
model = load_model(
    config_path=config_path,
    device="cuda",
    use_cache=False  # Disable cache for pretraining
)

print(f"Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")

In [None]:
# Configure training
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./pretrain_output",
    num_train_epochs=3,
    per_device_train_batch_size=gpu_config['batch_size'],
    gradient_accumulation_steps=gpu_config['gradient_accumulation'],
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=100,
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    fp16=True,
    gradient_checkpointing=True
)

In [None]:
# Start pretraining
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets["math"],
    eval_dataset=datasets["reasoning"]
)

trainer.train()

In [None]:
# Save pretrained model
trainer.save_model("final_model")
print("Model saved to 'final_model' directory")

# Test loading saved model
test_model = load_model(
    config_path=config_path,
    pretrained_path="final_model",
    device="cuda"
)
print("Successfully loaded pretrained model")

In [None]:
# Verify model performance
test_input = torch.randint(0, model.args.vocab_size, (1, 128)).cuda()
with torch.inference_mode():
    output = test_model(test_input)
print(f"Test output shape: {output.shape}")