# Vishwamai Model Pretraining on Google Colab

This notebook sets up and runs pretraining for the Vishwamai model using Google Colab's resources.

In [None]:
# Check if we're using a GPU
!nvidia-smi

In [None]:
# Install required packages
!pip install torch transformers datasets huggingface_hub

In [None]:
# Clone the repository
!git clone https://github.com/VishwamAI/VishwamAI.git
%cd VishwamAI
!pip install -r requirements.txt

In [None]:
# Login to HuggingFace
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import os
import torch
from pathlib import Path
from datasets import load_dataset
from transformers import AutoTokenizer
from huggingface_hub import HfApi, upload_folder
from torch.utils.data import DataLoader
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP

from vishwamai.model_utils import load_model
from vishwamai.trainer import Trainer, TrainingArgs

In [None]:
def setup_tokenizer():
    """Initialize BERT tokenizer with custom configuration"""
    tokenizer = AutoTokenizer.from_pretrained(
        "bert-large-uncased",
        model_max_length=2048,  # Increased for GPU training
        do_lower_case=True,
        truncation_side="right",
        padding_side="right",
        use_fast=True
    )
    
    special_tokens = {
        "additional_special_tokens": [
            "[MEMORY]",
            "[REASONING]",
            "[CACHE]",
            "[STEP]",
        ]
    }
    tokenizer.add_special_tokens(special_tokens)
    return tokenizer

In [None]:
def prepare_datasets(tokenizer, max_length=2048):
    """Load and prepare full datasets for pretraining"""
    print("Loading datasets...")
    datasets = {
        "gsm8k": load_dataset("openai/gsm8k", split="train"),
        "mmlu": load_dataset("cais/mmlu", split="train"),
        "mmlu_pro": load_dataset("TIGER-Lab/MMLU-Pro", split="train"),
        "mmmlu": load_dataset("openai/MMMLU", split="train")
    }
    
    def prepare_text(examples):
        if "question" in examples:
            text = examples["question"]
            if "solution" in examples:
                text = f"[STEP] Question: {text} [STEP] Solution: {examples['solution']}"
        else:
            text = examples["text"] if "text" in examples else str(examples)
        text = f"[MEMORY] [CACHE] [REASONING] {text}"
        return text
    
    def tokenize_function(examples):
        text = prepare_text(examples)
        return tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt",
            return_special_tokens_mask=True,
            return_token_type_ids=True
        )
    
    tokenized_datasets = {}
    for name, dataset in datasets.items():
        print(f"Processing {name} dataset...")
        tokenized_datasets[name] = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=dataset.column_names
        )
    
    return tokenized_datasets

In [None]:
def create_dataloaders(tokenized_datasets, batch_size=16):  # Increased for GPU
    """Create DataLoaders for training"""
    def collate_fn(examples):
        input_ids = torch.stack([example['input_ids'] for example in examples])
        attention_mask = torch.stack([example['attention_mask'] for example in examples])
        labels = input_ids.clone()
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }
    
    dataloaders = {}
    for name, dataset in tokenized_datasets.items():
        dataloaders[name] = DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=True,
            collate_fn=collate_fn,
            pin_memory=True
        )
    
    return dataloaders

In [None]:
# Initialize training
tokenizer = setup_tokenizer()
print("Preparing datasets...")
tokenized_datasets = prepare_datasets(tokenizer)
dataloaders = create_dataloaders(tokenized_datasets)

# Create combined dataset
combined_dataloader = DataLoader(
    torch.utils.data.ConcatDataset([dl.dataset for dl in dataloaders.values()]),
    batch_size=16,
    shuffle=True,
    pin_memory=True
)

In [None]:
# Load model
print("Loading model...")
config_path = "vishwamai/configs/config_optimized.json"

# Adjust model size based on available GPU memory
model = load_model(
    config_path,
    device="cuda",  # Use GPU
    hidden_size=2048,  # Reduced from original
    num_hidden_layers=24,
    num_attention_heads=32,
    intermediate_size=8192
)

In [None]:
# Configure training arguments
training_args = TrainingArgs(
    output_dir="pretrain_checkpoints",
    num_epochs=3,
    batch_size=16,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    warmup_steps=1000,
    weight_decay=0.1,
    max_grad_norm=1.0,
    save_steps=500,
    logging_steps=100,
    use_fsdp=True,
    mixed_precision=True,
    gradient_checkpointing=True
)

trainer = Trainer(
    model=model,
    train_dataloader=combined_dataloader,
    eval_dataloader=dataloaders['gsm8k'],
    args=training_args
)

print("Starting pretraining...")
trainer.train()

In [None]:
# Save model and upload to HuggingFace Hub
output_dir = "vishwamai_pretrained"
os.makedirs(output_dir, exist_ok=True)

print("Saving model...")
state_dict = model.state_dict()
torch.save(state_dict, os.path.join(output_dir, "pytorch_model.bin"))
tokenizer.save_pretrained(output_dir)

# Copy config and model card
import shutil
shutil.copy("MODEL_CARD.md", os.path.join(output_dir, "README.md"))
shutil.copy(config_path, os.path.join(output_dir, "config.json"))

# Upload to HuggingFace Hub
repo_name = "kasinadhsarma/vishwamai-model"
api = HfApi()

try:
    api.create_repo(repo_name, private=True)
except Exception as e:
    print(f"Repository creation error (might already exist): {e}")

api.upload_folder(
    folder_path=output_dir,
    repo_id=repo_name,
    repo_type="model"
)

print(f"Model uploaded successfully to: https://huggingface.co/{repo_name}")