# VishwamAI Training on Google Colab (T4)

This notebook trains VishwamAI on Google Colab's T4 GPU and saves to Hugging Face Hub.

**Requirements:**
- Google Colab with T4 GPU
- Hugging Face account and token
- ~16GB GPU memory

In [None]:
!nvidia-smi

In [None]:
!pip install -q transformers datasets torch accelerate
!pip install -q sentencepiece protobuf

In [None]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    default_data_collator
)
from datasets import load_dataset
from huggingface_hub import notebook_login
from datetime import datetime

In [None]:
# Login to Hugging Face
notebook_login()

## Configuration

In [None]:
# Training config
CONFIG = {
    'model_name': 't5-base',  # Base model to start from
    'repo_id': 'kasinadhsarma/vishwamai-model',  # Your HF repo
    'max_length': 512,  # Sequence length
    'batch_size': 4,  # Per device batch size
    'grad_accum': 4,  # Gradient accumulation steps
    'epochs': 3,
    'lr': 2e-5,
    'warmup_steps': 100,
    'save_steps': 200,
    'eval_steps': 200
}

## Data Processing

In [None]:
def process_gsm8k(example):
    """Process GSM8K examples"""
    return {
        'input_text': f"solve: {example['question']}",
        'target_text': example['answer'].split('####')[1].strip()
    }

def process_mmlu(example):
    """Process MMLU examples"""
    options = ['A', 'B', 'C', 'D']
    formatted_options = '\n'.join(
        f"{opt}) {example[opt]}" for opt in options
    )
    return {
        'input_text': f"answer: {example['question']}\n\nOptions:\n{formatted_options}",
        'target_text': f"The answer is {options[example['answer']]}"
    }

# Load datasets
print("Loading datasets...")
gsm8k_train = load_dataset('gsm8k', 'main', split='train')
gsm8k_test = load_dataset('gsm8k', 'main', split='test')

mmlu_subjects = ['mathematics', 'computer_science', 'physics']
mmlu_datasets = []

for subject in mmlu_subjects:
    ds = load_dataset('cais/mmlu', subject)
    mmlu_datasets.append(ds)

# Process datasets
print("Processing datasets...")
gsm8k_train = gsm8k_train.map(process_gsm8k)
gsm8k_test = gsm8k_test.map(process_gsm8k)

mmlu_train = mmlu_datasets[0]['train']
mmlu_test = mmlu_datasets[0]['test']

for ds in mmlu_datasets[1:]:
    mmlu_train = mmlu_train.concatenate(ds['train'])
    mmlu_test = mmlu_test.concatenate(ds['test'])

mmlu_train = mmlu_train.map(process_mmlu)
mmlu_test = mmlu_test.map(process_mmlu)

## Model Setup

In [None]:
# Load model and tokenizer
print("Loading model and tokenizer...")
model = AutoModelForCausalLM.from_pretrained(CONFIG['model_name'])
tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])

# Tokenization function
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples['input_text'],
        max_length=CONFIG['max_length'],
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['target_text'],
            max_length=CONFIG['max_length'],
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Tokenize datasets
print("Tokenizing datasets...")
train_dataset = gsm8k_train.map(
    tokenize_function,
    batched=True,
    remove_columns=gsm8k_train.column_names
)

eval_dataset = gsm8k_test.map(
    tokenize_function,
    batched=True,
    remove_columns=gsm8k_test.column_names
)

## Training Setup

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=f"./results_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
    per_device_train_batch_size=CONFIG['batch_size'],
    per_device_eval_batch_size=CONFIG['batch_size'],
    gradient_accumulation_steps=CONFIG['grad_accum'],
    num_train_epochs=CONFIG['epochs'],
    learning_rate=CONFIG['lr'],
    fp16=True,  # Mixed precision training
    warmup_steps=CONFIG['warmup_steps'],
    logging_steps=10,
    save_steps=CONFIG['save_steps'],
    eval_steps=CONFIG['eval_steps'],
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_model_id=CONFIG['repo_id'],
    hub_strategy="every_save"
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=default_data_collator,
)

## Start Training

In [None]:
print(f"Starting training... Model will be saved to: {CONFIG['repo_id']}")
trainer.train()

## Evaluation on MMLU

In [None]:
# Prepare MMLU evaluation dataset
mmlu_eval = mmlu_test.map(
    tokenize_function,
    batched=True,
    remove_columns=mmlu_test.column_names
)

# Run evaluation
print("Evaluating on MMLU test set...")
mmlu_metrics = trainer.evaluate(eval_dataset=mmlu_eval)
print("MMLU Evaluation Results:")
print(mmlu_metrics)

## Save Final Model

In [None]:
# Push final model to hub
print("Pushing final model to Hugging Face Hub...")
trainer.push_to_hub(
    commit_message=f"Final training checkpoint - {datetime.now()}"
)
print("Training completed!")