# VishwamAI Math Integration - GSM8k Testing

Initial testing notebook for mathematical reasoning with minimal configuration.

In [1]:
import os
import torch
import gc
from typing import Dict, List
from datasets import load_dataset
from torch.utils.data import DataLoader, Subset
from functools import partial
from pathlib import Path

from vishwamai.model import VishwamaiConfig, VishwamaiModel
from vishwamai.training import VishwamaiTrainer
from vishwamai.conceptual_tokenizer import ConceptualTokenizer, ConceptualTokenizerConfig
from vishwamai.generate import VishwamaiGenerator, GenerationConfig

In [2]:
# Clear any existing PyTorch memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
gc.collect()

# Force using CPU for initial testing
device = torch.device("cpu")
print(f"Using device: {device}")

Using device: cpu


  return torch._C._cuda_getDeviceCount() > 0


In [3]:
# Load small subset of data for testing
def load_test_data(num_samples=100):
    train_full = load_dataset('parquet', data_files='gsm8k/train-00000-of-00001.parquet', split='train')
    test_full = load_dataset('parquet', data_files='gsm8k/test-00000-of-00001.parquet', split='train')
    
    # Take small subsets
    train_subset = Subset(train_full, range(min(num_samples, len(train_full))))
    test_subset = Subset(test_full, range(min(num_samples//10, len(test_full))))
    
    return train_subset, test_subset, train_full

train_dataset, test_dataset, full_dataset = load_test_data()
print(f"Train samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")

Train samples: 100
Test samples: 10


In [4]:
# Initialize and train tokenizer
tokenizer_config = ConceptualTokenizerConfig(
    vocab_size=32000,
    max_length=128  # Reduced for testing
)
tokenizer = ConceptualTokenizer(tokenizer_config)

# Get sample texts for tokenizer training
train_texts = []
for i in range(min(1000, len(full_dataset))):
    item = full_dataset[i]
    train_texts.append(f"Question: {item['question']}\nAnswer: {item['answer']}")

print("Training tokenizer...")
tokenizer.train_tokenizer(train_texts)
print("Tokenizer trained")

# Initialize tiny model
model_config = VishwamaiConfig(
    vocab_size=32000,
    hidden_size=128,  # Tiny size for testing
    num_hidden_layers=2,  # Minimum layers
    num_attention_heads=4,  # Reduced heads
    max_seq_len=128,  # Reduced sequence length
    intermediate_size=256  # Small FFN size
)

model = VishwamaiModel(model_config).to(device)
print("Model initialized")

Training tokenizer...
Tokenizer trained
Model initialized


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: training_data.txt
  input_format: 
  model_prefix: conceptual
  model_type: UNIGRAM
  vocab_size: 196
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 8
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 1
  bos_id: 2
  eos_id: 3
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differ

In [5]:
class QuietVishwamaiTrainer(VishwamaiTrainer):
    def compute_loss(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
        """Compute loss without debug prints"""
        labels = batch['labels']
        model_inputs = {
            'input_ids': batch['input_ids'],
            'attention_mask': batch['attention_mask']
        }
        
        if 'concept_ids' in batch:
            model_inputs['concept_ids'] = batch['concept_ids']
        
        outputs = self.model(**model_inputs)
        
        # Get sequence lengths and use minimum
        batch_size, seq_length_output, vocab_size = outputs.size()
        batch_size_labels, seq_length_labels = labels.size()
        min_seq_length = min(seq_length_output, seq_length_labels)
        
        # Truncate and reshape
        outputs = outputs[:, :min_seq_length, :].reshape(-1, vocab_size)
        labels = labels[:, :min_seq_length].reshape(-1)
        
        return torch.nn.functional.cross_entropy(outputs, labels)

def math_collate_fn(batch, tokenizer):
    questions = [b['question'] for b in batch]
    answers = [b['answer'] for b in batch]
    
    inputs = [f"Question: {q}\nAnswer: {a}" for q, a in zip(questions, answers)]
    encoded_inputs = [tokenizer.encode(text) for text in inputs]
    
    max_len = max(len(x) for x in encoded_inputs)
    padded_inputs = [x + [tokenizer.pad_token_id] * (max_len - len(x)) for x in encoded_inputs]
    attention_masks = [[1] * len(x) + [0] * (max_len - len(x)) for x in encoded_inputs]
    
    return {
        'input_ids': torch.tensor(padded_inputs),
        'attention_mask': torch.tensor(attention_masks),
        'labels': torch.tensor(padded_inputs).clone()
    }

train_loader = DataLoader(
    train_dataset,
    batch_size=4,
    collate_fn=partial(math_collate_fn, tokenizer=tokenizer),
    shuffle=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=4,
    collate_fn=partial(math_collate_fn, tokenizer=tokenizer)
)

print("Data loaders prepared")

Data loaders prepared


In [6]:
# Initialize trainer with quiet version
trainer = QuietVishwamaiTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_loader,
    eval_dataset=test_loader,
    device=device,
    optimizer_class=lambda params: torch.optim.AdamW(params, lr=1e-4),
    use_wandb=False
)

print("Trainer initialized")

Trainer initialized


In [7]:
# Setup save directory
save_dir = Path("gsm8k_test_model")
save_dir.mkdir(exist_ok=True)

# Train for few steps
try:
    print("Starting training...")
    trainer.train(
        num_epochs=2,
        save_dir=save_dir,
        evaluation_steps=10,
        save_steps=50,
        logging_steps=5,
        fp16=False  # Disable mixed precision since we're on CPU
    )
    print("Training completed")
except Exception as e:
    print(f"Training error: {e}")

Starting training...


Epoch 1: 100%|██████████| 25/25 [00:10<00:00,  2.38it/s, loss=9.8] 


Epoch 1 average loss: 10.2132


Epoch 2: 100%|██████████| 25/25 [00:07<00:00,  3.14it/s, loss=8.87]


Epoch 2 average loss: 9.3633
Training completed


In [8]:
# Test generation
try:
    generator = VishwamaiGenerator(
        model=model,
        tokenizer=tokenizer,
        config=GenerationConfig(
            max_length=128,
            temperature=0.7,
            top_p=0.9
        )
    )

    test_question = "If John has 5 apples and gives 2 to Mary, how many apples does John have left?"
    print("Generating answer...")
    generated = generator.generate(test_question)
    print(f"\nQuestion: {test_question}")
    print(f"Answer: {generated[0]}")
except Exception as e:
    print(f"Generation error: {e}")

Generating answer...

Question: If John has 5 apples and gives 2 to Mary, how many apples does John have left?
Answer: 
