# VishwamAI Math Integration Tutorial

This notebook demonstrates how to integrate mathematical reasoning capabilities with VishwamAI using the GSM8K dataset. We'll cover:

1. Dataset preparation and loading
2. Model and tokenizer configuration
3. Training pipeline setup
4. Evaluation and visualization
5. Example inference

## Setup and Dependencies

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import Dataset, DataLoader
from vishwamai.model import VishwamaiModel, VishwamaiConfig
from vishwamai.conceptual_tokenizer import ConceptualTokenizer, ConceptualTokenizerConfig
from vishwamai.training import VishwamaiTrainer

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

## 1. Dataset Preparation

We'll use the GSM8K (Grade School Math 8K) dataset, which contains math word problems with step-by-step solutions.

In [2]:
# Load GSM8K dataset
train_df = pd.read_parquet('gsm8k/train-00000-of-00001.parquet')
test_df = pd.read_parquet('gsm8k/test-00000-of-00001.parquet')

print(f"Training examples: {len(train_df)}")
print(f"Test examples: {len(test_df)}")

# Display sample
print("\nSample problem:")
sample_idx = np.random.randint(len(train_df))
print(f"Question: {train_df.iloc[sample_idx]['question']}")
print(f"Answer: {train_df.iloc[sample_idx]['answer']}")

Training examples: 7473
Test examples: 1319

Sample problem:
Question: Bob is tilling a plot of his garden. The plot is 110 feet wide by 120 feet long. His tiller digs a swath two feet wide, and he can till 1 foot of ground in about 2 seconds. How long will it take him to till this plot of land, in minutes?
Answer: If Bob goes along the side that's 120 feet long, he will till 110 / 2 = 55 rows.
Each of these rows are 120 feet long, so he will push the tiller a total of 120 * 55 = <<120*55=6600>>6,600 feet.
He tills 1 linear foot of ground every 2 seconds, so it will take him 2 * 6,600 = 13,200 seconds to till this plot
13,200 seconds is 13,2000 / 60 = <<13200/60=220>>220 minutes
#### 220


### Custom Dataset Class

In [3]:
class MathDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        question = row['question']
        answer = row['answer']
        
        # Tokenize inputs with padding and truncation
        inputs = self.tokenizer.encode(
            question,
            padding='max_length',
            max_length=self.max_length,
            truncation=True
        )
        
        labels = self.tokenizer.encode(
            answer,
            padding='max_length',
            max_length=self.max_length,
            truncation=True
        )
        
        return {
            'input_ids': torch.tensor(inputs, dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.long),
            'attention_mask': torch.ones(len(inputs), dtype=torch.long)
        }

## 2. Model Configuration

Configure the model and tokenizer with appropriate parameters for math reasoning tasks.

In [4]:
# Model configuration
model_config = VishwamaiConfig(
    hidden_size=3072,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_dropout_prob=0.1,
    attention_dropout_prob=0.1,
    max_position_embeddings=512,
    vocab_size=32000,
    type_vocab_size=2,
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    pad_token_id=0,
    use_cache=True,
    num_decoder_layers=12
)

# Tokenizer configuration
tokenizer_config = ConceptualTokenizerConfig(
    vocab_size=32000,
    max_length=512,
    concept_tokens=["math", "algebra", "arithmetic", "geometry"],
    reasoning_tokens=["equals", "therefore", "because", "solve", "calculate"],
    special_tokens={
        "pad_token": "[PAD]",
        "unk_token": "[UNK]",
        "bos_token": "[BOS]",
        "eos_token": "[EOS]",
        "sep_token": "[SEP]"
    }
)

# Initialize model and tokenizer
model = VishwamaiModel(model_config)
tokenizer = ConceptualTokenizer(tokenizer_config)

print("Model and tokenizer initialized successfully.")

TypeError: VishwamaiConfig.__init__() got an unexpected keyword argument 'hidden_dropout_prob'

## 3. Training Setup

Configure the training pipeline with appropriate hyperparameters and optimization settings.

In [None]:
# Prepare datasets
train_dataset = MathDataset(train_df, tokenizer)
val_dataset = MathDataset(test_df, tokenizer)

# Create data loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    num_workers=4,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=16,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

# Training configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

trainer = VishwamaiTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_loader,
    eval_dataset=val_loader,
    device=device,
    optimizer_kwargs={
        'lr': 5e-5,
        'weight_decay': 0.01,
        'beta1': 0.9,
        'beta2': 0.999
    }
)

## 4. Training and Monitoring

In [None]:
# Training metrics collector
metrics_history = []

def metric_callback(metrics):
    metrics_history.append(metrics)
    
# Train the model
trainer.train(
    num_epochs=10,
    save_dir="./checkpoints",
    evaluation_steps=100,
    save_steps=1000,
    logging_steps=10,
    callback=metric_callback,
    fp16=True  # Enable mixed precision training
)

print("Training completed successfully.")

## 5. Evaluation and Visualization

In [None]:
# Prepare metrics for visualization
metrics_df = pd.DataFrame(metrics_history)

# Set plot style
plt.style.use('seaborn')

# Create a figure with multiple subplots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))

# Plot training and validation loss
ax1.plot(metrics_df['step'], metrics_df['train_loss'], label='Training Loss')
if 'eval_loss' in metrics_df.columns:
    ax1.plot(metrics_df['step'], metrics_df['eval_loss'], label='Validation Loss')
ax1.set_xlabel('Training Steps')
ax1.set_ylabel('Loss')
ax1.set_title('Training and Validation Loss')
ax1.legend()
ax1.grid(True)

# Plot accuracy
if 'accuracy' in metrics_df.columns:
    ax2.plot(metrics_df['step'], metrics_df['accuracy'], label='Accuracy', color='green')
    ax2.set_xlabel('Training Steps')
    ax2.set_ylabel('Accuracy')
    ax2.set_title('Model Accuracy over Training')
    ax2.legend()
    ax2.grid(True)

plt.tight_layout()
plt.show()

## 6. Example Inference

Test the model on some example math problems.

In [None]:
def solve_math_problem(problem, max_length=100):
    # Prepare input
    inputs = tokenizer.encode(
        problem,
        return_tensors="pt",
        max_length=512,
        truncation=True
    ).to(device)

    # Generate output
    outputs = model.generate(
        inputs,
        max_length=max_length,
        num_beams=5,
        early_stopping=True,
        eos_token_id=tokenizer.eos_token_id
    )
    
    # Decode output
    solution = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return solution

# Test examples
test_problems = [
    "If John has 5 apples and gives 2 to Mary, how many apples does he have left?",
    "A train travels at 60 miles per hour. How far will it travel in 2.5 hours?",
    "What is 15% of 200?"
]

for problem in test_problems:
    print(f"\nProblem: {problem}")
    solution = solve_math_problem(problem)
    print(f"Solution: {solution}")