# Memory-Optimized VishwamAI Math Integration

This notebook demonstrates the mathematical capabilities of VishwamAI optimized for systems with limited memory, including:
- Problem generation with varying difficulty levels
- Step-by-step problem solving
- Socratic method for mathematical reasoning
- Memory-efficient training and evaluation

## Setup

First, we'll import required libraries and set up memory-efficient configurations.

In [None]:
import sys
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch.cuda.amp as amp
from accelerate import Accelerator
from bitsandbytes.optim import AdamW8bit

from vishwamai.architecture import VishwamaiModel, VishwamaiConfig, init_model
from vishwamai.toknizer import ConceptualTokenizer, ConceptualTokenizerConfig
from vishwamai.generate import generate

# Initialize accelerator for memory-efficient operations
accelerator = Accelerator()
device = accelerator.device

# Set up automatic mixed precision
scaler = amp.GradScaler()

print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB")

## Model Initialization

Initialize the VishwamAI model with memory-optimized configuration.

In [None]:
# Initialize model configuration optimized for 4GB VRAM
config = VishwamaiConfig(
    vocab_size=32000,
    max_seq_length=2048,  # Reduced from 8192
    dim=2048,  # Reduced from 4096
    depth=24,  # Reduced from 32
    num_heads=16,  # Reduced from 32
    mlp_ratio=2.67,  # Reduced from 4.0
    dropout=0.1,
    pad_token_id=0,
    bos_token_id=1,
    eos_token_id=2,
    mask_token_id=3
)

# Initialize model using memory-efficient initialization
model = init_model(config, device, memory_efficient=True)

# Initialize tokenizer with mathematical concepts
tokenizer_config = ConceptualTokenizerConfig(
    vocab_size=config.vocab_size,
    max_length=config.max_seq_length,
    model_type="unigram",
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>",
    mask_token="<mask>",
    concept_tokens=[
        "MATH", "PROBLEM", "SOLUTION", "STEP",
        "EQUATION", "VARIABLE", "FUNCTION", "PROOF"
    ],
    reasoning_tokens=[
        "BECAUSE", "THEREFORE", "IF", "THEN",
        "GIVEN", "IMPLIES", "SUPPOSE", "CONCLUDE"
    ]
)
tokenizer = ConceptualTokenizer(tokenizer_config)

## Dataset Loading

Load the GSM8K dataset with memory-efficient data loading.

In [None]:
# Memory-efficient dataset loading
def load_datasets(chunk_size=1000):
    # Load GSM8K dataset in chunks
    train_data = pd.read_parquet('gsm8k/train-00000-of-00001.parquet', columns=['question', 'answer'])
    test_data = pd.read_parquet('gsm8k/test-00000-of-00001.parquet', columns=['question', 'answer'])
    
    # Use references instead of copies for memory efficiency
    socratic_train = train_data
    socratic_test = test_data
    
    return train_data, test_data, socratic_train, socratic_test

train_data, test_data, socratic_train, socratic_test = load_datasets()

print(f"Training examples: {len(train_data)}")
print(f"Testing examples: {len(test_data)}")

## Memory-Efficient MathAI Class

Define the main class with optimized memory usage.

In [None]:
class MemoryEfficientMathAI:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = accelerator.device
        self.model = accelerator.prepare(self.model)
        self.scaler = amp.GradScaler()
        
    def generate_text(self, prompt, max_length=200, temperature=0.7):
        """Memory-efficient text generation."""
        if not prompt.startswith(self.tokenizer.config.bos_token):
            prompt = self.tokenizer.config.bos_token + prompt
            
        # Process input in smaller chunks if needed
        chunk_size = 128
        inputs = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
        
        with torch.cuda.amp.autocast():
            outputs = generate(
                self.model,
                inputs,
                max_length=max_length,
                temperature=temperature,
                do_sample=True,
                top_p=0.9,
                pad_token_id=self.tokenizer.config.pad_token_id,
                eos_token_id=self.tokenizer.config.eos_token_id,
                bos_token_id=self.tokenizer.config.bos_token_id,
                use_cache=True
            )
            
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    def generate_problem(self, topic, difficulty):
        """Generate a math problem with memory optimization."""
        prompt = f"Generate a {difficulty} math problem about {topic}:\n"
        torch.cuda.empty_cache()  # Clear cache before generation
        return self.generate_text(prompt)
    
    def solve_problem(self, problem):
        """Memory-efficient problem solving."""
        prompt = f"Solve this math problem step by step:\n{problem}\n\nSolution:"
        torch.cuda.empty_cache()
        return self.generate_text(prompt, max_length=500, temperature=0.3)
    
    def socratic_solve(self, problem):
        """Memory-efficient Socratic method solution."""
        prompt = f"Break down and solve this problem using the Socratic method:\n{problem}"
        torch.cuda.empty_cache()
        return self.generate_text(prompt, max_length=1000, temperature=0.3)

# Initialize optimized MathAI
math_ai = MemoryEfficientMathAI(model, tokenizer)

## Memory-Efficient Training

Define optimized training functions with gradient accumulation and mixed precision.

In [None]:
def train_epoch(model, optimizer, train_dataloader, accumulation_steps=4):
    """Memory-efficient training with gradient accumulation."""
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    
    for i, batch in enumerate(train_dataloader):
        with torch.cuda.amp.autocast():
            inputs = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(inputs)
            loss = torch.nn.functional.cross_entropy(outputs.view(-1, outputs.size(-1)), labels.view(-1))
            loss = loss / accumulation_steps
        
        scaler.scale(loss).backward()
        total_loss += loss.item()
        
        if (i + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            torch.cuda.empty_cache()
            
    return total_loss / len(train_dataloader)

def evaluate(model, test_dataloader):
    """Memory-efficient evaluation."""
    model.eval()
    total_loss = 0
    
    with torch.no_grad(), torch.cuda.amp.autocast():
        for batch in test_dataloader:
            inputs = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(inputs)
            loss = torch.nn.functional.cross_entropy(outputs.view(-1, outputs.size(-1)), labels.view(-1))
            total_loss += loss.item()
            torch.cuda.empty_cache()
            
    return total_loss / len(test_dataloader)

## Example Usage with Memory Optimization

In [None]:
# Generate problems with memory management
problems = [
    ("algebra", "intermediate"),
    ("geometry", "advanced"),
    ("calculus", "beginner")
]

for topic, difficulty in problems:
    print(f"\n{'-'*50}")
    print(f"Generating {difficulty} {topic} problem:")
    
    # Clear cache before each generation
    torch.cuda.empty_cache()
    
    problem = math_ai.generate_problem(topic, difficulty)
    print("\nProblem:")
    print(problem)
    
    print("\nStandard Solution:")
    solution = math_ai.solve_problem(problem)
    print(solution)
    
    print("\nSocratic Method Solution:")
    socratic = math_ai.socratic_solve(problem)
    print(socratic)