# VishwamAI GPU Analysis

This notebook performs comprehensive GPU analysis for VishwamAI models including:
- Memory utilization
- Processing speed
- Attention mechanism comparison
- Model scaling analysis
- Expert routing efficiency

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import time
from tqdm.notebook import tqdm
import seaborn as sns

from vishwamai.models.gpu.transformer import TransformerComputeLayer, TransformerMemoryLayer, HybridThoughtAwareAttention
from vishwamai.models.tot_model import ToTModel
from vishwamai.models.cot_model import CoTModel
from vishwamai.models.kernel_layers import OptimizedLinear, FusedLayerNorm

import torch.cuda.amp as amp
import torch.cuda.profiler as profiler
import torch.cuda as cuda

plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

print("All imports successful (models and GPU-specific corrected)!")

In [None]:
# GPU Setup and Verification
def setup_gpu():
    if not torch.cuda.is_available():
        raise RuntimeError('GPU is required for this analysis')

    device = torch.device('cuda')
    print(f'GPU Device: {torch.cuda.get_device_name()}')
    print(f'CUDA Version: {torch.version.cuda}')
    print(f'GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB')
    print(f'GPU Count: {torch.cuda.device_count()}')
    print(f'\nCUDA Architecture:')
    for i in range(torch.cuda.device_count()):
        props = torch.cuda.get_device_properties(i)
        print(f'  Device {i}: {props.name}')
        print(f'    Compute Capability: {props.major}.{props.minor}')
        print(f'    SM Count: {props.multi_processor_count}')
        print(f'    Max Threads per SM: {props.max_threads_per_multi_processor}')
    return device

device = setup_gpu()

In [None]:
# Enhanced GPU Monitoring
class GPUMonitor:
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.timestamps = []
        self.memory_allocated = []
        self.memory_cached = []
        self.utilization = []
        torch.cuda.reset_peak_memory_stats()
        
    def sample(self):
        self.timestamps.append(time.time())
        self.memory_allocated.append(torch.cuda.memory_allocated() / 1e9)
        self.memory_cached.append(torch.cuda.memory_reserved() / 1e9)
        # Get GPU utilization using nvidia-smi if available
        try:
            import pynvml
            pynvml.nvmlInit()
            handle = pynvml.nvmlDeviceGetHandleByIndex(0)
            util = pynvml.nvmlDeviceGetUtilizationRates(handle)
            self.utilization.append(util.gpu)
        except:
            self.utilization.append(0)

    def get_stats(self):
        return {
            'timestamps': np.array(self.timestamps),
            'memory_allocated': np.array(self.memory_allocated),
            'memory_cached': np.array(self.memory_cached),
            'utilization': np.array(self.utilization),
            'peak_memory': torch.cuda.max_memory_allocated() / 1e9
        }

gpu_monitor = GPUMonitor()

In [None]:
# Model Initialization with Error Handling
def create_model(model_type='transformer', attention_type='flash_mla', **kwargs):
    try:
        base_config = {
            'vocab_size': kwargs.get('vocab_size', 50000),
            'embed_dim': kwargs.get('embed_dim', 512),
            'num_layers': kwargs.get('num_layers', 12),
            'num_heads': kwargs.get('num_heads', 8),
            'ff_dim': kwargs.get('ff_dim', 2048),
            'max_seq_len': kwargs.get('max_seq_len', 512)
        }
        
        if attention_type == 'flash_mla':
            attention_class = FlashMLAAttention
            attention_kwargs = {'use_amp': True}
        else:
            attention_class = OptimizedMoEAttention
            attention_kwargs = {
                'num_experts': kwargs.get('num_experts', 4),
                'use_amp': True
            }
            
        if model_type == 'transformer':
            model = VishwamAITransformer(
                **base_config,
                attention_class=attention_class,
                attention_kwargs=attention_kwargs
            )
        elif model_type == 'tot':
            model = ToTModel(
                **base_config,
                num_experts=kwargs.get('num_experts', 4)
            )
        elif model_type == 'cot':
            model = CoTModel(
                **base_config,
                num_experts=kwargs.get('num_experts', 4)
            )
        else:
            raise ValueError(f'Unknown model type: {model_type}')
            
        return model.to(device)
    except Exception as e:
        print(f'Error creating model: {str(e)}')
        raise

In [None]:
# Load training and test datasets
def load_datasets():
    dataset_loader = DatasetLoader()
    train_dataset = dataset_loader.load_train_dataset()
    test_dataset = dataset_loader.load_test_dataset()
    return train_dataset, test_dataset

In [None]:
# Comprehensive Performance Analysis
def analyze_model_performance(model, dataset, batch_size=32, num_batches=None):
    model.eval()
    gpu_monitor.reset()
    
    metrics = {
        'batch_times': [],
        'throughput': [],
        'memory_peaks': []
    }
    
    with torch.no_grad():
        dataloader = torch.utils.data.DataLoader(
            dataset, batch_size=batch_size, shuffle=False
        )
        
        if num_batches:
            dataloader = list(dataloader)[:num_batches]
        
        for batch in tqdm(dataloader, desc='Analyzing'):
            # Reset CUDA cache
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()
            
            # Prepare batch
            if isinstance(batch, tuple):
                input_ids = batch[0].to(device)
            else:
                input_ids = batch.to(device)
                
            # Warmup
            _ = model(input_ids)
            torch.cuda.synchronize()
            
            # Measure performance
            start_time = time.perf_counter()
            gpu_monitor.sample()
            
            _ = model(input_ids)
            torch.cuda.synchronize()
            
            end_time = time.perf_counter()
            gpu_monitor.sample()
            
            # Calculate metrics
            batch_time = end_time - start_time
            tokens_per_sec = (batch_size * input_ids.size(1)) / batch_time
            peak_memory = torch.cuda.max_memory_allocated() / 1e9
            
            metrics['batch_times'].append(batch_time)
            metrics['throughput'].append(tokens_per_sec)
            metrics['memory_peaks'].append(peak_memory)
    
    return {
        'avg_time_per_batch': np.mean(metrics['batch_times']),
        'std_time_per_batch': np.std(metrics['batch_times']),
        'avg_throughput': np.mean(metrics['throughput']),
        'peak_memory': max(metrics['memory_peaks']),
        'gpu_stats': gpu_monitor.get_stats(),
        'raw_metrics': metrics
    }

In [None]:
# Main Analysis Pipeline
def run_analysis(model_configs, dataset_sizes=[1000, 5000, 10000]):
    results = {}
    
    try:
        # Load datasets
        print("Loading datasets...")
        dataset_loader = DatasetLoader()
        
        for size in dataset_sizes:
            print(f"\nAnalyzing with dataset size: {size}")
            train_data = dataset_loader.load_train_dataset(max_size=size)
            test_data = dataset_loader.load_test_dataset(max_size=size)
            
            for config in model_configs:
                model_name = f"{config['model_type']}_{config['attention_type']}"
                print(f"\nTesting {model_name}...")
                
                # Create and analyze model
                model = create_model(**config)
                
                results[f"{model_name}_{size}"] = {
                    'train': analyze_model_performance(model, train_data),
                    'test': analyze_model_performance(model, test_data)
                }
                
                # Clean up
                del model
                torch.cuda.empty_cache()
                
        return results
    except Exception as e:
        print(f"Error during analysis: {str(e)}")
        raise

# Dataset Preparation - TinyShakespeare

First, we'll download and prepare the TinyShakespeare dataset for training our models.

In [None]:
# Download TinyShakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

# Read the dataset
with open('input.txt', 'r') as f:
    text = f.read()

print(f'Length of dataset in characters: {len(text)}')
print('\nFirst 1000 characters:\n')
print(text[:1000])

In [None]:
# Create character-level tokenizer
class CharacterTokenizer:
    def __init__(self, text):
        chars = sorted(list(set(text)))
        self.vocab_size = len(chars)
        self.char_to_idx = {ch: i for i, ch in enumerate(chars)}
        self.idx_to_char = {i: ch for i, ch in enumerate(chars)}
        print(f'Vocabulary size: {self.vocab_size}')
    
    def encode(self, text, return_tensors='pt'):
        indices = [self.char_to_idx[c] for c in text]
        if return_tensors == 'pt':
            return torch.tensor(indices, dtype=torch.long).unsqueeze(0)
        return indices
    
    def decode(self, indices, skip_special_tokens=False):
        if isinstance(indices, torch.Tensor):
            indices = indices.cpu().numpy()
        return ''.join([self.idx_to_char[idx] for idx in indices])

# Initialize tokenizer
tokenizer = CharacterTokenizer(text)

In [None]:
# Prepare training data
def create_training_data(text, seq_length=256, batch_size=32):
    # Encode the full text
    data = torch.tensor(tokenizer.encode(text, return_tensors=None), dtype=torch.long)
    
    # Create training examples
    n = len(data) - seq_length
    x = torch.stack([data[i:i+seq_length] for i in range(0, n-1, seq_length)])
    y = torch.stack([data[i+1:i+seq_length+1] for i in range(0, n-1, seq_length)])
    
    # Create train/test split (90/10)
    n_train = int(0.9 * len(x))
    train_data = torch.utils.data.TensorDataset(x[:n_train], y[:n_train])
    test_data = torch.utils.data.TensorDataset(x[n_train:], y[n_train:])
    
    # Create dataloaders
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)
    
    return train_loader, test_loader

# Create training and test dataloaders
train_loader, test_loader = create_training_data(text)

In [None]:
# Update model configuration for Shakespeare dataset
shakespeare_model_configs = [
    {
        'model_type': 'transformer',
        'attention_type': 'flash_mla',
        'vocab_size': tokenizer.vocab_size,
        'max_seq_len': 256
    },
    {
        'model_type': 'transformer',
        'attention_type': 'moe',
        'num_experts': 4,
        'vocab_size': tokenizer.vocab_size,
        'max_seq_len': 256
    }
]

In [None]:
# Training function with GPU monitoring
def train_epoch(model, train_loader, optimizer, gpu_monitor):
    model.train()
    total_loss = 0
    
    for batch_idx, (data, target) in enumerate(tqdm(train_loader, desc='Training')):
        data, target = data.to(device), target.to(device)
        
        # Monitor GPU before forward pass
        gpu_monitor.sample()
        
        optimizer.zero_grad()
        output = model(data)
        loss = F.cross_entropy(output.view(-1, tokenizer.vocab_size), target.view(-1))
        
        # Monitor GPU after forward pass
        gpu_monitor.sample()
        
        loss.backward()
        optimizer.step()
        
        # Monitor GPU after backward pass
        gpu_monitor.sample()
        
        total_loss += loss.item()
        
        if batch_idx % 100 == 0:
            print(f'Batch {batch_idx}, Loss: {loss.item():.4f}')
    
    return total_loss / len(train_loader)

# Evaluation function
def evaluate(model, test_loader):
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            total_loss += F.cross_entropy(output.view(-1, tokenizer.vocab_size), target.view(-1)).item()
    
    return total_loss / len(test_loader)

In [None]:
# Train and analyze models on Shakespeare dataset
def train_and_analyze_models(model_configs, num_epochs=3):
    results = {}
    
    for config in model_configs:
        model_name = f"{config['model_type']}_{config['attention_type']}"
        print(f"\nTraining {model_name}...")
        
        # Create model
        model = create_model(**config)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        
        # Training loop with GPU monitoring
        gpu_monitor.reset()
        training_stats = []
        
        for epoch in range(num_epochs):
            print(f"\nEpoch {epoch+1}/{num_epochs}")
            
            # Train
            train_loss = train_epoch(model, train_loader, optimizer, gpu_monitor)
            
            # Evaluate
            test_loss = evaluate(model, test_loader)
            
            print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')
            
            # Save stats
            training_stats.append({
                'epoch': epoch + 1,
                'train_loss': train_loss,
                'test_loss': test_loss,
                'gpu_stats': gpu_monitor.get_stats()
            })
        
        results[model_name] = {
            'training_stats': training_stats,
            'final_train_loss': train_loss,
            'final_test_loss': test_loss
        }
        
        # Generate sample text
        print("\nGenerating sample text...")
        model.eval()
        with torch.no_grad():
            context = "ROMEO: "
            input_ids = tokenizer.encode(context, return_tensors='pt').to(device)
            generated = model.generate(
                input_ids,
                max_length=200,
                temperature=0.7,
                do_sample=True
            )
            generated_text = tokenizer.decode(generated[0])
            print(f"Generated text:\n{generated_text}")
        
        # Clean up
        del model
        torch.cuda.empty_cache()
    
    return results

# Run training and analysis
shakespeare_results = train_and_analyze_models(shakespeare_model_configs)

# Plot training results
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
for model_name, stats in shakespeare_results.items():
    train_losses = [stat['train_loss'] for stat in stats['training_stats']]
    test_losses = [stat['test_loss'] for stat in stats['training_stats']]
    epochs = range(1, len(train_losses) + 1)
    plt.plot(epochs, train_losses, '-o', label=f'{model_name}_train')
    plt.plot(epochs, test_losses, '--o', label=f'{model_name}_test')
plt.title('Training and Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
for model_name, stats in shakespeare_results.items():
    gpu_stats = stats['training_stats'][0]['gpu_stats']
    plt.plot(
        gpu_stats['timestamps'] - gpu_stats['timestamps'][0],
        gpu_stats['memory_allocated'],
        label=model_name
    )
plt.title('GPU Memory Usage During Training')
plt.xlabel('Time (seconds)')
plt.ylabel('Memory (GB)')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Visualization Functions
def plot_performance_comparison(results):
    plt.figure(figsize=(15, 10))
    
    # Throughput comparison
    plt.subplot(2, 2, 1)
    plot_data = []
    labels = []
    for model_name in results:
        plot_data.extend([
            results[model_name]['train']['avg_throughput'],
            results[model_name]['test']['avg_throughput']
        ])
        labels.extend([f"{model_name}_train", f"{model_name}_test"])
    sns.barplot(x=labels, y=plot_data)
    plt.title('Model Throughput (tokens/sec)')
    plt.xticks(rotation=45)
    
    # Memory usage
    plt.subplot(2, 2, 2)
    plot_data = []
    for model_name in results:
        plot_data.extend([
            results[model_name]['train']['peak_memory'],
            results[model_name]['test']['peak_memory']
        ])
    sns.barplot(x=labels, y=plot_data)
    plt.title('Peak GPU Memory Usage (GB)')
    plt.xticks(rotation=45)
    
    # GPU utilization over time
    plt.subplot(2, 2, 3)
    for model_name in results:
        stats = results[model_name]['train']['gpu_stats']
        plt.plot(
            stats['timestamps'] - stats['timestamps'][0],
            stats['utilization'],
            label=model_name
        )
    plt.title('GPU Utilization Over Time')
    plt.xlabel('Time (seconds)')
    plt.ylabel('GPU Utilization %')
    plt.legend()
    
    # Memory allocation over time
    plt.subplot(2, 2, 4)
    for model_name in results:
        stats = results[model_name]['train']['gpu_stats']
        plt.plot(
            stats['timestamps'] - stats['timestamps'][0],
            stats['memory_allocated'],
            label=model_name
        )
    plt.title('GPU Memory Allocation Over Time')
    plt.xlabel('Time (seconds)')
    plt.ylabel('Memory (GB)')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

In [None]:
# Run the Analysis
model_configs = [
    {
        'model_type': 'transformer',
        'attention_type': 'flash_mla'
    },
    {
        'model_type': 'transformer',
        'attention_type': 'moe',
        'num_experts': 4
    },
    {
        'model_type': 'tot',
        'attention_type': 'flash_mla'
    },
    {
        'model_type': 'cot',
        'attention_type': 'moe',
        'num_experts': 4
    }
]

results = run_analysis(model_configs)
plot_performance_comparison(results)