# Hardware-Specific Neural Circuit Policy Examples

This notebook demonstrates how to optimize Neural Circuit Policies for different Apple Silicon processors:

- M1 Optimization
- M1 Pro/Max Optimization
- M1 Ultra Optimization
- Performance Comparison

In [None]:
import mlx.core as mx
import mlx.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from ncps.mlx import CfC, CfCCell
from ncps.wirings import AutoNCP
from ncps.mlx.advanced_profiling import MLXProfiler
from ncps.tests.configs.device_configs import get_device_config

## 1. Device Detection and Configuration

First, let's detect the current device and get its optimal configuration:

In [None]:
# Get device configuration
config = get_device_config()
print(f"Detected device: {config.device_type}")
print(f"Optimal batch size: {config.get_optimal_batch_size()}")
print(f"Optimal hidden size: {config.get_optimal_hidden_size()}")
print(f"Optimal backbone: {config.get_optimal_backbone()}")

## 2. Device-Optimized Model Creation

Create a model optimized for the current device:

In [None]:
def create_optimized_model(config):
    """Create device-optimized model."""
    # Create wiring with optimal size
    wiring = AutoNCP(
        units=config.get_optimal_hidden_size(),
        output_size=config.get_optimal_hidden_size() // 4
    )
    
    # Create model with optimal backbone
    model = CfC(
        cell=CfCCell(
            wiring=wiring,
            activation="tanh",
            backbone_units=config.get_optimal_backbone(),
            backbone_layers=2
        ),
        return_sequences=True
    )
    
    return model

# Create optimized model
model = create_optimized_model(config)

## 3. Neural Engine Optimization

Optimize for Neural Engine performance:

In [None]:
def optimize_for_neural_engine(model, config):
    """Optimize model for Neural Engine."""
    # Create test data
    batch_size = config.get_optimal_batch_size()
    x = mx.random.normal((batch_size, 16, model.input_size))
    
    # Test without compilation
    start = time.time()
    for _ in range(100):
        _ = model(x)
        mx.eval(_)
    uncompiled_time = (time.time() - start) / 100
    
    # Test with compilation
    @mx.compile(static_argnums=(1,))
    def forward(x, training=False):
        return model(x, training=training)
    
    # Warmup
    _ = forward(x)
    mx.eval(_)
    
    start = time.time()
    for _ in range(100):
        _ = forward(x)
        mx.eval(_)
    compiled_time = (time.time() - start) / 100
    
    return {
        'uncompiled_time': uncompiled_time,
        'compiled_time': compiled_time,
        'speedup': uncompiled_time / compiled_time
    }

# Test Neural Engine optimization
ne_stats = optimize_for_neural_engine(model, config)
print(f"Compilation speedup: {ne_stats['speedup']:.2f}x")

## 4. Memory Optimization

Optimize memory usage for the device:

In [None]:
def optimize_memory(model, config):
    """Optimize memory usage."""
    profiler = MLXProfiler(model)
    
    # Profile memory usage
    stats = profiler.profile_memory(
        batch_size=config.get_optimal_batch_size(),
        track_unified=True
    )
    
    print(f"Peak memory usage: {stats['peak_usage']:.2f} MB")
    print(f"Memory bandwidth: {stats['bandwidth']:.2f} GB/s")
    
    # Verify within budget
    memory_ok = stats['peak_usage'] <= config.memory_budget
    bandwidth_ok = stats['bandwidth'] >= config.min_bandwidth
    
    print(f"Memory within budget: {memory_ok}")
    print(f"Bandwidth meets minimum: {bandwidth_ok}")

# Test memory optimization
optimize_memory(model, config)

## 5. Performance Comparison

Compare performance across different configurations:

In [None]:
def compare_configurations(model, config):
    """Compare different configurations."""
    results = []
    profiler = MLXProfiler(model)
    
    for batch_size in config.batch_sizes:
        # Profile performance
        compute_stats = profiler.profile_compute(
            batch_size=batch_size,
            seq_length=16,
            num_runs=100
        )
        
        # Profile memory
        memory_stats = profiler.profile_memory(
            batch_size=batch_size
        )
        
        results.append({
            'batch_size': batch_size,
            'tflops': compute_stats['tflops'],
            'memory': memory_stats['peak_usage'],
            'bandwidth': memory_stats['bandwidth']
        })
    
    return results

# Compare configurations
results = compare_configurations(model, config)

# Plot results
plt.figure(figsize=(15, 5))

# Plot TFLOPS
plt.subplot(131)
plt.plot(
    [r['batch_size'] for r in results],
    [r['tflops'] for r in results],
    marker='o'
)
plt.xlabel('Batch Size')
plt.ylabel('TFLOPS')
plt.title('Compute Performance')

# Plot memory
plt.subplot(132)
plt.plot(
    [r['batch_size'] for r in results],
    [r['memory'] for r in results],
    marker='o'
)
plt.xlabel('Batch Size')
plt.ylabel('Memory (MB)')
plt.title('Memory Usage')

# Plot bandwidth
plt.subplot(133)
plt.plot(
    [r['batch_size'] for r in results],
    [r['bandwidth'] for r in results],
    marker='o'
)
plt.xlabel('Batch Size')
plt.ylabel('Bandwidth (GB/s)')
plt.title('Memory Bandwidth')

plt.tight_layout()
plt.show()

## Hardware-Specific Insights

Based on our experiments:

1. **Neural Engine Performance**
   - Compilation provides significant speedup
   - Power-of-2 sizes are optimal
   - Batch size affects utilization
   - Device-specific scaling

2. **Memory Management**
   - Unified memory is efficient
   - Bandwidth scales with batch size
   - Memory usage is predictable
   - Device limits are respected

3. **Optimization Tips**
   - Use device-specific configs
   - Enable compilation
   - Monitor performance
   - Balance resources

4. **Device-Specific Settings**
   - M1: 32-64 batch size
   - M1 Pro/Max: 64-128 batch size
   - M1 Ultra: 128-256 batch size
   - Adjust based on model size