# Mamba Installation and Functionality Test

**Purpose**: Verify that Mamba-SSM can be installed and works correctly on Windows with CUDA.

**Expected Outcome**: 
- Mamba packages installed successfully
- Basic forward pass works on CUDA
- No errors or compatibility issues

---

## Step 1: Install Mamba Packages

This will install:
- `mamba-ssm`: The core Mamba state-space model library
- `causal-conv1d`: Required dependency for efficient causal convolutions

In [None]:
# Install mamba-ssm and dependencies
!pip install mamba-ssm causal-conv1d

## Step 2: Verify Installation

Check that packages can be imported successfully.

In [None]:
# Test imports
try:
    import mamba_ssm
    print("✓ mamba_ssm imported successfully")
    print(f"  Version: {mamba_ssm.__version__ if hasattr(mamba_ssm, '__version__') else 'Unknown'}")
except ImportError as e:
    print(f"✗ Failed to import mamba_ssm: {e}")

try:
    import causal_conv1d
    print("✓ causal_conv1d imported successfully")
except ImportError as e:
    print(f"✗ Failed to import causal_conv1d: {e}")

## Step 3: Check PyTorch and CUDA Availability

Verify that PyTorch is installed and CUDA is available.

In [None]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
    print(f"Device count: {torch.cuda.device_count()}")
else:
    print("⚠️  CUDA not available - Mamba may not work optimally")
    print("   Consider using a machine with CUDA support for best performance")

## Step 4: Test Basic Mamba Forward Pass

Create a simple Mamba layer and test a forward pass with random data.

**Test Configuration**:
- Input: (batch=2, sequence_length=64, dimension=128)
- Model: Mamba with d_model=128, d_state=16, d_conv=4, expand=2

In [None]:
from mamba_ssm import Mamba

# Test configuration
batch_size = 2
sequence_length = 64
dim = 128

# Create random input tensor
device = 'cuda' if torch.cuda.is_available() else 'cpu'
x = torch.randn(batch_size, sequence_length, dim).to(device)

print(f"Input shape: {x.shape}")
print(f"Device: {device}")
print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB" if torch.cuda.is_available() else "")

In [None]:
# Create Mamba model
model = Mamba(
    d_model=dim,      # Model dimension
    d_state=16,       # SSM state dimension
    d_conv=4,         # Local convolution width
    expand=2          # Expansion factor
).to(device)

print("✓ Mamba model created successfully")
print(f"  Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"  Model device: {next(model.parameters()).device}")

In [None]:
# Test forward pass
try:
    with torch.no_grad():
        y = model(x)
    
    print("✓ Forward pass successful!")
    print(f"  Input shape:  {x.shape}")
    print(f"  Output shape: {y.shape}")
    print(f"  Output dtype: {y.dtype}")
    print(f"  Output device: {y.device}")
    
    # Check for NaN or Inf
    if torch.isnan(y).any():
        print("⚠️  Warning: Output contains NaN values")
    elif torch.isinf(y).any():
        print("⚠️  Warning: Output contains Inf values")
    else:
        print("✓ Output is numerically stable (no NaN/Inf)")
    
    # Show output statistics
    print(f"\nOutput statistics:")
    print(f"  Mean: {y.mean().item():.4f}")
    print(f"  Std:  {y.std().item():.4f}")
    print(f"  Min:  {y.min().item():.4f}")
    print(f"  Max:  {y.max().item():.4f}")
    
except Exception as e:
    print(f"✗ Forward pass failed: {e}")
    import traceback
    traceback.print_exc()

## Step 5: Test Gradient Flow (Training Readiness)

Verify that gradients can be computed (important for training).

In [None]:
# Test gradient computation
try:
    # Forward pass with gradients enabled
    x_grad = torch.randn(batch_size, sequence_length, dim, requires_grad=True).to(device)
    y_grad = model(x_grad)
    
    # Compute a dummy loss and backpropagate
    loss = y_grad.mean()
    loss.backward()
    
    print("✓ Gradient computation successful!")
    print(f"  Loss value: {loss.item():.4f}")
    print(f"  Input gradient shape: {x_grad.grad.shape}")
    print(f"  Input gradient mean: {x_grad.grad.mean().item():.6f}")
    
    # Check model parameter gradients
    grad_params = [p for p in model.parameters() if p.grad is not None]
    print(f"  Parameters with gradients: {len(grad_params)}/{len(list(model.parameters()))}")
    
    if len(grad_params) > 0:
        avg_grad = torch.stack([p.grad.abs().mean() for p in grad_params]).mean()
        print(f"  Average gradient magnitude: {avg_grad.item():.6f}")
    
    print("✓ Model is ready for training!")
    
except Exception as e:
    print(f"✗ Gradient computation failed: {e}")
    import traceback
    traceback.print_exc()

## Step 6: Memory Usage Test

Check GPU memory usage for different sequence lengths (important for understanding limitations).

In [None]:
if torch.cuda.is_available():
    print("Testing memory usage with different sequence lengths...\n")
    
    sequence_lengths = [64, 128, 256, 512, 1024]
    
    for seq_len in sequence_lengths:
        # Clear cache
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        
        try:
            # Create input
            x_test = torch.randn(1, seq_len, dim).to(device)
            
            # Forward pass
            with torch.no_grad():
                y_test = model(x_test)
            
            # Get memory stats
            mem_allocated = torch.cuda.memory_allocated() / 1024**2
            mem_reserved = torch.cuda.memory_reserved() / 1024**2
            mem_peak = torch.cuda.max_memory_allocated() / 1024**2
            
            print(f"Sequence length {seq_len:4d}: "
                  f"Allocated: {mem_allocated:6.2f} MB, "
                  f"Reserved: {mem_reserved:6.2f} MB, "
                  f"Peak: {mem_peak:6.2f} MB")
            
        except RuntimeError as e:
            if "out of memory" in str(e):
                print(f"Sequence length {seq_len:4d}: ✗ Out of memory")
                break
            else:
                raise
    
    # Final cleanup
    torch.cuda.empty_cache()
else:
    print("Skipping memory test (CUDA not available)")