In [None]:
import sys
import subprocess

print("=" * 60)
print("WORKING ENVIRONMENT CONFIGURATION")
print("=" * 60)

# Python info
print(f"\nPython:")
print(f"  Version: {sys.version.split()[0]}")
print(f"  Executable: {sys.executable}")

# PyTorch info
print(f"\nPyTorch:")
print(f"  Version: {torch.__version__}")
print(f"  CUDA available: {torch.cuda.is_available()}")
print(f"  CUDA version: {torch.version.cuda}")
print(f"  cuDNN version: {torch.backends.cudnn.version()}")

# NumPy info
import numpy as np
print(f"\nNumPy:")
print(f"  Version: {np.__version__}")

# Mamba info
import mamba_ssm
print(f"\nMamba-SSM:")
print(f"  Version: {mamba_ssm.__version__ if hasattr(mamba_ssm, '__version__') else '2.2.6.post3'}")

# CUDA toolkit info (from system)
try:
    nvcc_out = subprocess.check_output(['nvcc', '--version'], stderr=subprocess.STDOUT, text=True)
    cuda_version = [line for line in nvcc_out.split('\n') if 'release' in line.lower()][0]
    print(f"\nCUDA Toolkit:")
    print(f"  {cuda_version.strip()}")
except:
    print("\nCUDA Toolkit: Unable to determine")

# GPU info
if torch.cuda.is_available():
    print(f"\nGPU:")
    print(f"  Device: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

print("\n" + "=" * 60)
print("✓ All components configured correctly for Mamba training!")
print("=" * 60)

# Mamba Installation and Functionality Test (WSL2)

**Purpose**: Verify that Mamba-SSM works correctly in WSL2 environment with CUDA support.

**Expected Outcome**: 
- Mamba packages imported successfully
- Basic forward pass works on CUDA
- No errors or compatibility issues

---

## Environment Setup

This notebook is designed to run in **WSL2 with the `mamba-env` conda environment**.

### Prerequisites (Should Already Be Completed)

Follow the setup guide at `docs/WSL2_SETUP_GUIDE.md` to install:

1. **WSL2** with Ubuntu
2. **CUDA Toolkit 12.6** in WSL
3. **Conda environment** named `mamba-env` with:
   - Python 3.10
   - PyTorch 2.4.1+cu121
   - NumPy < 2.0
   - mamba-ssm (built from source)

### Running This Notebook

**In VSCode with WSL Extension:**
1. Open VSCode → Connect to WSL (green button bottom-left)
2. Open this notebook
3. Select kernel: **Python 3.10.19 ('mamba-env')** or **Python (Mamba-WSL2)**
4. Run cells

**Important**: Do NOT run the installation cells below if you've already followed the setup guide. Jump to Step 2 (Verify Installation).

---

## Step 1: Install Mamba Packages (Optional - Only if Not Installed)

**SKIP THIS STEP** if you've already followed the WSL2 setup guide.

This installs:
- `torch==2.4.1+cu121`: PyTorch with CUDA 12.1 support
- `numpy<2.0`: NumPy 1.x for compatibility
- `causal-conv1d`: Required dependency (built from source)
- `mamba-ssm`: The core Mamba library (built from source)

In [None]:
# OPTIONAL: Only run if you haven't completed the WSL2 setup guide
# This will take 10-15 minutes to compile from source

# Install PyTorch 2.4.1 with CUDA 12.1
!pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121

# Install NumPy 1.x
!pip install "numpy<2.0"

# Update C++ standard library
!conda install -c conda-forge libstdcxx-ng -y

# Build mamba-ssm from source (critical for compatibility)
!pip cache purge
!pip install causal-conv1d --no-binary causal-conv1d --no-build-isolation
!pip install mamba-ssm --no-binary mamba-ssm --no-build-isolation

## Step 2: Verify Installation

Check that packages can be imported successfully.

In [1]:
# Test imports
try:
    import mamba_ssm
    print("✓ mamba_ssm imported successfully")
    print(f"  Version: {mamba_ssm.__version__ if hasattr(mamba_ssm, '__version__') else 'Unknown'}")
except ImportError as e:
    print(f"✗ Failed to import mamba_ssm: {e}")

try:
    import causal_conv1d
    print("✓ causal_conv1d imported successfully")
except ImportError as e:
    print(f"✗ Failed to import causal_conv1d: {e}")

✓ mamba_ssm imported successfully
  Version: 2.2.6.post3
✓ causal_conv1d imported successfully


## Step 3: Check PyTorch and CUDA Availability

Verify that PyTorch is installed and CUDA is available.

In [2]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
    print(f"Device count: {torch.cuda.device_count()}")
else:
    print("⚠️  CUDA not available - Mamba may not work optimally")
    print("   Consider using a machine with CUDA support for best performance")

PyTorch version: 2.4.1+cu121
CUDA available: True
CUDA version: 12.1
Current device: 0
Device name: NVIDIA GeForce RTX 4060
Device count: 1


## Step 4: Test Basic Mamba Forward Pass

Create a simple Mamba layer and test a forward pass with random data.

**Test Configuration**:
- Input: (batch=2, sequence_length=64, dimension=128)
- Model: Mamba with d_model=128, d_state=16, d_conv=4, expand=2

In [3]:
from mamba_ssm import Mamba

# Test configuration
batch_size = 2
sequence_length = 64
dim = 128

# Create random input tensor
device = 'cuda' if torch.cuda.is_available() else 'cpu'
x = torch.randn(batch_size, sequence_length, dim).to(device)

print(f"Input shape: {x.shape}")
print(f"Device: {device}")
print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB" if torch.cuda.is_available() else "")

Input shape: torch.Size([2, 64, 128])
Device: cuda
Memory allocated: 0.06 MB


In [4]:
# Create Mamba model
model = Mamba(
    d_model=dim,      # Model dimension
    d_state=16,       # SSM state dimension
    d_conv=4,         # Local convolution width
    expand=2          # Expansion factor
).to(device)

print("✓ Mamba model created successfully")
print(f"  Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"  Model device: {next(model.parameters()).device}")

✓ Mamba model created successfully
  Model parameters: 116,480
  Model device: cuda:0


In [5]:
# Test forward pass
try:
    with torch.no_grad():
        y = model(x)
    
    print("✓ Forward pass successful!")
    print(f"  Input shape:  {x.shape}")
    print(f"  Output shape: {y.shape}")
    print(f"  Output dtype: {y.dtype}")
    print(f"  Output device: {y.device}")
    
    # Check for NaN or Inf
    if torch.isnan(y).any():
        print("⚠️  Warning: Output contains NaN values")
    elif torch.isinf(y).any():
        print("⚠️  Warning: Output contains Inf values")
    else:
        print("✓ Output is numerically stable (no NaN/Inf)")
    
    # Show output statistics
    print(f"\nOutput statistics:")
    print(f"  Mean: {y.mean().item():.4f}")
    print(f"  Std:  {y.std().item():.4f}")
    print(f"  Min:  {y.min().item():.4f}")
    print(f"  Max:  {y.max().item():.4f}")
    
except Exception as e:
    print(f"✗ Forward pass failed: {e}")
    import traceback
    traceback.print_exc()

✓ Forward pass successful!
  Input shape:  torch.Size([2, 64, 128])
  Output shape: torch.Size([2, 64, 128])
  Output dtype: torch.float32
  Output device: cuda:0
✓ Output is numerically stable (no NaN/Inf)

Output statistics:
  Mean: -0.0001
  Std:  0.0411
  Min:  -0.2012
  Max:  0.1837


## Step 5: Test Gradient Flow (Training Readiness)

Verify that gradients can be computed (important for training).

In [None]:
# Test gradient computation
try:
    # Forward pass with gradients enabled
    x_grad = torch.randn(batch_size, sequence_length, dim, requires_grad=True).to(device)
    y_grad = model(x_grad)
    
    # Compute a dummy loss and backpropagate
    loss = y_grad.mean()
    loss.backward()
    
    print("✓ Gradient computation successful!")
    print(f"  Loss value: {loss.item():.4f}")
    
    # Check model parameter gradients
    grad_params = [p for p in model.parameters() if p.grad is not None]
    print(f"  Parameters with gradients: {len(grad_params)}/{len(list(model.parameters()))}")
    
    if len(grad_params) > 0:
        avg_grad = torch.stack([p.grad.abs().mean() for p in grad_params]).mean()
        print(f"  Average gradient magnitude: {avg_grad.item():.6f}")
    
    print("✓ Model is ready for training!")
    
except Exception as e:
    print(f"✗ Gradient computation failed: {e}")
    import traceback
    traceback.print_exc()

## Step 6: Memory Usage Test

Check GPU memory usage for different sequence lengths (important for understanding limitations).

In [7]:
if torch.cuda.is_available():
    print("Testing memory usage with different sequence lengths...\n")
    
    sequence_lengths = [64, 128, 256, 512, 1024]
    
    for seq_len in sequence_lengths:
        # Clear cache
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        
        try:
            # Create input
            x_test = torch.randn(1, seq_len, dim).to(device)
            
            # Forward pass
            with torch.no_grad():
                y_test = model(x_test)
            
            # Get memory stats
            mem_allocated = torch.cuda.memory_allocated() / 1024**2
            mem_reserved = torch.cuda.memory_reserved() / 1024**2
            mem_peak = torch.cuda.max_memory_allocated() / 1024**2
            
            print(f"Sequence length {seq_len:4d}: "
                  f"Allocated: {mem_allocated:6.2f} MB, "
                  f"Reserved: {mem_reserved:6.2f} MB, "
                  f"Peak: {mem_peak:6.2f} MB")
            
        except RuntimeError as e:
            if "out of memory" in str(e):
                print(f"Sequence length {seq_len:4d}: ✗ Out of memory")
                break
            else:
                raise
    
    # Final cleanup
    torch.cuda.empty_cache()
else:
    print("Skipping memory test (CUDA not available)")

Testing memory usage with different sequence lengths...

Sequence length   64: Allocated:  17.45 MB, Reserved:  24.00 MB, Peak:  17.86 MB
Sequence length  128: Allocated:  17.51 MB, Reserved:  24.00 MB, Peak:  18.31 MB
Sequence length  256: Allocated:  17.64 MB, Reserved:  24.00 MB, Peak:  19.19 MB
Sequence length  512: Allocated:  17.89 MB, Reserved:  26.00 MB, Peak:  20.95 MB
Sequence length 1024: Allocated:  18.39 MB, Reserved:  28.00 MB, Peak:  24.47 MB


## Step 7: Environment Summary

Display the complete working environment configuration.