# Versor Transformer - Quick Start Guide

This notebook provides a minimal working example to get you started with the Versor architecture.

The Versor Transformer is a geometric deep learning architecture based on Conformal Geometric Algebra (Cl(4,1)).
It uses multivector representations and geometric products for learning tasks.

## What you'll learn:
1. How to import the Versor architecture
2. How to create a simple dataset (learning x²)
3. How to train the model
4. How to test and evaluate performance

Feel free to adapt this code to your own problems!

## Cell 1: Import the Versor Architecture

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from Model import (
    VersorTransformer,
    conformal_lift,
    normalize_cl41
)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print("✓ Versor architecture imported successfully!")

## Cell 2: Generate Random Train/Test Set (Learning x²)

We'll create a simple regression task: learning the function f(x) = x²

The data will be lifted to conformal geometric algebra space using the `conformal_lift` function.

In [None]:
def generate_quadratic_dataset(n_samples=1000, seq_len=10, x_range=(-2, 2)):
    """
    Generate a dataset for learning x².
    
    Args:
        n_samples: Number of samples to generate
        seq_len: Sequence length (number of points per sample)
        x_range: Range of x values
    
    Returns:
        X: Input sequences (n_samples, seq_len, 4) - lifted to 4D space
        y: Target values (n_samples,) - the quadratic values
    """
    # Generate random x values
    x = np.random.uniform(x_range[0], x_range[1], (n_samples, seq_len))
    
    # Compute y = x²
    y = x[:, -1] ** 2  # Use the last value in sequence as target
    
    # Create 4D points for conformal lifting
    # We'll use [x, 0, 0, 0] as our 4D representation
    X_4d = np.zeros((n_samples, seq_len, 4))
    X_4d[:, :, 0] = x  # First dimension contains our x values
    
    # Convert to tensors
    X = torch.tensor(X_4d, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.float32)
    
    return X, y

# Generate datasets
n_train = 800
n_test = 200
seq_len = 10

X_train, y_train = generate_quadratic_dataset(n_train, seq_len)
X_test, y_test = generate_quadratic_dataset(n_test, seq_len)

# Move to device
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

print(f"✓ Dataset generated!")
print(f"  Train: {X_train.shape}, Test: {X_test.shape}")
print(f"  Sample input range: [{X_train[:, :, 0].min():.2f}, {X_train[:, :, 0].max():.2f}]")
print(f"  Sample output range: [{y_train.min():.2f}, {y_train.max():.2f}]")

# Visualize a few samples
plt.figure(figsize=(10, 4))
for i in range(5):
    x_vals = X_train[i, :, 0].cpu().numpy()
    plt.scatter(x_vals, x_vals**2, alpha=0.6, label=f'Sample {i+1}')
plt.xlabel('x')
plt.ylabel('x²')
plt.title('Sample Training Data (x²)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Cell 3: Train the Architecture

Now we'll create and train a Versor Transformer model.

The model will learn to predict x² from sequences of x values.

In [None]:
# Model hyperparameters
embed_dim = 16      # Embedding dimension (number of multivector channels)
n_heads = 4         # Number of attention heads
n_layers = 2        # Number of transformer layers
n_classes = 1       # Regression task (single output)
expansion = 2       # MLP expansion factor

# Create model
model = VersorTransformer(
    embed_dim=embed_dim,
    n_heads=n_heads,
    n_layers=n_layers,
    n_classes=n_classes,
    expansion=expansion,
    use_rotor_pool=True
).to(device)

# Add an input embedding layer to lift data to multivector space
class VersorRegressionModel(nn.Module):
    def __init__(self, versor_model, seq_len, embed_dim):
        super().__init__()
        self.input_proj = nn.Linear(4, embed_dim * 32)  # Project 4D to multivector space
        self.versor = versor_model
        
    def forward(self, x):
        # x: (batch, seq_len, 4)
        batch_size, seq_len, _ = x.shape
        
        # Project to multivector space
        x = self.input_proj(x)  # (batch, seq_len, embed_dim * 32)
        x = x.view(batch_size, seq_len, embed_dim, 32)  # (batch, seq_len, embed_dim, 32)
        
        # Normalize in multivector space
        x = normalize_cl41(x)
        
        # Pass through Versor Transformer
        out = self.versor(x)  # (batch, 1)
        
        return out.squeeze(-1)  # (batch,)

# Wrap the model
full_model = VersorRegressionModel(model, seq_len, embed_dim).to(device)

# Count parameters
n_params = sum(p.numel() for p in full_model.parameters() if p.requires_grad)
print(f"✓ Model created with {n_params:,} parameters")

# Training setup
criterion = nn.MSELoss()
optimizer = optim.AdamW(full_model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10)

# Training loop
n_epochs = 100
batch_size = 32
train_losses = []
test_losses = []

print("\nStarting training...")
print("-" * 60)

for epoch in range(n_epochs):
    # Training
    full_model.train()
    epoch_loss = 0
    n_batches = 0
    
    # Mini-batch training
    perm = torch.randperm(X_train.size(0))
    for i in range(0, X_train.size(0), batch_size):
        idx = perm[i:i+batch_size]
        batch_X = X_train[idx]
        batch_y = y_train[idx]
        
        optimizer.zero_grad()
        pred = full_model(batch_X)
        loss = criterion(pred, batch_y)
        loss.backward()
        
        # Gradient clipping for stability
        torch.nn.utils.clip_grad_norm_(full_model.parameters(), 1.0)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        n_batches += 1
    
    avg_train_loss = epoch_loss / n_batches
    train_losses.append(avg_train_loss)
    
    # Evaluation
    full_model.eval()
    with torch.no_grad():
        test_pred = full_model(X_test)
        test_loss = criterion(test_pred, y_test).item()
        test_losses.append(test_loss)
    
    # Learning rate scheduling
    scheduler.step(test_loss)
    
    # Print progress
    if (epoch + 1) % 10 == 0 or epoch == 0:
        lr = optimizer.param_groups[0]['lr']
        print(f"Epoch {epoch+1:3d}/{n_epochs} | Train Loss: {avg_train_loss:.6f} | Test Loss: {test_loss:.6f} | LR: {lr:.6f}")
    
    # Early stopping
    if test_loss < 0.001:
        print(f"\n✓ Converged at epoch {epoch+1}!")
        break

print("-" * 60)
print("✓ Training complete!")

# Plot training curves
plt.figure(figsize=(10, 4))
plt.plot(train_losses, label='Train Loss', alpha=0.7)
plt.plot(test_losses, label='Test Loss', alpha=0.7)
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.title('Training Progress')
plt.legend()
plt.grid(True, alpha=0.3)
plt.yscale('log')
plt.show()

## Cell 4: Test the Architecture

Let's evaluate the trained model and visualize its predictions.

In [None]:
# Evaluate on test set
full_model.eval()
with torch.no_grad():
    test_pred = full_model(X_test)
    test_loss = criterion(test_pred, y_test).item()
    
    # Calculate metrics
    mae = torch.abs(test_pred - y_test).mean().item()
    rmse = torch.sqrt(torch.mean((test_pred - y_test)**2)).item()
    
    # R² score
    ss_res = torch.sum((y_test - test_pred)**2)
    ss_tot = torch.sum((y_test - y_test.mean())**2)
    r2 = 1 - (ss_res / ss_tot)

print("="*60)
print("EVALUATION RESULTS")
print("="*60)
print(f"Test MSE:  {test_loss:.6f}")
print(f"Test MAE:  {mae:.6f}")
print(f"Test RMSE: {rmse:.6f}")
print(f"R² Score:  {r2:.6f}")
print("="*60)

# Visualize predictions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Predicted vs Actual
y_test_np = y_test.cpu().numpy()
test_pred_np = test_pred.cpu().numpy()

axes[0].scatter(y_test_np, test_pred_np, alpha=0.5)
axes[0].plot([y_test_np.min(), y_test_np.max()], 
             [y_test_np.min(), y_test_np.max()], 
             'r--', linewidth=2, label='Perfect Prediction')
axes[0].set_xlabel('Actual x²')
axes[0].set_ylabel('Predicted x²')
axes[0].set_title('Predicted vs Actual Values')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot 2: Error distribution
errors = test_pred_np - y_test_np
axes[1].hist(errors, bins=30, alpha=0.7, edgecolor='black')
axes[1].axvline(0, color='r', linestyle='--', linewidth=2, label='Zero Error')
axes[1].set_xlabel('Prediction Error')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Error Distribution')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Show some example predictions
print("\nSample Predictions:")
print("-" * 60)
print(f"{'Input x':<15} {'Actual x²':<15} {'Predicted x²':<15} {'Error':<15}")
print("-" * 60)
for i in range(min(10, len(X_test))):
    x_val = X_test[i, -1, 0].item()  # Last x value in sequence
    actual = y_test[i].item()
    pred = test_pred[i].item()
    error = pred - actual
    print(f"{x_val:<15.4f} {actual:<15.4f} {pred:<15.4f} {error:<15.4f}")
print("-" * 60)

print("\n✓ Testing complete!")
print("\n" + "="*60)
print("NEXT STEPS:")
print("="*60)
print("1. Explore the tasks/ folder for more complex examples")
print("2. Try different architectures by adjusting hyperparameters")
print("3. Adapt this code to your own regression/classification problems")
print("4. Check out the paper in Paper/ for theoretical background")
print("="*60)