## 1. Installation and Imports

In [1]:
# Install LiVAE if not already installed
# !pip install livae

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad

# Import LiVAE
from livae import agent

# Set random seed for reproducibility
np.random.seed(42)

print(f"LiVAE imported successfully!")
print(f"Available in livae: {dir(agent)}")

ModuleNotFoundError: No module named 'livae'

## 2. Create Synthetic Single-Cell Data

Let's create a simple synthetic dataset with:
- 500 cells
- 100 genes
- 3 cell types with different expression patterns

In [None]:
# Create synthetic data with 3 cell types
n_cells = 500
n_genes = 100

# Cell type labels
cell_types = ['Type_A'] * 200 + ['Type_B'] * 150 + ['Type_C'] * 150

# Generate count data with different patterns per cell type
# Type A: High expression in genes 0-30
data_A = np.random.poisson(5.0, (200, n_genes))
data_A[:, :30] = np.random.poisson(15.0, (200, 30))

# Type B: High expression in genes 30-60
data_B = np.random.poisson(5.0, (150, n_genes))
data_B[:, 30:60] = np.random.poisson(15.0, (150, 30))

# Type C: High expression in genes 60-90
data_C = np.random.poisson(5.0, (150, n_genes))
data_C[:, 60:90] = np.random.poisson(15.0, (150, 30))

# Combine data
X = np.vstack([data_A, data_B, data_C]).astype(float)

# Create AnnData object
adata = ad.AnnData(X)
adata.obs['cell_type'] = cell_types
adata.layers['counts'] = X.copy()

# Add gene names
adata.var_names = [f'Gene_{i}' for i in range(n_genes)]

print(f"Created AnnData object:")
print(f"  Shape: {adata.shape}")
print(f"  Cell types: {adata.obs['cell_type'].value_counts().to_dict()}")
print(f"  Layers: {list(adata.layers.keys())}")

## 3. Visualize Input Data

In [None]:
# Visualize the expression patterns
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for i, cell_type in enumerate(['Type_A', 'Type_B', 'Type_C']):
    mask = adata.obs['cell_type'] == cell_type
    mean_expr = adata.X[mask].mean(axis=0)
    axes[i].bar(range(n_genes), mean_expr, alpha=0.7)
    axes[i].set_title(f'{cell_type} Mean Expression')
    axes[i].set_xlabel('Gene Index')
    axes[i].set_ylabel('Mean Count')
    axes[i].set_ylim([0, 20])

plt.tight_layout()
plt.show()

print("Each cell type has distinct expression patterns in different gene regions.")

## 4. Initialize LiVAE Model

Key parameters:
- `adata`: Your AnnData object
- `layer`: Which data layer to use ('counts', 'X', etc.)
- `latent_dim`: Dimension of primary latent space (default: 10)
- `i_dim`: Dimension of interpretable embedding (default: 2)
- `hidden_dim`: Size of hidden layers (default: 128)
- `percent`: Fraction of data per batch (default: 0.01)

In [None]:
# Initialize LiVAE agent with basic parameters
model = agent(
    adata=adata,
    layer='counts',         # Use the counts layer
    latent_dim=10,          # Primary latent dimension
    i_dim=2,                # Interpretable embedding dimension (for visualization)
    hidden_dim=64,          # Hidden layer size (smaller for this small dataset)
    percent=0.2,            # Use 20% of data per batch (100 cells)
    lr=1e-3,                # Learning rate
    beta=1.0,               # β-VAE weight (KL divergence)
)

print("LiVAE model initialized successfully!")
print(f"\nModel configuration:")
print(f"  Input dimension: {n_genes}")
print(f"  Hidden dimension: 64")
print(f"  Latent dimension: 10")
print(f"  Interpretable dimension: 2")
print(f"  Batch size: {int(0.2 * n_cells)} cells")

## 5. Train the Model

The `fit()` method trains the model with a progress bar showing:
- **Loss**: Total combined loss
- **ARI**: Adjusted Rand Index (clustering quality)
- **NMI**: Normalized Mutual Information
- **ASW**: Average Silhouette Width
- **C_H**: Calinski-Harabasz Index
- **D_B**: Davies-Bouldin Index
- **P_C**: Graph connectivity score

In [None]:
# Train the model for 50 epochs
model.fit(epochs=50)

print("\nTraining completed!")

## 6. Extract Latent Representations

LiVAE provides two types of embeddings:
1. **Latent embedding** (`get_latent()`): Primary high-dimensional representation
2. **Interpretable embedding** (`get_iembed()`): Compressed low-dimensional representation

In [None]:
# Extract embeddings
latent = model.get_latent()              # Shape: (n_cells, latent_dim)
interpretable = model.get_iembed()       # Shape: (n_cells, i_dim)

print(f"Latent embedding shape: {latent.shape}")
print(f"Interpretable embedding shape: {interpretable.shape}")

# Add embeddings to AnnData object
adata.obsm['X_livae_latent'] = latent
adata.obsm['X_livae_interpretable'] = interpretable

print("\nEmbeddings added to adata.obsm:")
print(f"  Available: {list(adata.obsm.keys())}")

## 7. Visualize Results

In [None]:
# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot interpretable embedding (2D)
for cell_type in ['Type_A', 'Type_B', 'Type_C']:
    mask = adata.obs['cell_type'] == cell_type
    axes[0].scatter(
        interpretable[mask, 0], 
        interpretable[mask, 1],
        label=cell_type,
        alpha=0.6,
        s=30
    )
axes[0].set_xlabel('Interpretable Dimension 1')
axes[0].set_ylabel('Interpretable Dimension 2')
axes[0].set_title('Interpretable Embedding (2D)')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Plot first 2 dimensions of latent embedding
for cell_type in ['Type_A', 'Type_B', 'Type_C']:
    mask = adata.obs['cell_type'] == cell_type
    axes[1].scatter(
        latent[mask, 0], 
        latent[mask, 1],
        label=cell_type,
        alpha=0.6,
        s=30
    )
axes[1].set_xlabel('Latent Dimension 1')
axes[1].set_ylabel('Latent Dimension 2')
axes[1].set_title('Latent Embedding (first 2 dims of 10)')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("\n✅ LiVAE successfully separated the three cell types!")

## 8. Clustering Analysis

In [None]:
# Perform clustering on latent representation
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

# Cluster using latent embedding
kmeans = KMeans(n_clusters=3, random_state=42)
predicted_labels = kmeans.fit_predict(latent)

# Convert true labels to numeric
label_map = {'Type_A': 0, 'Type_B': 1, 'Type_C': 2}
true_labels = adata.obs['cell_type'].map(label_map).values

# Calculate metrics
ari = adjusted_rand_score(true_labels, predicted_labels)
nmi = normalized_mutual_info_score(true_labels, predicted_labels)

print(f"\nClustering Performance:")
print(f"  Adjusted Rand Index (ARI): {ari:.3f}")
print(f"  Normalized Mutual Info (NMI): {nmi:.3f}")
print(f"\n  ARI = 1.0 means perfect clustering")
print(f"  ARI = 0.0 means random clustering")

## 9. Summary Statistics

In [None]:
# Display summary
print("="*60)
print("LiVAE Basic Usage Summary")
print("="*60)
print(f"\nDataset:")
print(f"  Cells: {adata.n_obs}")
print(f"  Genes: {adata.n_vars}")
print(f"  Cell types: {len(adata.obs['cell_type'].unique())}")
print(f"\nModel:")
print(f"  Latent dimension: {latent.shape[1]}")
print(f"  Interpretable dimension: {interpretable.shape[1]}")
print(f"  Training epochs: 50")
print(f"\nPerformance:")
print(f"  ARI: {ari:.3f}")
print(f"  NMI: {nmi:.3f}")
print("\n✅ Tutorial completed successfully!")
print("="*60)

## Next Steps

Continue to the next tutorials:
- **Tutorial 2**: Advanced regularization (β-VAE, DIP, TC, InfoVAE, Lorentzian)
- **Tutorial 3**: Real single-cell data analysis
- **Tutorial 4**: Batch correction and integration
- **Tutorial 5**: Hyperparameter tuning and best practices