In [None]:
import sys, os, platform, psutil, sys, tempfile
import scvi, sklearn, scipy, anndata, scanpy as sc, seaborn as sns, torch, numpy as np, muon

print("Python:", sys.version.split()[0])
print("OS:", platform.system(), platform.release())
print("CPU count:", psutil.cpu_count(logical=True))
print("Memory (GB):", round(psutil.virtual_memory().total / (1024**3), 2))
print ("")
print("scvi-tools:", scvi.__version__)
print("scikit-learn:", sklearn.__version__)
print("scipy:", scipy.__version__)
print("anndata:", anndata.__version__)
print("scanpy:", sc.__version__)
print("seaborn:", sns.__version__)
print("torch:", torch.__version__)
print("numpy:", np.__version__)
print("muon:", muon.__version__)

In [None]:
sc.set_figure_params(figsize=(6, 6), frameon=False) #set default parameters for all Scanpy plots
sns.set_theme()
torch.set_float32_matmul_precision("high") # this setting prefers accuracy and reproducibility over speed
save_dir = tempfile.TemporaryDirectory()

# Define the output directory relative to the current working directory
output_dir = 'Intermediate Files/Parameter_Testing'
os.makedirs(output_dir, exist_ok=True)

# You can now use output_dir to save your file s
print(f"Output directory is set to: {output_dir}")

In [None]:
from anndata import read_h5ad

# Define the folder path
Int_folder = "Intermediate_Files/QC/"

gene_data_std = read_h5ad(os.path.join(Int_folder, "Concatenated_Gene_Data.h5ad"))

In [None]:
# Check for initial NaN values
print("Initial NaN values in the PBMC Gene data matrix:", np.isnan(gene_data_std.X).sum())

In [None]:
print(gene_data_std.obs["batch"].value_counts())

In [None]:
gene_data_std.X.dtype

In [None]:
# save raw data from X to counts layer 
gene_data_std.layers["counts"] = gene_data_std.X.copy()

In [None]:
## create a new MuData object where the normalized data are another “modality”
mdata_g_std = muon.MuData({"rna":gene_data_std.copy()}, axis=-1)
## Now rna is count-based and log_norm_rna is log-normalized
mdata_g_std.mod["rna"].X = mdata_g_std.mod["rna"].layers["counts"]
del mdata_g_std.mod["rna"].raw
del mdata_g_std.mod["rna"].layers["counts"]

In [None]:
mdata_g_std

In [None]:
## Extract the RNA modality
rna_adata_g_std = mdata_g_std.mod['rna']

## Ensure that the counts layer is set
rna_adata_g_std.layers["counts"] = rna_adata_g_std.X.copy()

In [None]:
print(rna_adata_g_std.obs["batch"].value_counts())

In [None]:
rna_adata_g_std.X.dtype

In [None]:
print(mdata_g_std.mod['rna'].obs.columns)  # List all columns in .obs

In [None]:
# Check the column's dtype
print(mdata_g_std.mod['rna'].obs['batch'].dtype)

# Display unique values
print(mdata_g_std.mod['rna'].obs['batch'].unique())

In [None]:
if "_scvi_batch" in mdata_g_std.mod['rna'].obs.columns:
    del mdata_g_std.mod['rna'].obs["_scvi_batch"]

In [None]:
from scvi.model import AUTOZI

AUTOZI.setup_anndata(
    adata=mdata_g_std.mod['rna'],
    batch_key="batch"  # Specify the batch column
)

In [None]:
print(mdata_g_std.mod['rna'].obs.columns)  # Should include "_scvi_batch"
print(mdata_g_std.mod['rna'].obs["_scvi_batch"].unique())  # Check the values to indicate number of batches

In [None]:
## (Optional) if _scvi_batch is missing or batches do not contain assigned integers, perform the following:

#from scvi.data import AnnDataManager
#from scvi.data.fields import CategoricalObsField

## Initialize AnnDataManager
#adata_manager = AnnDataManager(
#    fields=[
#        CategoricalObsField("batch", attr_key="_scvi_batch")  # Specify the batch key
#    ]
#)

## Register integer # to each batch within the RNA object
#adata_manager.register_fields(mdata_g_std.mod['rna'])
#
## Validate that batch was successfully found and registered
#adata_manager.validate()

## Print the unique batch IDs in _scvi_batch ([0,1] is two batches)
#print(mdata_g_std.mod['rna'].obs['_scvi_batch'].unique())

In [None]:
print(mdata_g_std.mod['rna'].uns.keys())
print(type(mdata_g_std.mod['rna']))
print(mdata_g_std.mod['rna'].obs.columns)

In [None]:
import os
os.environ["OMP_NUM_THREADS"] = "127"
os.environ["MKL_NUM_THREADS"] = "127"

In [None]:
def run_autozi_test(
    adata_full,
    subset_n=1000, # number of cells to randomly sample
    n_latent=20, # number of latent space dimensions
    dropout_rate=0.3, # prevents overreliance on a single subset of neurons
    learning_rate=1e-2, # rate at which parameters are updated during each iteration
    weight_decay=1e-3, # reduces sensitivity to noise
    max_epochs=20, # max number of passes through training data, low for testing
    seed=42, # random seed for reproducibility
):
    # Subset the data
    np.random.seed(seed)
    subset_indices = np.random.choice(adata_full.obs_names, size=min(subset_n, adata_full.n_obs), replace=False)
    adata_subset = adata_full[subset_indices].copy()

    # Setup for scVI
    AUTOZI.setup_anndata(adata_subset, batch_key="batch")

    # Initialize model
    model = AUTOZI(
        adata=adata_subset,
        n_latent=n_latent,
        dropout_rate=dropout_rate
    )

    # Train
    model.train(
        max_epochs=max_epochs, 
        train_size=0.9, # 90% training / 10% validation split
        early_stopping=True, # Stop early if validation loss stops improving
        early_stopping_patience=10, # Number of stagnant epochs before stopping
        batch_size=128, #. Mini-batch size for training
        plan_kwargs={
            "lr": learning_rate,
            "weight_decay": weight_decay
        }
    )

    # Retrieve training metrics: ELBO = Evidence Lower Bound, a measure of model fit
    train_elbo = model.history['elbo_train'].iloc[-1, 0]
    val_elbo = model.history['elbo_validation'].iloc[-1, 0]

    # Summarize results
    print(f"✅ Finished: latent={n_latent}, dropout={dropout_rate}, lr={learning_rate}, wd={weight_decay}")
    print(f"   Train ELBO: {train_elbo:.2f}, Val ELBO: {val_elbo:.2f}")
    return model

In [None]:
# === Load your MuData ===
# (Assumes you already loaded it like this)
adata = mdata_g_std.mod['rna']

In [None]:
# === Run test ===
test_model = run_autozi_test(
    adata_full=adata,
    subset_n=5000,
    n_latent=30,
    dropout_rate=0.4,
    learning_rate=5e-3,
    weight_decay=1e-2
)

In [None]:
# Access training history and visualize
import matplotlib.pyplot as plt

train_elbo = test_model.history['elbo_train']  # ELBO on training data per epoch
val_elbo = test_model.history['elbo_validation']  # ELBO on validation data per epoch

# Print last few values of training and validation ELBO curves to look for convergence or early stopping behavior
print("Training ELBO gene-level:", train_elbo[-10:])
print("Validation ELBO gene-level:", val_elbo[-10:])

In [None]:
# Plot the training and validation ELBO curves and look for convergence
plt.plot(train_elbo, label="Training ELBO")
plt.plot(val_elbo, label="Validation ELBO")
plt.xlabel("Epoch")
plt.ylabel("Negative ELBO") # Lower values indicate better model fit
plt.legend()
plt.title("Training vs Validation Loss (ELBO) Gene-Level")
plt.show()

In [None]:
# Retrieve model latent representation
latent = test_model.get_latent_representation(test_model.adata)

# Store latent embedding in AnnData object
test_model.adata.obsm["X_AUTOZI_test"] = latent

# Compute UMAP from AutoZI latent space
sc.pp.neighbors(test_model.adata, use_rep="X_AUTOZI_test")
sc.tl.umap(test_model.adata)

# --Evaluate batch mixing using silhouette score--
# Score closer to 0 = good mixing, score closer to 1 = poor mixing
from sklearn.metrics import silhouette_score

print("silhouetts mixing score:", silhouette_score(latent, test_model.adata.obs['batch']))

# Visualize batch distributions across UMAP
sc.pl.umap(test_model.adata, color="batch", title="AUTOZI Latent - Batch Mixing")

In [None]:
# -- Compute neighborhood-based batch entropy --
# Local batch diversity among nearest neighbors
from sklearn.neighbors import NearestNeighbors
from scipy.stats import entropy
import numpy as np

X = latent
labels = test_model.adata.obs['batch'].values

# Identify 30 nearest neighbors for each cell in latent space
n_neighbors = 30
knn = NearestNeighbors(n_neighbors=n_neighbors + 1).fit(X)
distances, indices = knn.kneighbors(X)

batch_entropies = []
for i in range(X.shape[0]):
    neighbor_batches = labels[indices[i][1:]]  # exclude self
    _, counts = np.unique(neighbor_batches, return_counts=True)
    batch_entropies.append(entropy(counts, base=2))  # base-2 for interpretability

# Compute the true max entropy for a uniform distribution over batches given n_neighbors
n_batches = len(np.unique(labels))
ideal_counts = np.full(n_batches, n_neighbors / n_batches)
max_entropy = entropy(ideal_counts, base=2)


print("Max possible entropy with", n_batches, "batches and", n_neighbors, "neighbors:", max_entropy)
print("Mean entropy of batch mixing:", np.mean(batch_entropies))

In [None]:
# Use the same adata subset used to train `test_model`
latent = test_model.get_latent_representation(test_model.adata)

# Assign to the same subset
test_model.adata.obsm["X_AUTOZI_test"] = latent

# Get denoised expression as a NumPy array
denoised_expr = test_model.get_normalized_expression(library_size=10000, 
                                                                   batch_key = "batch"
                                                                  )  # Normalized per 10,000 reads

# Assign denoised expression as a new layer (ensure format is compatible with AnnData)
test_model.adata.layers["denoised"] = denoised_expr.values  # Convert DataFrame to NumPy array

# Apply log1p transformation
test_model.adata.layers["log_denoised"] = np.log1p(test_model.adata.layers["denoised"])

In [None]:
# Assess how well T cell markers cluster together and away from other clusters
sc.pl.umap(test_model.adata, color=['CD3D:ENSG00000167286', 'CD3E:ENSG00000198851', 'CD3G:ENSG00000160654'], layer = "log_denoised")

In [None]:
# Assess how well T cell markers cluster away from other clusters
# Genes of interest
genes = ['CD3D:ENSG00000167286', 'CD3E:ENSG00000198851', 'CD3G:ENSG00000160654']

# Calculate summed expression per cell
test_model.adata.obs['CD3_Combined'] = test_model.adata[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    test_model.adata,
    color='CD3_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Total TCell Marker Expression",
    layer = "log_denoised",
    vmin=(-(np.percentile(test_model.adata.obs['CD3_Combined'], 95))),
    vmax=np.percentile(test_model.adata.obs['CD3_Combined'], 95),
    show = True
)

In [None]:
# === Load your MuData ===
# (Assumes you already loaded it like this)
adata = mdata_g_std.mod['rna']

os.environ["OPENBLAS_NUM_THREADS"] = "127"
os.environ["OMP_NUM_THREADS"] = "127"

# === Run test ===
test_model = run_autozi_test(
    adata_full=adata,
    subset_n=5000,
    n_latent=40,
    dropout_rate=0.4,
    learning_rate=5e-3,
    weight_decay=1e-2
)

In [None]:
# Access training history
import matplotlib.pyplot as plt

train_elbo = test_model.history['elbo_train']  # Dataset 1
val_elbo = test_model.history['elbo_validation']  # Dataset 1

# Print last few values
print("Training ELBO gene-level:", train_elbo[-10:])
print("Validation ELBO gene-level:", val_elbo[-10:])

In [None]:
plt.plot(train_elbo, label="Training ELBO")
plt.plot(val_elbo, label="Validation ELBO")
plt.xlabel("Epoch")
plt.ylabel("Negative ELBO")
plt.legend()
plt.title("Training vs Validation Loss (ELBO) Gene-Level")
plt.show()

In [None]:
# Use the same adata subset used to train `test_model`
latent = test_model.get_latent_representation(test_model.adata)

# Assign to the same subset
test_model.adata.obsm["X_AUTOZI_test"] = latent

# Then UMAP
sc.pp.neighbors(test_model.adata, use_rep="X_AUTOZI_test")
sc.tl.umap(test_model.adata)
sc.pl.umap(test_model.adata, color="batch", title="AUTOZI Latent - Batch Mixing")

from sklearn.metrics import silhouette_score
print(silhouette_score(latent, test_model.adata.obs['batch']))

In [None]:
from sklearn.neighbors import NearestNeighbors
from scipy.stats import entropy
import numpy as np

X = latent
labels = test_model.adata.obs['batch'].values

n_neighbors = 30
knn = NearestNeighbors(n_neighbors=n_neighbors + 1).fit(X)
distances, indices = knn.kneighbors(X)

batch_entropies = []
for i in range(X.shape[0]):
    neighbor_batches = labels[indices[i][1:]]  # exclude self
    _, counts = np.unique(neighbor_batches, return_counts=True)
    batch_entropies.append(entropy(counts))

print("Mean entropy of batch mixing:", np.mean(batch_entropies))

In [None]:
# Use the same adata subset used to train `test_model`
latent = test_model.get_latent_representation(test_model.adata)

# Assign to the same subset
test_model.adata.obsm["X_AUTOZI_test"] = latent

# Get denoised expression as a NumPy array
denoised_expr = test_model.get_normalized_expression(library_size=10000, 
                                                                   batch_key = "batch"
                                                                  )  # Normalized per 10,000 reads

# Assign denoised expression as a new layer (ensure format is compatible with AnnData)
test_model.adata.layers["denoised"] = denoised_expr.values  # Convert DataFrame to NumPy array

# Apply log1p transformation
test_model.adata.layers["log_denoised"] = np.log1p(test_model.adata.layers["denoised"])

In [None]:
sc.pl.umap(test_model.adata, color=['CD3D:ENSG00000167286', 'CD3E:ENSG00000198851', 'CD3G:ENSG00000160654'], layer = "log_denoised")

In [None]:
# Genes of interest
genes = ['CD3D:ENSG00000167286', 'CD3E:ENSG00000198851', 'CD3G:ENSG00000160654']

# Calculate summed expression per cell
test_model.adata.obs['CD3_Combined'] = test_model.adata[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    test_model.adata,
    color='CD3_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Total TCell Marker Expression",
    layer = "log_denoised",
    vmin=(-(np.percentile(test_model.adata.obs['CD3_Combined'], 95))),
    vmax=np.percentile(test_model.adata.obs['CD3_Combined'], 95),
    show = True
)

In [None]:
# === Load your MuData ===
# (Assumes you already loaded it like this)
adata = mdata_g_std.mod['rna']

# === Run test ===
test_model = run_autozi_test(
    adata_full=adata,
    subset_n=5000,
    n_latent=50,
    dropout_rate=0.4,
    learning_rate=5e-3,
    weight_decay=1e-2
)



In [None]:
# Access training history
import matplotlib.pyplot as plt

train_elbo = test_model.history['elbo_train']  # Dataset 1
val_elbo = test_model.history['elbo_validation']  # Dataset 1

# Print last few values
print("Training ELBO gene-level:", train_elbo[-10:])
print("Validation ELBO gene-level:", val_elbo[-10:])

In [None]:
plt.plot(train_elbo, label="Training ELBO")
plt.plot(val_elbo, label="Validation ELBO")
plt.xlabel("Epoch")
plt.ylabel("Negative ELBO")
plt.legend()
plt.title("Training vs Validation Loss (ELBO) Gene-Level")
plt.show()

In [None]:
# Use the same adata subset used to train `test_model`
latent = test_model.get_latent_representation(test_model.adata)

# Assign to the same subset
test_model.adata.obsm["X_AUTOZI_test"] = latent

# Then UMAP
sc.pp.neighbors(test_model.adata, use_rep="X_AUTOZI_test")
sc.tl.umap(test_model.adata)
sc.pl.umap(test_model.adata, color="batch", title="AUTOZI Latent - Batch Mixing")

from sklearn.metrics import silhouette_score
print(silhouette_score(latent, test_model.adata.obs['batch']))

In [None]:
from sklearn.neighbors import NearestNeighbors
from scipy.stats import entropy
import numpy as np

X = latent
labels = test_model.adata.obs['batch'].values

n_neighbors = 30
knn = NearestNeighbors(n_neighbors=n_neighbors + 1).fit(X)
distances, indices = knn.kneighbors(X)

batch_entropies = []
for i in range(X.shape[0]):
    neighbor_batches = labels[indices[i][1:]]  # exclude self
    _, counts = np.unique(neighbor_batches, return_counts=True)
    batch_entropies.append(entropy(counts))

print("Mean entropy of batch mixing:", np.mean(batch_entropies))

In [None]:
# Use the same adata subset used to train `test_model`
latent = test_model.get_latent_representation(test_model.adata)

# Assign to the same subset
test_model.adata.obsm["X_AUTOZI_test"] = latent

# Get denoised expression as a NumPy array
denoised_expr = test_model.get_normalized_expression(library_size=10000, 
                                                                   batch_key = "batch"
                                                                  )  # Normalized per 10,000 reads

# Assign denoised expression as a new layer (ensure format is compatible with AnnData)
test_model.adata.layers["denoised"] = denoised_expr.values  # Convert DataFrame to NumPy array

# Apply log1p transformation
test_model.adata.layers["log_denoised"] = np.log1p(test_model.adata.layers["denoised"])

In [None]:
# Then UMAP
sc.pp.neighbors(test_model.adata, use_rep="X_AUTOZI_test")
sc.tl.umap(test_model.adata)

In [None]:
sc.pl.umap(test_model.adata, color=['CD3D:ENSG00000167286', 'CD3E:ENSG00000198851', 'CD3G:ENSG00000160654'], layer = "log_denoised")

In [None]:
# Genes of interest
genes = ['CD3D:ENSG00000167286', 'CD3E:ENSG00000198851', 'CD3G:ENSG00000160654']

# Calculate summed expression per cell
test_model.adata.obs['CD3_Combined'] = test_model.adata[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    test_model.adata,
    color='CD3_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Total TCell Marker Expression",
    layer = "log_denoised",
    vmin=(-(np.percentile(test_model.adata.obs['CD3_Combined'], 100))),
    vmax=np.percentile(test_model.adata.obs['CD3_Combined'], 100),
    show = True
)

In [None]:
# === Load your MuData ===
# (Assumes you already loaded it like this)
adata = mdata_g_std.mod['rna']

# === Run test ===
test_model = run_autozi_test(
    adata_full=adata,
    subset_n=5000,
    n_latent=40,
    dropout_rate=0.4,
    learning_rate=5e-3,
    weight_decay=1e-2
)

In [None]:
# Access training history
import matplotlib.pyplot as plt

train_elbo = test_model.history['elbo_train']  # Dataset 1
val_elbo = test_model.history['elbo_validation']  # Dataset 1

# Print last few values
print("Training ELBO gene-level:", train_elbo[-10:])
print("Validation ELBO gene-level:", val_elbo[-10:])

In [None]:
plt.plot(train_elbo, label="Training ELBO")
plt.plot(val_elbo, label="Validation ELBO")
plt.xlabel("Epoch")
plt.ylabel("Negative ELBO")
plt.legend()
plt.title("Training vs Validation Loss (ELBO) Gene-Level")
plt.show()

In [None]:
# Use the same adata subset used to train `test_model`
latent = test_model.get_latent_representation(test_model.adata)

# Assign to the same subset
test_model.adata.obsm["X_AUTOZI_test"] = latent

# Then UMAP
sc.pp.neighbors(test_model.adata, use_rep="X_AUTOZI_test")
sc.tl.umap(test_model.adata)
sc.pl.umap(test_model.adata, color="batch", title="AUTOZI Latent - Batch Mixing")

from sklearn.metrics import silhouette_score
print(silhouette_score(latent, test_model.adata.obs['batch']))

In [None]:
from sklearn.neighbors import NearestNeighbors
from scipy.stats import entropy
import numpy as np

X = latent
labels = test_model.adata.obs['batch'].values

n_neighbors = 30
knn = NearestNeighbors(n_neighbors=n_neighbors + 1).fit(X)
distances, indices = knn.kneighbors(X)

batch_entropies = []
for i in range(X.shape[0]):
    neighbor_batches = labels[indices[i][1:]]  # exclude self
    _, counts = np.unique(neighbor_batches, return_counts=True)
    batch_entropies.append(entropy(counts))

print("Mean entropy of batch mixing:", np.mean(batch_entropies))

In [None]:
# Use the same adata subset used to train `test_model`
latent = test_model.get_latent_representation(test_model.adata)

# Assign to the same subset
test_model.adata.obsm["X_AUTOZI_test"] = latent

# Get denoised expression as a NumPy array
denoised_expr = test_model.get_normalized_expression(library_size=10000, 
                                                                   batch_key = "batch"
                                                                  )  # Normalized per 10,000 reads

# Assign denoised expression as a new layer (ensure format is compatible with AnnData)
test_model.adata.layers["denoised"] = denoised_expr.values  # Convert DataFrame to NumPy array

# Apply log1p transformation
test_model.adata.layers["log_denoised"] = np.log1p(test_model.adata.layers["denoised"])

In [None]:
# Then UMAP
sc.pp.neighbors(test_model.adata, use_rep="X_AUTOZI_test")
sc.tl.umap(test_model.adata)

In [None]:
sc.pl.umap(test_model.adata, color=['CD3D:ENSG00000167286', 'CD3E:ENSG00000198851', 'CD3G:ENSG00000160654'], layer = "log_denoised")

In [None]:
# Genes of interest
genes = ['CD3D:ENSG00000167286', 'CD3E:ENSG00000198851', 'CD3G:ENSG00000160654']

# Calculate summed expression per cell
test_model.adata.obs['CD3_Combined'] = test_model.adata[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    test_model.adata,
    color='CD3_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Total TCell Marker Expression",
    layer = "log_denoised",
    vmin=(-(np.percentile(test_model.adata.obs['CD3_Combined'], 95))),
    vmax=np.percentile(test_model.adata.obs['CD3_Combined'], 95),
    show = True
)

In [None]:
# === Load your MuData ===
# (Assumes you already loaded it like this)
adata = mdata_g_std.mod['rna']

# === Run test ===
test_model = run_autozi_test(
    adata_full=adata,
    subset_n=5000,
    n_latent=40,
    dropout_rate=0.3,
    learning_rate=5e-3,
    weight_decay=1e-2
)

In [None]:
# Access training history
import matplotlib.pyplot as plt

train_elbo = test_model.history['elbo_train']  # Dataset 1
val_elbo = test_model.history['elbo_validation']  # Dataset 1

# Print last few values
print("Training ELBO gene-level:", train_elbo[-10:])
print("Validation ELBO gene-level:", val_elbo[-10:])

In [None]:
plt.plot(train_elbo, label="Training ELBO")
plt.plot(val_elbo, label="Validation ELBO")
plt.xlabel("Epoch")
plt.ylabel("Negative ELBO")
plt.legend()
plt.title("Training vs Validation Loss (ELBO) Gene-Level")
plt.show()

In [None]:
# Use the same adata subset used to train `test_model`
latent = test_model.get_latent_representation(test_model.adata)

# Assign to the same subset
test_model.adata.obsm["X_AUTOZI_test"] = latent

# Then UMAP
sc.pp.neighbors(test_model.adata, use_rep="X_AUTOZI_test")
sc.tl.umap(test_model.adata)
sc.pl.umap(test_model.adata, color="batch", title="AUTOZI Latent - Batch Mixing")

from sklearn.metrics import silhouette_score
print(silhouette_score(latent, test_model.adata.obs['batch']))

In [None]:
from sklearn.neighbors import NearestNeighbors
from scipy.stats import entropy
import numpy as np

X = latent
labels = test_model.adata.obs['batch'].values

n_neighbors = 30
knn = NearestNeighbors(n_neighbors=n_neighbors + 1).fit(X)
distances, indices = knn.kneighbors(X)

batch_entropies = []
for i in range(X.shape[0]):
    neighbor_batches = labels[indices[i][1:]]  # exclude self
    _, counts = np.unique(neighbor_batches, return_counts=True)
    batch_entropies.append(entropy(counts))

print("Mean entropy of batch mixing:", np.mean(batch_entropies))

In [None]:
# Use the same adata subset used to train `test_model`
latent = test_model.get_latent_representation(test_model.adata)

# Assign to the same subset
test_model.adata.obsm["X_AUTOZI_test"] = latent

# Get denoised expression as a NumPy array
denoised_expr = test_model.get_normalized_expression(library_size=10000, 
                                                                   batch_key = "batch"
                                                                  )  # Normalized per 10,000 reads

# Assign denoised expression as a new layer (ensure format is compatible with AnnData)
test_model.adata.layers["denoised"] = denoised_expr.values  # Convert DataFrame to NumPy array

# Apply log1p transformation
test_model.adata.layers["log_denoised"] = np.log1p(test_model.adata.layers["denoised"])

In [None]:
# Then UMAP
sc.pp.neighbors(test_model.adata, use_rep="X_AUTOZI_test")
sc.tl.umap(test_model.adata)

In [None]:
sc.pl.umap(test_model.adata, color=['CD3D:ENSG00000167286', 'CD3E:ENSG00000198851', 'CD3G:ENSG00000160654'], layer = "log_denoised")

In [None]:
# Genes of interest
genes = ['CD3D:ENSG00000167286', 'CD3E:ENSG00000198851', 'CD3G:ENSG00000160654']

# Calculate summed expression per cell
test_model.adata.obs['CD3_Combined'] = test_model.adata[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    test_model.adata,
    color='CD3_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Total TCell Marker Expression",
    layer = "log_denoised",
    vmin=(-(np.percentile(test_model.adata.obs['CD3_Combined'], 95))),
    vmax=np.percentile(test_model.adata.obs['CD3_Combined'], 95),
    show = True
)