In [None]:
import os, collections, sys, platform
import scvi
import matplotlib.pyplot as plt
import muon as mu
import anndata
import pandas as pd
import seaborn as sns
import scipy
import scanpy as sc
import numpy as np
import leidenalg
import igraph
import torch
import matplotlib
from importlib.metadata import version, PackageNotFoundError

def pkg_ver(name):
    try:
        return version(name)
    except PackageNotFoundError:
        return 'Not installed'

print('Python:', sys.version.split()[0])
print('scvi-tools:', scvi.__version__)
print('scipy:', scipy.__version__)
print('anndata:', anndata.__version__)
print('scanpy:', sc.__version__)
print('seaborn:', sns.__version__)
print('torch:', torch.__version__)
print('numpy:', np.__version__)
print('muon:', mu.__version__)
print('matplotlib:', matplotlib.__version__)
print('pandas:', pd.__version__)
print('leidenalg:', pkg_ver('leidenalg'))
print('igraph:', igraph.__version__)


In [None]:
# paths to saved models and MuData
iso_dir_1 = 'Modeling/PBMCs_AutoZI/Isoform/' 
gene_dir_1 = 'Modeling/PBMCs_AutoZI/Gene/' 

# Load MuData files that contain the AnnData used during training
mdata_gene_pbmc = mu.read(os.path.join(
    "Modeling/PBMCs_AutoZI/Gene/", 
    "mdata_gene_with_latent_PBMCs_AutoZI.h5mu"
))
mdata_iso_pbmc = mu.read(os.path.join(
    "Modeling/PBMCs_AutoZI/Isoform/",
    "mdata_iso_with_latent_PBMCs_AutoZI.h5mu"
))

# Extract the RNA modality used by AutoZI
adata_g_filtered = mdata_gene_pbmc.mod['rna']
adata_i_filtered = mdata_iso_pbmc.mod['rna']

In [None]:
## Load the trained AutoZI model (gene-level)
try:
    autozi_gene_model_pbmc = scvi.model.AUTOZI.load(gene_dir_1, adata = adata_g_filtered )
    print("AUTOZI model 1 loaded successfully.")
except ValueError as e:
    print(f"Error loading AUTOZI models: {e}")
    autozi_gene_model_pbmc = None

In [None]:
## Load the trained AUTOZI model (isoform-level)
try:
    autozi_iso_model_pbmc = scvi.model.AUTOZI.load(iso_dir_1, adata = adata_i_filtered )
    print("AUTOZI model 1 loaded successfully.")
except ValueError as e:
    print(f"Error loading AUTOZI models: {e}")
    autozi_iso_model_pbmc = None

In [None]:
# Access training history for the gene-level model
import matplotlib.pyplot as plt
train_elbo1 = autozi_gene_model_pbmc.history['elbo_train']  # Training loss across epochs
val_elbo1 = autozi_gene_model_pbmc.history['elbo_validation']  # Validation loss across epochs

In [None]:
# Access training history for the isoform-level model
train_elbo2 = autozi_iso_model_pbmc.history['elbo_train']  # Training loss across epochs
val_elbo2 = autozi_iso_model_pbmc.history['elbo_validation']  # Validation loss across epochs

In [None]:
# Print last few ELBO values for sanity check
# You want train and validation ELBOs to: Decrease (less negative) over epochs,
# and converge closely by the end (no large divergence = less overfitting).
print("Training ELBO gene-level:", train_elbo1[-10:])
print("Validation ELBO gene-level:", val_elbo1[-10:])

In [None]:
# Plot training vs validation ELBO for gene-level model, look for same convergence
plt.plot(train_elbo1, label="Training ELBO")
plt.plot(val_elbo1, label="Validation ELBO")
plt.xlabel("Epoch")
plt.ylabel("Negative ELBO")
plt.legend()
plt.title("Training vs Validation Loss (ELBO) Gene-Level")
plt.show()

In [None]:
# Print last few ELBO values for sanity check
# You want train and validation ELBOs to: Decrease (less negative) over epochs,
# and converge closely by the end (no large divergence = less overfitting).
print("Training ELBO isoform-level:", train_elbo2[-10:])
print("Validation ELBO isoform-level:", val_elbo2[-10:])

In [None]:
# Plot training vs validation ELBO for isoform-level model, look for same convergence
plt.plot(train_elbo2, label="Training ELBO")
plt.plot(val_elbo2, label="Validation ELBO")
plt.xlabel("Epoch")
plt.ylabel("Negative ELBO")
plt.legend()
plt.title("Training vs Validation Loss (ELBO) Isoform-Level")
plt.show()

In [None]:
# Extract latent representation and posterior parameters for gene data after AutoZI
latent_g_autozi = autozi_gene_model_pbmc.get_latent_representation()
outputs_g = autozi_gene_model_pbmc.get_alphas_betas()
alpha_posterior_g = outputs_g['alpha_posterior'] #confidence the gene is zero-inflated (higher = more likely)
beta_posterior_g = outputs_g['beta_posterior'] #confidence the gene is not zero-inflated (higher = less likely)

In [None]:
# Extract latent representation and posterior parameters for isoform-data after AutoZI
latent_i_autozi = autozi_iso_model_pbmc.get_latent_representation()
outputs_i = autozi_iso_model_pbmc.get_alphas_betas() 
alpha_posterior_i = outputs_i['alpha_posterior']  #confidence the isoform is zero-inflated (higher = more likely)
beta_posterior_i = outputs_i['beta_posterior']  #confidence the isoform is zero-inflated (higher = more likely)

In [None]:
# Analyze Zero-Inflated (ZI) probabilities for gene-level data
threshold = 0.5    # Classify as ZI if posterior probability > 0.5
zi_probs_g = beta.cdf(0.5, alpha_posterior_g, beta_posterior_g)   # Posterior probability of ZI per gene
is_zi_pred_g = zi_probs_g > threshold 
print('Fraction of predicted ZI genes in Gene-level Data :', is_zi_pred_g.mean()) 

In [None]:
# Analyze Zero-Inflated (ZI) probabilities for gene-level data
threshold = 0.5    # Classify as ZI if posterior probability > 0.5
zi_probs_i = beta.cdf(0.5, alpha_posterior_i, beta_posterior_i)   # Posterior probability of ZI per gene
is_zi_pred_i = zi_probs_i > threshold

print('Fraction of predicted ZI isoforms in Isoform-Level Data :', is_zi_pred_i.mean())

In [None]:
# Define a list of latent representations and their corresponding ZI predictions
latent_representations_1 = [
    {"name": "20lat_1e2", "is_zi_pred": is_zi_pred_g},
]

# Mask for features with average expression > 1.0, which would be considered expressed enough for biological relevance
mask_sufficient_expression_g_pbmc = (np.array(adata_g_filtered.X.mean(axis=0)) > 1.0).reshape(-1)

# Compute the fraction of expressed genes that are predicted as zero-inflated
for latent_rep in latent_representations_1:
    name = latent_rep["name"]
    is_zi_pred = latent_rep["is_zi_pred"]
    
    # Print the fraction of genes with avg expression > 1.0
    print(f'Fraction of genes in PBMCs with avg expression > 1.0 for {name}:',
          mask_sufficient_expression_g_pbmc.mean())
    
    # Print the fraction of predicted ZI genes with avg expression > 1.0
    print(f'Fraction of predicted ZI genes in PBMCs with avg expression > 1.0 for {name}:',
          is_zi_pred[mask_sufficient_expression_g_pbmc].mean())
    print("")

In [None]:
# Define a list of latent representations and their corresponding ZI predictions
latent_representations_2 = [
    {"name": "20lat_1e2", "is_zi_pred": is_zi_pred_i},
]

# Mask for features with average expression > 1.0, which would be considered expressed enough for biological relevance
mask_sufficient_expression_i_pbmc = (np.array(adata_i_filtered.X.mean(axis=0)) > 1.0).reshape(-1)

# Compute the fraction of expressed isoforms that are predicted as zero-inflated
for latent_rep in latent_representations_2:
    name = latent_rep["name"]
    is_zi_pred = latent_rep["is_zi_pred"]
    
    # Print the fraction of genes with avg expression > 1.0
    print(f'Fraction of isoforms in PBMCs with avg expression > 1.0 for {name}:',
          mask_sufficient_expression_i_pbmc.mean())
    
    # Print the fraction of predicted ZI genes with avg expression > 1.0
    print(f'Fraction of predicted ZI isoforms in PBMCs with avg expression > 1.0 for {name}:',
          is_zi_pred[mask_sufficient_expression_i_pbmc].mean())
    print("")

In [None]:
# Obtain denoised and batch-corrected gene expression values from AutoZI model and scale to library size of 10K reads
denoised_expr_g = autozi_gene_model_pbmc.get_normalized_expression(adata_g_filtered, 
                                                                   library_size=10000, 
                                                                   batch_key = "batch"
                                                                  )

# Assign denoised expression to new layer 
adata_g_filtered.layers["denoised"] = denoised_expr_g.values  # Convert DataFrame to NumPy array to ensure format is AnnData-compatible

# Apply log1p transformation
adata_g_filtered.layers["log_denoised"] = np.log1p(adata_g_filtered.layers["denoised"])

In [None]:
# Obtain denoised and batch-corrected isoform expression values from AutoZI model and scale to library size of 10K reads
denoised_expr_i = autozi_iso_model_pbmc.get_normalized_expression(adata_i_filtered, 
                                                                  library_size=10000, 
                                                                  batch_key = "batch"
                                                                 )

# Assign denoised expression to new layer
adata_i_filtered.layers["denoised"] = denoised_expr_i.values  # Convert DataFrame to NumPy array to ensure format is AnnData-compatible

# Apply log1p transformation
adata_i_filtered.layers["log_denoised"] = np.log1p(adata_i_filtered.layers["denoised"])

In [None]:
#Print layers to ensure the layers in the cell above were added to the object
print(adata_g_filtered.layers) # should list 'denoised' and 'log_denoised'
print(adata_i_filtered.layers) # should list 'denoised' and 'log_denoised'

In [None]:
print(adata_g_filtered.obsm.keys()) # check presence of AutoZI latent representations

In [None]:
print(adata_i_filtered.obsm.keys())

In [None]:
#Save AnnData object with denoised and raw data
output_dir = 'Intermediate_Files/Clustering'

adata_g_filtered.write(os.path.join(output_dir, "PBMC_gene_AutoZI_denoised.h5mu"
                                   ))

In [None]:
output_dir = 'Intermediate_Files/Clustering'
adata_i_filtered.write(os.path.join(output_dir, "PBMC_iso_AutoZI_denoised.h5mu"
                                   ))

In [None]:
# Load the mdata object from the file
output_dir = 'Intermediate_Files/Clustering'

from scanpy import read_h5ad
adata_g_filtered = read_h5ad(os.path.join(output_dir,
                                          "PBMC_gene_AutoZI_denoised.h5mu"
                                         ))

In [None]:
output_dir = 'Intermediate_Files/Clustering' 

adata_i_filtered = read_h5ad(os.path.join(output_dir,
                                          "PBMC_iso_AutoZI_denoised.h5mu"
                                         ))

In [None]:
sc.settings.figdir = "Intermediate_Files/Clustering/Figures"

In [None]:
def relabel_clusters_by_size(adata, cluster_key):
    # Get the cluster labels produced by Leiden
    cluster_labels = adata.obs[cluster_key]
    
    # Count the size of each cluster
    cluster_sizes = cluster_labels.value_counts()
    
    # Sort clusters by size (largest = cluster 0)
    sorted_clusters = cluster_sizes.index[np.argsort(-cluster_sizes.values)]
    
    # Map original labels to new integer labels ranked by size
    new_labels_map = {old_label: new_label for new_label, old_label in enumerate(sorted_clusters)}
     
    # Apply remapping and store as categorical variable
    adata.obs[cluster_key] = cluster_labels.map(new_labels_map).astype('category')
    
    return adata

In [None]:
# Function to plot gene-level UMAP with labels reflecting cluster sizes
def plot_umap_with_labels_g(adata, resolutions, use_rep_key=None):
    """
    Plots UMAP with cluster labels at multiple resolutions. Use this to determine appropriate resolution
    via visual separation of clusters, stable cluster separation across resolutions, and reasonable spread
    of cluster labels (groups that are visually distinct are not determined as one cluster)

    Parameters:
    - adata: AnnData object
    - resolutions: List of resolution values to plot
    - use_rep_key: The key to use for coloring clusters (defaults to '20lat_1e2' if not provided)
    """

    print(f"Using representation: {use_rep_key}")  # Print assigned representation

    vibrant_palette = plt.get_cmap('tab20').colors  # Set color palette

    for res in resolutions:
        # Plot UMAP with cluster labels
        sc.pl.umap(
            adata,
            color=f'{res}_{use_rep_key}',
            title=f'UMAP with Clusters (Gene-Level, {use_rep_key}, Res={res})',
            frameon=True,
            palette=vibrant_palette,
            legend_loc='on data', # labels plotted on the embedding
            legend_fontsize=10,
            legend_fontoutline=2,
        )

# Function to plot isoform-level UMAP with labels reflecting cluster sizes
def plot_umap_with_labels_i(adata, resolutions, use_rep_key=None):
    """
    Same as above, for isoform-level AnnData. 
    
    Plots UMAP with cluster labels at multiple resolutions. Use this to determine appropriate resolution
    via visual separation of clusters, stable cluster separation across resolutions, and reasonable spread
    of cluster labels (groups that are visually distinct are not determined as one cluster)

    Parameters:
    - adata: AnnData object
    - resolutions: List of resolution values to plot
    - use_rep_key: The key to use for coloring clusters (defaults to '20lat_1e2' if not provided)
    """

    print(f"Using representation: {use_rep_key}") 
    vibrant_palette = plt.get_cmap('tab20').colors

    for res in resolutions:
        sc.pl.umap(
            adata,
            color=f'{res}_{use_rep_key}',
            title=f'UMAP with Clusters (Isoform-Level, {use_rep_key}, Res={res})',
            frameon=True,
            palette=vibrant_palette,
            legend_loc='on data',
            legend_fontsize=10,
            legend_fontoutline=2,
        )

In [None]:
# Assign denoised log-transformed layer to .X for downstream scaling/PCA/UMAP
adata_g_filtered.X = adata_g_filtered.layers["log_denoised"]

In [None]:
# Assign denoised log-transformed layer to .X for downstream scaling/PCA/UMAP
adata_i_filtered.X = adata_i_filtered.layers["log_denoised"]

In [None]:
# Scale features (cap values at 10 to reduce outlier influence)
sc.pp.scale(adata_g_filtered, 
            max_value = 10
           )

In [None]:
# Scale features (cap values at 10 to reduce outlier influence)
sc.pp.scale(adata_i_filtered, 
            max_value = 10
           )

In [None]:
# If desired, PCA can be used on gene-level data to compare against AutoZI latent space
#sc.pp.pca(adata_g_filtered)

# Compute UMAP
sc.pp.neighbors(adata_g_filtered, n_neighbors = 20, use_rep= 'X_AutoZI')
sc.tl.umap(adata_g_filtered, min_dist=0.3)

In [None]:
# If desired, PCA can be used on isoform-level data to compare against AutoZI latent space
#sc.pp.pca(adata_i_filtered)

# Use KNN to perform clustering Compute UMAP
sc.pp.neighbors(adata_i_filtered, n_neighbors = 20, use_rep= 'X_AutoZI')
sc.tl.umap(adata_i_filtered, min_dist=0.3)

In [None]:
# Perform Leiden clustering at multiple resolutions
resolutions = [0.04, 0.06, 0.1, 0.14, 0.16, 0.2, 0.24, 0.26, 0.3]
for res in resolutions:
    sc.tl.leiden(adata_g_filtered, resolution=res, key_added=f'{res}_log_AutoZI', flavor = "igraph", n_iterations=2)
    sc.tl.leiden(adata_i_filtered, resolution=res, key_added=f'{res}_log_AutoZI', flavor = "igraph", n_iterations=2)

In [None]:
# Relabel each clustering solution so label "0" is always the largest cluster, etc.
resolutions = ['0.04_log_AutoZI', '0.06_log_AutoZI', '0.1_log_AutoZI', '0.14_log_AutoZI', 
               '0.16_log_AutoZI', '0.2_log_AutoZI', '0.24_log_AutoZI', '0.26_log_AutoZI', 
               '0.3_log_AutoZI'
              ]
for cluster_key in resolutions:
    adata_g_filtered_pbmc = relabel_clusters_by_size(adata_g_filtered, cluster_key)
    adata_i_filtered_pbmc = relabel_clusters_by_size(adata_i_filtered, cluster_key)

In [None]:
# Visualize batch mixing on UMAP (gene-level data)
sc.pl.umap(adata_g_filtered_pbmc, color=["batch"], title="UMAP Colored by Batch after AutoZI (gene-level)", 
           save = "_by_batch_gene.pdf"
          )

In [None]:
# Visualize batch mixing on UMAP (isoform-level data)
sc.pl.umap(adata_i_filtered_pbmc, color=["batch"], title="UMAP Colored by Batch after AutoZI (isoform-level)",
           save = "_by_batch_isoform.pdf"
          )

In [None]:
# Plot multiple gene-level resolutions
plot_umap_with_labels_g(adata_g_filtered_pbmc, resolutions=[
    0.04, 0.06, 0.1, 0.14, 0.16, 0.2, 0.24, 0.26, 0.3,
], use_rep_key= 'log_AutoZI')

In [None]:
sc.settings.figdir = "Intermediate_Files/Clustering/Figures/UMAP"

# Only use the desired resolution
resolutions = ['0.06_log_AutoZI']

# Relabel for that resolution
cluster_key = resolutions[0]
adata_g_filtered_pbmc = relabel_clusters_by_size(adata_g_filtered, cluster_key)

# Plot UMAP and save to PDF
sc.pl.umap(
    adata_g_filtered_pbmc,
    color='0.06_log_AutoZI',
    title=f'UMAP with Clusters (Gene-Level, Res=0.06)',
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='on data',
    legend_fontsize=10,
    legend_fontoutline=2,
    save="_0.06_log_autoZI_gene.pdf"
)

In [None]:
# Call the functions
# Compare resolutions with cluster separation and marker expression to determine resolution
plot_umap_with_labels_i(adata_i_filtered, resolutions=[
    0.04, 0.06, 0.1, 0.14, 0.16, 0.2, 0.24, 0.26, 0.3, 
], use_rep_key= 'log_AutoZI')

In [None]:
sc.settings.figdir = "Intermediate_Files/Clustering/Figures/UMAP"

# Only use the desired resolution
resolutions = ['0.06_log_AutoZI']

# Relabel for that resolution
cluster_key = resolutions[0]
adata_i_filtered_pbmc = relabel_clusters_by_size(adata_i_filtered, cluster_key)

# Plot UMAP and save to PDF
sc.pl.umap(
    adata_i_filtered_pbmc,
    color='0.06_log_AutoZI',
    title=f'UMAP with Clusters (Isoform-Level, log_AutoZI, Res=0.06)',
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='on data',
    legend_fontsize=10,
    legend_fontoutline=2,
    save="_0.06_log_autoZI_isoform.pdf"
)

In [None]:
## Cell-type assignment

In [None]:
# Function to find matching genes in var_names (combined_ID format)
def find_matching_genes(prefixes, gene_list):
    return [gene for gene in gene_list if any(gene.startswith(prefix) for prefix in prefixes)]

In [None]:
TCell_Markers = ["CD4:", "CD3D:", "CD3E:", "CD3G:", "CD3Z", "CD8A:", "CD8B:", "PTPRCAP:"]

## Though not reflected in this code for sake of simplicity, we also assessed the presence of these markers:
#Naive_TCell = ["PTPRC:", "TCF7:", "FOXP1", "LEF1:", "PECAM1:"]
#Memory_TCell = ["CCR5:", "HLA-DRB1:", "HLA-DRA:", "ITGAE:", "SELL:", "TCF7:", "IL7R:", "CCR7:"]
#CD8_TCell = ["CD8A:", "CD8B:", "CXCR3:", "KLRB1:", "PTGDR2:", "GATA3:", "IRF4:", "RORC:", "CCL5:"]
#Reg_TCell = ["FOXP3:", "IL2RA:", "CTLA4:", "STAT5A:"]
#CD4_Effector = ["CXCR3:", "TNF:", "STAT4:", "IL17A:", "IL13:", "IL25:", "AHR:", "FOXO4:", "GATA3:", "IL2RA:"]
#Central_Memory_TCell = ["CCR5:", "IL7RA:", "EOMES:", "PRDM1:", "IL7R:", "SELL:", "CCR7:"]
#Effector_Memory_TCell = ["CCR5:", "HLA-DRB1:", "HLA-DRA:", "ITGAL:", "GZMA:", "PRDM1:", "SELL:"]
#Th1_TCell = ["CXCR3:", "IFNG:", "TNF:", "STAT4:"]

In [None]:
# Get gene marker matches for gene-level data
TCell_genes_1 = find_matching_genes(TCell_Markers, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_1)} gene-level IDs for T-Cells: {TCell_genes_1}")

In [None]:
# Get gene marker matches for isoform-level data
TCell_iso_1 = find_matching_genes(TCell_Markers, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_1)} isoform-level IDs for T-Cells: {TCell_iso_1}")

In [None]:
## Gene-level marker expression

import scanpy as sc
from matplotlib.colors import TwoSlopeNorm
import matplotlib.pyplot as plt

# Define output directory
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Gene"
group_name = "TCell"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("T cell Markers")

for gene in TCell_genes_1:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector for gene of interest
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Use 99th percentile of nonzero expression for vmax; centers colormap at 0
    expr_nonzero = expr[expr > 0]  # Exclude 0s from scaling so it is based only on expressing cells
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile to spot outliers
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
## Isoform-level marker expression

import os
import scanpy as sc
import numpy as np
from matplotlib.colors import TwoSlopeNorm
import matplotlib.pyplot as plt
from collections import defaultdict

# --- Define output directory ---
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Iso"
group_name = "TCell"

# Create output directory
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir
sc.settings.savefigs_path = output_dir  # Avoid weird fallback behavior

print("T cell Markers")

# Group isoforms by their gene symbol prefix
isoforms_by_gene = defaultdict(list)
for isoform in TCell_iso_1:
    gene = isoform.split(":")[0]  # Get gene name before first colon
    isoforms_by_gene[gene].append(isoform)

# --- Iterate by gene, but save in same directory ---
for gene, isoforms in isoforms_by_gene.items():
    print(f"\n--- {gene} ---")
    
    for isoform in isoforms:
        safe_iso_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_i_filtered[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Calculate 99th percentile for vmax
        expr_nonzero = expr[expr > 0]
        vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"{isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Centered color scale
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot and save to main output_dir
        sc.pl.umap(
            adata_i_filtered,
            color=isoform,
            use_raw=False,
            title=isoform,
            cmap="coolwarm",
            save=f"{safe_iso_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
# Define gene prefixes of interest
BCell_Markers = ["MS4A1:","CD19:", "CD79A:", "CD22:", "CD1D:", "CD24:", "CD80:", "CD27:", "SPN:"]

# Get gene matches for gene-level data
BCell_genes = find_matching_genes(BCell_Markers, adata_g_filtered.var_names)
print(f"Matched {len(BCell_genes)} gene-level IDs: {BCell_genes}")

In [None]:
BCell_iso = find_matching_genes(BCell_Markers, adata_i_filtered.var_names)
print(f"Matched {len(BCell_iso)} isoform-level IDs: {BCell_iso}")

In [None]:
## B cell Markers on the gene level

print("B cell Markers")

# Define output directory
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Gene"
group_name = "BCell"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir


# Loop through matched genes and plot them
for gene in BCell_genes:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
## Isoform-level B cell marker gene expression

marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Iso"
group_name = "BCell"

# Create output directory
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir
sc.settings.savefigs_path = output_dir  # Avoid weird fallback behavior

print("B cell Markers")

# --- Group isoforms by gene ---
isoforms_by_gene = defaultdict(list)
for isoform in BCell_iso:
    gene = isoform.split(":")[0]  # Get gene name before first colon
    isoforms_by_gene[gene].append(isoform)

# --- Iterate by gene, but save in same directory ---
for gene, isoforms in isoforms_by_gene.items():
    print(f"\n--- {gene} ---")
    
    for isoform in isoforms:
        safe_iso_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_i_filtered[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Calculate 99th percentile for vmax
        expr_nonzero = expr[expr > 0]
        vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"{isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Centered color scale
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot and save to main output_dir
        sc.pl.umap(
            adata_i_filtered,
            color=isoform,
            use_raw=False,
            title=isoform,
            cmap="coolwarm",
            save=f"{safe_iso_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
## Natural Killer (NK) cell Markers
NKCells = ["NCAM1:", "FCGR3A:", "KLRD1:", "KLRF1:", "GZMB:", "CD226:", "IL2RB:"]
NKActiv = ["IFNG:", "CCL5:", "IL2RB:", "ITGAM"]

# Get gene matches for gene-level data
NK_genes_1 = find_matching_genes(NKCells, adata_g_filtered.var_names)
print(f"Matched {len(NK_genes_1)} gene-level IDs for NK Cells: {NK_genes_1}")
#NK_genes_2 = find_matching_genes(NKActiv, adata_g_filtered.var_names)
#print(f"Matched {len(NK_genes_2)} gene-level IDs for Activated NK Cells: {NK_genes_2}")

In [None]:
# Get gene matches for isoform-level data
NK_iso_1 = find_matching_genes(NKCells, adata_i_filtered.var_names)
print(f"Matched {len(NK_iso_1)} gene-level IDs for NK Cells: {NK_iso_1}")
#NK_iso_2 = find_matching_genes(NKActiv, adata_i_filtered.var_names)
#print(f"Matched {len(NK_iso_2)} gene-level IDs for Activated NK Cells: {NK_iso_2}")

In [None]:
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Gene"
group_name = "NKCell"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("NK cell Markers")

# Loop through matched genes and plot them
for gene in NK_genes_1:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Iso"
group_name = "NKCell"

# Create output directory
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir
sc.settings.savefigs_path = output_dir  # Avoid weird fallback behavior

print("NK cell Markers")

# --- Group isoforms by gene ---
isoforms_by_gene = defaultdict(list)
for isoform in NK_iso_1:
    gene = isoform.split(":")[0]  # Get gene name before first colon
    isoforms_by_gene[gene].append(isoform)

# --- Iterate by gene, but save in same directory ---
for gene, isoforms in isoforms_by_gene.items():
    print(f"\n--- {gene} ---")
    
    for isoform in isoforms:
        safe_iso_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_i_filtered[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Calculate 99th percentile for vmax
        expr_nonzero = expr[expr > 0]
        vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"{isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Centered color scale
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot and save to main output_dir
        sc.pl.umap(
            adata_i_filtered,
            color=isoform,
            use_raw=False,
            title=isoform,
            cmap="coolwarm",
            save=f"{safe_iso_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
# Monocyte-derived Markers

Classical_Monocytes = ["CD14:", "FCGR2A:", "IL1B:", "SELL:", "CLEC7A:", "TNF:"]
NonClassical_Monocytes = ["FCGR3A:", "CX3CR1:"]
Myeloid = ["ITGAM:", "ITGAX:", "CD33:", "CD14:", "CD1C:", "HLA-DRA:", "HLA-DRB1:", "LILRB4:"]

# Get gene matches for gene-level data
Mono_genes_1 = find_matching_genes(Classical_Monocytes, adata_g_filtered.var_names)
print(f"Matched {len(Mono_genes_1)} gene-level IDs for Classical Monocytes: {Mono_genes_1}")
Mono_genes_2 = find_matching_genes(NonClassical_Monocytes, adata_g_filtered.var_names)
print(f"Matched {len(Mono_genes_2)} gene-level IDs for Non-Classical Monocytes: {Mono_genes_2}")
Myeloid_genes = find_matching_genes(Myeloid, adata_g_filtered.var_names)
print(f"Matched {len(Myeloid_genes)} gene-level IDs for Myeloid Cells: {Myeloid_genes}")

In [None]:
# Get gene matches for isoform-level data
Mono_iso_1 = find_matching_genes(Classical_Monocytes, adata_i_filtered.var_names)
print(f"Matched {len(Mono_iso_1)} iso-level IDs for Classical Monocytes: {Mono_iso_1}")
Mono_iso_2 = find_matching_genes(NonClassical_Monocytes, adata_i_filtered.var_names)
print(f"Matched {len(Mono_iso_2)} iso-level IDs for Non-classical Monocytes: {Mono_iso_2}")
Myeloid_iso = find_matching_genes(Myeloid, adata_i_filtered.var_names)
print(f"Matched {len(Myeloid_iso)} iso-level IDs for Myeloid Cells: {Myeloid_iso}")

In [None]:
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Gene"
group_name = "Myeloid Cell"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Myeloid Markers")

# Loop through matched genes and plot them
for gene in Myeloid_genes:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Gene"
group_name = "Classical Monocytes"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Classical Monocyte Markers")

# Loop through matched genes and plot them
for gene in Mono_genes_1:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Gene"
group_name = "NonClassical Monocytes"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("NonClassical Monocyte Markers")

# Loop through matched genes and plot them
for gene in Mono_genes_2:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Iso"
group_name = "Myeloid Cell"

# Create output directory
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir
sc.settings.savefigs_path = output_dir  # Avoid weird fallback behavior

print("Myeloid Markers")

# --- Group isoforms by gene ---
isoforms_by_gene = defaultdict(list)
for isoform in Myeloid_iso:
    gene = isoform.split(":")[0]  # Get gene name before first colon
    isoforms_by_gene[gene].append(isoform)

# --- Iterate by gene, but save in same directory ---
for gene, isoforms in isoforms_by_gene.items():
    print(f"\n--- {gene} ---")
    
    for isoform in isoforms:
        safe_iso_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_i_filtered[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Calculate 99th percentile for vmax
        expr_nonzero = expr[expr > 0]
        vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"{isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Centered color scale
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot and save to main output_dir
        sc.pl.umap(
            adata_i_filtered,
            color=isoform,
            use_raw=False,
            title=isoform,
            cmap="coolwarm",
            save=f"{safe_iso_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Iso"
group_name = "Classical Monocytes"

# Create output directory
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir
sc.settings.savefigs_path = output_dir  # Avoid weird fallback behavior

print("Classical Monocyte Markers")

# --- Group isoforms by gene ---
isoforms_by_gene = defaultdict(list)
for isoform in Mono_iso_1:
    gene = isoform.split(":")[0]  # Get gene name before first colon
    isoforms_by_gene[gene].append(isoform)

# --- Iterate by gene, but save in same directory ---
for gene, isoforms in isoforms_by_gene.items():
    print(f"\n--- {gene} ---")
    
    for isoform in isoforms:
        safe_iso_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_i_filtered[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Calculate 99th percentile for vmax
        expr_nonzero = expr[expr > 0]
        vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"{isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Centered color scale
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot and save to main output_dir
        sc.pl.umap(
            adata_i_filtered,
            color=isoform,
            use_raw=False,
            title=isoform,
            cmap="coolwarm",
            save=f"{safe_iso_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Iso"
group_name = "NonClassical Monocytes"

# Create output directory
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir
sc.settings.savefigs_path = output_dir  # Avoid weird fallback behavior

print("NonClassical Monocyte Markers")

# --- Group isoforms by gene ---
isoforms_by_gene = defaultdict(list)
for isoform in Mono_iso_2:
    gene = isoform.split(":")[0]  # Get gene name before first colon
    isoforms_by_gene[gene].append(isoform)

# --- Iterate by gene, but save in same directory ---
for gene, isoforms in isoforms_by_gene.items():
    print(f"\n--- {gene} ---")
    
    for isoform in isoforms:
        safe_iso_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_i_filtered[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Calculate 99th percentile for vmax
        expr_nonzero = expr[expr > 0]
        vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"{isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Centered color scale
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot and save to main output_dir
        sc.pl.umap(
            adata_i_filtered,
            color=isoform,
            use_raw=False,
            title=isoform,
            cmap="coolwarm",
            save=f"{safe_iso_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
## Dendritic cell Markers

DC = ["CD1C:", "CD1A:", "ZBTB46:", "ITGAE:", "HLA-DRA:", "HLA-DRB1:", "THBD:", "SIRPA:", "LILRB4", "IRF8:", "IRF4:", "CD14:"]
pDC = ["CLEC4C:", "TNF:", "TCF4:", "TLR7:", "TLR9:"]

In [None]:
# Get gene matches for gene-level data
DC_genes = find_matching_genes(DC, adata_g_filtered.var_names)
print(f"Matched {len(DC_genes)} gene-level IDs for Dendritic Cells: {DC_genes}")
pDC_genes = find_matching_genes(pDC, adata_g_filtered.var_names)
print(f"Matched {len(pDC_genes)} gene-level IDs for Plasmacytoid Dendritic Cells: {pDC_genes}")

In [None]:
# Get gene matches for iso-level data
DC_iso = find_matching_genes(DC, adata_i_filtered.var_names)
print(f"Matched {len(DC_iso)} isoform-level IDs for Dendritic Cells: {DC_iso}")
pDC_iso = find_matching_genes(pDC, adata_i_filtered.var_names)
print(f"Matched {len(pDC_iso)} isoform-level IDs for Plasmacytoid Dendritic Cells: {pDC_iso}")

In [None]:
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Gene"
group_name = "Dendritic Cells"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Dendritic Cell Markers")

# Loop through matched genes and plot them
for gene in DC_genes:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Gene"
group_name = "Plasmacytoid Dendritic Cells"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Plasmacytoid Dendritic Cell Markers")

# Loop through matched genes and plot them
for gene in pDC_genes:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Iso"
group_name = "Dendritic Cells"

# Create output directory
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir
sc.settings.savefigs_path = output_dir  # Avoid weird fallback behavior

print("Dendritic Cell Markers")

# --- Group isoforms by gene ---
isoforms_by_gene = defaultdict(list)
for isoform in DC_iso:
    gene = isoform.split(":")[0]  # Get gene name before first colon
    isoforms_by_gene[gene].append(isoform)

# --- Iterate by gene, but save in same directory ---
for gene, isoforms in isoforms_by_gene.items():
    print(f"\n--- {gene} ---")
    
    for isoform in isoforms:
        safe_iso_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_i_filtered[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Calculate 99th percentile for vmax
        expr_nonzero = expr[expr > 0]
        vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"{isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Centered color scale
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot and save to main output_dir
        sc.pl.umap(
            adata_i_filtered,
            color=isoform,
            use_raw=False,
            title=isoform,
            cmap="coolwarm",
            save=f"{safe_iso_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Iso"
group_name = "Plasmacytoid Dendritic Cells"

# Create output directory
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir
sc.settings.savefigs_path = output_dir  # Avoid weird fallback behavior

print("Plasmacytoid Dendritic Cell Markers")

# --- Group isoforms by gene ---
isoforms_by_gene = defaultdict(list)
for isoform in pDC_iso:
    gene = isoform.split(":")[0]  # Get gene name before first colon
    isoforms_by_gene[gene].append(isoform)

# --- Iterate by gene, but save in same directory ---
for gene, isoforms in isoforms_by_gene.items():
    print(f"\n--- {gene} ---")
    
    for isoform in isoforms:
        safe_iso_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_i_filtered[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Calculate 99th percentile for vmax
        expr_nonzero = expr[expr > 0]
        vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"{isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Centered color scale
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot and save to main output_dir
        sc.pl.umap(
            adata_i_filtered,
            color=isoform,
            use_raw=False,
            title=isoform,
            cmap="coolwarm",
            save=f"{safe_iso_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
## Megakaryocyte Markers

MK = ["CD42D:", "CXCR1:", "CXCR2", "ITGA2B:", "GP1BA:", "CXCR4:", "SLAMF1:", "MPL:",  "GP5:"]

# Get gene matches for gene-level data
MK_genes = find_matching_genes(MK, adata_g_filtered.var_names)
print(f"Matched {len(MK_genes)} gene-level IDs for Megakaryotes: {MK_genes}")

In [None]:
MK_iso = find_matching_genes(MK, adata_i_filtered.var_names)
print(f"Matched {len(MK_iso)} isoform-level IDs for Megakaryotes: {MK_iso}")

In [None]:
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Gene"
group_name = "Megakaryocyte"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Megakaryocyte Markers")

# Loop through matched genes and plot them
for gene in MK_genes:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Iso"
group_name = "Megakaryocyte"

# Create output directory
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir
sc.settings.savefigs_path = output_dir  # Avoid weird fallback behavior

print("Megakaryocyte Markers")

# --- Group isoforms by gene ---
isoforms_by_gene = defaultdict(list)
for isoform in MK_iso:
    gene = isoform.split(":")[0]  # Get gene name before first colon
    isoforms_by_gene[gene].append(isoform)

# --- Iterate by gene, but save in same directory ---
for gene, isoforms in isoforms_by_gene.items():
    print(f"\n--- {gene} ---")
    
    for isoform in isoforms:
        safe_iso_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_i_filtered[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Calculate 99th percentile for vmax
        expr_nonzero = expr[expr > 0]
        vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"{isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Centered color scale
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot and save to main output_dir
        sc.pl.umap(
            adata_i_filtered,
            color=isoform,
            use_raw=False,
            title=isoform,
            cmap="coolwarm",
            save=f"{safe_iso_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
## Transition cell markers

Transition = ["STAT4:", "IFNG:", "FOXO4:", "CD4:", "TCF7:", "IL7R:"]

# Get gene matches for gene-level data
Transition_genes = find_matching_genes(Transition, adata_g_filtered.var_names)
print(f"Matched {len(Transition_genes)} gene-level IDs for Transition Cells: {Transition_genes}")

In [None]:
Transition_iso = find_matching_genes(Transition, adata_i_filtered.var_names)
print(f"Matched {len(Transition_iso)} isoform-level IDs for Transition Cells: {Transition_iso}")

In [None]:
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Gene"
group_name = "TransitionCells"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Transition Cell Markers")

# Loop through matched genes and plot them
for gene in Transition_genes:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Iso"
group_name = "TransitionCells"

# Create output directory
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir
sc.settings.savefigs_path = output_dir  # Avoid weird fallback behavior

print("Transition Markers")

# --- Group isoforms by gene ---
isoforms_by_gene = defaultdict(list)
for isoform in Transition_iso:
    gene = isoform.split(":")[0]  # Get gene name before first colon
    isoforms_by_gene[gene].append(isoform)

# --- Iterate by gene, but save in same directory ---
for gene, isoforms in isoforms_by_gene.items():
    print(f"\n--- {gene} ---")
    
    for isoform in isoforms:
        safe_iso_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_i_filtered[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Calculate 99th percentile for vmax
        expr_nonzero = expr[expr > 0]
        vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"{isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Centered color scale
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot and save to main output_dir
        sc.pl.umap(
            adata_i_filtered,
            color=isoform,
            use_raw=False,
            title=isoform,
            cmap="coolwarm",
            save=f"{safe_iso_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
### Create Aggregate Plots from Markers

In [None]:
## T cell aggregate marker plot (gene-level)

import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap

sc.settings.figdir = "Intermediate_Files/Clustering_07232025/Figures/UMAP"

# Genes of interest
genes = ['CD3D:ENSG00000167286', 'CD3E:ENSG00000198851', 'CD3G:ENSG00000160654']

# Calculate summed expression per cell to create combined cell-type marker score
adata_g_filtered.obs['CD3_Combined'] = adata_g_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_g_filtered,
    color='CD3_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Total TCell Marker Expression",
    vmin=(-(np.percentile(adata_g_filtered.obs['CD3_Combined'], 100))),
    vmax=np.percentile(adata_g_filtered.obs['CD3_Combined'], 100),
    show = False
)

# Adjust figure aesthetics
fig = plt.gcf()
fig.set_size_inches(5, 3)
ax = plt.gca()
ax.set_title("TCell Aggregate Gene Marker Expression")  
ax.set_xticks([])
ax.set_yticks([])

# Save high-resolution PDF version
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering/Figures/Markers/Cell_type_Gene/TCell_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
## B cell aggregate marker plot (gene-level)

# Genes of interest
genes = ['CD22:ENSG00000012124', 'CD79A:ENSG00000105369', 'MS4A1:ENSG00000156738', 'CD19:ENSG00000177455']

# Calculate summed expression per cell
adata_g_filtered.obs['BCell_Markers_Combined'] = adata_g_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_g_filtered,
    color='BCell_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="BCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_g_filtered.obs['BCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_g_filtered.obs['BCell_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(5, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("BCell Aggregate Gene Marker Expression")
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering/Figures/Markers/Cell_type_Gene/umap_BCell_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
## NK cell aggregate marker plot (gene-level)

# Genes of interest
genes = ['GZMB:ENSG00000100453', 
         'NCAM1:ENSG00000149294', 
         'KLRF1:ENSG00000150045',
        'IL2RB:ENSG00000100385', 
         'ITGAM:ENSG00000169896'
        ]

# Calculate summed expression per cell
adata_g_filtered.obs['NKCell_Markers_Combined'] = adata_g_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_g_filtered,
    color='NKCell_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="NKCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_g_filtered.obs['NKCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_g_filtered.obs['NKCell_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(5, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Natural Killer (NK) Cell Aggregate Gene Marker Expression")
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_07232025/Figures/Markers/Cell_type_Gene/umap_NKCell_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
## Monocyte-derived cell aggregate marker plot (gene-level)

# Genes of interest
genes = ['FCGR2A:ENSG00000143226', 
         'FCGR3A:ENSG00000203747', 
         'CLEC7A:ENSG00000172243',
         'CD33:ENSG00000105383',
         'LILRB4:ENSG00000186818'
        ]

# Calculate summed expression per cell
adata_g_filtered.obs['Mono_Markers_Combined'] = adata_g_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_g_filtered,
    color='Mono_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Monocyte-derived Marker Combined Expression",
    vmin=(-(np.percentile(adata_g_filtered.obs['Mono_Markers_Combined'], 100))),
    vmax=np.percentile(adata_g_filtered.obs['Mono_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(5, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Monocyte-derived Aggregate Gene Marker Expression")  
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering/Figures/Markers/Cell_type_Gene/umap_MonocyteDerived_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
## Megakaryocyte aggregate marker plot (gene-level)

# Genes of interest
genes = ['MPL:ENSG00000117400', 
         'ITGA2B:ENSG00000005961',
         'GP1BA:ENSG00000185245'
        ]

# Calculate summed expression per cell
adata_g_filtered.obs['MK_Markers_Combined'] = adata_g_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_g_filtered,
    color='MK_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Megakaryocyte Marker Combined Expression",
    vmin=(-(np.percentile(adata_g_filtered.obs['MK_Markers_Combined'], 100))),
    vmax=np.percentile(adata_g_filtered.obs['MK_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(5, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Megakaryocyte Aggregate Gene Marker Expression") 
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering/Figures/Markers/Cell_type_Gene/umap_Megakaryocyte_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
# Map Leiden cluster labels to broad immune cell types
celltype_names = {
    "0": "TCells",
    "1": "NK Cells",
    "2": "BCells",
    "3": "Monocyte-derived",
    "4": "Megakaryocytes"
}

# Add categorical cell-type annotation to AnnData object
adata_g_filtered.obs['gen_cell_type'] = adata_g_filtered_pbmc.obs['0.06_log_AutoZI'].astype(str).replace(celltype_names)

# Display number of cells per assigned type
print(adata_g_filtered.obs['gen_cell_type'].value_counts())

In [None]:
## Final Gene-level UMAP figure

import matplotlib.pyplot as plt
import scanpy as sc

sc.settings.figdir = "Intermediate_Files/Clustering/Figures/UMAP"

# Only use the desired resolution
resolutions = ['0.06_log_AutoZI']

# Define desired order for legend
legend_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]

# UMAP colored by assigned cell type
sc.pl.umap(
    adata_g_filtered,
    color='gen_cell_type',
    title='',  # No title for publication version
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='lower center',
    legend_fontsize=10,
    legend_fontoutline=1,
    show=False
)

# Adjust figure size and legend position
fig = plt.gcf()
fig.set_size_inches(6, 4)
ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()

# Reorder legend entries with desired order
order = [labels.index(l) for l in legend_order if l in labels]
ordered_handles = [handles[i] for i in order]
ordered_labels = [labels[i] for i in order]

# Remove old legend
ax.get_legend().remove()

# Add re-ordered legend below plot
ax.legend(ordered_handles, ordered_labels,
          loc='lower center',
          bbox_to_anchor=(0.5, -0.25),  # Centered below, negative y pushes down
          fontsize=10,
          frameon=True,
          ncol=3)  # vertical layout

plt.tight_layout()

# Save final UMAP figure
plt.savefig("Intermediate_Files/Clustering/Figures/UMAP/UMAP_0.06_log_autoZI_gene_celltypes.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:


import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import os

# --- Input: gene names only ---
genes_of_interest = ["CD3G", "CD3D", "CD3E"]
cluster_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]

# --- Output settings ---
sc.settings.figdir = "Intermediate_Files/Clustering_07232025/Figures/UMAP"
output_file = "Intermediate_Files/Clustering_07232025/Figures/Markers/Cell_type_Iso/umap_TCell_combined_expression_isoform_aggregate.pdf"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# --- Automatically find all matching isoforms ---
isoform_vars = [v for v in adata_i_filtered.var_names if any(v.startswith(f"{gene}:") for gene in genes_of_interest)]

if not isoform_vars:
    raise ValueError("No isoforms found for specified genes.")

# --- Compute summed expression across those isoforms ---
adata_i_filtered.obs['TCell_Combined'] = adata_i_filtered[:, isoform_vars].X.toarray().sum(axis=1)

# --- Plot UMAP ---
fig = sc.pl.umap(
    adata_i_filtered,
    color='TCell_Combined',
    cmap="coolwarm",
    frameon=True,
    title=None,  # We'll customize manually
    vmin=-np.percentile(adata_i_filtered.obs['TCell_Combined'], 100),
    vmax=np.percentile(adata_i_filtered.obs['TCell_Combined'], 100),
    show=False
)

# --- Adjust figure size and labels ---
fig = plt.gcf()
fig.set_size_inches(5, 3)
ax = plt.gca()
ax.set_title("TCell Aggregate Isoform Marker Expression", fontsize=12)
ax.set_xticks([])
ax.set_yticks([])

# --- Save figure ---
plt.tight_layout()
plt.savefig(output_file, dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import os

# --- Input: gene names only ---
genes_of_interest = ["GZMB:", "KLRF1:", "NCAM1", "ITGAM", "IL2RB"]
cluster_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]

# --- Output settings ---
sc.settings.figdir = "Intermediate_Files/Clustering_07232025/Figures/UMAP"
output_file = "Intermediate_Files/Clustering_07232025/Figures/Markers/Cell_type_Iso/umap_NKCell_combined_expression_isoform_aggregate.pdf"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# --- Automatically find all matching isoforms ---
isoform_vars = [v for v in adata_i_filtered.var_names if any(v.startswith(f"{gene}:") for gene in genes_of_interest)]

if not isoform_vars:
    raise ValueError("No isoforms found for specified genes.")

# --- Compute summed expression across those isoforms ---
adata_i_filtered.obs['NKCell_Combined'] = adata_i_filtered[:, isoform_vars].X.toarray().sum(axis=1)

# --- Plot UMAP ---
fig = sc.pl.umap(
    adata_i_filtered,
    color='NKCell_Combined',
    cmap="coolwarm",
    frameon=True,
    title=None,  # We'll customize manually
    vmin=-np.percentile(adata_i_filtered.obs['NKCell_Combined'], 100),
    vmax=np.percentile(adata_i_filtered.obs['NKCell_Combined'], 100),
    show=False
)

# --- Adjust figure size and labels ---
fig = plt.gcf()
fig.set_size_inches(5, 3)
ax = plt.gca()
ax.set_title("Natural Killer Cell Aggregate Isoform Marker Expression", fontsize=12)
ax.set_xticks([])
ax.set_yticks([])

# --- Save figure ---
plt.tight_layout()
plt.savefig(output_file, dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:

import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import os

# --- Input: gene names only ---
genes_of_interest = ["FCGR2A", "CLEC7A", "CD33", "LILRB4"]
cluster_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]

# --- Output settings ---
sc.settings.figdir = "Intermediate_Files/Clustering_07232025/Figures/UMAP"
output_file = "Intermediate_Files/Clustering_07232025/Figures/Markers/Cell_type_Iso/umap_MonocyteDerived_combined_expression_isoform_aggregate.pdf"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# --- Automatically find all matching isoforms ---
isoform_vars = [v for v in adata_i_filtered.var_names if any(v.startswith(f"{gene}:") for gene in genes_of_interest)]

if not isoform_vars:
    raise ValueError("No isoforms found for specified genes.")

# --- Compute summed expression across those isoforms ---
adata_i_filtered.obs['Monocyte_Combined'] = adata_i_filtered[:, isoform_vars].X.toarray().sum(axis=1)

# --- Plot UMAP ---
fig = sc.pl.umap(
    adata_i_filtered,
    color='Monocyte_Combined',
    cmap="coolwarm",
    frameon=True,
    title=None,  # We'll customize manually
    vmin=-np.percentile(adata_i_filtered.obs['Monocyte_Combined'], 100),
    vmax=np.percentile(adata_i_filtered.obs['Monocyte_Combined'], 100),
    show=False
)

# --- Adjust figure size and labels ---
fig = plt.gcf()
fig.set_size_inches(5, 3)
ax = plt.gca()
ax.set_title("Monocyte-derived Cell Aggregate Isoform Marker Expression", fontsize=12)
ax.set_xticks([])
ax.set_yticks([])

# --- Save figure ---
plt.tight_layout()
plt.savefig(output_file, dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import os

# --- Input: gene names only ---
genes_of_interest = ["CD22", "CD79A", "MS4A1", "CD19"]
cluster_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]

# --- Output settings ---
sc.settings.figdir = "Intermediate_Files/Clustering_07232025/Figures/UMAP"
output_file = "Intermediate_Files/Clustering_07232025/Figures/Markers/Cell_type_Iso/umap_BCell_combined_expression_isoform_aggregate.pdf"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# --- Automatically find all matching isoforms ---
isoform_vars = [v for v in adata_i_filtered.var_names if any(v.startswith(f"{gene}:") for gene in genes_of_interest)]

if not isoform_vars:
    raise ValueError("No isoforms found for specified genes.")

# --- Compute summed expression across those isoforms ---
adata_i_filtered.obs['BCell_Combined'] = adata_i_filtered[:, isoform_vars].X.toarray().sum(axis=1)

# --- Plot UMAP ---
fig = sc.pl.umap(
    adata_i_filtered,
    color='BCell_Combined',
    cmap="coolwarm",
    frameon=True,
    title=None,  # We'll customize manually
    vmin=-np.percentile(adata_i_filtered.obs['BCell_Combined'], 100),
    vmax=np.percentile(adata_i_filtered.obs['BCell_Combined'], 100),
    show=False
)

# --- Adjust figure size and labels ---
fig = plt.gcf()
fig.set_size_inches(5, 3)
ax = plt.gca()
ax.set_title("B Cell Aggregate Isoform Marker Expression", fontsize=12)
ax.set_xticks([])
ax.set_yticks([])

# --- Save figure ---
plt.tight_layout()
plt.savefig(output_file, dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import os

# --- Input: gene names only ---
genes_of_interest = ["GP1BA", "MPL", "ITGA2B"]
cluster_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]

# --- Output settings ---
sc.settings.figdir = "Intermediate_Files/Clustering_07232025/Figures/UMAP"
output_file = "Intermediate_Files/Clustering_07232025/Figures/Markers/Cell_type_Iso/umap_Megakaryocyte_combined_expression_isoform_aggregate.pdf"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# --- Automatically find all matching isoforms ---
isoform_vars = [v for v in adata_i_filtered.var_names if any(v.startswith(f"{gene}:") for gene in genes_of_interest)]

if not isoform_vars:
    raise ValueError("No isoforms found for specified genes.")

# --- Compute summed expression across those isoforms ---
adata_i_filtered.obs['MK_Combined'] = adata_i_filtered[:, isoform_vars].X.toarray().sum(axis=1)

# --- Plot UMAP ---
fig = sc.pl.umap(
    adata_i_filtered,
    color='MK_Combined',
    cmap="coolwarm",
    frameon=True,
    title=None,  # We'll customize manually
    vmin=-np.percentile(adata_i_filtered.obs['MK_Combined'], 100),
    vmax=np.percentile(adata_i_filtered.obs['MK_Combined'], 100),
    show=False
)

# --- Adjust figure size and labels ---
fig = plt.gcf()
fig.set_size_inches(5, 3)
ax = plt.gca()
ax.set_title("Megakaryocyte Aggregate Isoform Marker Expression", fontsize=12)
ax.set_xticks([])
ax.set_yticks([])

# --- Save figure ---
plt.tight_layout()
plt.savefig(output_file, dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
# For example, suppose you want to merge clusters '0' and '1' into "Merged_A",
# '2' and '3' into "Merged_B", and keep others as-is.
celltype_names = {
    "0": "TCells",
    "1": "NK Cells",
    "2": "BCells",
    "3": "Monocyte-derived",
    "4": "Megakaryocytes"
}

# Create a new observation column with the merged cluster assignments.
adata_i_filtered.obs['gen_cell_type'] = adata_i_filtered_pbmc.obs['0.06_log_AutoZI'].astype(str).replace(celltype_names)

# Optionally, you can verify the result:
print(adata_i_filtered.obs['gen_cell_type'].value_counts())

In [None]:
## Final Isoform-level UMAP figure

import matplotlib.pyplot as plt
import scanpy as sc

sc.settings.figdir = "Intermediate_Files/Clustering/Figures/UMAP"

# Only use the desired resolution
resolutions = ['0.06_log_AutoZI']

# Define desired order for legend
legend_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]

# UMAP colored by assigned cell type
sc.pl.umap(
    adata_i_filtered,
    color='gen_cell_type',
    title='',  # No title for publication version
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='lower center',
    legend_fontsize=10,
    legend_fontoutline=1,
    show=False
)

# Adjust figure size and legend position
fig = plt.gcf()
fig.set_size_inches(6, 4)
ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()

# Reorder legend entries with desired order
order = [labels.index(l) for l in legend_order if l in labels]
ordered_handles = [handles[i] for i in order]
ordered_labels = [labels[i] for i in order]

# Remove old legend
ax.get_legend().remove()

# Add re-ordered legend below plot
ax.legend(ordered_handles, ordered_labels,
          loc='lower center',
          bbox_to_anchor=(0.5, -0.25),  # Centered below, negative y pushes down
          fontsize=10,
          frameon=True,
          ncol=3)  # vertical layout

plt.tight_layout()

# Save final UMAP figure
plt.savefig("Intermediate_Files/Clustering/Figures/UMAP/UMAP_0.06_log_autoZI_gene_celltypes.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
#Save Clustered data
output_dir = 'Intermediate_Files/Clustering/'

adata_g_filtered.write(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_AutoZILatent.h5mu"))
adata_i_filtered.write(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_AutoZILatent.h5mu"))