In [None]:
import io
import tempfile
from anndata import AnnData
import muon as mu
import numpy as np
import requests
import os
import scanpy as sc
import scvi
import seaborn as sns
import torch
import pandas as pd
import sys
import scrublet as scr
import skimage
import pybiomart
from bioservices import BioMart
import rdata
import matplotlib.pyplot as plt
from adjustText import adjust_text
from scipy.stats import beta
import leidenalg
import igraph
import tqdm
import time
import gc
import polars as pl
import pyarrow

In [None]:
# Load the mdata object from the file
output_dir = 'Intermediate_Files/Clustering'

from scanpy import read_h5ad
# Load the mdata object from the file
adata_g_filtered = read_h5ad(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_AutoZILatent.h5mu"))
adata_i_filtered = read_h5ad(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_AutoZILatent.h5mu"))

In [None]:
adata_TCell = adata_g_filtered[adata_g_filtered.obs["gen_cell_type"] == "TCells"].copy()

In [None]:
adata_TCell_i = adata_i_filtered[adata_i_filtered.obs["gen_cell_type"] == "TCells"].copy()

In [None]:
# Function to find matching genes in var_names (combined_ID format)
def find_matching_genes(prefixes, gene_list):
    return [gene for gene in gene_list if any(gene.startswith(prefix) for prefix in prefixes)]

TCell_Markers = ["CD4:", "CD3D:", "CD3E:", "CD3G:", "CD3Z", "CD8A:", "CD8B:", "PTPRC:", "NCAM1"]
Naive_TCell = ["PTPRC:", "TCF7:", "FOXP1", "LEF1:", "IL2RA:", "CD27:", "IL7R:", "ITGAE:"]
Memory_TCell = ["CCR5:", "HLA-DRB1:", "HLA-DRA:", "SELL", "TCF7:", "IL7R:", "CCR7:"]
CD8_TCell = ["CD8A:", "CD8B:", "CXCR3:", "KLRB1:", "PTGDR2:", "GATA3:", "IRF4:", "RORC:", "CCL5"]
#Central_Memory_TCell = ["CCR5:", "IL7RA:", "EOMES:", "PRDM1:", "IL7R:", "SELL", "CCR7:"]
#Effector_Memory_TCell = ["CCR5:", "HLA-DRB1:", "HLA-DRA:", "ITGAL:", "GZMA:", "PRDM1:", "SELL"]
Reg_TCell = ["FOXP3:", "IL2RA:", "CTLA4:", "STAT5A"]
#Th1_TCell = ["CXCR3:", "IFNG:", "TNF:", "STAT4:"]
CD4_Effector = ["CD4:", "CXCR3:", "TNF:", "STAT4:", "IL17A:", "IL13:", "IL25:", "AHR:", "FOXO4:", "GATA3", "IL2RA"]
Transition = ["CD4:", "GATA3:", "AHR:", "IL2RA:", "CCR7:", "IL7R:", "TCF7:", "CX3CR1:", "PDCD1:", "TOX:", "STAT5A:", 'CD27:', 'LEF1:', 
            "FOXP3:", "IL2RA:", "CTLA4:", "STAT5A"]

# Get gene matches for gene-level data
TCell_genes_1 = find_matching_genes(TCell_Markers, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_1)} gene-level IDs for T-Cells: {TCell_genes_1}")
TCell_genes_2 = find_matching_genes(Naive_TCell, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_2)} gene-level IDs for Naive T-Cells: {TCell_genes_2}")
TCell_genes_3 = find_matching_genes(Memory_TCell, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_3)} gene-level IDs for Memory T-Cells: {TCell_genes_3}")
TCell_genes_4 = find_matching_genes(CD8_TCell, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_4)} gene-level IDs for CD8+ T-Cells: {TCell_genes_4}")
TCell_genes_5 = find_matching_genes(Reg_TCell, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_5)} gene-level IDs for Regulatory T-Cells: {TCell_genes_5}")
TCell_genes_6 = find_matching_genes(CD4_Effector, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_6)} gene-level IDs for Helper T-Cells: {TCell_genes_6}")
TCell_genes_7 = find_matching_genes(Transition, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_7)} gene-level IDs for Stem-like memory T-Cells: {TCell_genes_7}")


# Get gene matches for isoform-level data
TCell_iso_1 = find_matching_genes(TCell_Markers, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_1)} isoform-level IDs for T-Cells: {TCell_iso_1}")
TCell_iso_2 = find_matching_genes(Naive_TCell, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_2)} isoform-level IDs for Naive T-Cells: {TCell_iso_2}")
TCell_iso_3 = find_matching_genes(Memory_TCell, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_3)} isoform-level IDs for Memory T-Cells: {TCell_iso_3}")
TCell_iso_4 = find_matching_genes(CD8_TCell, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_4)} isoform-level IDs for CD8+ T-Cells: {TCell_iso_4}")
TCell_iso_5 = find_matching_genes(Reg_TCell, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_5)} isoform-level IDs for Regulatory T-Cells: {TCell_iso_5}")
TCell_iso_6 = find_matching_genes(CD4_Effector, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_6)} isoform-level IDs for Helper T-Cells: {TCell_iso_6}")
TCell_iso_7 = find_matching_genes(Transition, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_7)} isoform-level IDs for Stem-like memory T-Cells: {TCell_iso_7}")

In [None]:
# Run PCA on log-transformed data
sc.pp.pca(adata_TCell) 

# Compute UMAP
sc.pp.neighbors(adata_TCell, n_neighbors = 20, use_rep= 'X_AutoZI')
sc.tl.umap(adata_TCell, min_dist=0.3)

sc.pl.umap(adata_TCell, color="batch", title="Gene UMAP by batch")

In [None]:
# Run PCA on log-transformed data
sc.pp.pca(adata_TCell_i)

# Compute UMAP
sc.pp.neighbors(adata_TCell_i, n_neighbors = 20, use_rep= 'X_AutoZI')
sc.tl.umap(adata_TCell_i, min_dist=0.3)

sc.pl.umap(adata_TCell_i, color="batch", title="Isoform UMAP by batch")

In [None]:
# Perform Leiden clustering at multiple resolutions
resolutions = [0.04, 0.06, 0.1, 0.14, 0.16, 0.2, 0.24, 0.26, 0.3, 0.34, 0.36, 
               0.4, 0.44, 0.46, 0.5
              ]
for res in resolutions:
    sc.tl.leiden(adata_TCell, resolution=res, key_added=f'{res}_log_AutoZI', flavor = "igraph", n_iterations=2)
    sc.tl.leiden(adata_TCell_i, resolution=res, key_added=f'{res}_log_AutoZI', flavor = "igraph", n_iterations=2)

In [None]:
def relabel_clusters_by_size(adata, cluster_key):
    # Get the cluster labels
    cluster_labels = adata.obs[cluster_key]
    
    # Count the size of each cluster
    cluster_sizes = cluster_labels.value_counts()
    
    # Sort clusters by size (largest first)
    sorted_clusters = cluster_sizes.index[np.argsort(-cluster_sizes.values)]
    
    # Create a mapping from old cluster labels to new ones
    new_labels_map = {old_label: new_label for new_label, old_label in enumerate(sorted_clusters)}
     
    # Apply the new labels
    adata.obs[cluster_key] = cluster_labels.map(new_labels_map).astype('category')
    
    return adata

In [None]:
# Relabel clusters by size
resolutions = ['0.04_log_AutoZI', '0.06_log_AutoZI', '0.1_log_AutoZI', '0.14_log_AutoZI', 
               '0.16_log_AutoZI', '0.2_log_AutoZI', '0.24_log_AutoZI', '0.26_log_AutoZI', 
               '0.3_log_AutoZI',
              ]
for cluster_key in resolutions:
    adata_TCell_pbmc = relabel_clusters_by_size(adata_TCell, cluster_key)
    adata_TCell_i_pbmc = relabel_clusters_by_size(adata_TCell_i, cluster_key)

In [None]:
# Function to plot UMAP with labels reflecting cluster sizes
def plot_umap_with_labels_g(adata, resolutions, use_rep_key=None):
    """
    Plots UMAP with cluster labels and prints what 'use_rep' is assigned to.

    Parameters:
    - adata: AnnData object
    - resolutions: List of resolution values to plot
    - use_rep_key: The key to use for coloring clusters (defaults to '20lat_1e2' if not provided)
    """

    print(f"Using representation: {use_rep_key}")  # Print assigned representation

    vibrant_palette = plt.get_cmap('tab20').colors  # Set color palette

    for res in resolutions:
        # Plot UMAP with cluster labels
        sc.pl.umap(
            adata,
            color=f'{res}_{use_rep_key}',
            title=f'UMAP with Clusters (Gene-Level, {use_rep_key}, Res={res})',
            frameon=True,
            palette=vibrant_palette,
            legend_loc='on data',
            legend_fontsize=10,
            legend_fontoutline=2,
        )

# Function to plot UMAP with labels reflecting cluster sizes
def plot_umap_with_labels_i(adata, resolutions, use_rep_key=None):
    """
    Plots UMAP with cluster labels and prints what 'use_rep' is assigned to.

    Parameters:
    - adata: AnnData object
    - resolutions: List of resolution values to plot
    - use_rep_key: The key to use for coloring clusters (defaults to '20lat_1e2' if not provided)
    """

    print(f"Using representation: {use_rep_key}")  # Print assigned representation

    vibrant_palette = plt.get_cmap('tab20').colors  # Set color palette

    for res in resolutions:
        # Plot UMAP with cluster labels
        sc.pl.umap(
            adata,
            color=f'{res}_{use_rep_key}',
            title=f'UMAP with Clusters (Isoform-Level, {use_rep_key}, Res={res})',
            frameon=True,
            palette=vibrant_palette,
            legend_loc='on data',
            legend_fontsize=10,
            legend_fontoutline=2,
        )

In [None]:
# Call the functions
plot_umap_with_labels_g(adata_TCell_pbmc, resolutions=[
    0.04, 0.06, 0.1, 0.14, 0.16, 0.2, 0.24, 0.26, 0.3
], use_rep_key= 'log_AutoZI')

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os

# Define output directory
marker_root = "Intermediate_Files/Clustering/Figures/Markers"
group_name = "General T cell (T cell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("General T cell Markers")

# Loop through matched genes and plot them
for gene in TCell_genes_1:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
# Define output directory
marker_root = "Intermediate_Files/Clustering/Figures/Markers"
group_name = "Naive TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Naive T-Cell Markers")

# Loop through matched genes and plot them
for gene in TCell_genes_2:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
### Gene-level Aggregate Naive T cell marker score by cell (UMAP) (Figure 5h)

# Genes of interest
genes = ['CD27:ENSG00000139193', "ITGAE:ENSG00000083457", "LEF1:ENSG00000138795"]
        

# Calculate summed expression per cell
adata_TCell.obs['Naive_Markers_Combined'] =adata_TCell[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell,
    color='Naive_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Naive TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell.obs['Naive_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell.obs['Naive_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Naive TCell Aggregated Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering/Figures/UMAP/umap_Naive_TCellOnly_combined_expression_gene.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
print("Memory T cell Markers")

# Define output directory
marker_root = "Intermediate_Files/Clustering/Figures/Markers"
group_name = "Memory T cell (T cells Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

# Loop through matched genes and plot them
for gene in TCell_genes_3:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr =adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
       adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
### Gene-level Aggregate Memory T cell marker score by cell (UMAP) (Figure 5h)

print("Aggregate Score for Memory T cell Markers")

# Genes of interest
genes = ['TCF7:ENSG00000081059', 'CCR7:ENSG00000126353', 'SELL:ENSG00000188404']
        

# Calculate summed expression per cell
adata_TCell.obs['TMem_Markers_Combined'] =adata_TCell[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell,
    color='TMem_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Memory T cell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell.obs['TMem_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell.obs['TMem_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Memory T cell Aggregated Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering/Figures/UMAP/umap_TMem_TCellOnly_combined_expression_gene.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
print("CD8+ T cell Markers")

# Define output directory
marker_root = "Intermediate_Files/Clustering/Figures/Markers"
group_name = "CD8 TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir


# Loop through matched genes and plot them
for gene in TCell_genes_4:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
### Gene-level Aggregate Effector CD8+ T cell marker score by cell (UMAP) (Figure 5g)

# Genes of interest
genes = ['GATA3:ENSG00000107485', 'KLRB1:ENSG00000111796', 'CD8A:ENSG00000153563',
        'CD8B:ENSG00000172116', 'CCL5:ENSG00000271503']
        

# Calculate summed expression per cell
adata_TCell.obs['CD8_TCell_Markers_Combined'] =adata_TCell[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell,
    color='CD8_TCell_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Cytotoxic TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell.obs['CD8_TCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell.obs['CD8_TCell_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Cytotoxic TCell Aggregate Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering/Figures/UMAP/umap_CytotoxicTCell_TCellsOnly_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
print("Regulatory T-Cell Markers")

# Define output directory
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "Regulatory TCell"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir


# Loop through matched genes and plot them
for gene in TCell_genes_5:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
### Gene-level Aggregate Regulatory T cell marker score by cell (UMAP)

Reg_filtered = [i for i in TCell_genes_5 if any(gene in i for gene in ["FOXP3", "STAT5A", "IL2RA", "CTLA4"])]

# Safety: Check which markers are in adata_TCell_i.var_names
genes = [g for g in Reg_filtered if g in adata_TCell.var_names]

# Calculate summed expression per cell
X = adata_TCell[:, genes].X
if hasattr(X, "toarray"):
    X = X.toarray()

adata_TCell.obs['Reg_TCell_Gene_Markers_Combined'] = X.sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell,
    color='Reg_TCell_Gene_Markers_Combined',
    cmap="coolwarm",
    frameon=True,
    title="Regulatory TCell Gene Marker Expression",
    vmin=-(np.percentile(adata_TCell.obs['Reg_TCell_Gene_Markers_Combined'], 100)),
    vmax=np.percentile(adata_TCell.obs['Reg_TCell_Gene_Markers_Combined'], 100),
    show=False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Total Regulatory TCell Aggregate Gene Marker Expression")
ax.set_xticks([])
ax.set_yticks([])

# Save figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_07282025/Figures/UMAP/umap_RegulatoryTCell_TCellsOnly_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
print("Helper T cell Markers")
# Define output directory
marker_root = "Intermediate_Files/Clustering/Figures/Markers"
group_name = "Helper (CD4) T cell(T cell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir


# Loop through matched genes and plot them
for gene in TCell_genes_6:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
### Gene-level Aggregate CD4+ Effector T cell marker score by cell (UMAP) (Figure 5f)

# Genes of interest
genes = ['GATA3:ENSG00000107485', 'CD4:ENSG00000010610', 'AHR:ENSG00000106546',
        'IL2RA:ENSG00000134460', 'TNF:ENSG00000232810']  

# Calculate summed expression per cell
adata_TCell.obs['CD4_Effector_TCell_Markers_Combined'] =adata_TCell[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell,
    color='CD4_Effector_TCell_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="CD4+ Effector T cell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell.obs['CD4_Effector_TCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell.obs['CD4_Effector_TCell_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("CD4+ Effector T cell Aggregate Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering/Figures/UMAP/umap_CD4EffectorTCell_TCellsOnlycombined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
print("Effector-Memory Transition T cell Markers")
# Define output directory
marker_root = "Intermediate_Files/Clustering/Figures/Markers"
group_name = "Effector-Memory Transition T cell (T cell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir


# Loop through matched genes and plot them
for gene in TCell_genes_7:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
### Gene-level Aggregate Transition T cell marker score by cell (UMAP) (Figure 5i)

# Genes of interest
genes = ['GATA3:ENSG00000107485',
    'CCR7:ENSG00000126353',#
    'TCF7:ENSG00000081059', #
    'IL2RA:ENSG00000134460',
    'CTLA4:ENSG00000163599',
    'CD27:ENSG00000139193', #
    'ITGAE:ENSG00000083457',
    'LEF1:ENSG00000138795'#
]


# Calculate summed expression per cell
adata_TCell.obs['Transition_TCell_Markers_Combined'] =adata_TCell[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell,
    color='Transition_TCell_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Effector-Memory Transition TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell.obs['Transition_TCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell.obs['Transition_TCell_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 2.4)

# Clean up axis
ax = plt.gca()
ax.set_title("Effector-Memory Transition T cell Aggregate Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering/Figures/UMAP_TransitionTCell_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
MoreTransition = ["STAT4:", "IFNG:", "FOXO4:", "CD4:", "TCF7:", "IL7R:", "CTLA4:"]

# Get gene matches for gene-level data
Transition_genes = find_matching_genes(MoreTransition, adata_g_filtered.var_names)
print(f"Matched {len(Transition_genes)} gene-level IDs for Transition Cells: {Transition_genes}")

In [None]:
print("Transition Cell Markers")

marker_root = "Intermediate_Files/Clustering/Figures/Markers/Cell_type_Gene"
group_name = "TransitionCells"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

# Loop through matched genes and plot them
for gene in Transition_genes:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero,100) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero,100)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
# Manually identify the clusters as follows:

tcell_cluster_mapping = {
    "0": "Memory T cells",
    "1": "Effector CD4 T cells",
    "2": "Effector CD8 T cells",
    "3": "Effector-Memory Transition TCells",
}

# Create new cell type annotations
adata_TCell_pbmc.obs["TCell_subtype"] = adata_TCell_pbmc.obs["0.26_log_AutoZI"].astype(str).map(tcell_cluster_mapping)

# Check assignments
adata_TCell_pbmc.obs["TCell_subtype"].value_counts()

In [None]:
import matplotlib.pyplot as plt
import scanpy as sc

# Define desired legend order
legend_order = ["Memory T cells", "Effector CD4 T cells", "Effector CD8 T cells", "Effector-Memory Transition T cells"]

# Plot UMAP using subtype annotation
sc.pl.umap(
    adata_TCell_pbmc,
    color='TCell_subtype',
    title='',  # No title for publication version
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='lower right',
    legend_fontsize=10,
    legend_fontoutline=1,
    show=False  # Allow editing
)

# Adjust figure
fig = plt.gcf()
fig.set_size_inches(5,4)

# Get current legend handles and labels
ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()

# Build a dictionary of current labels and handles
label_handle_dict = dict(zip(labels, handles))

# Only keep those in legend_order that exist in current labels
ordered_labels = [label for label in legend_order if label in label_handle_dict]
ordered_handles = [label_handle_dict[label] for label in ordered_labels]

# Remove old legend
ax.get_legend().remove()

# Add new ordered legend
ax.legend(ordered_handles, ordered_labels,
          loc='lower center',
          bbox_to_anchor=(0.48, -0.3),  # Centered below
          fontsize=10,
          frameon=True,
          ncol=2)

plt.tight_layout()

# Save figure
plt.savefig("Intermediate_Files/Clustering/Figures/UMAP/UMAP_TCell_subtypes.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
### Plot UMAP with sub-cell-type labels

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import scanpy as sc

# Desired legend order
legend_order = [
    "Memory T cells",
    "Effector CD4 T cells",
    "Effector CD8 T cells",
    "Effector-Memory Transition T cells"
]

OBS_KEY = "TCell_subtype"

# Keep only categories present and reorder
present = [c for c in legend_order if c in adata_TCell.obs[OBS_KEY].unique().tolist()]
if not present:
    raise ValueError(f"No categories from legend_order found in adata.obs['{OBS_KEY}'].")

adata_TCell.obs[OBS_KEY] = (
    adata_TCell.obs[OBS_KEY]
    .astype("category")
    .cat.remove_unused_categories()
    .cat.reorder_categories(present, ordered=True)
)

# Use full tab20 palette
palette = plt.get_cmap("tab20").colors

# Plot UMAP without Scanpy legend
sc.pl.umap(
    adata_TCell,
    color=OBS_KEY,
    title="",
    frameon=True,
    palette=palette,
    legend_loc=None,   # disable Scanpy's legend
    show=False,
    size=6
)

# Match publication dimensions
fig = plt.gcf()
fig.set_size_inches(6, 4)
ax = plt.gca()

# Axis labels
ax.set_xlabel("UMAP1", fontsize=9)
ax.set_ylabel("UMAP2", fontsize=9)
ax.set_xticks([]); ax.set_yticks([])
for spine in ["top", "right"]:
    ax.spines[spine].set_visible(False)

# Manual legend patches (tab20 colors in order)
handles = [
    mpatches.Patch(facecolor=palette[i], edgecolor="none", label=present[i])
    for i in range(len(present))
]

# Adjust layout to leave space at the bottom for legend
fig.subplots_adjust(bottom=0.3)

legend = fig.legend(
    handles, present,
    loc="lower center",
    bbox_to_anchor=(0.5, 0.1),   # centered below the axes
    fontsize=9,
    frameon=True,                # box around legend
    fancybox=False,              # square corners
    ncol=2,
    handlelength=1.0,
    handletextpad=0.5,
    columnspacing=1.1,
    borderpad=0.4
)
legend.get_frame().set_edgecolor("black")
legend.get_frame().set_linewidth(0.8)

# Save
out_base = "Intermediate_Files/Paper_Figs/UMAP/UMAP_TCell_subtypes"
fig.savefig(f"{out_base}.pdf", dpi=600, transparent=True, bbox_inches="tight")
fig.savefig(f"{out_base}.png", dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
from pandas.api.types import CategoricalDtype

# Create the column if it doesn't exist
if "gen_cell_type_reannotated" not in adata_g_filtered.obs.columns:
    adata_g_filtered.obs["gen_cell_type_reannotated"] = adata_g_filtered.obs["gen_cell_type"]

# Ensure it's categorical
if not isinstance(adata_g_filtered.obs["gen_cell_type_reannotated"].dtype, CategoricalDtype):
    adata_g_filtered.obs["gen_cell_type_reannotated"] = adata_g_filtered.obs["gen_cell_type_reannotated"].astype("category")

# Extract actual new categories from TCell_subtype
new_categories = adata_TCell_pbmc.obs["TCell_subtype"].astype("category").cat.categories

# Add only truly new categories
existing_cats = set(adata_g_filtered.obs["gen_cell_type_reannotated"].cat.categories)
cats_to_add = list(set(new_categories) - existing_cats)
if cats_to_add:
    adata_g_filtered.obs["gen_cell_type_reannotated"] = adata_g_filtered.obs["gen_cell_type_reannotated"].cat.add_categories(cats_to_add)

# Assign new labels (as strings)
adata_g_filtered.obs.loc[adata_TCell_pbmc.obs_names, "gen_cell_type_reannotated"] = adata_TCell_pbmc.obs["TCell_subtype"].astype(str).values

# Check output
print(adata_g_filtered.obs["gen_cell_type_reannotated"].value_counts())

In [None]:
# Remove unused categories (this will remove "TCells" since no cells are assigned to it)
adata_g_filtered.obs["gen_cell_type_reannotated"] = adata_g_filtered.obs["gen_cell_type_reannotated"].cat.remove_unused_categories()

# Check result
print(adata_g_filtered.obs["gen_cell_type_reannotated"].value_counts())

In [None]:
# Plot UMAP using subtype annotation
# Define desired legend order
legend_order = ["NK cells", "B cells", "Megakaryocytes", "Monocyte-derived", 
                "Memory T cells", "Effector CD4 T cells", "Effector CD8 T cells", 
                "Effector-Memory Transition T cells"]

sc.pl.umap(
    adata_g_filtered,
    color='gen_cell_type_reannotated',
    title='',  # No title for publication version
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='lower right',
    legend_fontsize=10,
    legend_fontoutline=1,
    show=False  # Allow editing
)

# Adjust figure
fig = plt.gcf()
fig.set_size_inches(5,5)

# Get current legend handles and labels
ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()

# Build a dictionary of current labels and handles
label_handle_dict = dict(zip(labels, handles))

# Only keep those in legend_order that exist in current labels
ordered_labels = [label for label in legend_order if label in label_handle_dict]
ordered_handles = [label_handle_dict[label] for label in ordered_labels]

# Remove old legend
ax.get_legend().remove()

# Add new ordered legend
ax.legend(ordered_handles, ordered_labels,
          loc='lower center',
          bbox_to_anchor=(0.5, -0.35),  # Centered below
          fontsize=10,
          frameon=True,
          ncol=2)

plt.tight_layout()

# Save figure
plt.savefig("Intermediate_Files/Clustering/Figures/UMAP/UMAP_subtypes.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
#Save Clustered data
output_dir = 'Intermediate_Files/Clustering/'

adata_g_filtered.write(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_reannotated_AutoZILatent.h5mu"))

In [None]:
### Now let's do the same on the isoform leve;

In [None]:
# Call the functions
plot_umap_with_labels_i(adata_TCell_i_pbmc, resolutions=[
    0.04, 0.06, 0.1, 0.14, 0.16, 0.2, 0.24, 0.26, 0.3
], use_rep_key= 'log_AutoZI')

In [None]:
### General T cell markers alone on isoform-level

# Call the functions
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os
from collections import defaultdict

# Define output directory
marker_root = "Intermediate_Files/Clustering/Figures/Markers"
group_name = "T cell (T cell alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("T cell Markers")

# Group isoforms by gene
gene_to_isoforms = defaultdict(list)
for isoform in TCell_iso_1:
    gene = isoform.split(":")[0]  # Use the prefix before first colon
    gene_to_isoforms[gene].append(isoform)

# Loop through each gene and its isoforms
for gene, isoforms in gene_to_isoforms.items():
    print(f"\n▶ Gene: {gene} ({len(isoforms)} isoforms)")
    for isoform in isoforms:
        safe_isoform_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_TCell_i[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Exclude 0s if you want to scale based only on expressing cells
        expr_nonzero = expr[expr > 0]

        # Calculate 99th percentile for vmax
        vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"   └─ {isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Create TwoSlopeNorm scaling
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot UMAP
        sc.pl.umap(
            adata_TCell_i,
            color=isoform,
            use_raw=False,
            title=f"{isoform}",
            cmap="coolwarm",
            save=f"_{safe_isoform_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
print("Effector-Memory Transition T cell Markers")

import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os
from collections import defaultdict

# Define output directory
marker_root = "Intermediate_Files/Clustering/Figures/Markers"
group_name = "Effector-Memory Transition T cell (T cell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

# Group isoforms by gene
gene_to_isoforms = defaultdict(list)
for isoform in TCell_iso_7:
    gene = isoform.split(":")[0]  # Use the prefix before first colon
    gene_to_isoforms[gene].append(isoform)

# Loop through each gene and its isoforms
for gene, isoforms in gene_to_isoforms.items():
    print(f"\n▶ Gene: {gene} ({len(isoforms)} isoforms)")
    for isoform in isoforms:
        safe_isoform_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_TCell_i[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Exclude 0s if you want to scale based only on expressing cells
        expr_nonzero = expr[expr > 0]

        # Calculate 99th percentile for vmax
        vmax_val = np.percentile(expr_nonzero, 100) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"   └─ {isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Create TwoSlopeNorm scaling
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot UMAP
        sc.pl.umap(
            adata_TCell_i,
            color=isoform,
            use_raw=False,
            title=f"{isoform}",
            cmap="coolwarm",
            save=f"_{safe_isoform_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
### Isoform-level Aggregate Transition T cell marker score by cell (UMAP) (Figure 5m)

import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

# Gene symbols of interest (before colons)
gene_symbols = ['ITGAE', 
                'LEF1',
                'CD27',
                'GATA3',
                'IL2RA',
                'CTLA4',
                'IL7R'
               ]

# Find all isoforms that start with each gene symbol
matching_isoforms = [g for g in adata_TCell_i.var_names if any(g.startswith(sym + ":") for sym in gene_symbols)]

print(f"Found {len(matching_isoforms)} matching isoforms for genes: {gene_symbols}")

# Calculate summed expression per cell
adata_TCell_i.obs['Transition_Markers_Combined'] = adata_TCell_i[:, matching_isoforms].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell_i,
    color='Transition_Markers_Combined',
    cmap="coolwarm",
    frameon=True,
    title="Effector-Memory Transition T cell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell_i.obs['Transition_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell_i.obs['Transition_Markers_Combined'], 100),
    show=False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Effector-Memory Transition TCell Aggregate Gene Marker Expression")
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering/Figures/UMAP/umap_EffectorMemoryTransition_TCellsOnlycombined_expression_isoform_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
print("Helper CD4 T cell Markers")

import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os
from collections import defaultdict

# Define output directory
marker_root = "Intermediate_Files/Clustering/Figures/Markers"
group_name = "Helper (CD4) T cell (T cell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir



# Group isoforms by gene
gene_to_isoforms = defaultdict(list)
for isoform in TCell_iso_6:
    gene = isoform.split(":")[0]  # Use the prefix before first colon
    gene_to_isoforms[gene].append(isoform)

# Loop through each gene and its isoforms
for gene, isoforms in gene_to_isoforms.items():
    print(f"\n▶ Gene: {gene} ({len(isoforms)} isoforms)")
    for isoform in isoforms:
        safe_isoform_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_TCell_i[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Exclude 0s if you want to scale based only on expressing cells
        expr_nonzero = expr[expr > 0]

        # Calculate 99th percentile for vmax
        vmax_val = np.percentile(expr_nonzero, 100) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"   └─ {isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Create TwoSlopeNorm scaling
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot UMAP
        sc.pl.umap(
            adata_TCell_i,
            color=isoform,
            use_raw=False,
            title=f"{isoform}",
            cmap="coolwarm",
            save=f"_{safe_isoform_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
### Isoform-level Aggregate Effector CD4+ T cell marker score by cell (UMAP) (Figure 5j)

import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

# Gene symbols of interest (before colons)
gene_symbols = ['GATA3', 'CD4', 'AHR', 'IL2RA']

# Find all isoforms that start with each gene symbol
matching_isoforms = [g for g in adata_TCell_i.var_names if any(g.startswith(sym + ":") for sym in gene_symbols)]

print(f"Found {len(matching_isoforms)} matching isoforms for genes: {gene_symbols}")

# Calculate summed expression per cell
adata_TCell_i.obs['CD4_Effector_TCell_Markers_Combined'] = adata_TCell_i[:, matching_isoforms].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell_i,
    color='CD4_Effector_TCell_Markers_Combined',
    cmap="coolwarm",
    frameon=True,
    title="CD4+ Effector T cell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell_i.obs['CD4_Effector_TCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell_i.obs['CD4_Effector_TCell_Markers_Combined'], 100),
    show=False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("CD4+ Effector T cell Aggregate Gene Marker Expression")
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering/Figures/UMAP/umap_CD4EffectorTCell_TCellsOnlycombined_expression_isoform_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
print("Memory TCell Markers")

import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os
from collections import defaultdict

# Define output directory
marker_root = "Intermediate_Files/Clustering/Figures/Markers"
group_name = "Memory T cell (T cell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir


# Group isoforms by gene
gene_to_isoforms = defaultdict(list)
for isoform in TCell_iso_3:
    gene = isoform.split(":")[0]  # Use the prefix before first colon
    gene_to_isoforms[gene].append(isoform)

# Loop through each gene and its isoforms
for gene, isoforms in gene_to_isoforms.items():
    print(f"\n▶ Gene: {gene} ({len(isoforms)} isoforms)")
    for isoform in isoforms:
        safe_isoform_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_TCell_i[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Exclude 0s if you want to scale based only on expressing cells
        expr_nonzero = expr[expr > 0]

        # Calculate 99th percentile for vmax
        vmax_val = np.percentile(expr_nonzero,100) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"   └─ {isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Create TwoSlopeNorm scaling
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot UMAP
        sc.pl.umap(
            adata_TCell_i,
            color=isoform,
            use_raw=False,
            title=f"{isoform}",
            cmap="coolwarm",
            save=f"_{safe_isoform_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
### Isoform-level Aggregate Memory T cell marker score by cell (UMAP) (Figure 5l)

import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

# Gene symbols of interest (before colons)
gene_symbols = ['CCR7', 'SELL', 'TCF7']

# Find all isoforms that start with each gene symbol
matching_isoforms = [g for g in adata_TCell_i.var_names if any(g.startswith(sym + ":") for sym in gene_symbols)]

print(f"Found {len(matching_isoforms)} matching isoforms for genes: {gene_symbols}")

# Calculate summed expression per cell
adata_TCell_i.obs['Mem_TCell_Markers_Combined'] = adata_TCell_i[:, matching_isoforms].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell_i,
    color='Mem_TCell_Markers_Combined',
    cmap="coolwarm",
    frameon=True,
    title="Memory T cell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell_i.obs['Mem_TCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell_i.obs['Mem_TCell_Markers_Combined'], 100),
    show=False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Memory T cell Aggregate Gene Marker Expression")
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering/Figures/UMAP/umap_MemTCell_TCellsOnlycombined_expression_isoform_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os
from collections import defaultdict

# Define output directory
marker_root = "Intermediate_Files/Clustering/Figures/Markers"
group_name = "CD8+ Effector TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("CD8+ T-Cell Markers")

# Group isoforms by gene
gene_to_isoforms = defaultdict(list)
for isoform in TCell_iso_4:
    gene = isoform.split(":")[0]  # Use the prefix before first colon
    gene_to_isoforms[gene].append(isoform)

# Loop through each gene and its isoforms
for gene, isoforms in gene_to_isoforms.items():
    print(f"\n▶ Gene: {gene} ({len(isoforms)} isoforms)")
    for isoform in isoforms:
        safe_isoform_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_TCell_i[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Exclude 0s if you want to scale based only on expressing cells
        expr_nonzero = expr[expr > 0]

        # Calculate 99th percentile for vmax
        vmax_val = np.percentile(expr_nonzero, 100) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"   └─ {isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Create TwoSlopeNorm scaling
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot UMAP
        sc.pl.umap(
            adata_TCell_i,
            color=isoform,
            use_raw=False,
            title=f"{isoform}",
            cmap="coolwarm",
            save=f"_{safe_isoform_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
### Isoform-level Aggregate Effector CD8+ T cell marker score by cell (UMAP) (Figure 5j)

import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

# Gene symbols of interest (before colons)
gene_symbols = ['CD8A', 'CD8B', 'GATA3', 'KLRB1', 'CCL5']

# Find all isoforms that start with each gene symbol
matching_isoforms = [g for g in adata_TCell_i.var_names if any(g.startswith(sym + ":") for sym in gene_symbols)]

print(f"Found {len(matching_isoforms)} matching isoforms for genes: {gene_symbols}")

# Calculate summed expression per cell
adata_TCell_i.obs['EffectorCD8_TCell_Markers_Combined'] = adata_TCell_i[:, matching_isoforms].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell_i,
    color='EffectorCD8_Markers_Combined',
    cmap="coolwarm",
    frameon=True,
    title="Effector CD8 T cell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell_i.obs['EffectorCD8_TCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell_i.obs['EffectorCD8_TCell_Markers_Combined'], 100),
    show=False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Effector CD8 T cell Aggregate Gene Marker Expression")
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering/Figures/UMAP/umap_EffectorCD8TCell_TCellsOnlycombined_expression_isoform_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
print("Regulatory T cell Markers")

import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os
from collections import defaultdict

# Define output directory
marker_root = "Intermediate_Files/Clustering/Figures/Markers"
group_name = "Regulatory T cell (T cell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir



# Group isoforms by gene
gene_to_isoforms = defaultdict(list)
for isoform in TCell_iso_5:
    gene = isoform.split(":")[0]  # Use the prefix before first colon
    gene_to_isoforms[gene].append(isoform)

# Loop through each gene and its isoforms
for gene, isoforms in gene_to_isoforms.items():
    print(f"\n▶ Gene: {gene} ({len(isoforms)} isoforms)")
    for isoform in isoforms:
        safe_isoform_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_TCell_i[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Exclude 0s if you want to scale based only on expressing cells
        expr_nonzero = expr[expr > 0]

        # Calculate 99th percentile for vmax
        vmax_val = np.percentile(expr_nonzero, 100) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"   └─ {isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Create TwoSlopeNorm scaling
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot UMAP
        sc.pl.umap(
            adata_TCell_i,
            color=isoform,
            use_raw=False,
            title=f"{isoform}",
            cmap="coolwarm",
            save=f"_{safe_isoform_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
sc.pl.umap(
            adata_TCell_i,
            color=f'0.26_log_AutoZI',
            title=f'UMAP with Clusters (Isoform-Level) ',
            frameon=True,
            legend_fontsize=10,
            legend_fontoutline=2,
size= 10
        )

In [None]:
# Manually identified the clusters as follows:

tcell_cluster_mapping = {
    "0": "Memory T cells",
    "1": "Effector CD4 T cells",
    "2": "Effector CD8 TCells",
    "3": "Effector-Memory Transition T cells"
}

# Create new cell type annotations
adata_TCell_i_pbmc.obs["TCell_subtype"] = adata_TCell_i_pbmc.obs["0.26_log_AutoZI"].astype(str).map(tcell_cluster_mapping)

# Check assignments
adata_TCell_i_pbmc.obs["TCell_subtype"].value_counts()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import scanpy as sc

# Desired legend order (subset of what's present is OK)
legend_order = [
    "Memory T cells",
    "Effector CD4 T cells",
    "Effector CD8 T cells",
    "Effector-Memory Transition TCells"
]

OBS_KEY = "TCell_subtype"

# Ensure categorical dtype, reorder to your preferred order (keeping only present)
present = [c for c in legend_order if c in adata_TCell.obs[OBS_KEY].unique().tolist()]
if not present:
    raise ValueError(f"No categories from legend_order found in adata.obs['{OBS_KEY}'].")

adata_TCell.obs[OBS_KEY] = (
    adata_TCell.obs[OBS_KEY]
    .astype("category")
    .cat.remove_unused_categories()
    .cat.reorder_categories(present, ordered=True)
)

# Build a stable palette matching the (reordered) categories
tab20 = plt.get_cmap("tab20").colors
base_idx = [0, 2, 4, 6, 8, 10, 12, 14]  # spaced picks from tab20
palette = [tab20[i % len(tab20)] for i in base_idx[:len(present)]]

# Plot UMAP without legend (we'll add our own)
sc.pl.umap(
    adata_TCell,
    color=OBS_KEY,
    title="",
    frameon=True,
    palette=palette,     # order aligns to 'present'
    legend_loc=None,     # disable Scanpy legend
    show=False,
    size=6               # tweak point size as needed
)

# Figure/axes setup
fig = plt.gcf()
fig.set_size_inches(5, 4)
ax = plt.gca()

# Axis labels (publication-friendly)
ax.set_xlabel("UMAP1", fontsize=9)
ax.set_ylabel("UMAP2", fontsize=9)

# Optional: tidy ticks, keep labels visible
ax.set_xticks([]); ax.set_yticks([])
for spine in ["top", "right"]:
    ax.spines[spine].set_visible(False)

# ---- Manual legend built from categories + palette ----
handles = [mpatches.Patch(facecolor=palette[i], edgecolor="none", label=present[i])
           for i in range(len(present))]

leg = ax.legend(
    handles=handles,
    loc="lower center",
    bbox_to_anchor=(0.5, -0.28),  # under the axis
    fontsize=8,
    frameon=False,
    ncol=2,
    handlelength=1.0,
    columnspacing=1.2,
)

# Save
out_base = "Intermediate_Files/Paper_Figs/UMAP/UMAP_TCell_subtypes"
fig.savefig(f"{out_base}.pdf", dpi=600, transparent=True, bbox_inches="tight")
fig.savefig(f"{out_base}.png", dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
# Create the column if it doesn't exist
if "gen_cell_type_reannotated" not in adata_i_filtered.obs.columns:
    adata_i_filtered.obs["gen_cell_type_reannotated"] = adata_i_filtered.obs["gen_cell_type"]

# Temporarily convert to plain string for flexible assignment
adata_i_filtered.obs["gen_cell_type_reannotated"] = (
    adata_i_filtered.obs["gen_cell_type_reannotated"].astype(str)
)

# Assign new subtype labels from adata_TCell_i
adata_i_filtered.obs.loc[adata_TCell_i_pbmc.obs_names, "gen_cell_type_reannotated"] = (
    adata_TCell_i.obs["TCell_subtype"].astype(str)
)

# (Optional) Re-cast to categorical for plotting, ordering, etc.
adata_i_filtered.obs["gen_cell_type_reannotated"] = (
    adata_i_filtered.obs["gen_cell_type_reannotated"].astype("category")
)

# Check output
print(adata_i_filtered.obs["gen_cell_type_reannotated"].value_counts())

In [None]:
sc.pl.umap(
            adata_i_filtered,
            color=f'gen_cell_type_reannotated',
            title=f'UMAP with Clusters (Isoform-Level) ',
            frameon=True,
            legend_fontsize=10,
            legend_fontoutline=2,
        )

In [None]:
#Save Denoised data
output_dir = 'Intermediate_Files/Clustering/'

adata_i_filtered.write(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_reannotated_AutoZILatent.h5mu"))

In [None]:
# Load the mdata object from the file
import os
output_dir = 'Intermediate_Files/Clustering'

from scanpy import read_h5ad
# Load the mdata object from the file
adata_g_filtered = read_h5ad(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_reannotated_AutoZILatent.h5mu"))
adata_i_filtered = read_h5ad(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_reannotated_AutoZILatent.h5mu"))

In [None]:
adata_TCell = adata_g_filtered[adata_g_filtered.obs["gen_cell_type"] == "TCells"].copy()
adata_TCell_i = adata_i_filtered[adata_i_filtered.obs["gen_cell_type"] == "TCells"].copy()

In [None]:
### Create violin plot with mitochondrial percentages by cell-type (gene-level)

import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Config ---
celltype_col = "gen_cell_type_reannotated"   # Column in .obs with cell type labels
mito_col = "pct_counts_mt"                   # Column in .obs with mitochondrial percentage
output_file = "Intermediate_Files/Paper_Figs/mito_percentage_by_celltype_gene.png"

# Desired custom order
custom_order = [
    "BCells",
    "NK Cells",
    "Monocyte-derived",
    "Megakaryocytes",
    "Effector CD4 TCells",
    "Cytotoxic TCells", # These are the same as CD8+ Effector T cells
    "Memory TCells",
    "Effector-Memory Transition TCells"
]

## Group mitochondrial percentage by cell type
mito_by_celltype = (
    adata_g_filtered.obs[[celltype_col, mito_col]]
    .groupby(celltype_col)
    .agg(
        mean_mito_pct=(mito_col, "mean"),
        median_mito_pct=(mito_col, "median"),
        n_cells=(mito_col, "count")
    )
    .reindex(custom_order)  # Optional: match grouping output order to plot order
)

print(mito_by_celltype)

## Create Violin plot
plt.figure(figsize=(10, 5))
sns.violinplot(
    data=adata_g_filtered.obs,
    x=celltype_col,
    y=mito_col,
    order=custom_order,
    palette="Set2",
    inner="box"
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Mitochondrial %")
plt.title("Mitochondrial Percentage by Cell Type")
plt.tight_layout()

# --- Save figure ---
plt.savefig(output_file, dpi=300, bbox_inches="tight")

plt.show()

In [None]:
### Create violin plot with mitochondrial percentages by cell-type (isoform-level) (Figure S5a)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Config ---
celltype_col = "gen_cell_type_reannotated"   # Column in .obs with cell type labels
mito_col = "pct_counts_mt"                   # Column in .obs with mitochondrial percentage
output_file = "Intermediate_Files/Paper_Figs/mito_percentage_by_celltype_iso.png"

# Desired custom order
custom_order = [
    "BCells",
    "NK Cells",
    "Monocyte-derived",
    "Megakaryocytes",
    "Effector CD4 TCells",
    "Cytotoxic TCells",
    "Memory TCells",
    "Effector-Memory Transition TCells"
]

## Group mitochondrial percentage by cell type 
mito_by_celltype = (
    adata_i_filtered.obs[[celltype_col, mito_col]]
    .groupby(celltype_col)
    .agg(
        mean_mito_pct=(mito_col, "mean"),
        median_mito_pct=(mito_col, "median"),
        n_cells=(mito_col, "count")
    )
    .reindex(custom_order)  # Optional: match grouping output order to plot order
)

print(mito_by_celltype)

# --- Violin plot ---
plt.figure(figsize=(10, 5))
sns.violinplot(
    data=adata_i_filtered.obs,
    x=celltype_col,
    y=mito_col,
    order=custom_order,
    palette="Set2",
    inner="box"
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Mitochondrial %")
plt.title("Mitochondrial Percentage by Cell Type")
plt.tight_layout()

## Save figure
plt.savefig(output_file, dpi=300, bbox_inches="tight")

plt.show()

In [None]:
### Create violin plot with doublet score by cell-type (gene-level)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Config
celltype_col = "gen_cell_type_reannotated"   # Column in .obs with cell type labels
doublet_col = "doublet_score"                # Column in .obs with doublet score
output_file = "Intermediate_Files/Paper_Figs/doubletscore_by_celltype_gene.png"

# Desired custom order
custom_order = [
    "BCells",
    "NK Cells",
    "Monocyte-derived",
    "Megakaryocytes",
    "Effector CD4 TCells",
    "Cytotoxic TCells",
    "Memory TCells",
    "Effector-Memory Transition TCells"
]

## Group doublet score by cell type
doublet_by_celltype = (
    adata_g_filtered.obs[[celltype_col, doublet_col]]
    .groupby(celltype_col)
    .agg(
        mean_doublet_score=(doublet_col, "mean"),
        median_doublet_score=(doublet_col, "median"),
        n_cells=(doublet_col, "count")
    )
    .reindex(custom_order)  # Optional: match the grouping output order to the plot order
)

print(doublet_by_celltype)

## Violin plot
plt.figure(figsize=(10, 5))
sns.violinplot(
    data=adata_g_filtered.obs,
    x=celltype_col,
    y=doublet_col,
    order=custom_order,
    palette="Set2",
    inner="box"
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Doublet Score")
plt.title("Doublet Score by Cell Type")
plt.tight_layout()

## Save figure
plt.savefig(output_file, dpi=300, bbox_inches="tight")

plt.show()

In [None]:
### Create violin plot with doublet score by cell-type (isoform-level) (Figure S5b)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Config ---
celltype_col = "gen_cell_type_reannotated"   # Column in .obs with cell type labels
doublet_col = "doublet_score"                # Column in .obs with doublet score
output_file = "Intermediate_Files/Paper_Figs/doubletscore_by_celltype_iso.png"

# Desired custom order
custom_order = [
    "BCells",
    "NK Cells",
    "Monocyte-derived",
    "Megakaryocytes",
    "Effector CD4 TCells",
    "Cytotoxic TCells",
    "Memory TCells",
    "Effector-Memory Transition TCells"
]

# --- Group doublet score by cell type ---
doublet_by_celltype = (
    adata_i_filtered.obs[[celltype_col, doublet_col]]
    .groupby(celltype_col)
    .agg(
        mean_doublet_score=(doublet_col, "mean"),
        median_doublet_score=(doublet_col, "median"),
        n_cells=(doublet_col, "count")
    )
    .reindex(custom_order)  # Optional: match the grouping output order to the plot order
)

print(doublet_by_celltype)

# --- Violin plot ---
plt.figure(figsize=(10, 5))
sns.violinplot(
    data=adata_i_filtered.obs,
    x=celltype_col,
    y=doublet_col,
    order=custom_order,
    palette="Set2",
    inner="box"
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Doublet Score")
plt.title("Doublet Score by Cell Type")
plt.tight_layout()

# --- Save figure ---
plt.savefig(output_file, dpi=300, bbox_inches="tight")

plt.show()

In [None]:
### Create violin plot with total counts by cell-type (gene-level)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Config ---
celltype_col = "gen_cell_type_reannotated"   # Column in .obs with cell type labels
counts_col = "total_counts"                  # Column in .obs with total counts
output_file = "Intermediate_Files/Paper_Figs/totalcounts_by_celltype_gene.png"

# Desired custom order
custom_order = [
    "BCells",
    "NK Cells",
    "Monocyte-derived",
    "Megakaryocytes",
    "Effector CD4 TCells",
    "Cytotoxic TCells",
    "Memory TCells",
    "Effector-Memory Transition TCells"
]

# --- Group total counts by cell type ---
counts_by_celltype = (
    adata_g_filtered.obs[[celltype_col, counts_col]]
    .groupby(celltype_col)
    .agg(
        mean_counts=(counts_col, "mean"),
        median_counts=(counts_col, "median"),
        n_cells=(counts_col, "count")
    )
    .reindex(custom_order)  # match output to custom plotting order
)

print(counts_by_celltype)

# --- Violin plot ---
plt.figure(figsize=(10, 5))
sns.violinplot(
    data=adata_g_filtered.obs,
    x=celltype_col,
    y=counts_col,
    order=custom_order,
    palette="Set2",
    inner="box"  # adds median/IQR markers
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Total Counts")
plt.title("Total Counts per Cell by Cell Type")
plt.tight_layout()

# --- Save figure ---
plt.savefig(output_file, dpi=300, bbox_inches="tight")

plt.show()

In [None]:
### Create violin plot with total counts by cell-type (isoform-level)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Config ---
celltype_col = "gen_cell_type_reannotated"   # Column in .obs with cell type labels
counts_col = "total_counts_isoforms"                  # Column in .obs with total counts
output_file = "Intermediate_Files/Paper_Figs/totalcounts_by_celltype_iso.png"

# Desired custom order
custom_order = [
    "BCells",
    "NK Cells",
    "Monocyte-derived",
    "Megakaryocytes",
    "Effector CD4 TCells",
    "Cytotoxic TCells",
    "Memory TCells",
    "Effector-Memory Transition TCells"
]

# --- Group total counts by cell type ---
counts_by_celltype = (
    adata_i_filtered.obs[[celltype_col, counts_col]]
    .groupby(celltype_col)
    .agg(
        mean_counts=(counts_col, "mean"),
        median_counts=(counts_col, "median"),
        n_cells=(counts_col, "count")
    )
    .reindex(custom_order)  # match output to custom plotting order
)

print(counts_by_celltype)

# --- Violin plot ---
plt.figure(figsize=(10, 5))
sns.violinplot(
    data=adata_i_filtered.obs,
    x=celltype_col,
    y=counts_col,
    order=custom_order,
    palette="Set2",
    inner="box"  # adds median/IQR markers
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Total Counts")
plt.title("Total Counts per Cell by Cell Type")
plt.tight_layout()

# --- Save figure ---
plt.savefig(output_file, dpi=300, bbox_inches="tight")

plt.show()

In [None]:
### Create UMAPs with GZMB and NCAM1 expression (gene-level) (Figure S5c-d)

import scanpy as sc
import numpy as np
import os

# --- I/O like your snippet ---
marker_root = "Intermediate_Files/Clustering/Figures/Markers"
group_name = "NK-T (TCell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

# --- helpers ---
def isoforms_for_gene(adata, gene):
    pref = f"{gene}:"
    return [v for v in adata.var_names if v == gene or v.startswith(pref)]

def summed_gene_expr(adata, gene):
    cols = isoforms_for_gene(adata, gene)
    if not cols:
        raise ValueError(f"No isoforms found for {gene} (looked for '{gene}' or '{gene}:*').")
    X = adata[:, cols].X
    if hasattr(X, "toarray"): X = X.toarray()
    return X.sum(axis=1).ravel()

# plot each gene separately
for gene in ["GZMB", "NCAM1"]:
    expr = summed_gene_expr(adata_TCell_i, gene)
    vmax = np.percentile(expr[expr > 0], 99) if (expr > 0).any() else 1.0

    # stash vector in obs and plot
    key = f"__{gene}_sum"
    adata_TCell_i.obs[key] = expr

    sc.pl.umap(
        adata_TCell_i,
        color=key,
        use_raw=False,
        title=f"{gene}",
        cmap="viridis",       # sequential; expression is non-negative
        vmin=0,
        vmax=vmax,
        frameon=True,
        show=False,
        save=f"_{gene}.pdf"   # writes to sc.settings.figdir
    )

    del adata_TCell_i.obs[key]

In [None]:
### Create UMAPs with GZMB and NCAM1 expression (isoform-level) (Figure S5c-d)

import scanpy as sc
import numpy as np
import os

# --- I/O ---
marker_root = "Intermediate_Files/Clustering/Figures/Markers"
group_name = "NK-T (TCell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

def find_gene_var(adata, gene):
    """Return a single var key for the gene (no summing)."""
    gU = gene.upper()
    # 1) exact var_names (case-insensitive)
    exact = [vn for vn in adata.var_names if isinstance(vn, str) and vn.upper() == gU]
    if exact:
        return exact[0]
    # 2) combined IDs like 'GZMB:...'
    pref = [vn for vn in adata.var_names if isinstance(vn, str) and vn.startswith(f"{gene}:")]
    if len(pref) == 1:
        return pref[0]
    # 3) var['gene_name'] matches
    if "gene_name" in adata.var.columns:
        idx = adata.var.index[adata.var["gene_name"].astype(str).str.upper() == gU].tolist()
        if len(idx) == 1:
            return idx[0]
        elif len(idx) > 1:
            # pick the column with highest mean expression
            X = adata[:, idx].X
            if hasattr(X, "toarray"): X = X.toarray()
            means = X.mean(axis=0)
            return idx[int(np.argmax(means))]

    # 2) multiple 'GENE:*' options -> choose with highest mean
    if len(pref) > 1:
        X = adata[:, pref].X
        if hasattr(X, "toarray"): X = X.toarray()
        means = X.mean(axis=0)
        return pref[int(np.argmax(means))]

    raise ValueError(f"Could not map gene '{gene}' to a single feature in var_names or var['gene_name'].")

# --- plot each gene separately (no summing) ---
for gene in ["GZMB", "NCAM1"]:
    var_key = find_gene_var(adata_TCell, gene)
    expr = adata_TCell[:, var_key].X
    expr = expr.toarray().ravel() if hasattr(expr, "toarray") else np.ravel(expr)

    vmax = np.percentile(expr[expr > 0], 99) if (expr > 0).any() else 1.0

    key = f"__{gene}"
    adata_TCell.obs[key] = expr
    sc.pl.umap(
        adata_TCell,
        color=key,
        use_raw=False,
        title=gene,
        cmap="viridis",
        vmin=0, vmax=vmax,
        frameon=True, size=6,
        legend_loc=None,
        show=False,
        save=f"_{gene}.pdf"
    )
    del adata_TCell.obs[key]

In [None]:
adata_i_filtered.obs['sub_cell_type'] = adata_i_filtered.obs.pop('gen_cell_type_reannotated')
adata_g_filtered.obs['sub_cell_type'] = adata_g_filtered.obs.pop('gen_cell_type_reannotated')

In [None]:
#Save Denoised data
output_dir = 'Intermediate_Files/Clustering/'

adata_i_filtered.write(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_reannotated_AutoZILatent.h5mu"))
adata_g_filtered.write(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_reannotated_AutoZILatent.h5mu"))