In [None]:
import io
import tempfile
from anndata import AnnData
import muon as mu
import numpy as np
import requests
import os
import scanpy as sc
import scvi
import seaborn as sns
import torch
import pandas as pd
import sys
import scrublet as scr
import skimage
import pybiomart
from bioservices import BioMart
import rdata
import matplotlib.pyplot as plt
from adjustText import adjust_text
from scipy.stats import beta
import leidenalg
import igraph
import tqdm
import time
import gc
import polars as pl
import pyarrow

In [None]:
# Load the mdata object from the file
output_dir = 'Intermediate_Files/Clustering_07282025'

from scanpy import read_h5ad
# Load the mdata object from the file
adata_g_filtered = read_h5ad(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_AutoZILatent_07312025.h5mu"))
adata_i_filtered = read_h5ad(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_AutoZILatent_07312025.h5mu"))

In [None]:
adata_TCell = adata_g_filtered[adata_g_filtered.obs["gen_cell_type"] == "TCells"].copy()

In [None]:
adata_TCell_i = adata_i_filtered[adata_i_filtered.obs["gen_cell_type"] == "TCells"].copy()

In [None]:
# Function to find matching genes in var_names (combined_ID format)
def find_matching_genes(prefixes, gene_list):
    return [gene for gene in gene_list if any(gene.startswith(prefix) for prefix in prefixes)]

#PTPRCAP = CD45
TCell_Markers = ["CD4:", "CD3D:", "CD3E:", "CD3G:", "CD3Z", "CD8A:", "CD8B:", "PTPRC:", "NCAM1"]
Naive_TCell = ["PTPRC:", "TCF7:", "FOXP1", "LEF1:", "IL2RA:", "CD27:", "IL7R:", "ITGAE:"]
Memory_TCell = ["CCR5:", "HLA-DRB1:", "HLA-DRA:", "SELL", "TCF7:", "IL7R:", "CCR7:"]
CD8_TCell = ["CD8A:", "CD8B:", "CXCR3:", "KLRB1:", "PTGDR2:", "GATA3:", "IRF4:", "RORC:", "CCL5"]
#Central_Memory_TCell = ["CCR5:", "IL7RA:", "EOMES:", "PRDM1:", "IL7R:", "SELL", "CCR7:"]
#Effector_Memory_TCell = ["CCR5:", "HLA-DRB1:", "HLA-DRA:", "ITGAL:", "GZMA:", "PRDM1:", "SELL"]
Reg_TCell = ["FOXP3:", "IL2RA:", "CTLA4:", "STAT5A"]
#Th1_TCell = ["CXCR3:", "IFNG:", "TNF:", "STAT4:"]
CD4_Effector = ["CD4:", "CXCR3:", "TNF:", "STAT4:", "IL17A:", "IL13:", "IL25:", "AHR:", "FOXO4:", "GATA3", "IL2RA"]
Stem_Mem = ["CD4:", "GATA3:", "AHR:", "IL2RA:", "CCR7:", "IL7R:", "TCF7:", "CX3CR1:", "PDCD1:", "TOX:", "STAT5A:", 'CD27:', 'LEF1:', 
            "FOXP3:", "IL2RA:", "CTLA4:", "STAT5A"]

# Get gene matches for gene-level data
TCell_genes_1 = find_matching_genes(TCell_Markers, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_1)} gene-level IDs for T-Cells: {TCell_genes_1}")
TCell_genes_2 = find_matching_genes(Naive_TCell, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_2)} gene-level IDs for Naive T-Cells: {TCell_genes_2}")
TCell_genes_3 = find_matching_genes(Memory_TCell, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_3)} gene-level IDs for Memory T-Cells: {TCell_genes_3}")
TCell_genes_4 = find_matching_genes(CD8_TCell, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_4)} gene-level IDs for CD8+ T-Cells: {TCell_genes_4}")
TCell_genes_5 = find_matching_genes(Reg_TCell, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_5)} gene-level IDs for Regulatory T-Cells: {TCell_genes_5}")
TCell_genes_6 = find_matching_genes(CD4_Effector, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_6)} gene-level IDs for Helper T-Cells: {TCell_genes_6}")
TCell_genes_7 = find_matching_genes(Stem_Mem, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_7)} gene-level IDs for Stem-like memory T-Cells: {TCell_genes_7}")


# Get gene matches for isoform-level data
TCell_iso_1 = find_matching_genes(TCell_Markers, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_1)} isoform-level IDs for T-Cells: {TCell_iso_1}")
TCell_iso_2 = find_matching_genes(Naive_TCell, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_2)} isoform-level IDs for Naive T-Cells: {TCell_iso_2}")
TCell_iso_3 = find_matching_genes(Memory_TCell, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_3)} isoform-level IDs for Memory T-Cells: {TCell_iso_3}")
TCell_iso_4 = find_matching_genes(CD8_TCell, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_4)} isoform-level IDs for CD8+ T-Cells: {TCell_iso_4}")
TCell_iso_5 = find_matching_genes(Reg_TCell, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_5)} isoform-level IDs for Regulatory T-Cells: {TCell_iso_5}")
TCell_iso_6 = find_matching_genes(CD4_Effector, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_6)} isoform-level IDs for Helper T-Cells: {TCell_iso_6}")
TCell_iso_7 = find_matching_genes(Stem_Mem, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_7)} isoform-level IDs for Stem-like memory T-Cells: {TCell_iso_7}")

In [None]:
# Run PCA on log-transformed data
sc.pp.pca(adata_TCell)

# Compute UMAP
sc.pp.neighbors(adata_TCell, n_neighbors = 20, use_rep= 'X_AutoZI')
sc.tl.umap(adata_TCell, min_dist=0.3)

sc.pl.umap(adata_TCell, color="batch", title="Gene UMAP by batch")

In [None]:
# Run PCA on log-transformed data
sc.pp.pca(adata_TCell_i)

# Compute UMAP
sc.pp.neighbors(adata_TCell_i, n_neighbors = 20, use_rep= 'X_AutoZI')
sc.tl.umap(adata_TCell_i, min_dist=0.3)

sc.pl.umap(adata_TCell_i, color="batch", title="Gene UMAP by batch")

In [None]:
# Perform Leiden clustering at multiple resolutions
resolutions = [0.04, 0.06, 0.1, 0.14, 0.16, 0.2, 0.24, 0.26, 0.3, 0.34, 0.36, 
               0.4, 0.44, 0.46, 0.5
              ]
for res in resolutions:
    sc.tl.leiden(adata_TCell, resolution=res, key_added=f'{res}_log_AutoZI', flavor = "igraph", n_iterations=2)
    sc.tl.leiden(adata_TCell_i, resolution=res, key_added=f'{res}_log_AutoZI', flavor = "igraph", n_iterations=2)

In [None]:
def relabel_clusters_by_size(adata, cluster_key):
    # Get the cluster labels
    cluster_labels = adata.obs[cluster_key]
    
    # Count the size of each cluster
    cluster_sizes = cluster_labels.value_counts()
    
    # Sort clusters by size (largest first)
    sorted_clusters = cluster_sizes.index[np.argsort(-cluster_sizes.values)]
    
    # Create a mapping from old cluster labels to new ones
    new_labels_map = {old_label: new_label for new_label, old_label in enumerate(sorted_clusters)}
     
    # Apply the new labels
    adata.obs[cluster_key] = cluster_labels.map(new_labels_map).astype('category')
    
    return adata

In [None]:
# Relabel clusters by size
resolutions = ['0.04_log_AutoZI', '0.06_log_AutoZI', '0.1_log_AutoZI', '0.14_log_AutoZI', 
               '0.16_log_AutoZI', '0.2_log_AutoZI', '0.24_log_AutoZI', '0.26_log_AutoZI', 
               '0.3_log_AutoZI', '0.34_log_AutoZI', '0.36_log_AutoZI', '0.4_log_AutoZI', 
               '0.44_log_AutoZI', '0.46_log_AutoZI', '0.5_log_AutoZI'
              ]
for cluster_key in resolutions:
    adata_TCell_pbmc = relabel_clusters_by_size(adata_TCell, cluster_key)
    adata_TCell_i_pbmc = relabel_clusters_by_size(adata_TCell_i, cluster_key)

In [None]:
# Function to plot UMAP with labels reflecting cluster sizes
def plot_umap_with_labels_g(adata, resolutions, use_rep_key=None):
    """
    Plots UMAP with cluster labels and prints what 'use_rep' is assigned to.

    Parameters:
    - adata: AnnData object
    - resolutions: List of resolution values to plot
    - use_rep_key: The key to use for coloring clusters (defaults to '20lat_1e2' if not provided)
    """

    print(f"Using representation: {use_rep_key}")  # Print assigned representation

    vibrant_palette = plt.get_cmap('tab20').colors  # Set color palette

    for res in resolutions:
        # Plot UMAP with cluster labels
        sc.pl.umap(
            adata,
            color=f'{res}_{use_rep_key}',
            title=f'UMAP with Clusters (Gene-Level, {use_rep_key}, Res={res})',
            frameon=True,
            palette=vibrant_palette,
            legend_loc='on data',
            legend_fontsize=10,
            legend_fontoutline=2,
        )

# Function to plot UMAP with labels reflecting cluster sizes
def plot_umap_with_labels_i(adata, resolutions, use_rep_key=None):
    """
    Plots UMAP with cluster labels and prints what 'use_rep' is assigned to.

    Parameters:
    - adata: AnnData object
    - resolutions: List of resolution values to plot
    - use_rep_key: The key to use for coloring clusters (defaults to '20lat_1e2' if not provided)
    """

    print(f"Using representation: {use_rep_key}")  # Print assigned representation

    vibrant_palette = plt.get_cmap('tab20').colors  # Set color palette

    for res in resolutions:
        # Plot UMAP with cluster labels
        sc.pl.umap(
            adata,
            color=f'{res}_{use_rep_key}',
            title=f'UMAP with Clusters (Isoform-Level, {use_rep_key}, Res={res})',
            frameon=True,
            palette=vibrant_palette,
            legend_loc='on data',
            legend_fontsize=10,
            legend_fontoutline=2,
        )

In [None]:
# Call the functions
plot_umap_with_labels_g(adata_TCell_pbmc, resolutions=[
    0.04, 0.06, 0.1, 0.14, 0.16, 0.2, 0.24, 0.26, 0.3, 0.34, 0.36, 
    0.4, 0.44, 0.46, 0.5, 0.54, 0.56, 0.6, 0.64, 0.66, 0.7, 0.74, 
    0.76, 0.8, 0.84, 0.86, 0.9, 0.94, 0.96, 1.0
], use_rep_key= 'log_AutoZI')

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os

# Define output directory
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "General TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("General T-Cell Markers")

# Loop through matched genes and plot them
for gene in TCell_genes_1:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
# Define output directory
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "Naive TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Naive T-Cell Markers")

# Loop through matched genes and plot them
for gene in TCell_genes_2:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
# Genes of interest
genes = ['CD27:ENSG00000139193', "ITGAE:ENSG00000083457", "LEF1:ENSG00000138795"]
        

# Calculate summed expression per cell
adata_TCell.obs['Naive_Markers_Combined'] =adata_TCell[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell,
    color='Naive_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Naive TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell.obs['Naive_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell.obs['Naive_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Naive TCell Aggregated Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "Total Memory TCell\nGene Marker Expression", ,
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_07282025/Figures/UMAP/umap_Naive_TCellOnly_combined_expression_gene.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
print("Memory T-Cell Markers")

# Define output directory
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "Memory TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

# Loop through matched genes and plot them
for gene in TCell_genes_3:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr =adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
       adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
# Genes of interest
genes = ['TCF7:ENSG00000081059', 'CCR7:ENSG00000126353', 'SELL:ENSG00000188404']
        

# Calculate summed expression per cell
adata_TCell.obs['TMem_Markers_Combined'] =adata_TCell[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell,
    color='TMem_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Memory TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell.obs['TMem_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell.obs['TMem_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Memory TCell Aggregated Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "Total Memory TCell\nGene Marker Expression", ,
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_07282025/Figures/UMAP/umap_TMem_TCellOnly_combined_expression_gene.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
print("CD8+ T-Cell Markers")

# Define output directory
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "CD8 TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir


# Loop through matched genes and plot them
for gene in TCell_genes_4:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
# Genes of interest
genes = ['GATA3:ENSG00000107485', 'KLRB1:ENSG00000111796', 'CD8A:ENSG00000153563',
        'CD8B:ENSG00000172116', 'CCL5:ENSG00000271503']
        

# Calculate summed expression per cell
adata_TCell.obs['CD8_TCell_Markers_Combined'] =adata_TCell[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell,
    color='CD8_TCell_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Cytotoxic TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell.obs['CD8_TCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell.obs['CD8_TCell_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Cytotoxic TCell Aggregate Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "Cytotoxic TCell Aggregate Gene Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_07282025/Figures/UMAP/umap_CytotoxicTCell_TCellsOnly_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
print("Regulatory T-Cell Markers")

# Define output directory
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "Regulatory TCell"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir


# Loop through matched genes and plot them
for gene in TCell_genes_5:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
Reg_filtered = [i for i in TCell_genes_5 if any(gene in i for gene in ["FOXP3", "STAT5A", "IL2RA", "CTLA4"])]

# Safety: Check which markers are in adata_TCell_i.var_names
genes = [g for g in Reg_filtered if g in adata_TCell.var_names]

# Calculate summed expression per cell
X = adata_TCell[:, genes].X
if hasattr(X, "toarray"):
    X = X.toarray()

adata_TCell.obs['Reg_TCell_Gene_Markers_Combined'] = X.sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell,
    color='Reg_TCell_Gene_Markers_Combined',
    cmap="coolwarm",
    frameon=True,
    title="Regulatory TCell Gene Marker Expression",
    vmin=-(np.percentile(adata_TCell.obs['Reg_TCell_Gene_Markers_Combined'], 100)),
    vmax=np.percentile(adata_TCell.obs['Reg_TCell_Gene_Markers_Combined'], 100),
    show=False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Total Regulatory TCell Aggregate Gene Marker Expression")
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT annotation
#fig.text(-0.03, 0.53, "Total Regulatory TCell Aggrgate Gene Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)

# Save figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_07282025/Figures/UMAP/umap_RegulatoryTCell_TCellsOnly_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
print("Helper T-Cell Markers")
# Define output directory
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "Helper (CD4) TCell(TCell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir


# Loop through matched genes and plot them
for gene in TCell_genes_6:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
# Genes of interest
genes = ['GATA3:ENSG00000107485', 'CD4:ENSG00000010610', 'AHR:ENSG00000106546',
        'IL2RA:ENSG00000134460', 'TNF:ENSG00000232810']  

# Calculate summed expression per cell
adata_TCell.obs['CD4_Effector_TCell_Markers_Combined'] =adata_TCell[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell,
    color='CD4_Effector_TCell_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="CD4+ Effector TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell.obs['CD4_Effector_TCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell.obs['CD4_Effector_TCell_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("CD4+ Effector TCell Aggregate Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "CD4+ Effector TCell Aggregate Gene Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_07282025/Figures/UMAP/umap_CD4EffectorTCell_TCellsOnlycombined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
# Genes of interest
genes = ['CD3D:ENSG00000167286', 'CD3E:ENSG00000198851', 'CD3G:ENSG00000160654',
         'GATA3:ENSG00000107485', 'CD4:ENSG00000010610', 'AHR:ENSG00000106546',
        'IL2RA:ENSG00000134460']  

# Calculate summed expression per cell
adata_g_filtered.obs['CD4_Effector_TCell_Markers_Combined'] =adata_g_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_g_filtered,
    color='CD4_Effector_TCell_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="CD4+ Effector TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_g_filtered.obs['CD4_Effector_TCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_g_filtered.obs['CD4_Effector_TCell_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 2.4)

# Clean up axis
ax = plt.gca()
ax.set_title("CD4+ Effector TCell Aggregate Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "CD4+ Effector TCell Aggregate Gene Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_07282025/Figures/UMAP/umap_CD4EffectorTCell_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
print("Effector-Memory Transition TCell Markers")
# Define output directory
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "Effector-Memory Transition TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir


# Loop through matched genes and plot them
for gene in TCell_genes_7:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
# Genes of interest
genes = ['GATA3:ENSG00000107485',
    'CCR7:ENSG00000126353',#
    'TCF7:ENSG00000081059', #
    'IL2RA:ENSG00000134460',
    'CTLA4:ENSG00000163599',
    'CD27:ENSG00000139193', #
    'ITGAE:ENSG00000083457',
    'LEF1:ENSG00000138795'#
]


# Calculate summed expression per cell
adata_TCell.obs['Transition_TCell_Markers_Combined'] =adata_TCell[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell,
    color='Transition_TCell_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Effector-Memory Transition TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell.obs['Transition_TCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell.obs['Transition_TCell_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 2.4)

# Clean up axis
ax = plt.gca()
ax.set_title("Effector-Memory Transition TCell Aggregate Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "CD4+ Effector TCell Aggregate Gene Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_07282025/Figures/UMAP_TransitionTCell_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
Transition = ["STAT4:", "IFNG:", "FOXO4:", "CD4:", "TCF7:", "IL7R:", "CTLA4:"]

# Get gene matches for gene-level data
Transition_genes = find_matching_genes(Transition, adata_g_filtered.var_names)
print(f"Matched {len(Transition_genes)} gene-level IDs for Transition Cells: {Transition_genes}")

In [None]:
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers/Cell_type_Gene"
group_name = "TransitionCells"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Transition Cell Markers")

# Loop through matched genes and plot them
for gene in Transition_genes:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero,100) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero,100)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
# Example: Suppose you manually identified the clusters as follows:
# (Change this to whatever is appropriate for your dataset)

tcell_cluster_mapping = {
    "0": "Memory TCells",
    "1": "Effector CD4 TCells",
    "2": "Cytotoxic TCells",
    "3": "Effector-Memory Transition TCells",
}

# Create new cell type annotations
adata_TCell_pbmc.obs["TCell_subtype"] = adata_TCell_pbmc.obs["0.26_log_AutoZI"].astype(str).map(tcell_cluster_mapping)

# Check assignments
adata_TCell_pbmc.obs["TCell_subtype"].value_counts()

In [None]:
import matplotlib.pyplot as plt
import scanpy as sc

# Define desired legend order
legend_order = ["Memory TCells", "Effector CD4 TCells", "Cytotoxic TCells", "Effector-Memory Transition TCells"]

# Plot UMAP using subtype annotation
sc.pl.umap(
    adata_TCell_pbmc,
    color='TCell_subtype',
    title='',  # No title for publication version
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='lower right',
    legend_fontsize=10,
    legend_fontoutline=1,
    show=False  # Allow editing
)

# Adjust figure
fig = plt.gcf()
fig.set_size_inches(5,4)

# Get current legend handles and labels
ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()

# Build a dictionary of current labels and handles
label_handle_dict = dict(zip(labels, handles))

# Only keep those in legend_order that exist in current labels
ordered_labels = [label for label in legend_order if label in label_handle_dict]
ordered_handles = [label_handle_dict[label] for label in ordered_labels]

# Remove old legend
ax.get_legend().remove()

# Add new ordered legend
ax.legend(ordered_handles, ordered_labels,
          loc='lower center',
          bbox_to_anchor=(0.48, -0.3),  # Centered below
          fontsize=10,
          frameon=True,
          ncol=2)

plt.tight_layout()

# Save figure
plt.savefig("Intermediate_Files/Clustering_07282025/Figures/UMAP/UMAP_TCell_subtypes.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import scanpy as sc

# Desired legend order
legend_order = [
    "Memory TCells",
    "Effector CD4 TCells",
    "Cytotoxic TCells",
    "Effector-Memory Transition TCells"
]

OBS_KEY = "TCell_subtype"

# Keep only categories present and reorder
present = [c for c in legend_order if c in adata_TCell.obs[OBS_KEY].unique().tolist()]
if not present:
    raise ValueError(f"No categories from legend_order found in adata.obs['{OBS_KEY}'].")

adata_TCell.obs[OBS_KEY] = (
    adata_TCell.obs[OBS_KEY]
    .astype("category")
    .cat.remove_unused_categories()
    .cat.reorder_categories(present, ordered=True)
)

# Use full tab20 palette
palette = plt.get_cmap("tab20").colors

# Plot UMAP without Scanpy legend
sc.pl.umap(
    adata_TCell,
    color=OBS_KEY,
    title="",
    frameon=True,
    palette=palette,
    legend_loc=None,   # disable Scanpy's legend
    show=False,
    size=6
)

# Match publication dimensions
fig = plt.gcf()
fig.set_size_inches(6, 4)
ax = plt.gca()

# Axis labels
ax.set_xlabel("UMAP1", fontsize=9)
ax.set_ylabel("UMAP2", fontsize=9)
ax.set_xticks([]); ax.set_yticks([])
for spine in ["top", "right"]:
    ax.spines[spine].set_visible(False)

# Manual legend patches (tab20 colors in order)
handles = [
    mpatches.Patch(facecolor=palette[i], edgecolor="none", label=present[i])
    for i in range(len(present))
]

# Adjust layout to leave space at the bottom for legend
fig.subplots_adjust(bottom=0.3)

legend = fig.legend(
    handles, present,
    loc="lower center",
    bbox_to_anchor=(0.5, 0.1),   # centered below the axes
    fontsize=9,
    frameon=True,                # box around legend
    fancybox=False,              # square corners
    ncol=2,
    handlelength=1.0,
    handletextpad=0.5,
    columnspacing=1.1,
    borderpad=0.4
)
legend.get_frame().set_edgecolor("black")
legend.get_frame().set_linewidth(0.8)

# Save
out_base = "Intermediate_Files/Paper_Figs/UMAP/UMAP_TCell_subtypes"
fig.savefig(f"{out_base}.pdf", dpi=600, transparent=True, bbox_inches="tight")
fig.savefig(f"{out_base}.png", dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
from pandas.api.types import CategoricalDtype

# Create the column if it doesn't exist
if "gen_cell_type_reannotated" not in adata_g_filtered.obs.columns:
    adata_g_filtered.obs["gen_cell_type_reannotated"] = adata_g_filtered.obs["gen_cell_type"]

# Ensure it's categorical
if not isinstance(adata_g_filtered.obs["gen_cell_type_reannotated"].dtype, CategoricalDtype):
    adata_g_filtered.obs["gen_cell_type_reannotated"] = adata_g_filtered.obs["gen_cell_type_reannotated"].astype("category")

# Extract actual new categories from TCell_subtype
new_categories = adata_TCell_pbmc.obs["TCell_subtype"].astype("category").cat.categories

# Add only truly new categories
existing_cats = set(adata_g_filtered.obs["gen_cell_type_reannotated"].cat.categories)
cats_to_add = list(set(new_categories) - existing_cats)
if cats_to_add:
    adata_g_filtered.obs["gen_cell_type_reannotated"] = adata_g_filtered.obs["gen_cell_type_reannotated"].cat.add_categories(cats_to_add)

# Assign new labels (as strings)
adata_g_filtered.obs.loc[adata_TCell_pbmc.obs_names, "gen_cell_type_reannotated"] = adata_TCell_pbmc.obs["TCell_subtype"].astype(str).values

# Check output
print(adata_g_filtered.obs["gen_cell_type_reannotated"].value_counts())

In [None]:
# Remove unused categories (this will remove "TCells" since no cells are assigned to it)
adata_g_filtered.obs["gen_cell_type_reannotated"] = adata_g_filtered.obs["gen_cell_type_reannotated"].cat.remove_unused_categories()

# Check result
print(adata_g_filtered.obs["gen_cell_type_reannotated"].value_counts())

In [None]:
# Plot UMAP using subtype annotation
# Define desired legend order
legend_order = ["NK Cells", "BCells", "Megakaryocytes", "Monocyte-derived", 
                "Memory TCells", "Effector CD4 TCells", "Cytotoxic TCells", 
                "Effector-Memory Transition TCells"]

sc.pl.umap(
    adata_g_filtered,
    color='gen_cell_type_reannotated',
    title='',  # No title for publication version
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='lower right',
    legend_fontsize=10,
    legend_fontoutline=1,
    show=False  # Allow editing
)

# Adjust figure
fig = plt.gcf()
fig.set_size_inches(5,5)

# Get current legend handles and labels
ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()

# Build a dictionary of current labels and handles
label_handle_dict = dict(zip(labels, handles))

# Only keep those in legend_order that exist in current labels
ordered_labels = [label for label in legend_order if label in label_handle_dict]
ordered_handles = [label_handle_dict[label] for label in ordered_labels]

# Remove old legend
ax.get_legend().remove()

# Add new ordered legend
ax.legend(ordered_handles, ordered_labels,
          loc='lower center',
          bbox_to_anchor=(0.5, -0.35),  # Centered below
          fontsize=10,
          frameon=True,
          ncol=2)

plt.tight_layout()

# Save figure
plt.savefig("Intermediate_Files/Clustering_07282025/Figures/UMAP/UMAP_subtypes.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
#Save Clustered data
output_dir = 'Intermediate_Files/Clustering_07282025/'

adata_g_filtered.write(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_reannotated_AutoZILatent_08132025.h5mu"))

In [None]:
## Code to check differences between clusters

In [None]:
import scanpy as sc

# Choose your cell type labels
group1 = "Effector-Memory Transition TCells"
group2 = "Memory TCells"

# Make a log1p version of raw counts
import numpy as np

adata_TCell.layers["log1p_counts"] = np.log1p(adata_TCell.layers["counts"])

# Subset AnnData
adata_sub = adata_TCell[adata_TCell.obs["TCell_subtype"].isin([group1, group2])].copy()


# Create a new column with simplified labels
adata_sub.obs["comparison_group"] = adata_sub.obs["TCell_subtype"].map({
    group1: "group1",
    group2: "group2"
})

sc.tl.rank_genes_groups(
    adata_sub, 
    groupby="comparison_group",
    reference="group1",  # this will be the baseline
    method="wilcoxon", 
    use_raw=False,
    layer="log1p_counts",   # <- raw counts go here
    pts=True
)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from adjustText import adjust_text

# Extract DE results
de_results = sc.get.rank_genes_groups_df(adata_sub, group="group2")

# Replace 0 to avoid -inf
de_results["-log10(pval)"] = -np.log10(de_results["pvals_adj"].replace(0, 1e-300))

# Define significance
de_results["significant"] = (de_results["pvals_adj"] < 0.05) & (abs(de_results["logfoldchanges"]) >= 1)

# Volcano plot
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=de_results,
    x="logfoldchanges",
    y="-log10(pval)",
    hue="significant",
    palette={True: "red", False: "gray"},
    alpha=0.7,
    edgecolor=None
)

# Vertical threshold lines
plt.axvline(1, color="blue", linestyle="--", lw=1)
plt.axvline(-1, color="blue", linestyle="--", lw=1)

# Label top N genes
top_n = 30
top_genes = de_results.nsmallest(top_n, "pvals_adj")
texts = []
for _, row in top_genes.iterrows():
    texts.append(plt.text(row["logfoldchanges"], row["-log10(pval)"], row["names"],
                          fontsize=8, ha='right' if row["logfoldchanges"] < 0 else 'left'))

adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))

# Aesthetics
plt.title("Volcano Plot: Effector-Memory Transition TCells vs Memory TCells")
plt.xlabel("Log2 Fold Change")
plt.ylabel("-log10 Adjusted p-value")
plt.legend(title="FDR < 0.05 & |log2FC| ≥ 1", loc="upper left")
plt.tight_layout()
plt.savefig("Intermediate_Files/Paper_Figs/volcano_EffectorMemoryTransition_vs_Memory_labeled.pdf", dpi=600)
plt.show()

In [None]:
# Call the functions
plot_umap_with_labels_i(adata_TCell_i_pbmc, resolutions=[
    0.04, 0.06, 0.1, 0.14, 0.16, 0.2, 0.24, 0.26, 0.3, 0.34, 0.36, 
    0.4, 0.44, 0.46, 0.5, 0.54, 0.56, 0.6, 0.64, 0.66, 0.7, 0.74, 
    0.76, 0.8, 0.84, 0.86, 0.9, 0.94, 0.96, 1.0
], use_rep_key= 'log_AutoZI')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import scanpy as sc

def plot_predominant_isoform_umap(adata, gene_name, gene_id, output_dir):
    """
    Plot and save a UMAP of the predominant isoform per cell for a given gene,
    only if the total expression for that gene in the cell exceeds the median.
    """
    gene_prefix = f"{gene_name}:{gene_id}"
    os.makedirs(output_dir, exist_ok=True)

    # --- Subset isoforms belonging to gene ---
    isoform_vars = [v for v in adata.var_names if v.startswith(gene_prefix)]
    if not isoform_vars:
        print(f"[WARN] No isoforms found for {gene_prefix}")
        return

    adata_gene = adata[:, isoform_vars]

    # --- Compute total gene expression per cell ---
    gene_expr = adata_gene.X.sum(axis=1).A1 if hasattr(adata_gene.X, "A1") else adata_gene.X.sum(axis=1)
    gene_expr_thresh = np.median(gene_expr)

    # --- Create isoform expression DataFrame ---
    X_dense = adata_gene.X.toarray() if hasattr(adata_gene.X, "toarray") else np.asarray(adata_gene.X)
    df_iso_expr = pd.DataFrame(
        X_dense,
        index=adata_gene.obs_names,
        columns=adata_gene.var_names
    )

    # --- Get predominant isoform per cell ---
    predominant_isoform = df_iso_expr.idxmax(axis=1)
    predominant_isoform_masked = predominant_isoform.where(gene_expr > gene_expr_thresh, other="Median")

    # --- Add to obs as ordered categorical (put 'Median' last in legend) ---
    colname = f"{gene_name}_predominant_isoform"
    adata.obs[colname] = predominant_isoform_masked

    unique_vals = adata.obs[colname].unique().tolist()
    other_vals = [v for v in unique_vals if v != "Median"]
    cat_order = other_vals + (["Median"] if "Median" in unique_vals else [])
    adata.obs[colname] = pd.Categorical(adata.obs[colname], categories=cat_order, ordered=False)

    # --- Build palette: light grey for 'Median', auto colors for others ---
    base_colors = sc.pl.palettes.default_20  # 20 distinct defaults
    palette = {}
    for i, val in enumerate(cat_order):
        if val == "Median":
            palette[val] = "#D3D3D3"  # light grey
        else:
            palette[val] = base_colors[i % len(base_colors)]

    sc.settings.figdir = output_dir  # where Scanpy saves figures
    sc.pl.umap(
        adata,
        color=colname,
        title=f"Predominant Isoform of {gene_name} (Only if Expression > Median)",
        palette=palette,
        na_color="lightgray",
        show=False,                    # don't display
        save=f"_{colname}.pdf"         # Scanpy writes to output_dir
    )
    print(f"[DONE] Saved: {os.path.join(output_dir, colname + '.pdf')}")

In [None]:
# Call the functions
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os
from collections import defaultdict

# Define output directory
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "TCell (TCellAlone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("T-Cell Markers")

# Group isoforms by gene
gene_to_isoforms = defaultdict(list)
for isoform in TCell_iso_1:
    gene = isoform.split(":")[0]  # Use the prefix before first colon
    gene_to_isoforms[gene].append(isoform)

# Loop through each gene and its isoforms
for gene, isoforms in gene_to_isoforms.items():
    print(f"\n▶ Gene: {gene} ({len(isoforms)} isoforms)")
    for isoform in isoforms:
        safe_isoform_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_TCell_i[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Exclude 0s if you want to scale based only on expressing cells
        expr_nonzero = expr[expr > 0]

        # Calculate 99th percentile for vmax
        vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"   └─ {isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Create TwoSlopeNorm scaling
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot UMAP
        sc.pl.umap(
            adata_TCell_i,
            color=isoform,
            use_raw=False,
            title=f"{isoform}",
            cmap="coolwarm",
            save=f"_{safe_isoform_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
plot_predominant_isoform_umap(
    adata=adata_g_filtered,
    gene_name="CD3D",
    gene_id="ENSG00000167286",
    output_dir="Intermediate_Files/Clustering_07282025/Figures/Markers/Predominant_Isoform_Plots_allcells"
)

In [None]:
plot_predominant_isoform_umap(
    adata=adata_TCell_i,
    gene_name="CD3D",
    gene_id="ENSG00000167286",
    output_dir="Intermediate_Files/Clustering_07282025/Figures/Markers/Predominant_Isoform_Plots"
)

In [None]:
plot_predominant_isoform_umap(
    adata=adata_TCell_i,
    gene_name="CD3E",
    gene_id="ENSG00000198851",
    output_dir="Intermediate_Files/Clustering_07282025/Figures/Markers/Predominant_Isoform_Plots"
)

In [None]:
plot_predominant_isoform_umap(
    adata=adata_TCell_i,
    gene_name="CD3G",
    gene_id="ENSG00000160654",
    output_dir="Intermediate_Files/Clustering_07282025/Figures/Markers/Predominant_Isoform_Plots"
)

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os
from collections import defaultdict

# Define output directory
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "Effector-Memory Transition TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Effector-Memory Transition T-Cell Markers")

# Group isoforms by gene
gene_to_isoforms = defaultdict(list)
for isoform in TCell_iso_7:
    gene = isoform.split(":")[0]  # Use the prefix before first colon
    gene_to_isoforms[gene].append(isoform)

# Loop through each gene and its isoforms
for gene, isoforms in gene_to_isoforms.items():
    print(f"\n▶ Gene: {gene} ({len(isoforms)} isoforms)")
    for isoform in isoforms:
        safe_isoform_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_TCell_i[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Exclude 0s if you want to scale based only on expressing cells
        expr_nonzero = expr[expr > 0]

        # Calculate 99th percentile for vmax
        vmax_val = np.percentile(expr_nonzero, 100) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"   └─ {isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Create TwoSlopeNorm scaling
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot UMAP
        sc.pl.umap(
            adata_TCell_i,
            color=isoform,
            use_raw=False,
            title=f"{isoform}",
            cmap="coolwarm",
            save=f"_{safe_isoform_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
plot_predominant_isoform_umap(
    adata=adata_TCell_i,
    gene_name="LEF1",
    gene_id="ENSG00000138795",
    output_dir="Intermediate_Files/Clustering_07282025/Figures/Markers/Predominant_Isoform_Plots"
)

In [None]:
plot_predominant_isoform_umap(
    adata=adata_TCell_i,
    gene_name="ITGAE",
    gene_id="ENSG00000083457",
    output_dir="Intermediate_Files/Clustering_07282025/Figures/Markers/Predominant_Isoform_Plots"
)

In [None]:
plot_predominant_isoform_umap(
    adata=adata_TCell_i,
    gene_name="IL2RA",
    gene_id="ENSG00000134460",
    output_dir="Intermediate_Files/Clustering_07282025/Figures/Markers/Predominant_Isoform_Plots"
)

In [None]:

plot_predominant_isoform_umap(
    adata=adata_TCell_i,
    gene_name="GATA3",
    gene_id="ENSG00000107485",
    output_dir="Intermediate_Files/Clustering_07282025/Figures/Markers/Predominant_Isoform_Plots"
)

In [None]:
plot_predominant_isoform_umap(
    adata=adata_TCell_i,
    gene_name="TCF7",
    gene_id="ENSG00000081059",
    output_dir="Intermediate_Files/Clustering_07282025/Figures/Markers/Predominant_Isoform_Plots"
)

In [None]:
plot_predominant_isoform_umap(
    adata=adata_TCell_i,
    gene_name="IL7R",
    gene_id="ENSG00000168685",
    output_dir="Intermediate_Files/Clustering_07282025/Figures/Markers/Predominant_Isoform_Plots"
)

In [None]:
plot_predominant_isoform_umap(
    adata=adata_TCell_i,
    gene_name="CD27",
    gene_id="ENSG00000139193",
    output_dir="Intermediate_Files/Clustering_07282025/Figures/Markers/Predominant_Isoform_Plots"
)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

# Gene symbols of interest (before colons)
gene_symbols = ['ITGAE', 
                'LEF1',
                'CD27',
                'GATA3',
                'IL2RA',
                'CTLA4',
                'IL7R'
               ]

# Find all isoforms that start with each gene symbol
matching_isoforms = [g for g in adata_TCell_i.var_names if any(g.startswith(sym + ":") for sym in gene_symbols)]

print(f"Found {len(matching_isoforms)} matching isoforms for genes: {gene_symbols}")

# Calculate summed expression per cell
adata_TCell_i.obs['Transition_Markers_Combined'] = adata_TCell_i[:, matching_isoforms].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell_i,
    color='Transition_Markers_Combined',
    cmap="coolwarm",
    frameon=True,
    title="Effector-Memory Transition TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell_i.obs['Transition_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell_i.obs['Transition_Markers_Combined'], 100),
    show=False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Effector-Memory Transition TCell Aggregate Gene Marker Expression")
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_07282025/Figures/UMAP/umap_EffectorMemoryTransition_TCellsOnlycombined_expression_isoform_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os
from collections import defaultdict

# Define output directory
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "Helper (CD4) TCell(TCell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Helper T-Cell Markers")

# Group isoforms by gene
gene_to_isoforms = defaultdict(list)
for isoform in TCell_iso_6:
    gene = isoform.split(":")[0]  # Use the prefix before first colon
    gene_to_isoforms[gene].append(isoform)

# Loop through each gene and its isoforms
for gene, isoforms in gene_to_isoforms.items():
    print(f"\n▶ Gene: {gene} ({len(isoforms)} isoforms)")
    for isoform in isoforms:
        safe_isoform_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_TCell_i[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Exclude 0s if you want to scale based only on expressing cells
        expr_nonzero = expr[expr > 0]

        # Calculate 99th percentile for vmax
        vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"   └─ {isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Create TwoSlopeNorm scaling
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot UMAP
        sc.pl.umap(
            adata_TCell_i,
            color=isoform,
            use_raw=False,
            title=f"{isoform}",
            cmap="coolwarm",
            save=f"_{safe_isoform_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os
from collections import defaultdict

# Define output directory
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "Helper (CD4) TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Helper T-Cell Markers")

# Group isoforms by gene
gene_to_isoforms = defaultdict(list)
for isoform in TCell_iso_6:
    gene = isoform.split(":")[0]  # Use the prefix before first colon
    gene_to_isoforms[gene].append(isoform)

# Loop through each gene and its isoforms
for gene, isoforms in gene_to_isoforms.items():
    print(f"\n▶ Gene: {gene} ({len(isoforms)} isoforms)")
    for isoform in isoforms:
        safe_isoform_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_TCell_i[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Exclude 0s if you want to scale based only on expressing cells
        expr_nonzero = expr[expr > 0]

        # Calculate 99th percentile for vmax
        vmax_val = np.percentile(expr_nonzero, 100) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"   └─ {isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Create TwoSlopeNorm scaling
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot UMAP
        sc.pl.umap(
            adata_TCell_i,
            color=isoform,
            use_raw=False,
            title=f"{isoform}",
            cmap="coolwarm",
            save=f"_{safe_isoform_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

# Gene symbols of interest (before colons)
gene_symbols = ['GATA3', 'CD4', 'AHR', 'IL2RA']

# Find all isoforms that start with each gene symbol
matching_isoforms = [g for g in adata_TCell_i.var_names if any(g.startswith(sym + ":") for sym in gene_symbols)]

print(f"Found {len(matching_isoforms)} matching isoforms for genes: {gene_symbols}")

# Calculate summed expression per cell
adata_TCell_i.obs['CD4_Effector_TCell_Markers_Combined'] = adata_TCell_i[:, matching_isoforms].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell_i,
    color='CD4_Effector_TCell_Markers_Combined',
    cmap="coolwarm",
    frameon=True,
    title="CD4+ Effector TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell_i.obs['CD4_Effector_TCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell_i.obs['CD4_Effector_TCell_Markers_Combined'], 100),
    show=False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("CD4+ Effector TCell Aggregate Gene Marker Expression")
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_07282025/Figures/UMAP/umap_CD4EffectorTCell_TCellsOnlycombined_expression_isoform_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os
from collections import defaultdict

# Define output directory
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "Memory TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Memory TCell Markers")

# Group isoforms by gene
gene_to_isoforms = defaultdict(list)
for isoform in TCell_iso_3:
    gene = isoform.split(":")[0]  # Use the prefix before first colon
    gene_to_isoforms[gene].append(isoform)

# Loop through each gene and its isoforms
for gene, isoforms in gene_to_isoforms.items():
    print(f"\n▶ Gene: {gene} ({len(isoforms)} isoforms)")
    for isoform in isoforms:
        safe_isoform_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_TCell_i[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Exclude 0s if you want to scale based only on expressing cells
        expr_nonzero = expr[expr > 0]

        # Calculate 99th percentile for vmax
        vmax_val = np.percentile(expr_nonzero,100) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"   └─ {isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Create TwoSlopeNorm scaling
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot UMAP
        sc.pl.umap(
            adata_TCell_i,
            color=isoform,
            use_raw=False,
            title=f"{isoform}",
            cmap="coolwarm",
            save=f"_{safe_isoform_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

# Gene symbols of interest (before colons)
gene_symbols = ['CCR7', 'SELL', 'TCF7']

# Find all isoforms that start with each gene symbol
matching_isoforms = [g for g in adata_TCell_i.var_names if any(g.startswith(sym + ":") for sym in gene_symbols)]

print(f"Found {len(matching_isoforms)} matching isoforms for genes: {gene_symbols}")

# Calculate summed expression per cell
adata_TCell_i.obs['Mem_TCell_Markers_Combined'] = adata_TCell_i[:, matching_isoforms].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell_i,
    color='Mem_TCell_Markers_Combined',
    cmap="coolwarm",
    frameon=True,
    title="Memory TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell_i.obs['Mem_TCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell_i.obs['Mem_TCell_Markers_Combined'], 100),
    show=False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Memory TCell Aggregate Gene Marker Expression")
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_07282025/Figures/UMAP/umap_MemTCell_TCellsOnlycombined_expression_isoform_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os
from collections import defaultdict

# Define output directory
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "CD8+ TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("CD8+ T-Cell Markers")

# Group isoforms by gene
gene_to_isoforms = defaultdict(list)
for isoform in TCell_iso_4:
    gene = isoform.split(":")[0]  # Use the prefix before first colon
    gene_to_isoforms[gene].append(isoform)

# Loop through each gene and its isoforms
for gene, isoforms in gene_to_isoforms.items():
    print(f"\n▶ Gene: {gene} ({len(isoforms)} isoforms)")
    for isoform in isoforms:
        safe_isoform_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_TCell_i[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Exclude 0s if you want to scale based only on expressing cells
        expr_nonzero = expr[expr > 0]

        # Calculate 99th percentile for vmax
        vmax_val = np.percentile(expr_nonzero, 100) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"   └─ {isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Create TwoSlopeNorm scaling
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot UMAP
        sc.pl.umap(
            adata_TCell_i,
            color=isoform,
            use_raw=False,
            title=f"{isoform}",
            cmap="coolwarm",
            save=f"_{safe_isoform_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

# Gene symbols of interest (before colons)
gene_symbols = ['CD8A', 'CD8B', 'GATA3', 'KLRB1', 'CCL5']

# Find all isoforms that start with each gene symbol
matching_isoforms = [g for g in adata_TCell_i.var_names if any(g.startswith(sym + ":") for sym in gene_symbols)]

print(f"Found {len(matching_isoforms)} matching isoforms for genes: {gene_symbols}")

# Calculate summed expression per cell
adata_TCell_i.obs['Cytotoxic_TCell_Markers_Combined'] = adata_TCell_i[:, matching_isoforms].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell_i,
    color='Cytotoxic_TCell_Markers_Combined',
    cmap="coolwarm",
    frameon=True,
    title="Cytotoxic TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell_i.obs['Cytotoxic_TCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell_i.obs['Cytotoxic_TCell_Markers_Combined'], 100),
    show=False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Cytotoxic TCell Aggregate Gene Marker Expression")
ax.set_xticks([])
ax.set_yticks([])

# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_07282025/Figures/UMAP/umap_CytotoxicTCell_TCellsOnlycombined_expression_isoform_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os
from collections import defaultdict

# Define output directory
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "Regulatory TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Regulatory T-Cell Markers")

# Group isoforms by gene
gene_to_isoforms = defaultdict(list)
for isoform in TCell_iso_5:
    gene = isoform.split(":")[0]  # Use the prefix before first colon
    gene_to_isoforms[gene].append(isoform)

# Loop through each gene and its isoforms
for gene, isoforms in gene_to_isoforms.items():
    print(f"\n▶ Gene: {gene} ({len(isoforms)} isoforms)")
    for isoform in isoforms:
        safe_isoform_name = isoform.replace(":", "_")

        # Get expression vector
        expr = adata_TCell_i[:, isoform].X

        # Convert sparse to dense if needed
        if hasattr(expr, "toarray"):
            expr = expr.toarray().flatten()
        else:
            expr = np.ravel(expr)

        # Exclude 0s if you want to scale based only on expressing cells
        expr_nonzero = expr[expr > 0]

        # Calculate 99th percentile for vmax
        vmax_val = np.percentile(expr_nonzero, 100) if len(expr_nonzero) > 0 else 1.0

        # Count how many cells exceed the 99th percentile
        num_above_vmax = np.sum(expr > vmax_val)
        print(f"   └─ {isoform}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

        # Create TwoSlopeNorm scaling
        norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

        # Plot UMAP
        sc.pl.umap(
            adata_TCell_i,
            color=isoform,
            use_raw=False,
            title=f"{isoform}",
            cmap="coolwarm",
            save=f"_{safe_isoform_name}.pdf",
            show=True,
            norm=norm
        )

In [None]:
sc.pl.umap(
            adata_TCell_i,
            color=f'0.26_log_AutoZI',
            title=f'UMAP with Clusters (Isoform-Level) ',
            frameon=True,
            legend_fontsize=10,
            legend_fontoutline=2,
size= 10
        )

In [None]:
# Example: Suppose you manually identified the clusters as follows:
# (Change this to whatever is appropriate for your dataset)

tcell_cluster_mapping = {
    "0": "Memory TCells",
    "1": "Effector CD4 TCells",
    "2": "Cytotoxic TCells",
    "3": "Effector-Memory Transition TCells"
}

# Create new cell type annotations
adata_TCell_i_pbmc.obs["TCell_subtype"] = adata_TCell_i_pbmc.obs["0.26_log_AutoZI"].astype(str).map(tcell_cluster_mapping)

# Check assignments
adata_TCell_i_pbmc.obs["TCell_subtype"].value_counts()

In [None]:
import matplotlib.pyplot as plt
import scanpy as sc

# Define desired legend order
legend_order = ["Memory TCells", "Effector CD4 TCells", "Cytotoxic TCells", "Effector-Memory Transition TCells"]

# Plot UMAP using subtype annotation
sc.pl.umap(
    adata_TCell_i,
    color='TCell_subtype',
    title='',  # No title for publication version
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='lower right',
    legend_fontsize=10,
    legend_fontoutline=1,
    show=False  # Allow editing
)

# Adjust figure
fig = plt.gcf()
fig.set_size_inches(5,4)

# Get current legend handles and labels
ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()

# Build a dictionary of current labels and handles
label_handle_dict = dict(zip(labels, handles))

# Only keep those in legend_order that exist in current labels
ordered_labels = [label for label in legend_order if label in label_handle_dict]
ordered_handles = [label_handle_dict[label] for label in ordered_labels]

# Remove old legend
ax.get_legend().remove()

# Add new ordered legend
ax.legend(ordered_handles, ordered_labels,
          loc='lower center',
          bbox_to_anchor=(0.5, -0.4),  # Centered below
          fontsize=11,
          frameon=True,
          ncol=2)

plt.tight_layout()

# Save figure
plt.savefig("Intermediate_Files/Clustering_07282025/Figures/UMAP/UMAP_TCell_subtypes_isoform.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import scanpy as sc

# Desired legend order
legend_order = [
    "Memory TCells",
    "Effector CD4 TCells",
    "Cytotoxic TCells",
    "Effector-Memory Transition TCells"
]

OBS_KEY = "TCell_subtype"

# Keep only categories present and reorder
present = [c for c in legend_order if c in adata_TCell.obs[OBS_KEY].unique().tolist()]
if not present:
    raise ValueError(f"No categories from legend_order found in adata.obs['{OBS_KEY}'].")

adata_TCell_i.obs[OBS_KEY] = (
    adata_TCell_i.obs[OBS_KEY]
    .astype("category")
    .cat.remove_unused_categories()
    .cat.reorder_categories(present, ordered=True)
)

# Use full tab20 palette
palette = plt.get_cmap("tab20").colors

# Plot UMAP without Scanpy legend
sc.pl.umap(
    adata_TCell_i,
    color=OBS_KEY,
    title="",
    frameon=True,
    palette=palette,
    legend_loc=None,   # disable Scanpy's legend
    show=False,
    size=6
)

# Match publication dimensions
fig = plt.gcf()
fig.set_size_inches(6, 4)
ax = plt.gca()

# Axis labels
ax.set_xlabel("UMAP1", fontsize=9)
ax.set_ylabel("UMAP2", fontsize=9)
ax.set_xticks([]); ax.set_yticks([])
for spine in ["top", "right"]:
    ax.spines[spine].set_visible(False)

# Manual legend patches (tab20 colors in order)
handles = [
    mpatches.Patch(facecolor=palette[i], edgecolor="none", label=present[i])
    for i in range(len(present))
]

# Adjust layout to leave space at the bottom for legend
fig.subplots_adjust(bottom=0.3)

legend = fig.legend(
    handles, present,
    loc="lower center",
    bbox_to_anchor=(0.5, 0.1),   # centered below the axes
    fontsize=9,
    frameon=True,                # box around legend
    fancybox=False,              # square corners
    ncol=2,
    handlelength=1.0,
    handletextpad=0.5,
    columnspacing=1.1,
    borderpad=0.4
)
legend.get_frame().set_edgecolor("black")
legend.get_frame().set_linewidth(0.8)

# Save
out_base = "Intermediate_Files/Paper_Figs/UMAP/UMAP_TCell_isoform_subtypes"
fig.savefig(f"{out_base}.pdf", dpi=600, transparent=True, bbox_inches="tight")
fig.savefig(f"{out_base}.png", dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import scanpy as sc

# Desired legend order (subset of what's present is OK)
legend_order = [
    "Memory TCells",
    "Effector CD4 TCells",
    "Cytotoxic TCells",
    "Effector-Memory Transition TCells"
]

OBS_KEY = "TCell_subtype"

# Ensure categorical dtype, reorder to your preferred order (keeping only present)
present = [c for c in legend_order if c in adata_TCell.obs[OBS_KEY].unique().tolist()]
if not present:
    raise ValueError(f"No categories from legend_order found in adata.obs['{OBS_KEY}'].")

adata_TCell.obs[OBS_KEY] = (
    adata_TCell.obs[OBS_KEY]
    .astype("category")
    .cat.remove_unused_categories()
    .cat.reorder_categories(present, ordered=True)
)

# Build a stable palette matching the (reordered) categories
tab20 = plt.get_cmap("tab20").colors
base_idx = [0, 2, 4, 6, 8, 10, 12, 14]  # spaced picks from tab20
palette = [tab20[i % len(tab20)] for i in base_idx[:len(present)]]

# Plot UMAP without legend (we'll add our own)
sc.pl.umap(
    adata_TCell,
    color=OBS_KEY,
    title="",
    frameon=True,
    palette=palette,     # order aligns to 'present'
    legend_loc=None,     # disable Scanpy legend
    show=False,
    size=6               # tweak point size as needed
)

# Figure/axes setup
fig = plt.gcf()
fig.set_size_inches(5, 4)
ax = plt.gca()

# Axis labels (publication-friendly)
ax.set_xlabel("UMAP1", fontsize=9)
ax.set_ylabel("UMAP2", fontsize=9)

# Optional: tidy ticks, keep labels visible
ax.set_xticks([]); ax.set_yticks([])
for spine in ["top", "right"]:
    ax.spines[spine].set_visible(False)

# ---- Manual legend built from categories + palette ----
handles = [mpatches.Patch(facecolor=palette[i], edgecolor="none", label=present[i])
           for i in range(len(present))]

leg = ax.legend(
    handles=handles,
    loc="lower center",
    bbox_to_anchor=(0.5, -0.28),  # under the axis
    fontsize=8,
    frameon=False,
    ncol=2,
    handlelength=1.0,
    columnspacing=1.2,
)

# Save
out_base = "Intermediate_Files/Paper_Figs/UMAP/UMAP_TCell_subtypes"
fig.savefig(f"{out_base}.pdf", dpi=600, transparent=True, bbox_inches="tight")
fig.savefig(f"{out_base}.png", dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
# Create the column if it doesn't exist
if "gen_cell_type_reannotated" not in adata_i_filtered.obs.columns:
    adata_i_filtered.obs["gen_cell_type_reannotated"] = adata_i_filtered.obs["gen_cell_type"]

# Temporarily convert to plain string for flexible assignment
adata_i_filtered.obs["gen_cell_type_reannotated"] = (
    adata_i_filtered.obs["gen_cell_type_reannotated"].astype(str)
)

# Assign new subtype labels from adata_TCell_i
adata_i_filtered.obs.loc[adata_TCell_i_pbmc.obs_names, "gen_cell_type_reannotated"] = (
    adata_TCell_i.obs["TCell_subtype"].astype(str)
)

# (Optional) Re-cast to categorical for plotting, ordering, etc.
adata_i_filtered.obs["gen_cell_type_reannotated"] = (
    adata_i_filtered.obs["gen_cell_type_reannotated"].astype("category")
)

# Check output
print(adata_i_filtered.obs["gen_cell_type_reannotated"].value_counts())

In [None]:
sc.pl.umap(
            adata_i_filtered,
            color=f'gen_cell_type_reannotated',
            title=f'UMAP with Clusters (Isoform-Level) ',
            frameon=True,
            legend_fontsize=10,
            legend_fontoutline=2,
        )

In [None]:
#Save Denoised data
output_dir = 'Intermediate_Files/Clustering_07282025/'

adata_i_filtered.write(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_reannotated_AutoZILatent_08132025.h5mu"))

In [None]:
# Load the mdata object from the file
import os
output_dir = 'Intermediate_Files/Clustering_07282025'

from scanpy import read_h5ad
# Load the mdata object from the file
adata_g_filtered = read_h5ad(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_reannotated_AutoZILatent_08132025.h5mu"))
adata_i_filtered = read_h5ad(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_reannotated_AutoZILatent_08132025.h5mu"))

In [None]:
adata_g_filtered

In [None]:
adata_i_filtered

In [None]:
adata_TCell = adata_g_filtered[adata_g_filtered.obs["gen_cell_type"] == "TCells"].copy()
adata_TCell_i = adata_i_filtered[adata_i_filtered.obs["gen_cell_type"] == "TCells"].copy()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import scanpy as sc

# Desired legend order (subset of what's present is OK)
legend_order = [
    "Memory TCells",
    "Effector CD4 TCells",
    "Cytotoxic TCells",
    "Effector-Memory Transition TCells"
]

OBS_KEY = "sub_cell_type"

# Ensure categorical dtype, reorder to your preferred order (keeping only present)
present = [c for c in legend_order if c in adata_TCell.obs[OBS_KEY].unique().tolist()]
if not present:
    raise ValueError(f"No categories from legend_order found in adata.obs['{OBS_KEY}'].")

adata_TCell.obs[OBS_KEY] = (
    adata_TCell.obs[OBS_KEY]
    .astype("category")
    .cat.remove_unused_categories()
    .cat.reorder_categories(present, ordered=True)
)

# Build a stable palette matching the (reordered) categories
tab20 = plt.get_cmap("tab20").colors
base_idx = [0, 2, 4, 6, 8, 10, 12, 14]  # spaced picks from tab20
palette = [tab20[i % len(tab20)] for i in base_idx[:len(present)]]

# Plot UMAP without legend (we'll add our own)
sc.pl.umap(
    adata_TCell,
    color=OBS_KEY,
    title="",
    frameon=True,
    palette=palette,     # order aligns to 'present'
    legend_loc=None,     # disable Scanpy legend
    show=False,
    size=6               # tweak point size as needed
)

# Figure/axes setup
fig = plt.gcf()
fig.set_size_inches(5, 4)
ax = plt.gca()

# Axis labels (publication-friendly)
ax.set_xlabel("UMAP1", fontsize=9)
ax.set_ylabel("UMAP2", fontsize=9)

# Optional: tidy ticks, keep labels visible
ax.set_xticks([]); ax.set_yticks([])
for spine in ["top", "right"]:
    ax.spines[spine].set_visible(False)

# ---- Manual legend built from categories + palette ----
handles = [mpatches.Patch(facecolor=palette[i], edgecolor="none", label=present[i])
           for i in range(len(present))]

leg = ax.legend(
    handles=handles,
    loc="lower center",
    bbox_to_anchor=(0.5, -0.28),  # under the axis
    fontsize=8,
    frameon=False,
    ncol=2,
    handlelength=1.0,
    columnspacing=1.2,
)

# Save
out_base = "Intermediate_Files/Paper_Figs/UMAP/UMAP_TCell_subtypes"
fig.savefig(f"{out_base}.pdf", dpi=600, transparent=True, bbox_inches="tight")
fig.savefig(f"{out_base}.png", dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Config ---
celltype_col = "gen_cell_type_reannotated"   # Column in .obs with cell type labels
mito_col = "pct_counts_mt"                   # Column in .obs with mitochondrial percentage
output_file = "Intermediate_Files/Paper_Figs/mito_percentage_by_celltype_gene.png"

# Desired custom order
custom_order = [
    "BCells",
    "NK Cells",
    "Monocyte-derived",
    "Megakaryocytes",
    "Effector CD4 TCells",
    "Cytotoxic TCells",
    "Memory TCells",
    "Effector-Memory Transition TCells"
]

# --- Group mitochondrial percentage by cell type ---
mito_by_celltype = (
    adata_g_filtered.obs[[celltype_col, mito_col]]
    .groupby(celltype_col)
    .agg(
        mean_mito_pct=(mito_col, "mean"),
        median_mito_pct=(mito_col, "median"),
        n_cells=(mito_col, "count")
    )
    .reindex(custom_order)  # Optional: match grouping output order to plot order
)

print(mito_by_celltype)

# --- Violin plot ---
plt.figure(figsize=(10, 5))
sns.violinplot(
    data=adata_g_filtered.obs,
    x=celltype_col,
    y=mito_col,
    order=custom_order,
    palette="Set2",
    inner="box"
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Mitochondrial %")
plt.title("Mitochondrial Percentage by Cell Type")
plt.tight_layout()

# --- Save figure ---
plt.savefig(output_file, dpi=300, bbox_inches="tight")

plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Config ---
celltype_col = "gen_cell_type_reannotated"   # Column in .obs with cell type labels
mito_col = "pct_counts_mt"                   # Column in .obs with mitochondrial percentage
output_file = "Intermediate_Files/Paper_Figs/mito_percentage_by_celltype_iso.png"

# Desired custom order
custom_order = [
    "BCells",
    "NK Cells",
    "Monocyte-derived",
    "Megakaryocytes",
    "Effector CD4 TCells",
    "Cytotoxic TCells",
    "Memory TCells",
    "Effector-Memory Transition TCells"
]

# --- Group mitochondrial percentage by cell type ---
mito_by_celltype = (
    adata_i_filtered.obs[[celltype_col, mito_col]]
    .groupby(celltype_col)
    .agg(
        mean_mito_pct=(mito_col, "mean"),
        median_mito_pct=(mito_col, "median"),
        n_cells=(mito_col, "count")
    )
    .reindex(custom_order)  # Optional: match grouping output order to plot order
)

print(mito_by_celltype)

# --- Violin plot ---
plt.figure(figsize=(10, 5))
sns.violinplot(
    data=adata_i_filtered.obs,
    x=celltype_col,
    y=mito_col,
    order=custom_order,
    palette="Set2",
    inner="box"
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Mitochondrial %")
plt.title("Mitochondrial Percentage by Cell Type")
plt.tight_layout()

# --- Save figure ---
plt.savefig(output_file, dpi=300, bbox_inches="tight")

plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Config ---
celltype_col = "gen_cell_type_reannotated"   # Column in .obs with cell type labels
doublet_col = "doublet_score"                # Column in .obs with doublet score
output_file = "Intermediate_Files/Paper_Figs/doubletscore_by_celltype_gene.png"

# Desired custom order
custom_order = [
    "BCells",
    "NK Cells",
    "Monocyte-derived",
    "Megakaryocytes",
    "Effector CD4 TCells",
    "Cytotoxic TCells",
    "Memory TCells",
    "Effector-Memory Transition TCells"
]

# --- Group doublet score by cell type ---
doublet_by_celltype = (
    adata_g_filtered.obs[[celltype_col, doublet_col]]
    .groupby(celltype_col)
    .agg(
        mean_doublet_score=(doublet_col, "mean"),
        median_doublet_score=(doublet_col, "median"),
        n_cells=(doublet_col, "count")
    )
    .reindex(custom_order)  # Optional: match the grouping output order to the plot order
)

print(doublet_by_celltype)

# --- Violin plot ---
plt.figure(figsize=(10, 5))
sns.violinplot(
    data=adata_g_filtered.obs,
    x=celltype_col,
    y=doublet_col,
    order=custom_order,
    palette="Set2",
    inner="box"
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Doublet Score")
plt.title("Doublet Score by Cell Type")
plt.tight_layout()

# --- Save figure ---
plt.savefig(output_file, dpi=300, bbox_inches="tight")

plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Config ---
celltype_col = "gen_cell_type_reannotated"   # Column in .obs with cell type labels
doublet_col = "doublet_score"                # Column in .obs with doublet score
output_file = "Intermediate_Files/Paper_Figs/doubletscore_by_celltype_iso.png"

# Desired custom order
custom_order = [
    "BCells",
    "NK Cells",
    "Monocyte-derived",
    "Megakaryocytes",
    "Effector CD4 TCells",
    "Cytotoxic TCells",
    "Memory TCells",
    "Effector-Memory Transition TCells"
]

# --- Group doublet score by cell type ---
doublet_by_celltype = (
    adata_i_filtered.obs[[celltype_col, doublet_col]]
    .groupby(celltype_col)
    .agg(
        mean_doublet_score=(doublet_col, "mean"),
        median_doublet_score=(doublet_col, "median"),
        n_cells=(doublet_col, "count")
    )
    .reindex(custom_order)  # Optional: match the grouping output order to the plot order
)

print(doublet_by_celltype)

# --- Violin plot ---
plt.figure(figsize=(10, 5))
sns.violinplot(
    data=adata_i_filtered.obs,
    x=celltype_col,
    y=doublet_col,
    order=custom_order,
    palette="Set2",
    inner="box"
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Doublet Score")
plt.title("Doublet Score by Cell Type")
plt.tight_layout()

# --- Save figure ---
plt.savefig(output_file, dpi=300, bbox_inches="tight")

plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Config ---
celltype_col = "gen_cell_type_reannotated"   # Column in .obs with cell type labels
counts_col = "total_counts"                  # Column in .obs with total counts
output_file = "Intermediate_Files/Paper_Figs/totalcounts_by_celltype_gene.png"

# Desired custom order
custom_order = [
    "BCells",
    "NK Cells",
    "Monocyte-derived",
    "Megakaryocytes",
    "Effector CD4 TCells",
    "Cytotoxic TCells",
    "Memory TCells",
    "Effector-Memory Transition TCells"
]

# --- Group total counts by cell type ---
counts_by_celltype = (
    adata_g_filtered.obs[[celltype_col, counts_col]]
    .groupby(celltype_col)
    .agg(
        mean_counts=(counts_col, "mean"),
        median_counts=(counts_col, "median"),
        n_cells=(counts_col, "count")
    )
    .reindex(custom_order)  # match output to custom plotting order
)

print(counts_by_celltype)

# --- Violin plot ---
plt.figure(figsize=(10, 5))
sns.violinplot(
    data=adata_g_filtered.obs,
    x=celltype_col,
    y=counts_col,
    order=custom_order,
    palette="Set2",
    inner="box"  # adds median/IQR markers
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Total Counts")
plt.title("Total Counts per Cell by Cell Type")
plt.tight_layout()

# --- Save figure ---
plt.savefig(output_file, dpi=300, bbox_inches="tight")

plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Config ---
celltype_col = "gen_cell_type_reannotated"   # Column in .obs with cell type labels
counts_col = "total_counts_isoforms"                  # Column in .obs with total counts
output_file = "Intermediate_Files/Paper_Figs/totalcounts_by_celltype_iso.png"

# Desired custom order
custom_order = [
    "BCells",
    "NK Cells",
    "Monocyte-derived",
    "Megakaryocytes",
    "Effector CD4 TCells",
    "Cytotoxic TCells",
    "Memory TCells",
    "Effector-Memory Transition TCells"
]

# --- Group total counts by cell type ---
counts_by_celltype = (
    adata_i_filtered.obs[[celltype_col, counts_col]]
    .groupby(celltype_col)
    .agg(
        mean_counts=(counts_col, "mean"),
        median_counts=(counts_col, "median"),
        n_cells=(counts_col, "count")
    )
    .reindex(custom_order)  # match output to custom plotting order
)

print(counts_by_celltype)

# --- Violin plot ---
plt.figure(figsize=(10, 5))
sns.violinplot(
    data=adata_i_filtered.obs,
    x=celltype_col,
    y=counts_col,
    order=custom_order,
    palette="Set2",
    inner="box"  # adds median/IQR markers
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Total Counts")
plt.title("Total Counts per Cell by Cell Type")
plt.tight_layout()

# --- Save figure ---
plt.savefig(output_file, dpi=300, bbox_inches="tight")

plt.show()

In [None]:
import scanpy as sc
import numpy as np
import os

# --- I/O like your snippet ---
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "Memory TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

# --- helpers ---
def isoforms_for_gene(adata, gene):
    pref = f"{gene}:"
    return [v for v in adata.var_names if v == gene or v.startswith(pref)]

def summed_gene_expr(adata, gene):
    cols = isoforms_for_gene(adata, gene)
    if not cols:
        raise ValueError(f"No isoforms found for {gene} (looked for '{gene}' or '{gene}:*').")
    X = adata[:, cols].X
    if hasattr(X, "toarray"): X = X.toarray()
    return X.sum(axis=1).ravel()

# --- plot each gene separately (same look/feel as your previous code) ---
for gene in ["GZMB", "NCAM1"]:
    expr = summed_gene_expr(adata_TCell_i, gene)
    vmax = np.percentile(expr[expr > 0], 99) if (expr > 0).any() else 1.0

    # stash vector in obs and plot
    key = f"__{gene}_sum"
    adata_TCell_i.obs[key] = expr

    sc.pl.umap(
        adata_TCell_i,
        color=key,
        use_raw=False,
        title=f"{gene}",
        cmap="viridis",       # sequential; expression is non-negative
        vmin=0,
        vmax=vmax,
        frameon=True,
        show=False,
        save=f"_{gene}.pdf"   # writes to sc.settings.figdir
    )

    del adata_TCell_i.obs[key]

In [None]:
import scanpy as sc
import numpy as np
import os

# --- I/O ---
marker_root = "Intermediate_Files/Clustering_07282025/Figures/Markers"
group_name = "Memory TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

def find_gene_var(adata, gene):
    """Return a single var key for the gene (no summing)."""
    gU = gene.upper()
    # 1) exact var_names (case-insensitive)
    exact = [vn for vn in adata.var_names if isinstance(vn, str) and vn.upper() == gU]
    if exact:
        return exact[0]
    # 2) combined IDs like 'GZMB:...'
    pref = [vn for vn in adata.var_names if isinstance(vn, str) and vn.startswith(f"{gene}:")]
    if len(pref) == 1:
        return pref[0]
    # 3) var['gene_name'] matches
    if "gene_name" in adata.var.columns:
        idx = adata.var.index[adata.var["gene_name"].astype(str).str.upper() == gU].tolist()
        if len(idx) == 1:
            return idx[0]
        elif len(idx) > 1:
            # pick the column with highest mean expression
            X = adata[:, idx].X
            if hasattr(X, "toarray"): X = X.toarray()
            means = X.mean(axis=0)
            return idx[int(np.argmax(means))]

    # 2) multiple 'GENE:*' options -> choose with highest mean
    if len(pref) > 1:
        X = adata[:, pref].X
        if hasattr(X, "toarray"): X = X.toarray()
        means = X.mean(axis=0)
        return pref[int(np.argmax(means))]

    raise ValueError(f"Could not map gene '{gene}' to a single feature in var_names or var['gene_name'].")

# --- plot each gene separately (no summing) ---
for gene in ["GZMB", "NCAM1"]:
    var_key = find_gene_var(adata_TCell, gene)
    expr = adata_TCell[:, var_key].X
    expr = expr.toarray().ravel() if hasattr(expr, "toarray") else np.ravel(expr)

    vmax = np.percentile(expr[expr > 0], 99) if (expr > 0).any() else 1.0

    key = f"__{gene}"
    adata_TCell.obs[key] = expr
    sc.pl.umap(
        adata_TCell,
        color=key,
        use_raw=False,
        title=gene,
        cmap="viridis",
        vmin=0, vmax=vmax,
        frameon=True, size=6,
        legend_loc=None,
        show=False,
        save=f"_{gene}.pdf"
    )
    del adata_TCell.obs[key]

In [None]:
import scanpy as sc

import pandas as pd
from scipy.stats import spearmanr

group1 = "Memory TCells"
group2 = "Effector-Memory Transition TCells"

# Subset and average expression per gene
adata_sub = adata_TCell_i[adata_TCell_i.obs["gen_cell_type_reannotated"].isin([group1, group2])].copy()
X = adata_sub.X.toarray() if hasattr(adata_sub.X, "toarray") else adata_sub.X
df_expr = pd.DataFrame(X, index=adata_sub.obs_names, columns=adata_sub.var_names)
df_expr["group"] = adata_sub.obs["gen_cell_type_reannotated"].values

# Mean expression per group
mean_expr = df_expr.groupby("group").mean().T

# Correlation
rho, pval = spearmanr(mean_expr[group1], mean_expr[group2])
print(f"Spearman correlation between {group1} and {group2}: ρ = {rho:.3f}, p = {pval:.3g}")

In [None]:
import scanpy as sc

# Choose your cell type labels
group1 = "Memory TCells"
group2 = "Effector-Memory Transition TCells"

# Make a log1p version of raw counts
import numpy as np

adata_TCell_i.layers["log1p_counts"] = np.log1p(adata_TCell_i.layers["counts"])

# Subset AnnData
adata_sub = adata_TCell_i[adata_TCell_i.obs["gen_cell_type_reannotated"].isin([group1, group2])].copy()


# Create a new column with simplified labels
adata_sub.obs["comparison_group"] = adata_sub.obs["gen_cell_type_reannotated"].map({
    group1: "group1",
    group2: "group2"
})

sc.tl.rank_genes_groups(
    adata_sub, 
    groupby="comparison_group",
    reference="group1",  # this will be the baseline
    method="wilcoxon", 
    use_raw=False,
    layer="log1p_counts",   # <- raw counts go here
    pts=True
)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from adjustText import adjust_text

# Extract results
de_results = sc.get.rank_genes_groups_df(adata_sub, group="group2")

# Compute -log10 adjusted p-value
de_results["-log10(pval)"] = -np.log10(de_results["pvals_adj"].replace(0, 1e-300))

# Define significant hits
de_results["significant"] = (de_results["pvals_adj"] < 0.05) & (abs(de_results["logfoldchanges"]) >= 1)

# Filter to significant genes only
sig_genes = de_results[de_results["significant"]].copy()

# Select top N genes by adjusted p-value
top_n = 20
top_genes = sig_genes.nsmallest(top_n, "pvals_adj")

# Start plot
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=de_results,
    x="logfoldchanges",
    y="-log10(pval)",
    hue="significant",
    palette={True: "red", False: "gray"},
    edgecolor=None,
    alpha=0.7
)

# Threshold lines
plt.axvline(1, color="blue", linestyle="--", lw=1)
plt.axvline(-1, color="blue", linestyle="--", lw=1)

# Add labels for top genes
texts = []
for _, row in top_genes.iterrows():
    texts.append(
        plt.text(
            row["logfoldchanges"],
            row["-log10(pval)"],
            row["names"],
            fontsize=8,
            ha='right' if row["logfoldchanges"] < 0 else 'left'
        )
    )
adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))

adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))
plt.title("Volcano Plot:Memory TCells vs Effector-Memory Transition TCells")
plt.xlabel("Log2 Fold Change")
plt.ylabel("-log10 Adjusted p-value")
plt.legend(title="FDR < 0.05", loc="center left")
plt.tight_layout()
plt.savefig("Intermediate_Files/Paper_Figs/volcano_iso_Memory_vs_Transition.pdf", dpi=600)
plt.show()

In [None]:
import scanpy as sc

import pandas as pd
from scipy.stats import spearmanr

group1 = "Effector CD4 TCells"
group2 = "Effector-Memory Transition TCells"

# Subset and average expression per gene
adata_sub = adata_TCell_i[adata_TCell_i.obs["gen_cell_type_reannotated"].isin([group1, group2])].copy()
X = adata_sub.X.toarray() if hasattr(adata_sub.X, "toarray") else adata_sub.X
df_expr = pd.DataFrame(X, index=adata_sub.obs_names, columns=adata_sub.var_names)
df_expr["group"] = adata_sub.obs["gen_cell_type_reannotated"].values

# Mean expression per group
mean_expr = df_expr.groupby("group").mean().T

# Correlation
rho, pval = spearmanr(mean_expr[group1], mean_expr[group2])
print(f"Spearman correlation between {group1} and {group2}: ρ = {rho:.3f}, p = {pval:.3g}")

In [None]:
import scanpy as sc

# Choose your cell type labels
group1 = "Effector CD4 TCells"
group2 = "Effector-Memory Transition TCells"

# Make a log1p version of raw counts
import numpy as np

adata_TCell_i.layers["log1p_counts"] = np.log1p(adata_TCell_i.layers["counts"])

# Subset AnnData
adata_sub = adata_TCell_i[adata_TCell_i.obs["gen_cell_type_reannotated"].isin([group1, group2])].copy()


# Create a new column with simplified labels
adata_sub.obs["comparison_group"] = adata_sub.obs["gen_cell_type_reannotated"].map({
    group1: "group1",
    group2: "group2"
})

sc.tl.rank_genes_groups(
    adata_sub, 
    groupby="comparison_group",
    reference="group1",  # this will be the baseline
    method="wilcoxon", 
    use_raw=False,
    layer="log1p_counts",   # <- raw counts go here
    pts=True
)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from adjustText import adjust_text

# Extract results
de_results = sc.get.rank_genes_groups_df(adata_sub, group="group2")

# Compute -log10 adjusted p-value
de_results["-log10(pval)"] = -np.log10(de_results["pvals_adj"].replace(0, 1e-300))

# Define significant hits
de_results["significant"] = (de_results["pvals_adj"] < 0.05) & (abs(de_results["logfoldchanges"]) >= 1)

# Filter to significant genes only
sig_genes = de_results[de_results["significant"]].copy()

# Select top N genes by adjusted p-value
top_n = 20
top_genes = sig_genes.nsmallest(top_n, "pvals_adj")

# Start plot
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=de_results,
    x="logfoldchanges",
    y="-log10(pval)",
    hue="significant",
    palette={True: "red", False: "gray"},
    edgecolor=None,
    alpha=0.7
)

# Threshold lines
plt.axvline(1, color="blue", linestyle="--", lw=1)
plt.axvline(-1, color="blue", linestyle="--", lw=1)

# Add labels for top genes
texts = []
for _, row in top_genes.iterrows():
    texts.append(
        plt.text(
            row["logfoldchanges"],
            row["-log10(pval)"],
            row["names"],
            fontsize=8,
            ha='right' if row["logfoldchanges"] < 0 else 'left'
        )
    )
adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))

adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))
plt.title("Volcano Plot: Effector CD4 TCells vs Effector-Memory Transition TCells")
plt.xlabel("Log2 Fold Change")
plt.ylabel("-log10 Adjusted p-value")
plt.legend(title="FDR < 0.05", loc="lower left")
plt.tight_layout()
plt.savefig("Intermediate_Files/Paper_Figs/volcano_iso_EffectorCD4_vs_Transition.pdf", dpi=600)
plt.show()

In [None]:
import scanpy as sc

import pandas as pd
from scipy.stats import spearmanr

group1 = "Memory TCells"
group2 = "Effector-Memory Transition TCells"

# Subset and average expression per gene
adata_sub = adata_TCell[adata_TCell.obs["gen_cell_type_reannotated"].isin([group1, group2])].copy()
X = adata_sub.X.toarray() if hasattr(adata_sub.X, "toarray") else adata_sub.X
df_expr = pd.DataFrame(X, index=adata_sub.obs_names, columns=adata_sub.var_names)
df_expr["group"] = adata_sub.obs["gen_cell_type_reannotated"].values

# Mean expression per group
mean_expr = df_expr.groupby("group").mean().T

# Correlation
rho, pval = spearmanr(mean_expr[group1], mean_expr[group2])
print(f"Spearman correlation between {group1} and {group2}: ρ = {rho:.3f}, p = {pval:.3g}")

In [None]:
import scanpy as sc

# Choose your cell type labels
group1 = "Memory TCells"
group2 = "Effector-Memory Transition TCells"

# Make a log1p version of raw counts
import numpy as np

adata_TCell.layers["log1p_counts"] = np.log1p(adata_TCell.layers["counts"])

# Subset AnnData
adata_sub = adata_TCell[adata_TCell.obs["gen_cell_type_reannotated"].isin([group1, group2])].copy()


# Create a new column with simplified labels
adata_sub.obs["comparison_group"] = adata_sub.obs["gen_cell_type_reannotated"].map({
    group1: "group1",
    group2: "group2"
})

sc.tl.rank_genes_groups(
    adata_sub, 
    groupby="comparison_group",
    reference="group1",  # this will be the baseline
    method="wilcoxon", 
    use_raw=False,
    layer="log1p_counts",   # <- raw counts go here
    pts=True
)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from adjustText import adjust_text

# Extract results
de_results = sc.get.rank_genes_groups_df(adata_sub, group="group2")

# Compute -log10 adjusted p-value
de_results["-log10(pval)"] = -np.log10(de_results["pvals_adj"].replace(0, 1e-300))

# Define significant hits
de_results["significant"] = (de_results["pvals_adj"] < 0.05) & (abs(de_results["logfoldchanges"]) >= 1)

# Filter to significant genes only
sig_genes = de_results[de_results["significant"]].copy()

# Select top N genes by adjusted p-value
top_n = 20
top_genes = sig_genes.nsmallest(top_n, "pvals_adj")

# Start plot
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=de_results,
    x="logfoldchanges",
    y="-log10(pval)",
    hue="significant",
    palette={True: "red", False: "gray"},
    edgecolor=None,
    alpha=0.7
)

# Threshold lines
plt.axvline(1, color="blue", linestyle="--", lw=1)
plt.axvline(-1, color="blue", linestyle="--", lw=1)

# Add labels for top genes
texts = []
for _, row in top_genes.iterrows():
    texts.append(
        plt.text(
            row["logfoldchanges"],
            row["-log10(pval)"],
            row["names"],
            fontsize=8,
            ha='right' if row["logfoldchanges"] < 0 else 'left'
        )
    )
adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))

adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))
plt.title("Volcano Plot: Memory TCells vs Effector-Memory Transition TCells")
plt.xlabel("Log2 Fold Change")
plt.ylabel("-log10 Adjusted p-value")
plt.legend(title="FDR < 0.05", loc="center left")
plt.tight_layout()
plt.savefig("Intermediate_Files/Paper_Figs/volcano_gene_Memory_vs_Transition.pdf", dpi=600)
plt.show()

In [None]:
import scanpy as sc

import pandas as pd
from scipy.stats import spearmanr

group1 = "Effector CD4 TCells"
group2 = "Effector-Memory Transition TCells"

# Subset and average expression per gene
adata_sub = adata_TCell[adata_TCell.obs["gen_cell_type_reannotated"].isin([group1, group2])].copy()
X = adata_sub.X.toarray() if hasattr(adata_sub.X, "toarray") else adata_sub.X
df_expr = pd.DataFrame(X, index=adata_sub.obs_names, columns=adata_sub.var_names)
df_expr["group"] = adata_sub.obs["gen_cell_type_reannotated"].values

# Mean expression per group
mean_expr = df_expr.groupby("group").mean().T

# Correlation
rho, pval = spearmanr(mean_expr[group1], mean_expr[group2])
print(f"Spearman correlation between {group1} and {group2}: ρ = {rho:.3f}, p = {pval:.3g}")

In [None]:
import scanpy as sc

# Choose your cell type labels
group1 = "Effector CD4 TCells"
group2 = "Effector-Memory Transition TCells"

# Make a log1p version of raw counts
import numpy as np

adata_TCell.layers["log1p_counts"] = np.log1p(adata_TCell.layers["counts"])

# Subset AnnData
adata_sub = adata_TCell[adata_TCell.obs["gen_cell_type_reannotated"].isin([group1, group2])].copy()


# Create a new column with simplified labels
adata_sub.obs["comparison_group"] = adata_sub.obs["gen_cell_type_reannotated"].map({
    group1: "group1",
    group2: "group2"
})

sc.tl.rank_genes_groups(
    adata_sub, 
    groupby="comparison_group",
    reference="group1",  # this will be the baseline
    method="wilcoxon", 
    use_raw=False,
    layer="log1p_counts",   # <- raw counts go here
    pts=True
)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from adjustText import adjust_text

# Extract results
de_results = sc.get.rank_genes_groups_df(adata_sub, group="group2")

# Compute -log10 adjusted p-value
de_results["-log10(pval)"] = -np.log10(de_results["pvals_adj"].replace(0, 1e-300))

# Define significant hits
de_results["significant"] = (de_results["pvals_adj"] < 0.05) & (abs(de_results["logfoldchanges"]) >= 1)

# Filter to significant genes only
sig_genes = de_results[de_results["significant"]].copy()

# Select top N genes by adjusted p-value
top_n = 20
top_genes = sig_genes.nsmallest(top_n, "pvals_adj")

# Start plot
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=de_results,
    x="logfoldchanges",
    y="-log10(pval)",
    hue="significant",
    palette={True: "red", False: "gray"},
    edgecolor=None,
    alpha=0.7
)

# Threshold lines
plt.axvline(1, color="blue", linestyle="--", lw=1)
plt.axvline(-1, color="blue", linestyle="--", lw=1)

# Add labels for top genes
texts = []
for _, row in top_genes.iterrows():
    texts.append(
        plt.text(
            row["logfoldchanges"],
            row["-log10(pval)"],
            row["names"],
            fontsize=8,
            ha='right' if row["logfoldchanges"] < 0 else 'left'
        )
    )
adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))

adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))
plt.title("Volcano Plot: Effector CD4 TCells vs Effector-Memory Transition TCells")
plt.xlabel("Log2 Fold Change")
plt.ylabel("-log10 Adjusted p-value")
plt.legend(title="FDR < 0.05", loc="center left")
plt.tight_layout()
plt.savefig("Intermediate_Files/Paper_Figs/volcano_gene_EffectorCD4_vs_Transition.pdf", dpi=600)
plt.show()

In [None]:
adata_i_filtered.obs['sub_cell_type'] = adata_i_filtered.obs.pop('gen_cell_type_reannotated')
adata_g_filtered.obs['sub_cell_type'] = adata_g_filtered.obs.pop('gen_cell_type_reannotated')

In [None]:
#Save Denoised data
output_dir = 'Intermediate_Files/Clustering_07282025/'

adata_i_filtered.write(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_reannotated_AutoZILatent_08132025.h5mu"))
adata_g_filtered.write(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_reannotated_AutoZILatent_08132025.h5mu"))

In [None]:
from scanpy import read_h5ad
import os

output_dir = 'Intermediate_Files/Clustering_07282025/'

# Load the mdata object from the file
adata_g_filtered = read_h5ad(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_reannotated_AutoZILatent_08132025.h5mu"))
adata_i_filtered = read_h5ad(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_reannotated_AutoZILatent_08132025.h5mu"))

In [None]:
def find_matching_genes(gene_name, adata):
    return [g for g in adata.var_names if gene_name.upper() in g.upper()]

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np

# Suppose 'adata' is your AnnData object
# Step 1: Extract all gene names that start with "Bambu"
bambu_genes = [gene for gene in adata_g_filtered.var_names if "Bambu" in gene]
bambu_isos = [gene for gene in adata_i_filtered.var_names if "Bambu" in gene]

# Step 2: Subset the AnnData object to these genes
adata_g_bambu = adata_g_filtered[:, bambu_genes].copy()
adata_i_bambu = adata_i_filtered[:, bambu_isos].copy()

# Step 3: Convert the expression matrix to a Pandas DataFrame.
# Here, adata_bambu.to_df() converts the .X matrix (cells as rows, genes as columns).
df_expr_g = adata_g_bambu.to_df()
df_expr_i = adata_i_bambu.to_df()

# Step 4: Add clustering annotations to the DataFrame.
# Replace 'clusters' with the appropriate key if different.
df_expr_g['gen_cell_type'] = adata_g_bambu.obs['gen_cell_type'].values
df_expr_g['sub_cell_type'] = adata_g_bambu.obs['sub_cell_type'].values
df_expr_i['gen_cell_type'] = adata_i_bambu.obs['gen_cell_type'].values
df_expr_i['sub_cell_type'] = adata_i_bambu.obs['sub_cell_type'].values

# Step 5: Identify only the numeric columns (the gene expression columns)
numeric_cols_g = df_expr_g.select_dtypes(include=[np.number]).columns
numeric_cols_i = df_expr_i.select_dtypes(include=[np.number]).columns

# Step 5: Group by cluster and compute average expression for each gene.
# Group by the cell type and compute average expression
avg_expr_by_cluster_gen_g = df_expr_g.groupby("gen_cell_type")[numeric_cols_g].median()
avg_expr_by_cluster_gen_i = df_expr_i.groupby("gen_cell_type")[numeric_cols_i].median()
avg_expr_by_cluster_sub_g = df_expr_g.groupby("sub_cell_type")[numeric_cols_g].median()
avg_expr_by_cluster_sub_i = df_expr_i.groupby("sub_cell_type")[numeric_cols_i].median()

In [None]:
bambu_genes = sorted(bambu_genes)
print(bambu_genes)

In [None]:
bambu_isos = sorted(bambu_isos)
print(bambu_isos)

In [None]:
print("Median expression of 'Bambu' genes by cell-type (gene-level):")
print(avg_expr_by_cluster_gen_g)

In [None]:
print("Median expression of 'Bambu' genes by cell-type (iso-level):")
print(avg_expr_by_cluster_gen_i)

In [None]:
print("Median expression of 'Bambu' genes by sub-cell-type (gene-level):")
print(avg_expr_by_cluster_sub_g)

In [None]:
print("Median expression of 'Bambu' genes by sub-cell-type (iso-level):")
print(avg_expr_by_cluster_sub_i)

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np

# Group by the general cell type and compute the sum of expression for each gene.
sum_expr_by_cluster_gen_g = df_expr_g.groupby("gen_cell_type")[numeric_cols_g].sum()
sum_expr_by_cluster_gen_i = df_expr_i.groupby("gen_cell_type")[numeric_cols_i].sum()

# Group by the sub cell type (if desired) and compute the sum.
sum_expr_by_cluster_sub_g = df_expr_g.groupby("sub_cell_type")[numeric_cols_g].sum()
sum_expr_by_cluster_sub_i = df_expr_i.groupby("sub_cell_type")[numeric_cols_i].sum()

In [None]:
print("Sum counts of 'Bambu' genes by general cell type (gene-level):")
print(sum_expr_by_cluster_gen_g)

In [None]:
print("Sum counts of 'Bambu' genes by general cell type (iso-level):")
print(sum_expr_by_cluster_gen_i)

In [None]:
print("\nSum counts of 'Bambu' genes by sub cell type (gene-level):")
print(sum_expr_by_cluster_sub_g)

In [None]:
print("\nSum counts of 'Bambu' genes by sub cell type (isoform-level):")
print(sum_expr_by_cluster_sub_i)

In [None]:
import numpy as np
import pandas as pd

# For gene-level data:

# Select numeric columns (i.e. the genes)
numeric_cols_g = df_expr_g.select_dtypes(include=[np.number]).columns.tolist()

# Group by the general cell type and compute the median expression per gene
median_expr_by_cluster_g = df_expr_g.groupby("gen_cell_type")[numeric_cols_g].median()

# For each cluster, sort genes by median expression (descending) and select the top 10 gene names
top10_genes_by_cluster = {}
for cluster in median_expr_by_cluster_g.index:
    sorted_genes = median_expr_by_cluster_g.loc[cluster].sort_values(ascending=False)
    top10_genes = sorted_genes.index[:10].tolist()
    top10_genes_by_cluster[cluster] = top10_genes

print("Top 10 genes per cluster (gene-level data) based on median expression:")
for cluster, genes in top10_genes_by_cluster.items():
    print(f"Cluster {cluster}: {genes}")

In [None]:
# For isoform-level data:

# Select numeric columns (the isoforms)
numeric_cols_i = df_expr_i.select_dtypes(include=[np.number]).columns.tolist()

# Group by the general cell type and compute the median expression per isoform
median_expr_by_cluster_i = df_expr_i.groupby("gen_cell_type")[numeric_cols_i].median()

# For each cluster, sort isoforms by median expression (descending) and select the top 10 isoform names
top10_isos_by_cluster = {}
for cluster in median_expr_by_cluster_i.index:
    sorted_isos = median_expr_by_cluster_i.loc[cluster].sort_values(ascending=False)
    top10_isos = sorted_isos.index[:10].tolist()
    top10_isos_by_cluster[cluster] = top10_isos

print("\nTop 10 isoforms per cluster (isoform-level data) based on median expression:")
for cluster, isos in top10_isos_by_cluster.items():
    print(f"Cluster {cluster}: {isos}")

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os
import re

tx_to_plot = bambu_isos                      # from your earlier code
outdir = "Intermediate_Files/Clustering_07282025/Figures/Bambu_Transcripts"
os.makedirs(outdir, exist_ok=True)

def _to_1d(x):
    if hasattr(x, "toarray"):
        return x.toarray().ravel()
    return np.ravel(x)

def _safe(name):
    return re.sub(r"[:|/\\\s]", "_", str(name))

for tx in tx_to_plot:
    if tx not in adata_i_filtered.var_names:
        print(f"{tx} not found, skipping.")
        continue

    expr = _to_1d(adata_i_filtered[:, tx].X)
    expr_nonzero = expr[expr != 0]
    vmax_val = np.percentile(expr_nonzero, 100) if expr_nonzero.size > 0 else 1.0
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

    # Create the figure
    fig = sc.pl.embedding(
        adata_i_filtered,
        basis="umap",
        color=tx,
        use_raw=False,
        title=tx,
        cmap="coolwarm",
        show=False,          # don't auto-show yet
        return_fig=True,     # get a handle to the figure
        norm=norm,
        frameon=False
    )

    # Save
    fig_path = os.path.join(outdir, f"{_safe(tx)}_UMAP.pdf")
    fig.savefig(fig_path, dpi=300)

    # Show on screen, then close to free memory
    plt.show()              # this actually displays the figure
    plt.close(fig)

    print(f"Saved & displayed: {fig_path}")

In [None]:
import os
import re
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm

# --- config ---
OUTDIR = "Intermediate_Files/Clustering_07282025/Figures/SUPT3H_Isoforms"
os.makedirs(OUTDIR, exist_ok=True)

# 1) Collect SUPT3H isoforms by ENSG id (ensure present in AnnData)
SUPT3H_isos = [g for g in adata_i_filtered.var_names if "ENSG00000196284" in g]
if not SUPT3H_isos:
    raise ValueError("No SUPT3H isoforms (ENSG00000196284) found in adata_i_filtered.var_names")

print(f"Found {len(SUPT3H_isos)} SUPT3H isoforms")

def _to_1d(x):
    if hasattr(x, "toarray"):
        return x.toarray().ravel()
    return np.ravel(x)

def _safe(name):
    return re.sub(r"[:|/\\\s]", "_", str(name))

for iso in SUPT3H_isos:
    expr = _to_1d(adata_i_filtered[:, iso].X)

    # 99th percentile for vmax, center at mean
    expr_mean = np.mean(expr)
    expr_nonzero = expr[expr != 0]
    vmax_val = np.percentile(expr_nonzero, 100) if expr_nonzero.size > 0 else 1.0

    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=expr_mean, vmax=vmax_val)

    fig = sc.pl.umap(
        adata_i_filtered,
        color=iso,
        use_raw=False,
        cmap="coolwarm",
        norm=norm,
        frameon=True,
        title=f"{iso}",
        show=False,
        return_fig=True
    )

    fpath = os.path.join(OUTDIR, f"{_safe(iso)}_UMAP.pdf")
    fig.savefig(fpath, dpi=300)
    plt.show()
    plt.close(fig)

    print(f"Saved & displayed: {fpath}")

In [None]:
import os
import re
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm

# --- config ---
OUTDIR = "Intermediate_Files/Clustering_07282025/Figures/ICA1_Isoforms"
os.makedirs(OUTDIR, exist_ok=True)

# 1) Collect SUPT3H isoforms by ENSG id (ensure present in AnnData)
ICA1_isos = [g for g in adata_i_filtered.var_names if "ENSG00000003147" in g]
if not ICA1_isos:
    raise ValueError("No ICA1 isoforms (ENSG00000003147) found in adata_i_filtered.var_names")

print(f"Found {len(ICA1_isos)} ENSG00000003147 isoforms")

def _to_1d(x):
    if hasattr(x, "toarray"):
        return x.toarray().ravel()
    return np.ravel(x)

def _safe(name):
    return re.sub(r"[:|/\\\s]", "_", str(name))

for iso in ICA1_isos:
    expr = _to_1d(adata_i_filtered[:, iso].X)

    # 99th percentile for vmax, center at mean
    expr_mean = np.mean(expr)
    expr_nonzero = expr[expr != 0]
    vmax_val = np.percentile(expr_nonzero, 100) if expr_nonzero.size > 0 else 1.0

    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=expr_mean, vmax=vmax_val)

    fig = sc.pl.umap(
        adata_i_filtered,
        color=iso,
        use_raw=False,
        cmap="coolwarm",
        norm=norm,
        frameon=True,
        title=f"{iso}",
        show=False,
        return_fig=True
    )

    fpath = os.path.join(OUTDIR, f"{_safe(iso)}_UMAP.pdf")
    fig.savefig(fpath, dpi=300)
    plt.show()
    plt.close(fig)

    print(f"Saved & displayed: {fpath}")

In [None]:
# Extract all gene names that start with "Bambu"
CMC1_isos = [gene for gene in adata_i_filtered.var_names if "ENSG00000187118" in gene]

# Determine grid size: adjust ncols as needed.
n_genes = len(CMC1_isos)
ncols = 2
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each bambu gene and plot UMAP colored by its expression.
for i, gene in enumerate(CMC1_isos):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        vmax = 8
    )

# Hide any extra subplots (if nrows*ncols > number of genes)
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Extract all gene names that start with "Bambu"
LYAR_isos = [gene for gene in adata_i_filtered.var_names if "ENSG00000145220" in gene]

# Determine grid size: adjust ncols as needed.
n_genes = len(LYAR_isos)
ncols = 2
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each bambu gene and plot UMAP colored by its expression.
for i, gene in enumerate(LYAR_isos):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        #vmax = 1
    )

# Hide any extra subplots (if nrows*ncols > number of genes)
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Extract all gene names that start with "Bambu"
isos_TRGC1 = [gene for gene in adata_i_filtered.var_names if "ENSG00000211689" in gene]

# Determine grid size: adjust ncols as needed.
n_genes = len(isos_TRGC1)
ncols = 3
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each bambu gene and plot UMAP colored by its expression.
for i, gene in enumerate(isos_TRGC1):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        #vmax = 1
    )

# Hide any extra subplots (if nrows*ncols > number of genes)
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Extract all gene names that start with "Bambu"
isos_TRGC2 = [gene for gene in adata_i_filtered.var_names if "ENSG00000227191" in gene]

# Determine grid size: adjust ncols as needed.
n_genes = len(isos_TRGC2)
ncols = 3
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each bambu gene and plot UMAP colored by its expression.
for i, gene in enumerate(isos_TRGC2):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        #vmax = 1
    )

# Hide any extra subplots (if nrows*ncols > number of genes)
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Extract all gene names that contain the specific ENSG ID
isos_308813 = [gene for gene in adata_i_filtered.var_names if "ENSG00000308813" in gene]

# Determine grid size
n_genes = len(isos_308813)
ncols = 3
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each isoform and plot UMAP colored by expression
for i, gene in enumerate(isos_308813):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        vmax=8,
        vmin=-2
    )

# Hide extra axes if grid is bigger than number of genes
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()

## ✅ Save figure
#plt.savefig("Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/ENSG00000308813_isoform_umaps.pdf",
#            dpi=600, transparent=True, bbox_inches="tight")

#plt.show()

In [None]:
# Extract all gene names that contain the specific ENSG ID
isos_TALAM1 = [gene for gene in adata_i_filtered.var_names if "ENSG00000289740" in gene]

# Determine grid size
n_genes = len(isos_TALAM1)
ncols = 3
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each isoform and plot UMAP colored by expression
for i, gene in enumerate(isos_TALAM1):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        vmax=8,
        vmin=-2
    )

# Hide extra axes if grid is bigger than number of genes
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()

# ✅ Save figure
#plt.savefig("Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/TALAM1_isoform_umaps.pdf",
#            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
# Extract all gene names that contain the specific ENSG ID
isos_MALAT1 = [gene for gene in adata_i_filtered.var_names if "ENSG00000251562" in gene]

# Determine grid size
n_genes = len(isos_MALAT1)
ncols = 3
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each isoform and plot UMAP colored by expression
for i, gene in enumerate(isos_MALAT1):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        vmax=8,
        vmin=-2
    )

# Hide extra axes if grid is bigger than number of genes
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()

# ✅ Save figure
#plt.savefig("Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/MALAT1_isoform_umaps.pdf",
#            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm

# Step 1: Define ENSG IDs for MALAT1 and TALAM1
target_ensgs = ["ENSG00000251562", "ENSG00000289740"]
isoform_matches = [gene for gene in adata_i_filtered.var_names if any(ensg in gene for ensg in target_ensgs)]

print(f"✅ Found {len(isoform_matches)} isoforms:")
for g in isoform_matches:
    print("  ", g)

# Step 2: Extract expression matrix
X = adata_i_filtered.X.toarray() if hasattr(adata_i_filtered.X, "toarray") else adata_i_filtered.X
iso_indices = [adata_i_filtered.var_names.get_loc(i) for i in isoform_matches]

df_expr = pd.DataFrame(X[:, iso_indices],
                       columns=isoform_matches,
                       index=adata_i_filtered.obs["sub_cell_type"])

# Step 3: Mean expression per cell type
mean_expr = df_expr.groupby(df_expr.index).mean()

# Step 4: Set symmetric vmin/vmax for centered colormap
abs_max = np.abs(mean_expr.values).max()
norm = TwoSlopeNorm(vmin=-2, vcenter=0, vmax=2)

# Step 5: Plot
g = sns.clustermap(mean_expr,
                   cmap="coolwarm", linewidths=0.5, linecolor="gray",
                   figsize=(len(mean_expr.columns) * 0.4 + 2, len(mean_expr.index) * 0.5 + 2),
                   col_cluster=True, row_cluster=False,
                   cbar_kws={"label": "Mean Expression"},
                   norm=TwoSlopeNorm(vmin=-2, vcenter=0, vmax=2),
                  cbar_pos=(0.09, 0.65, 0.03, 0.2))

g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90, fontsize=8)
g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0)
g.ax_heatmap.set_xlabel("")
g.ax_heatmap.set_ylabel("")
g.ax_heatmap.set_title("", pad=20)

#g.savefig("Intermediate_Files/Paper_Figs/MALAT1_TALAM1_isoform_expression_clustermap.pdf",
#          dpi=600, bbox_inches="tight", transparent=True)
plt.show()

In [None]:
# Apply log1p transformation
adata_g_filtered.layers["log_raw"] = np.log1p(adata_g_filtered.layers["counts"])
adata_i_filtered.layers["log_raw"] = np.log1p(adata_i_filtered.layers["counts"])

In [None]:
# --- Create new datasets (subsets) for each cell type ---
# Get the unique cell types
unique_cell_types_g = adata_g_filtered.obs['gen_cell_type'].unique()
unique_cell_types_i = adata_i_filtered.obs['gen_cell_type'].unique()

# Create a dictionary to store the subsets
celltype_datasets_g = {}
celltype_datasets_i = {}

for cell_type in unique_cell_types_g:
    # Subset the AnnData object for each cell type
    subset = adata_g_filtered[adata_g_filtered.obs['gen_cell_type'] == cell_type].copy()
    celltype_datasets_g[cell_type] = subset
    # Optionally, save each subset to file:
    #subset.write_h5ad(f"PBMC_{cell_type}_gene_raw_counts_04092024.h5ad", compression="gzip")
    #print(f"Saved dataset for {cell_type} with {subset.n_obs} cells.")

for cell_type in unique_cell_types_i:
    # Subset the AnnData object for each cell type
    subset = adata_i_filtered[adata_i_filtered.obs['gen_cell_type'] == cell_type].copy()
    celltype_datasets_i[cell_type] = subset
    # Optionally, save each subset to file:
   # subset.write_h5ad(f"PBMC_{cell_type}_iso_raw_counts_04092024.h5ad", compression="gzip")
   # print(f"Saved dataset for {cell_type} with {subset.n_obs} cells.")

# Now you have separate datasets stored in the dictionary 'celltype_datasets'
# You can access, for example, raw counts for specific genes in:
# celltype_datasets["TCells"].X

In [None]:
TCell_g = celltype_datasets_g["TCells"]
NKCell_g = celltype_datasets_g["NK Cells"]
Monocyte_g = celltype_datasets_g["Monocyte-derived"]
BCell_g = celltype_datasets_g["BCells"]
MK_g = celltype_datasets_g["Megakaryocytes"]

TCell_i = celltype_datasets_i["TCells"]
NKCell_i = celltype_datasets_i["NK Cells"]
Monocyte_i = celltype_datasets_i["Monocyte-derived"]
BCell_i = celltype_datasets_i["BCells"]
MK_i = celltype_datasets_i["Megakaryocytes"]

In [None]:
TCell_Markers = ["CD3D:","CD3E:","CD3G","CD8A:","CD8B:","CCR5:","KLRB1","GATA3","IL2RA:","CD4:","AHR:","TNF:","SELL:","CCR7:","TCF7:"]
BCells = ["MS4A1:","CD19:", "CD79A:", "CD22:"]
NKCells = ["NKG7:", "GNLY:", "KLRD1:", "KLRF1:", "GZMB:", "NCAM1:", "ITGAM:", "IL2RB:"]
Myeloid = ["ITGAM:", "ITGAX:", "CD33:", "CD14:", "CD1C:", "HLA-DRA:", "HLA-DRB1:"]
Monocytes = ["CD14:", "LYZ:", "VCAN:", "FCN1:", "CST3:", "S100A8:", "S100A9:", "FCGR3A:", "CX3CR1:"]
DC = ["CD1C:", "CLEC10A:", "FCER1A:", "CST3:"]
pDC = ["CLEC4C:", "LILRA4:", "GZMB:", "TCL1A:"]
MK = ["PPBP:", "PF4:", "ITGA2B:", "GP1BA:", "VWF:"]



# Function to find matching genes in var_names (combined_ID format)
def find_matching_genes(prefixes, gene_list):
    return [gene for gene in gene_list if any(gene.startswith(prefix) for prefix in prefixes)]

In [None]:
TCell_g[0]

In [None]:
TCell_i[0]

In [None]:
# Get gene matches for gene-level data
TCell_genes = find_matching_genes(TCell_Markers, adata_g_filtered.var_names)
NK_genes = find_matching_genes(NKCells, adata_g_filtered.var_names)
Myeloid_genes = find_matching_genes(Myeloid, adata_g_filtered.var_names)
Monocyte_genes = find_matching_genes(Monocytes, adata_g_filtered.var_names)
BCell_genes = find_matching_genes(BCells, adata_g_filtered.var_names)
DC_genes = find_matching_genes(DC, adata_g_filtered.var_names)
pDC_genes = find_matching_genes(pDC, adata_g_filtered.var_names)
MK_genes = find_matching_genes(MK, adata_g_filtered.var_names)

TCell_isos = find_matching_genes(TCell_Markers, adata_i_filtered.var_names)
NK_isos = find_matching_genes(NKCells, adata_i_filtered.var_names)
Myeloid_isos = find_matching_genes(Myeloid, adata_i_filtered.var_names)
Monocyte_isos = find_matching_genes(Monocytes, adata_i_filtered.var_names)
BCell_isos = find_matching_genes(BCells, adata_i_filtered.var_names)
DC_isos = find_matching_genes(DC, adata_i_filtered.var_names)
pDC_isos = find_matching_genes(pDC, adata_i_filtered.var_names)
MK_isos = find_matching_genes(MK, adata_i_filtered.var_names)

In [None]:
from scipy.stats import mode

# Subset the AnnData object to only include the genes in TCell_genes_1
TCell_g_subset = TCell_g[:, TCell_genes].copy()

# Choose the expression layer.
X = TCell_g_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
TCell_summary_df_g = pd.DataFrame({
    "gene": TCell_g_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(TCell_summary_df_g)

In [None]:
from scipy.stats import mode

# Subset the AnnData object to only include the genes in TCell_genes_1
TCell_i_subset = TCell_i[:, TCell_isos].copy()

# Choose the expression layer.
X = TCell_i_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
TCell_summary_df_i = pd.DataFrame({
    "gene": TCell_i_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(TCell_summary_df_i)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(TCell_g_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(TCell_i_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
NKCell_g_subset =NKCell_g[:, NK_genes].copy()

# Choose the expression layer.
X = NKCell_g_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
NKCell_summary_df_g = pd.DataFrame({
    "gene": NKCell_g_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(NKCell_summary_df_g)

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
NKCell_i_subset =NKCell_i[:, NK_isos].copy()

# Choose the expression layer.
X = NKCell_i_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
NKCell_summary_df_i = pd.DataFrame({
    "gene": NKCell_i_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(NKCell_summary_df_i)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(NKCell_g_subset.var_names):
    plt.figure(figsize=(6, 4))
    plt.hist(NKCell_g_subset.layers["log_raw"][:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(NKCell_i_subset.var_names):
    plt.figure(figsize=(6, 4))
    plt.hist(NKCell_i_subset.layers["log_raw"][:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
Monocyte_g_subset =Monocyte_g[:, Myeloid_genes].copy()

# Choose the expression layer.
X = Monocyte_g_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
Monocyte_summary_df_g = pd.DataFrame({
    "gene": Monocyte_g_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(Monocyte_summary_df_g)

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
Monocyte_i_subset =Monocyte_i[:, Myeloid_isos].copy()

# Choose the expression layer.
X =Monocyte_i_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
Monocyte_summary_df_i = pd.DataFrame({
    "gene": Monocyte_i_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(Monocyte_summary_df_i)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Monocyte_g_subset.var_names):
    plt.figure(figsize=(6, 4))
    plt.hist(Monocyte_g_subset.layers["log_raw"][:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Monocyte_i_subset.var_names):
    plt.figure(figsize=(6, 4))
    plt.hist(Myeloid_i_subset.layers["log_raw"][:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
BCell_g_subset =BCell_g[:, BCell_genes].copy()

# Choose the expression layer.
X = BCell_g_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
BCell_summary_df_g = pd.DataFrame({
    "gene": BCell_g_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(BCell_summary_df_g)

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
BCell_i_subset =BCell_i[:, BCell_isos].copy()

# Choose the expression layer.
X = BCell_i_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
BCell_summary_df_i = pd.DataFrame({
    "gene": BCell_i_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(BCell_summary_df_i)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(BCell_g_subset.var_names):
    plt.figure(figsize=(6, 4))
    plt.hist(BCell_g_subset.layers["log_raw"][:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(BCell_i_subset.var_names):
    plt.figure(figsize=(6, 4))
    plt.hist(BCell_i_subset.layers["log_raw"][:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
MK_g_subset =MK_g[:, MK_genes].copy()

# Choose the expression layer.
X = MK_g_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
MK_summary_df_g = pd.DataFrame({
    "gene": MK_g_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(MK_summary_df_g)

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
MK_i_subset =MK_i[:, MK_isos].copy()

# Choose the expression layer.
X = MK_i_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
MK_summary_df_i = pd.DataFrame({
    "gene": MK_i_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(MK_summary_df_i)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(MK_g_subset.var_names):
    plt.figure(figsize=(6, 4))
    plt.hist(MK_g_subset.layers["log_raw"][:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(MK_i_subset.var_names):
    plt.figure(figsize=(6, 4))
    plt.hist(MK_i_subset.layers["log_raw"][:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# --- Create new datasets (subsets) for each cell type ---
# Get the unique cell types
unique_cell_types_g = adata_g_filtered.obs['gen_cell_type_reannotated'].unique()
unique_cell_types_i = adata_i_filtered.obs['gen_cell_type_reannotated'].unique()

# Create a dictionary to store the subsets
subcelltype_datasets_g = {}
subcelltype_datasets_i = {}

for cell_type in unique_cell_types_g:
    # Subset the AnnData object for each cell type
    subset = adata_g_filtered[adata_g_filtered.obs['gen_cell_type_reannotated'] == cell_type].copy()
    subcelltype_datasets_g[cell_type] = subset
    # Optionally, save each subset to file:
    subset.write_h5ad(f"PBMC_{cell_type}_gene_subtype_raw_counts_06022024.h5ad", compression="gzip")
    print(f"Saved dataset for {cell_type} with {subset.n_obs} cells.")

In [None]:
for cell_type in unique_cell_types_i:
    # Subset the AnnData object for each cell type
    subset = adata_i_filtered[adata_i_filtered.obs['gen_cell_type_reannotated'] == cell_type].copy()
    subcelltype_datasets_i[cell_type] = subset
    # Optionally, save each subset to file:
    subset.write_h5ad(f"PBMC_{cell_type}_iso_subtype_raw_counts_06022024.h5ad", compression="gzip")
    print(f"Saved dataset for {cell_type} with {subset.n_obs} cells.")

# Now you have separate datasets stored in the dictionary 'celltype_datasets'
# You can access, for example, raw counts for specific genes in:
# celltype_datasets["TCells"].X

In [None]:
NKCells_g = subcelltype_datasets_g["NK Cells"]
CD8_TCells_g = subcelltype_datasets_g["Effector CD8 TCells"]
CD4_TCells_1_g = subcelltype_datasets_g["Effector CD4 TCells #1"]
CD4_TCells_2_g = subcelltype_datasets_g["Effector CD4 TCells #2"]
CD4_TCells_3_g = subcelltype_datasets_g["Effector CD4 TCells #3"]
Mem_TCells_g = subcelltype_datasets_g["Memory TCells"]
Monocytes_g = subcelltype_datasets_g["Monocyte-derived"]
BCells_g = subcelltype_datasets_g["BCells"]
Megakaryotes_g = subcelltype_datasets_g["Megakaryocytes"]

NKCells_i = subcelltype_datasets_i["NK Cells"]
CD8_TCells_i = subcelltype_datasets_i["Effector CD8 TCells"]
CD4_TCells_i = subcelltype_datasets_i["Effector CD4 TCells"]
Mem_TCells_i = subcelltype_datasets_i["Memory TCells"]
Unspecified_TCells_i = subcelltype_datasets_i["Unspecified TCells"]
Monocytes_i = subcelltype_datasets_i["Monocyte-derived"]
BCells_i = subcelltype_datasets_i["BCells"]
Megakaryotes_i = subcelltype_datasets_i["Megakaryocytes"]

In [None]:
TCell_Markers = ["CD4:", "CD3D:", "CD3E:"]
TMem = ["CCR7:", "SELL:", "TCF7:"]
Cytotoxic = ["CD8A:", "CD8B:", "GATA3:", "KLRB1:", "CCL5:"]
CD4Effector = ["CD4:", "IL2RA:", "GATA3:", "AHR:"]
BCells = ["MS4A1:","CD19:", "CD79A:", "CD22:"]
NKCells = ["KLRD1:", "GZMB:", "FCGR3A:", "IL2RB:", "CD226:", "ITGAM:", "NCAM1"]
Myeloid = ["ITGAM:", "ITGAX:", "CD33:", "CD14:", "CD1C:", "HLA-DRA:", "HLA-DRB1:", "FCGR3A:", "FCGR2A:", "CLEC7A", "LILRB4:"]
Monocytes = ["CD14:", "LYZ:", "VCAN:", "FCN1:", "CST3:", "S100A8:", "S100A9:", "FCGR3A:", "CX3CR1:"]
DC = ["CD1C:", "CLEC10A:", "FCER1A:", "CST3:"]
pDC = ["CLEC4C:", "LILRA4:", "GZMB:", "TCL1A:"]
MK = ["PPBP:", "PF4:", "ITGA2B:", "GP1BA:", "VWF:"]

# Function to find matching genes in var_names (combined_ID format)
def find_matching_genes(prefixes, gene_list):
    return [gene for gene in gene_list if any(gene.startswith(prefix) for prefix in prefixes)]

In [None]:
# Get gene matches for gene-level data
TCell_genes = find_matching_genes(TCell_Markers, adata_g_filtered.var_names)
Cytotoxic_genes = find_matching_genes(Cytotoxic, adata_g_filtered.var_names)
CD4Effector_genes = find_matching_genes(CD4Effector, adata_g_filtered.var_names)
TMem_genes = find_matching_genes(TMem, adata_g_filtered.var_names)
NK_genes = find_matching_genes(NKCells, adata_g_filtered.var_names)
Myeloid_genes = find_matching_genes(Myeloid, adata_g_filtered.var_names)
Monocyte_genes = find_matching_genes(Monocytes, adata_g_filtered.var_names)
BCell_genes = find_matching_genes(BCells, adata_g_filtered.var_names)
DC_genes = find_matching_genes(DC, adata_g_filtered.var_names)
pDC_genes = find_matching_genes(pDC, adata_g_filtered.var_names)
MK_genes = find_matching_genes(MK, adata_g_filtered.var_names)

TCell_isos = find_matching_genes(TCell_Markers, adata_i_filtered.var_names)
TReg_isos = find_matching_genes(Treg, adata_i_filtered.var_names)
TMem_isos = find_matching_genes(TMem, adata_i_filtered.var_names)
NK_isos = find_matching_genes(NKCells, adata_i_filtered.var_names)
Myeloid_isos = find_matching_genes(Myeloid, adata_i_filtered.var_names)
Monocyte_isos = find_matching_genes(Monocytes, adata_i_filtered.var_names)
BCell_isos = find_matching_genes(BCells, adata_i_filtered.var_names)
DC_isos = find_matching_genes(DC, adata_i_filtered.var_names)
pDC_isos = find_matching_genes(pDC, adata_i_filtered.var_names)
MK_isos = find_matching_genes(MK, adata_i_filtered.var_names)

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
CD8_TCells_g_subset =CD8_TCells_g[:, TCell_genes].copy()

# Choose the expression layer.
X = CD8_TCells_g_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)

# Create a summary DataFrame with gene names and statistics
Naive_TCell_1_summary_df_g = pd.DataFrame({
    "gene": Naive_TCell_1_g_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median
})

print(Naive_TCell_1_summary_df_g)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Naive_TCell_1_g_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
Naive_TCell_2_g_subset =Naive_TCells_2_g[:, TCell_genes].copy()

# Choose the expression layer.
X = Naive_TCell_2_g_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)

# Create a summary DataFrame with gene names and statistics
Naive_TCell_2_summary_df_g = pd.DataFrame({
    "gene": Naive_TCell_2_g_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median
})

print(Naive_TCell_2_summary_df_g)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Naive_TCell_2_g_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
Naive_TCell_1_i_subset =Naive_TCells_1_i[:, TCell_isos].copy()

# Choose the expression layer.
X = Naive_TCell_1_i_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)

# Create a summary DataFrame with gene names and statistics
Naive_TCell_1_summary_df_i = pd.DataFrame({
    "gene": Naive_TCell_1_i_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median
})

print(Naive_TCell_1_summary_df_i)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Naive_TCell_1_i_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
Memory_TCell_subset =Memory_TCells_data[:, TCell_genes].copy()

# Choose the expression layer.
X = Memory_TCell_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
Memory_TCell_summary_df = pd.DataFrame({
    "gene": Memory_TCell_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(Memory_TCell_summary_df)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Memory_TCell_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
CD8_TCell_subset =CD8_TCells_data[:, TCell_genes].copy()

# Choose the expression layer.
X = CD8_TCell_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
CD8_TCell_summary_df = pd.DataFrame({
    "gene": Memory_TCell_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(CD8_TCell_summary_df)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(CD8_TCell_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
Monocyte_subset =Monocyte_data[:, Monocyte_genes].copy()

# Choose the expression layer.
X = Monocyte_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
Monocyte_summary_df = pd.DataFrame({
    "gene": Monocyte_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(Monocyte_summary_df)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Monocyte_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
Mono_Derived_DC_genes = DC_genes + Monocyte_genes

Mono_Derived_DC_subset =Mono_Derived_DCs_data[:, Mono_Derived_DC_genes].copy()

# Choose the expression layer.
X = Mono_Derived_DC_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
Mono_Derived_DC_summary_df = pd.DataFrame({
    "gene": Mono_Derived_DC_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(Mono_Derived_DC_summary_df)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Mono_Derived_DC_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
Megakaryote_subset =Megakaryotes_data[:, MK_genes].copy()

# Choose the expression layer.
X = Megakaryote_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
Megakaryote_summary_df = pd.DataFrame({
    "gene": Megakaryote_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(Megakaryote_summary_df)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Megakaryote_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Apply log1p transformation
adata_g_filtered.layers["log_raw"] = np.log1p(adata_g_filtered.layers["counts"])
#adata_i_filtered.layers["log_raw"] = np.log1p(adata_i_filtered.layers["counts"])

In [None]:
adata_g_filtered.X = adata_g_filtered.layers["log_raw"]

# Run PCA on log-transformed data
sc.pp.pca(adata_g_filtered, layer="log_raw")

# Compute UMAP
sc.pp.neighbors(adata_g_filtered, n_neighbors = 20)
sc.tl.umap(adata_g_filtered)

In [None]:
# Perform Leiden clustering at multiple resolutions
resolutions = [0.05, 0.08, 0.1, 0.15, 0.18, 0.2, 0.25, 0.28, 0.3, 0.35, 0.38]
for res in resolutions:
    sc.tl.leiden(adata_g_filtered, resolution=res, key_added=f'{res}_20lat_5e3', flavor = "igraph", n_iterations=2)

In [None]:
# Perform Leiden clustering at multiple resolutions
resolutions = [0.04, 0.06, 0.1, 0.14, 0.16, 0.2, 0.24, 0.26, 0.3, 0.34, 0.36, 0.4, 0.44, 0.46, 0.5]
for res in resolutions:
    sc.tl.leiden(adata_g_filtered, resolution=res, key_added=f'{res}_log_AutoZI', flavor = "igraph", n_iterations=2)

# Relabel clusters by size
resolutions = ['0.04_log_AutoZI', '0.06_log_AutoZI', '0.1_log_AutoZI', '0.14_log_AutoZI', '0.16_log_AutoZI', 
               '0.2_log_AutoZI', '0.24_log_AutoZI', '0.26_log_AutoZI', '0.3_log_AutoZI', '0.34_log_AutoZI', '0.36_log_AutoZI',
              '0.4_log_AutoZI', '0.44_log_AutoZI', '0.46_log_AutoZI', '0.5_log_AutoZI']
for cluster_key in resolutions:
    adata_g_filtered_pbmc = relabel_clusters_by_size(adata_g_filtered, cluster_key)

sc.pl.umap(adata_g_filtered, color=["batch"], title="UMAP Colored by Batch")

# Call the functions
plot_umap_with_labels_g(adata_g_filtered, resolutions=[
    #0.05, 0.08, 0.1, 0.15, 0.18, 0.2, 0.25, 0.28, 0.3, 0.35, 0.38
    0.04, 0.06, 0.1, 0.14, 0.16, 0.2, 0.24, 0.26, 0.3, 0.34, 0.36, 0.4, 0.44, 0.46, 0.5
], use_rep_key= 'log_AutoZI')