In [None]:
import io
import tempfile
from anndata import AnnData
import muon as mu
import numpy as np
import requests
import os
import scanpy as sc
import scvi
import seaborn as sns
import torch
import pandas as pd
import sys
import scrublet as scr
import skimage
import pybiomart
from bioservices import BioMart
import rdata
import matplotlib.pyplot as plt
from adjustText import adjust_text
from scipy.stats import beta
import leidenalg
import igraph
import tqdm
import time
import gc
import polars as pl
import pyarrow

In [None]:
import os
import scanpy as sc
import muon as mu
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scvi

output_dir = 'Intermediate_Files/QC_Figs_04302025/'

iso_dir_1 = 'Models/PBMCs_04302025_AutoZI/Isoform/20lat_1e2'
gene_dir_1 = 'Models/PBMCs_04302025_AutoZI/Gene/20lat_1e2'

# Load the mdata object from the file
mdata_gene_pbmc = mu.read(os.path.join(output_dir, "mdata_gene_with_latent_PBMCs_04302025_AutoZI.h5mu"))
mdata_iso_pbmc = mu.read(os.path.join(output_dir, "mdata_iso_with_latent_PBMCs_04302025_AutoZI.h5mu"))

# Extract the RNA modality
adata_g_filtered = mdata_gene_pbmc.mod['rna']
adata_i_filtered = mdata_iso_pbmc.mod['rna']


In [None]:
## Load the AUTOZI model (assuming it was saved using the AUTOZI class)
try:
    autozi_gene_model_pbmc = scvi.model.AUTOZI.load(gene_dir_1, adata = adata_g_filtered )
    print("AUTOZI model 1 loaded successfully.")
except ValueError as e:
    print(f"Error loading AUTOZI models: {e}")
    autozi_gene_model_pbmc = None

## Load the AUTOZI model (assuming it was saved using the AUTOZI class)
try:
    autozi_iso_model_pbmc = scvi.model.AUTOZI.load(iso_dir_1, adata = adata_i_filtered )
    print("AUTOZI model 1 loaded successfully.")
except ValueError as e:
    print(f"Error loading AUTOZI models: {e}")
    autozi_iso_model_pbmc = None

In [None]:
# Access training history
import matplotlib.pyplot as plt

train_elbo1 = autozi_gene_model_pbmc.history['elbo_train']  # Dataset 1
val_elbo1 = autozi_gene_model_pbmc.history['elbo_validation']  # Dataset 1


train_elbo2 = autozi_iso_model_pbmc.history['elbo_train']  # Dataset 1
val_elbo2 = autozi_iso_model_pbmc.history['elbo_validation']  # Dataset 1

In [None]:
# Print last few values
print("Training ELBO gene-level:", train_elbo1[-10:])
print("Validation ELBO gene-level:", val_elbo1[-10:])

In [None]:
print("Training ELBO isoform-level:", train_elbo2[-10:])
print("Validation ELBO isoform-level:", val_elbo2[-10:])

In [None]:
plt.plot(train_elbo1, label="Training ELBO")
plt.plot(val_elbo1, label="Validation ELB")
plt.xlabel("Epoch")
plt.ylabel("Negative ELBO")
plt.legend()
plt.title("Training vs Validation Loss (ELBO) Gene-Level")
plt.show()

In [None]:
plt.plot(train_elbo2, label="Training ELBO")
plt.plot(val_elbo2, label="Validation ELB")
plt.xlabel("Epoch")
plt.ylabel("Negative ELBO")
plt.legend()
plt.title("Training vs Validation Loss (ELBO) Isoform-Level")
plt.show()

In [None]:
## Get latent representation after SCVI
latent_g_autozi = autozi_gene_model_pbmc.get_latent_representation()
outputs_g = autozi_gene_model_pbmc.get_alphas_betas()
alpha_posterior_g = outputs_g['alpha_posterior']
beta_posterior_g = outputs_g['beta_posterior']

latent_i_autozi = autozi_iso_model_pbmc.get_latent_representation()
outputs_i = autozi_iso_model_pbmc.get_alphas_betas()
alpha_posterior_i = outputs_i['alpha_posterior']
beta_posterior_i = outputs_i['beta_posterior']


In [None]:
# Step 3: Analyze Zero-Inflated (ZI) probabilities
threshold = 0.5
zi_probs_g = beta.cdf(threshold, alpha_posterior_g, beta_posterior_g)
is_zi_pred_g = zi_probs_g > threshold

zi_probs_i = beta.cdf(threshold, alpha_posterior_i, beta_posterior_i)
is_zi_pred_i = zi_probs_i > threshold

print('Fraction of predicted ZI genes in Gene-level Data :', is_zi_pred_g.mean()) 
print('Fraction of predicted ZI isoforms in Isoform-Level Data :', is_zi_pred_i.mean())

In [None]:
# Define a list of latent representations and their corresponding ZI predictions
latent_representations_1 = [
    {"name": "20lat_1e2", "is_zi_pred": is_zi_pred_g},
]

latent_representations_2 = [
    {"name": "20lat_1e2", "is_zi_pred": is_zi_pred_i},
]

# Compute the mask for sufficient expression
mask_sufficient_expression_g_pbmc = (np.array(adata_g_filtered.X.mean(axis=0)) > 1.0).reshape(-1)
mask_sufficient_expression_i_pbmc = (np.array(adata_i_filtered.X.mean(axis=0)) > 1.0).reshape(-1)

# Loop through each latent representation and compute the metrics
for latent_rep in latent_representations_1:
    name = latent_rep["name"]
    is_zi_pred = latent_rep["is_zi_pred"]
    
    # Print the fraction of genes with avg expression > 1.0
    print(f'Fraction of genes in PBMCs with avg expression > 1.0 for {name}:',
          mask_sufficient_expression_g_pbmc.mean())
    
    # Print the fraction of predicted ZI genes with avg expression > 1.0
    print(f'Fraction of predicted ZI genes in PBMCs with avg expression > 1.0 for {name}:',
          is_zi_pred[mask_sufficient_expression_g_pbmc].mean())
    print("")

for latent_rep in latent_representations_2:
    name = latent_rep["name"]
    is_zi_pred = latent_rep["is_zi_pred"]
    
    # Print the fraction of genes with avg expression > 1.0
    print(f'Fraction of isoforms in PBMCs with avg expression > 1.0 for {name}:',
          mask_sufficient_expression_i_pbmc.mean())
    
    # Print the fraction of predicted ZI genes with avg expression > 1.0
    print(f'Fraction of predicted ZI isoforms in PBMCs with avg expression > 1.0 for {name}:',
          is_zi_pred[mask_sufficient_expression_i_pbmc].mean())
    print("")

In [None]:
# Get denoised expression as a NumPy array
denoised_expr_g = autozi_gene_model_pbmc.get_normalized_expression(adata_g_filtered, library_size=10000)  # Normalized per 10,000 reads
denoised_expr_i = autozi_iso_model_pbmc.get_normalized_expression(adata_i_filtered, library_size=10000)  # Normalized per 10,000 reads

# Assign denoised expression as a new layer (ensure format is compatible with AnnData)
adata_g_filtered.layers["denoised"] = denoised_expr_g.values  # Convert DataFrame to NumPy array
adata_i_filtered.layers["denoised"] = denoised_expr_i.values  # Convert DataFrame to NumPy array

# Apply log1p transformation
adata_g_filtered.layers["log_denoised"] = np.log1p(adata_g_filtered.layers["denoised"])
adata_i_filtered.layers["log_denoised"] = np.log1p(adata_i_filtered.layers["denoised"])

In [None]:
#Print layers to ensure they were saved
print(adata_g_filtered.layers)
print(adata_i_filtered.layers)

In [None]:
print(adata_g_filtered.obsm.keys())

In [None]:
#Save Denoised data
output_dir = 'Intermediate_Files/Clustering_05012025'

adata_g_filtered.write(os.path.join(output_dir, "PBMC_gene_AutoZI_denoised_05032025.h5mu"))
adata_i_filtered.write(os.path.join(output_dir, "PBMC_iso_AutoZI_denoised_05032025.h5mu"))

In [None]:
# Load the mdata object from the file
output_dir = 'Intermediate_Files/Clustering_05012025'

from scanpy import read_h5ad
adata_g_filtered = read_h5ad(os.path.join(output_dir, "PBMC_gene_AutoZI_denoised_05032025.h5mu"))
adata_i_filtered = read_h5ad(os.path.join(output_dir, "PBMC_iso_AutoZI_denoised_05032025.h5mu"))

In [None]:
sc.settings.figdir = "Intermediate_Files/Clustering_05012025/Figures/UMAP"

In [None]:
def relabel_clusters_by_size(adata, cluster_key):
    # Get the cluster labels
    cluster_labels = adata.obs[cluster_key]
    
    # Count the size of each cluster
    cluster_sizes = cluster_labels.value_counts()
    
    # Sort clusters by size (largest first)
    sorted_clusters = cluster_sizes.index[np.argsort(-cluster_sizes.values)]
    
    # Create a mapping from old cluster labels to new ones
    new_labels_map = {old_label: new_label for new_label, old_label in enumerate(sorted_clusters)}
    
    # Apply the new labels
    adata.obs[cluster_key] = cluster_labels.map(new_labels_map).astype('category')
    
    return adata

In [None]:
# Function to plot UMAP with labels reflecting cluster sizes
def plot_umap_with_labels_g(adata, resolutions, use_rep_key=None):
    """
    Plots UMAP with cluster labels and prints what 'use_rep' is assigned to.

    Parameters:
    - adata: AnnData object
    - resolutions: List of resolution values to plot
    - use_rep_key: The key to use for coloring clusters (defaults to '20lat_1e2' if not provided)
    """

    print(f"Using representation: {use_rep_key}")  # Print assigned representation

    vibrant_palette = plt.get_cmap('tab20').colors  # Set color palette

    for res in resolutions:
        # Plot UMAP with cluster labels
        sc.pl.umap(
            adata,
            color=f'{res}_{use_rep_key}',
            title=f'UMAP with Clusters (Gene-Level, {use_rep_key}, Res={res})',
            frameon=True,
            palette=vibrant_palette,
            legend_loc='on data',
            legend_fontsize=10,
            legend_fontoutline=2,
        )

# Function to plot UMAP with labels reflecting cluster sizes
def plot_umap_with_labels_i(adata, resolutions, use_rep_key=None):
    """
    Plots UMAP with cluster labels and prints what 'use_rep' is assigned to.

    Parameters:
    - adata: AnnData object
    - resolutions: List of resolution values to plot
    - use_rep_key: The key to use for coloring clusters (defaults to '20lat_1e2' if not provided)
    """

    print(f"Using representation: {use_rep_key}")  # Print assigned representation

    vibrant_palette = plt.get_cmap('tab20').colors  # Set color palette

    for res in resolutions:
        # Plot UMAP with cluster labels
        sc.pl.umap(
            adata,
            color=f'{res}_{use_rep_key}',
            title=f'UMAP with Clusters (Isoform-Level, {use_rep_key}, Res={res})',
            frameon=True,
            palette=vibrant_palette,
            legend_loc='on data',
            legend_fontsize=10,
            legend_fontoutline=2,
        )

In [None]:
adata_g_filtered.X = adata_g_filtered.layers["log_denoised"]
adata_i_filtered.X = adata_i_filtered.layers["log_denoised"]

In [None]:
import re

rbc_pattern = re.compile(r'^HB[A-Z0-9]+')
rbc_genes_all = [gene for gene in adata_g_filtered.var_names if rbc_pattern.match(gene)]
print(f"Found {len(rbc_genes_all)} RBC-related genes")
print(rbc_genes_all)

rbc_isoforms_all = [gene for gene in adata_i_filtered.var_names if rbc_pattern.match(gene)]
print(f"Found {len(rbc_isoforms_all)} RBC-related genes")
print(rbc_isoforms_all)

In [None]:
# Calculate RBC score before regressing out effect of ambient RBC RNA

# RBC score represents the total expression of all RBC genes (sum of counts per cell).
# Higher RBC score indicates more ambient RNA contamination from red blood cells.
# After regression, lower RBC score suggests successful removal of RBC ambient effects.
adata_g_filtered.obs['rbc_score'] = np.ravel(adata_g_filtered[:, rbc_genes_all].X.sum(axis=1))
adata_i_filtered.obs['rbc_score'] = np.ravel(adata_i_filtered[:, rbc_isoforms_all].X.sum(axis=1))

In [None]:
import matplotlib.pyplot as plt
plt.hist(adata_g_filtered.obs['rbc_score'], bins=50)
plt.xlabel('RBC Score (all RBC genes)')
plt.ylabel('Number of cells')
plt.title('Distribution of RBC Score (all RBC genes)')
plt.show()

plt.hist(adata_i_filtered.obs['rbc_score'], bins=50)
plt.xlabel('RBC Score (all RBC isoforms)')
plt.ylabel('Number of cells')
plt.title('Distribution of RBC Score (all RBC isoforms)')
plt.show()

In [None]:
sc.pp.regress_out(adata_g_filtered, ['rbc_score'])
sc.pp.regress_out(adata_i_filtered, ['rbc_score'])

In [None]:
adata_g_filtered.obs['rbc_score_postregress'] = np.ravel(adata_g_filtered[:, rbc_genes_all].X.sum(axis=1))
adata_i_filtered.obs['rbc_score_postregress'] = np.ravel(adata_i_filtered[:, rbc_isoforms_all].X.sum(axis=1))

In [None]:
plt.hist(adata_g_filtered.obs['rbc_score_postregress'], bins=50, range=(-5, 5))
plt.xlabel('RBC Score after regression (residuals)')
plt.ylabel('Number of cells')
plt.title('Distribution of RBC Score After Regression (Zoomed In)')
plt.show()

In [None]:
plt.hist(adata_i_filtered.obs['rbc_score_postregress'], bins=50, range=(-5, 5))
plt.xlabel('RBC Score after regression (residuals)')
plt.ylabel('Number of cells')
plt.title('Distribution of RBC Score After Regression (Zoomed In)')
plt.show()

In [None]:
sc.pp.scale(adata_g_filtered, max_value = 10)
sc.pp.scale(adata_i_filtered, max_value = 10)

In [None]:
# Run PCA on log-transformed data
sc.pp.pca(adata_g_filtered)

# Compute UMAP
sc.pp.neighbors(adata_g_filtered, n_neighbors = 20, use_rep= 'X_AutoZI')
sc.tl.umap(adata_g_filtered)

In [None]:
# Run PCA on log-transformed data
sc.pp.pca(adata_i_filtered)

# Compute UMAP
sc.pp.neighbors(adata_i_filtered, n_neighbors = 20, use_rep= 'X_AutoZI')
sc.tl.umap(adata_i_filtered)

In [None]:
# Perform Leiden clustering at multiple resolutions
resolutions = [0.04, 0.06, 0.1, 0.14, 0.16, 0.2, 0.24, 0.26, 0.3, 0.34, 0.36, 0.4, 0.44, 0.46, 0.5, 
               0.54, 0.56, 0.6, 0.64, 0.66, 0.7, 0.74, 0.76, 0.8, 0.84, 0.86, 0.9, 0.94, 0.96, 1.0]
for res in resolutions:
    sc.tl.leiden(adata_g_filtered, resolution=res, key_added=f'{res}_log_AutoZI', flavor = "igraph", n_iterations=2)
    sc.tl.leiden(adata_i_filtered, resolution=res, key_added=f'{res}_log_AutoZI', flavor = "igraph", n_iterations=2)

In [None]:
# Relabel clusters by size
resolutions = ['0.04_log_AutoZI', '0.06_log_AutoZI', '0.1_log_AutoZI', '0.14_log_AutoZI', '0.16_log_AutoZI', 
               '0.2_log_AutoZI', '0.24_log_AutoZI', '0.26_log_AutoZI', '0.3_log_AutoZI', '0.34_log_AutoZI', '0.36_log_AutoZI',
              '0.4_log_AutoZI', '0.44_log_AutoZI', '0.46_log_AutoZI', '0.5_log_AutoZI', '0.54_log_AutoZI', '0.56_log_AutoZI', 
               '0.6_log_AutoZI', '0.64_log_AutoZI', '0.66_log_AutoZI', '0.7_log_AutoZI', '0.74_log_AutoZI', '0.76_log_AutoZI', 
               '0.8_log_AutoZI', '0.84_log_AutoZI', '0.86_log_AutoZI', '0.9_log_AutoZI', '0.94_log_AutoZI', '0.96_log_AutoZI', 
               '1.0_log_AutoZI']
for cluster_key in resolutions:
    adata_g_filtered_pbmc = relabel_clusters_by_size(adata_g_filtered, cluster_key)
    adata_i_filtered_pbmc = relabel_clusters_by_size(adata_i_filtered, cluster_key)

In [None]:
sc.pl.umap(adata_g_filtered_pbmc, color=["batch"], title="UMAP Colored by Batch (gene-level)", 
           #save = "_by_batch_gene.pdf"
          )

In [None]:
sc.pl.umap(adata_i_filtered_pbmc, color=["batch"], title="UMAP Colored by Batch (isoform-level)",
           #save = "_by_batch_isoform.pdf"
          )

In [None]:
# Call the functions
plot_umap_with_labels_g(adata_g_filtered_pbmc, resolutions=[
    #0.05, 0.08, 0.1, 0.15, 0.18, 0.2, 0.25, 0.28, 0.3, 0.35, 0.38
    0.04, 0.06, 0.1, 0.14, 0.16, 0.2, 0.24, 0.26, 0.3, 0.34, 0.36, 0.4, 
    0.44, 0.46, 0.5, 0.54, 0.56, 0.6, 0.64, 0.66, 0.7, 0.74, 0.76, 0.8,
    0.84, 0.86, 0.9, 0.94, 0.96, 1.0
], use_rep_key= 'log_AutoZI')

In [None]:
sc.settings.figdir = "Intermediate_Files/Clustering_05012025/Figures/UMAP"

# Only use the desired resolution
resolutions = ['0.1_log_AutoZI']

# Relabel for that resolution
cluster_key = resolutions[0]
adata_g_filtered_pbmc = relabel_clusters_by_size(adata_g_filtered, cluster_key)

# Plot UMAP and save to PDF
sc.pl.umap(
    adata_g_filtered_pbmc,
    color='0.1_log_AutoZI',
    title=f'UMAP with Clusters (Gene-Level, log_AutoZI, Res=0.1)',
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='on data',
    legend_fontsize=10,
    legend_fontoutline=2,
    save="_0.1_log_autoZI_gene.pdf"
)

In [None]:
# Call the functions
plot_umap_with_labels_i(adata_i_filtered, resolutions=[
    #0.05, 0.08, 0.1, 0.15, 0.18, 0.2, 0.25, 0.28, 0.3, 0.35, 0.38
    0.04, 0.06, 0.1, 0.14, 0.16, 0.2, 0.24, 0.26, 0.3, 0.34, 0.36, 0.4, 
    0.44, 0.46, 0.5, 0.54, 0.56, 0.6, 0.64, 0.66, 0.7, 0.74, 0.76, 0.8,
    0.84, 0.86, 0.9, 0.94, 0.96, 1.0
], use_rep_key= 'log_AutoZI')

In [None]:
sc.settings.figdir = "Intermediate_Files/Clustering_05012025/Figures/UMAP"
# Only use the desired resolution
resolutions = ['0.1_log_AutoZI']

# Relabel for that resolution
cluster_key = resolutions[0]
adata_i_filtered_pbmc = relabel_clusters_by_size(adata_i_filtered, cluster_key)

# Plot UMAP and save to PDF
sc.pl.umap(
    adata_i_filtered_pbmc,
    color='0.1_log_AutoZI',
    title=f'UMAP with Clusters (Isoform-Level, log_AutoZI, Res=0.1)',
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='on data',
    legend_fontsize=10,
    legend_fontoutline=2,
    save="_0.1_log_autoZI_isoform.pdf"
)

In [None]:
# Only use the desired resolution
resolutions = ['0.66_log_AutoZI']

# Relabel for that resolution
cluster_key = resolutions[0]
adata_i_filtered_pbmc = relabel_clusters_by_size(adata_i_filtered, cluster_key)

# Plot UMAP and save to PDF
sc.pl.umap(
    adata_g_filtered_pbmc,
    color=cluster_key,
    title=f'UMAP with Clusters (Gene-Level, log_AutoZI, Res=0.66)',
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='on data',
    legend_fontsize=10,
    legend_fontoutline=2,
    save="_0.66_log_autoZI_isoform.pdf"
)

In [None]:
print(adata_g_filtered.layers.keys())

In [None]:
from matplotlib.colors import TwoSlopeNorm
import numpy as np

for gene in rbc_genes_all:
    
    # Skip if gene is not in var_names
    if gene not in adata_g_filtered.var_names:
        print(f"{gene} not found in var_names")
        continue

    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].layers["counts"]

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s for vmax calculation (to avoid blowout)
    expr_nonzero = expr[expr != 0]

    if len(expr_nonzero) == 0:
        vmax_val = 1.0  # fallback if no expression
    else:
        vmax_val = np.percentile(expr_nonzero, 99)

    # Count cells above vmax for sanity check
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Create symmetric normalization around zero
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

    # Plot UMAP
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"RBC Gene Expression: {gene}",
        cmap="coolwarm",
        #save=f"_{safe_gene_name}_RBC_UMAP.pdf",
        show=True,
        norm=norm
    )

In [None]:
from matplotlib.colors import TwoSlopeNorm
import numpy as np

for gene in rbc_isoforms_all:
    
    # Skip if gene is not in var_names
    if gene not in adata_i_filtered.var_names:
        print(f"{gene} not found in var_names")
        continue

    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_i_filtered[:, gene].layers["counts"]

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s for vmax calculation (to avoid blowout)
    expr_nonzero = expr[expr != 0]

    if len(expr_nonzero) == 0:
        vmax_val = 1.0  # fallback if no expression
    else:
        vmax_val = np.percentile(expr_nonzero, 99)

    # Count cells above vmax for sanity check
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Create symmetric normalization around zero
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

    # Plot UMAP
    sc.pl.umap(
        adata_i_filtered,
        color=gene,
        use_raw=False,
        title=f"RBC Isoform Expression: {gene}",
        cmap="coolwarm",
        #save=f"_{safe_gene_name}_RBC_UMAP.pdf",
        show=True,
        norm=norm
    )

In [None]:
# Function to find matching genes in var_names (combined_ID format)
def find_matching_genes(prefixes, gene_list):
    return [gene for gene in gene_list if any(gene.startswith(prefix) for prefix in prefixes)]

In [None]:
#PTPRCAP = CD45
TCell_Markers = ["CD4:", "CD3D:", "CD3E:", "CD3G:", "CD3Z", "CD8A:", "CD8B:", "PTPRCAP:"]
Naive_TCell = ["PTPRC:", "TCF7:", "FOXP1", "LEF1:", "PECAM1:"]
Memory_TCell = ["CCR5:", "HLA-DRB1:", "HLA-DRA:", "ITGAE:", "SELL", "TCF7:", "IL7R:", "CCR7:"]
CD8_TCell = ["CD8A:", "CD8B:", "CXCR3:", "KLRB1:", "PTGDR2:", "GATA3:", "IRF4:", "RORC:", "CCL5"]
#Central_Memory_TCell = ["CCR5:", "IL7RA:", "EOMES:", "PRDM1:", "IL7R:", "SELL", "CCR7:"]
#Effector_Memory_TCell = ["CCR5:", "HLA-DRB1:", "HLA-DRA:", "ITGAL:", "GZMA:", "PRDM1:", "SELL"]
Reg_TCell = ["FOXP3:", "IL2RA:", "CTLA4:", "STAT5A"]
#Th1_TCell = ["CXCR3:", "IFNG:", "TNF:", "STAT4:"]
CD4_Effector = ["CXCR3:", "TNF:", "STAT4:", "IL17A:", "IL13:", "IL25:", "AHR:", "FOXO4:", "GATA3", "IL2RA"]

In [None]:
# Get gene matches for gene-level data
TCell_genes_1 = find_matching_genes(TCell_Markers, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_1)} gene-level IDs for T-Cells: {TCell_genes_1}")
TCell_genes_2 = find_matching_genes(Naive_TCell, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_2)} gene-level IDs for Naive T-Cells: {TCell_genes_2}")
TCell_genes_3 = find_matching_genes(Memory_TCell, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_3)} gene-level IDs for Memory T-Cells: {TCell_genes_3}")
TCell_genes_4 = find_matching_genes(CD8_TCell, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_4)} gene-level IDs for CD8+ T-Cells: {TCell_genes_4}")
TCell_genes_5 = find_matching_genes(Reg_TCell, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_5)} gene-level IDs for Regulatory T-Cells: {TCell_genes_5}")
TCell_genes_6 = find_matching_genes(CD4_Effector, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_6)} gene-level IDs for Helper T-Cells: {TCell_genes_6}")


In [None]:
# Get gene matches for isoform-level data
TCell_iso_1 = find_matching_genes(TCell_Markers, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_1)} isoform-level IDs for T-Cells: {TCell_iso_1}")
TCell_iso_2 = find_matching_genes(Naive_TCell, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_2)} isoform-level IDs for Naive T-Cells: {TCell_iso_2}")
TCell_iso_3 = find_matching_genes(Memory_TCell, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_3)} isoform-level IDs for Memory T-Cells: {TCell_iso_3}")
TCell_iso_4 = find_matching_genes(CD8_TCell, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_4)} isoform-level IDs for CD8+ T-Cells: {TCell_iso_4}")
TCell_iso_5 = find_matching_genes(Reg_TCell, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_5)} isoform-level IDs for Regulatory T-Cells: {TCell_iso_5}")
TCell_iso_6 = find_matching_genes(CD4_Effector, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_6)} isoform-level IDs for Helper T-Cells: {TCell_iso_6}")

In [None]:
import scanpy as sc
from matplotlib.colors import TwoSlopeNorm
import matplotlib.pyplot as plt

# Define output directory
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "TCell"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("T-Cell Markers")

for gene in TCell_genes_1:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
import scanpy as sc
from matplotlib.colors import TwoSlopeNorm
import matplotlib.pyplot as plt

# Define output directory
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "TCell"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("T-Cell Markers")

for gene in TCell_iso_1:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_i_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_i_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
#MS4A1 = CD20, SPN = CD43
# Define gene prefixes of interest
BCell_Markers = ["MS4A1:","CD19:", "CD79A:", "CD22:", "CD1D:", "CD24:", "CD80:", "CD27:", "SPN:"]

# Get gene matches for gene-level data
BCell_genes = find_matching_genes(BCell_Markers, adata_g_filtered.var_names)
print(f"Matched {len(BCell_genes)} gene-level IDs: {BCell_genes}")
BCell_iso = find_matching_genes(BCell_Markers, adata_i_filtered.var_names)
print(f"Matched {len(BCell_iso)} isoform-level IDs: {BCell_iso}")

In [None]:
# Define the latent representation to use

print("B-Cell Markers")

# Define output directory
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "BCell"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir


# Loop through matched genes and plot them
for gene in BCell_genes:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:

# Define output directory
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "BCell"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir


# Loop through matched genes and plot them
for gene in BCell_iso:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_i_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_i_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
#FCG3RA = CD16, NCAM1 = CD56, ITGAM = CD11b
NKCells = ["NCAM1:", "FCGR3A:", "KLRD1:", "KLRF1:", "GZMB:", "CD226:", ]
NKActiv = ["IFNG:", "CCL5:", "IL2RB:", "ITGAM"]

# Get gene matches for gene-level data
NK_genes_1 = find_matching_genes(NKCells, adata_g_filtered.var_names)
print(f"Matched {len(NK_genes_1)} gene-level IDs for NK Cells: {NK_genes_1}")
NK_genes_2 = find_matching_genes(NKActiv, adata_g_filtered.var_names)
print(f"Matched {len(NK_genes_2)} gene-level IDs for Activated NK Cells: {NK_genes_2}")

# Get gene matches for iso-level data
NK_iso_1 = find_matching_genes(NKCells, adata_i_filtered.var_names)
print(f"Matched {len(NK_iso_1)} gene-level IDs for NK Cells: {NK_iso_1}")
NK_iso_2 = find_matching_genes(NKActiv, adata_i_filtered.var_names)
print(f"Matched {len(NK_iso_2)} gene-level IDs for Activated NK Cells: {NK_iso_2}")

In [None]:
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "NK Cell"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("NK T-Cell Markers")

# Loop through matched genes and plot them
for gene in NK_genes_1:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "NK Cell"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("NK Cell Markers")

# Loop through matched genes and plot them
for gene in NK_iso_1:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_i_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_i_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
Classical_Monocytes = ["CD14:", "FCGR2A:", "IL1B:", "SELL:", "CLEC7A:", "TNF:"]
NonClassical_Monocytes = ["FCGR3A:", "CX3CR1:"]
Myeloid = ["ITGAM:", "ITGAX:", "CD33:", "CD14:", "CD1C:", "HLA-DRA:", "HLA-DRB1:"]

# Get gene matches for gene-level data
Mono_genes_1 = find_matching_genes(Classical_Monocytes, adata_g_filtered.var_names)
print(f"Matched {len(Mono_genes_1)} gene-level IDs for Classical Monocytes: {Mono_genes_1}")
Mono_genes_2 = find_matching_genes(NonClassical_Monocytes, adata_g_filtered.var_names)
print(f"Matched {len(Mono_genes_2)} gene-level IDs for Non-Classical Monocytes: {Mono_genes_2}")
Myeloid_genes = find_matching_genes(Myeloid, adata_g_filtered.var_names)
print(f"Matched {len(Myeloid_genes)} gene-level IDs for Myeloid Cells: {Myeloid_genes}")

# Get gene matches for gene-level data
Mono_iso_1 = find_matching_genes(Classical_Monocytes, adata_i_filtered.var_names)
print(f"Matched {len(Mono_iso_1)} iso-level IDs for Classical Monocytes: {Mono_iso_1}")
Mono_iso_2 = find_matching_genes(NonClassical_Monocytes, adata_i_filtered.var_names)
print(f"Matched {len(Mono_iso_2)} iso-level IDs for Non-classical Monocytes: {Mono_iso_2}")
Myeloid_iso = find_matching_genes(Myeloid, adata_i_filtered.var_names)
print(f"Matched {len(Myeloid_iso)} iso-level IDs for Myeloid Cells: {Myeloid_iso}")

In [None]:
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "Myeloid Cell"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Myeloid Markers")

# Loop through matched genes and plot them
for gene in Myeloid_genes:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "Classical Monocytes"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Classical Monocyte Markers")

# Loop through matched genes and plot them
for gene in Mono_genes_1:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "NonClassical Monocytes"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("NonClassical Monocyte Markers")

# Loop through matched genes and plot them
for gene in Mono_genes_2:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "Myeloid Cell"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Myeloid Markers")

# Loop through matched genes and plot them
for gene in Myeloid_iso:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_i_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_i_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "Classical Monocytes"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Classical Monocyte Markers")

# Loop through matched genes and plot them
for gene in Mono_iso_1:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_i_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_i_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "NonClassical Monocytes"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("NonClassical Monocyte Markers")

# Loop through matched genes and plot them
for gene in Mono_iso_2:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_i_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_i_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
DC = ["CD1C:", "CD1A:", "ZBTB46:", "ITGAE:", "HLA-DRA:", "HLA-DRB1:", "THBD:", "SIRPA:", "LILRB4", "IRF8:", "IRF4:", "CD14:"]
pDC = ["CLEC4C:", "TNF:", "TCF4:", "TLR7:", "TLR9:"]

# Get gene matches for gene-level data
DC_genes = find_matching_genes(DC, adata_g_filtered.var_names)
print(f"Matched {len(DC_genes)} gene-level IDs for Dendritic Cells: {DC_genes}")
pDC_genes = find_matching_genes(pDC, adata_g_filtered.var_names)
print(f"Matched {len(pDC_genes)} gene-level IDs for Plasmacytoid Dendritic Cells: {pDC_genes}")

# Get gene matches for iso-level data
DC_iso = find_matching_genes(DC, adata_i_filtered.var_names)
print(f"Matched {len(DC_iso)} isoform-level IDs for Dendritic Cells: {DC_iso}")
pDC_iso = find_matching_genes(pDC, adata_i_filtered.var_names)
print(f"Matched {len(pDC_iso)} isoform-level IDs for Plasmacytoid Dendritic Cells: {pDC_iso}")

In [None]:
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "Dendritic Cells"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Dendritic Cell Markers")

# Loop through matched genes and plot them
for gene in DC_genes:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "Plasmacytoid Dendritic Cells"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Plasmacytoid Dendritic Cell Markers")

# Loop through matched genes and plot them
for gene in pDC_genes:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "Dendritic Cells"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Dendritic Cell Markers")

# Loop through matched genes and plot them
for gene in DC_iso:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_i_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_i_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "Plasmacytoid Dendritic Cells"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Plasmacytoid Dendritic Cell Markers")

# Loop through matched genes and plot them
for gene in pDC_iso:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_i_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_i_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
MK = ["CD42D:", "CXCR1:", "CXCR2", "ITGA2B:", "GP1BA:", "CXCR4:", "SLAMF1:", "MPL:",  "GP5:"]

# Get gene matches for gene-level data
MK_genes = find_matching_genes(MK, adata_g_filtered.var_names)
print(f"Matched {len(MK_genes)} gene-level IDs for Megakaryotes: {MK_genes}")

MK_iso = find_matching_genes(MK, adata_i_filtered.var_names)
print(f"Matched {len(MK_iso)} isoform-level IDs for Megakaryotes: {MK_iso}")

In [None]:
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "Megakaryocyte"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Megakaryocyte Markers")

# Loop through matched genes and plot them
for gene in MK_genes:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "Megakaryocyte"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Megakaryocyte Markers")

# Loop through matched genes and plot them
for gene in MK_iso:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_i_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_i_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
Transition = ["STAT4:", "IFNG:", "FOXO4:", "CD4:", "TCF7:", "IL7R:"]

# Get gene matches for gene-level data
Transition_genes = find_matching_genes(Transition, adata_g_filtered.var_names)
print(f"Matched {len(Transition_genes)} gene-level IDs for Transition Cells: {Transition_genes}")

Transition_iso = find_matching_genes(Transition, adata_i_filtered.var_names)
print(f"Matched {len(Transition_iso)} isoform-level IDs for Transition Cells: {Transition_iso}")

In [None]:
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "TransitionCells"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Transition Cell Markers")

# Loop through matched genes and plot them
for gene in Transition_genes:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
# For example, suppose you want to merge clusters '0' and '1' into "Merged_A",
# '2' and '3' into "Merged_B", and keep others as-is.
celltype_names = {
    "0": "TCells",
    "1": "NK Cells",
    "2": "BCells",
    "3": "Monocyte-derived",
    "4": "Megakaryocytes"
}

# Create a new observation column with the merged cluster assignments.
adata_g_filtered.obs['gen_cell_type'] = adata_g_filtered_pbmc.obs['0.1_log_AutoZI'].astype(str).replace(celltype_names)

# Optionally, you can verify the result:
print(adata_g_filtered.obs['gen_cell_type'].value_counts())

In [None]:
sc.settings.figdir = "Intermediate_Files/Clustering_05012025/Figures/UMAP"
# Only use the desired resolution
resolutions = ['0.1_log_AutoZI']

import matplotlib.pyplot as plt
import scanpy as sc


# Define desired order
legend_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]

# Plot UMAP without saving yet
sc.pl.umap(
    adata_g_filtered,
    color='gen_cell_type',
    title='',  # No title for publication version
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='lower center',
    legend_fontsize=10,
    legend_fontoutline=1,
    show=False  # Important: allow to edit figure after plotting
)

# Adjust figure
fig = plt.gcf()
fig.set_size_inches(6, 4)

# Get legend and reorder
ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()

# Build a new legend with desired order
order = [labels.index(l) for l in legend_order if l in labels]
ordered_handles = [handles[i] for i in order]
ordered_labels = [labels[i] for i in order]

# Remove old legend
ax.get_legend().remove()

# Add new ordered legend
ax.legend(ordered_handles, ordered_labels,
          loc='lower center',
          bbox_to_anchor=(0.5, -0.25),  # Centered below, negative y pushes down
          fontsize=10,
          frameon=True,
          ncol=3)  # vertical layout

plt.tight_layout()

# Save manually
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/UMAP_0.1_log_autoZI_gene_celltypes.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Extract exact cluster colors from UMAP
cluster_categories = adata_g_filtered_pbmc.obs["gen_cell_type"].cat.categories
cluster_colors_actual = adata_g_filtered_pbmc.uns["gen_cell_type_colors"]

# Build color mapping
umap_colors = dict(zip(cluster_categories, cluster_colors_actual))
umap_colors = {k: tuple(float(x) for x in v[:3]) if isinstance(v, (list, tuple)) else v for k, v in umap_colors.items()}

# Define genes and clusters order
genes = ['CD3D:ENSG00000167286', 'CD3E:ENSG00000198851', 'CD3G:ENSG00000160654']
cluster_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]

# Build dataframe for plotting
plot_data = []
for gene in genes:
    expr = adata_g_filtered_pbmc[:, gene].X.toarray().flatten()
    clusters = adata_g_filtered.obs['gen_cell_type'].values
    plot_data.append(pd.DataFrame({
        "Gene": [gene.split(":")[0]] * len(expr),
        "Expression": expr,
        "Cluster": clusters
    }))

df = pd.concat(plot_data)

# Plot
plt.figure(figsize=(6, 4))
sns.violinplot(data=df, x="Gene", y="Expression", hue="Cluster",
               order=["CD3D", "CD3E", "CD3G"],
               hue_order=cluster_order,
               palette=[umap_colors[c] for c in cluster_order],
               cut=0, linewidth=1)

#plt.xlabel("Gene")
plt.ylabel("Expression")
plt.legend(title="Cell Type", bbox_to_anchor=(1.05, 1), loc="lower center", borderaxespad=0.)

# Get legend and reorder
ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()

# Build a new legend with desired order
order = [labels.index(l) for l in legend_order if l in labels]
ordered_handles = [handles[i] for i in order]
ordered_labels = [labels[i] for i in order]

# Remove old legend
ax.get_legend().remove()

# Add new ordered legend
ax.legend(ordered_handles, ordered_labels,
          loc='lower center',
          bbox_to_anchor=(0.5, -0.4),  # Centered below, negative y pushes down
          fontsize=10,
          frameon=True,
          ncol=3)  # vertical layout
plt.tight_layout()

plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/CD3_violin_plot.pdf",
            dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap

sc.settings.figdir = "Intermediate_Files/Clustering_05012025/Figures/UMAP"

# Genes of interest
genes = ['CD3D:ENSG00000167286', 'CD3E:ENSG00000198851', 'CD3G:ENSG00000160654']

# Calculate summed expression per cell
adata_g_filtered.obs['CD3_Combined'] = adata_g_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_g_filtered,
    color='CD3_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Total TCell Marker Expression",
    vmin=(-(np.percentile(adata_g_filtered.obs['CD3_Combined'], 100))),
    vmax=np.percentile(adata_g_filtered.obs['CD3_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(5, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("TCell Aggregate Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "TCell Aggregate\nGene Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_TCell_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
# Genes of interest
genes = ['CD22:ENSG00000012124', 'CD79A:ENSG00000105369', 'MS4A1:ENSG00000156738', 'CD19:ENSG00000177455']

# Calculate summed expression per cell
adata_g_filtered.obs['BCell_Markers_Combined'] = adata_g_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_g_filtered,
    color='BCell_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="BCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_g_filtered.obs['BCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_g_filtered.obs['BCell_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(5, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("BCell Aggregate Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "BCell Aggregate\nGene Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_BCell_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
#Matched 6 gene-level IDs for NK Cells: ['GZMB:ENSG00000100453', 'KLRD1:ENSG00000134539', 'NCAM1:ENSG00000149294', 'KLRF1:ENSG00000150045', 'CD226:ENSG00000150637', 'FCGR3A:ENSG00000203747']
#Matched 4 gene-level IDs for Activated NK Cells: ['IL2RB:ENSG00000100385', 'IFNG:ENSG00000111537', 'ITGAM:ENSG00000169896', 'CCL5:ENSG00000271503']

# Genes of interest
genes = ['GZMB:ENSG00000100453', 
         'NCAM1:ENSG00000149294', 
         'KLRF1:ENSG00000150045',
        'IL2RB:ENSG00000100385', 
         'ITGAM:ENSG00000169896', 
         'CD226:ENSG00000150637',
         'FCGR3A:ENSG00000203747'
        ]

# Calculate summed expression per cell
adata_g_filtered.obs['NKCell_Markers_Combined'] = adata_g_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_g_filtered,
    color='NKCell_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="NKCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_g_filtered.obs['NKCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_g_filtered.obs['NKCell_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(5, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Natural Killer (NK) Cell Aggregate Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "Natural Killer (NK) Cell Aggregate\nGene Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_NKCell_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
#Matched 6 gene-level IDs for Classical Monocytes: ['IL1B:ENSG00000125538', 'FCGR2A:ENSG00000143226', 'CD14:ENSG00000170458', 'CLEC7A:ENSG00000172243', 'SELL:ENSG00000188404', 'TNF:ENSG00000232810']
#Matched 2 gene-level IDs for Non-Classical Monocytes: ['CX3CR1:ENSG00000168329', 'FCGR3A:ENSG00000203747']

# Genes of interest
genes = ['FCGR2A:ENSG00000143226', 
         'FCGR3A:ENSG00000203747', 
         'CLEC7A:ENSG00000172243',
         'CD33:ENSG00000105383',
         'LILRB4:ENSG00000186818'
        ]

# Calculate summed expression per cell
adata_g_filtered.obs['Mono_Markers_Combined'] = adata_g_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_g_filtered,
    color='Mono_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Monocyte-derived Marker Combined Expression",
    vmin=(-(np.percentile(adata_g_filtered.obs['Mono_Markers_Combined'], 100))),
    vmax=np.percentile(adata_g_filtered.obs['Mono_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(5, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Monocyte-derived Aggregate Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "Monocyte-derived Aggregate\nGene Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_MonocyteDerived_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
#Matched 8 gene-level IDs for Megakaryotes: ['ITGA2B:ENSG00000005961','MPL:ENSG00000117400', 'CXCR4:ENSG00000121966', 'CXCR1:ENSG00000163464', 'GP5:ENSG00000178732', 'CXCR2:ENSG00000180871', 'GP1BA:ENSG00000185245']

# Genes of interest
genes = ['MPL:ENSG00000117400', 
         'ITGA2B:ENSG00000005961'
        ]

# Calculate summed expression per cell
adata_g_filtered.obs['MK_Markers_Combined'] = adata_g_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_g_filtered,
    color='MK_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Megakaryocyte Marker Combined Expression",
    vmin=(-(np.percentile(adata_g_filtered.obs['MK_Markers_Combined'], 100))),
    vmax=np.percentile(adata_g_filtered.obs['MK_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(5, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Megakaryocyte Aggregate Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "Megakaryocyte Aggregate\nGene Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_Megakaryocyte_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
# For example, suppose you want to merge clusters '0' and '1' into "Merged_A",
# '2' and '3' into "Merged_B", and keep others as-is.
celltype_names = {
    "0": "TCells",
    "1": "NK Cells",
    "2": "BCells",
    "3": "Monocyte-derived",
    "4": "Megakaryocytes"
}

# Create a new observation column with the merged cluster assignments.
adata_i_filtered.obs['gen_cell_type'] = adata_i_filtered_pbmc.obs['0.1_log_AutoZI'].astype(str).replace(celltype_names)

# Optionally, you can verify the result:
print(adata_i_filtered.obs['gen_cell_type'].value_counts())

In [None]:
import matplotlib.pyplot as plt
import scanpy as sc


# Define desired order
legend_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]

# Plot UMAP without saving yet
sc.pl.umap(
    adata_i_filtered,
    color='gen_cell_type',
    title='',  # No title for publication version
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='lower right',
    legend_fontsize=10,
    legend_fontoutline=1,
    show=False  # Important: allow to edit figure after plotting
)

# Adjust figure
fig = plt.gcf()
fig.set_size_inches(6, 4)

# Get legend and reorder
ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()

# Build a new legend with desired order
order = [labels.index(l) for l in legend_order if l in labels]
ordered_handles = [handles[i] for i in order]
ordered_labels = [labels[i] for i in order]

# Remove old legend
ax.get_legend().remove()

# Add new ordered legend
ax.legend(ordered_handles, ordered_labels,
          loc='lower center',
          bbox_to_anchor=(0.5, -0.25),  # Centered below, negative y pushes down
          fontsize=10,
          frameon=True,
          ncol=3)  # vertical layout

plt.tight_layout()

# Save manually
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/UMAP_0.1_log_autoZI_isoform_celltypes.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap

# Define genes of interest
genes = ['CD3G:ENSG00000160654:ENST00000292144', 'CD3G:ENSG00000160654:ENST00000392883', 'CD3G:ENSG00000160654:ENST00000527777', 
'CD3G:ENSG00000160654:ENST00000532917', 'CD3D:ENSG00000167286:ENST00000300692', 'CD3D:ENSG00000167286:ENST00000392884', 
'CD3D:ENSG00000167286:ENST00000526561', 'CD3D:ENSG00000167286:ENST00000529594', 'CD3D:ENSG00000167286:ENST00000534687', 
'CD3D:ENSG00000167286:ENST00000695666', 'CD3D:ENSG00000167286:ENST00000695667', 'CD3D:ENSG00000167286:ENST00000695668',  
'CD3E:ENSG00000198851:ENST00000361763', 'CD3E:ENSG00000198851:ENST00000526146', 'CD3E:ENSG00000198851:ENST00000528600', 
'CD3E:ENSG00000198851:ENST00000529713', 'CD3E:ENSG00000198851:ENST00000531913']

cluster_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]

sc.settings.figdir = "Intermediate_Files/Clustering_05012025/Figures/UMAP"

# Calculate summed expression per cell
adata_i_filtered.obs['TCell_Combined'] = adata_i_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_i_filtered,
    color='TCell_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Total TCell Isoform Marker Expression",
    vmin=(-(np.percentile(adata_i_filtered.obs['TCell_Combined'], 100))),
    vmax=np.percentile(adata_i_filtered.obs['TCell_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(5, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("TCell Aggregate Isoform Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "TCell Aggregate Isoform\nMarker Expression", 
 #        va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_TCell_combined_expression_isoform_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap

# Define genes of interest
genes = ['GZMB:ENSG00000100453:ENST00000216341', 'GZMB:ENSG00000100453:ENST00000382540', 'GZMB:ENSG00000100453:ENST00000415355', 
         'GZMB:ENSG00000100453:ENST00000526004', 'GZMB:ENSG00000100453:ENST00000532263', 'GZMB:ENSG00000100453:ENST00000554242', 
         'KLRD1:ENSG00000134539:ENST00000336164', 'KLRD1:ENSG00000134539:ENST00000350274', 'KLRD1:ENSG00000134539:ENST00000538997', 
         'KLRD1:ENSG00000134539:ENST00000539792', 'KLRD1:ENSG00000134539:ENST00000540271', 'KLRD1:ENSG00000134539:ENST00000543420', 
         'NCAM1:ENSG00000149294:ENST00000316851', 'CD226:ENSG00000150637:ENST00000280200', 'CD226:ENSG00000150637:ENST00000577287', 
         'CD226:ENSG00000150637:ENST00000580335', 'CD226:ENSG00000150637:ENST00000582621', 'FCGR3A:ENSG00000203747:ENST00000367967', 
         'FCGR3A:ENSG00000203747:ENST00000426740', 'FCGR3A:ENSG00000203747:ENST00000443193', 'FCGR3A:ENSG00000203747:ENST00000699398',
         'IL2RB:ENSG00000100385:ENST00000216223', 'IL2RB:ENSG00000100385:ENST00000429622', 'IL2RB:ENSG00000100385:ENST00000445595', 
         'IL2RB:ENSG00000100385:ENST00000453962', 'IL2RB:ENSG00000100385:ENST00000698883', 'IL2RB:ENSG00000100385:ENST00000698890', 
         'IL2RB:ENSG00000100385:ENST00000698891', 'IL2RB:ENSG00000100385:ENST00000698892', 'IL2RB:ENSG00000100385:ENST00000698893', 
         'IL2RB:ENSG00000100385:ENST00000698894', 'IL2RB:ENSG00000100385:ENST00000698895', 'IL2RB:ENSG00000100385:ENST00000698896', 
         'IL2RB:ENSG00000100385:ENST00000698902', 'IL2RB:ENSG00000100385:ENST00000698903', 'IL2RB:ENSG00000100385:ENST00000698904', 
         'IL2RB:ENSG00000100385:ENST00000698905', 'IL2RB:ENSG00000100385:ENST00000703410', 'ITGAM:ENSG00000169896:ENST00000544665', 
         'ITGAM:ENSG00000169896:ENST00000561838', 'ITGAM:ENSG00000169896:ENST00000648685']

cluster_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]

sc.settings.figdir = "Intermediate_Files/Clustering_05012025/Figures/UMAP"

# Calculate summed expression per cell
adata_i_filtered.obs['NKCell_Combined'] = adata_i_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_i_filtered,
    color='NKCell_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Total TCell Isoform Marker Expression",
    vmin=(-(np.percentile(adata_i_filtered.obs['NKCell_Combined'], 100))),
    vmax=np.percentile(adata_i_filtered.obs['NKCell_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(5, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Natural Killer (NK) Cell Aggregate Isoform Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "Natural Killer (NK) Cell Aggregate\nIsoform Marker Expression", 
#        va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_NKCell_combined_expression_isoform_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap

# Define genes of interest
genes = ['CLEC7A:ENSG00000172243:ENST00000304084', 'CLEC7A:ENSG00000172243:ENST00000310002', 'CLEC7A:ENSG00000172243:ENST00000528799', 
         'CLEC7A:ENSG00000172243:ENST00000534609', 'FCGR3A:ENSG00000203747:ENST00000367967', 'FCGR3A:ENSG00000203747:ENST00000426740', 
         'FCGR3A:ENSG00000203747:ENST00000443193', 'FCGR3A:ENSG00000203747:ENST00000699398', 'CD33:ENSG00000105383:ENST00000262262', 
         'CD33:ENSG00000105383:ENST00000598473', 'CD33:ENSG00000105383:ENST00000600557', 'LILRB4:ENSG00000186818:ENST00000695418']

cluster_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]

sc.settings.figdir = "Intermediate_Files/Clustering_05012025/Figures/UMAP"

# Calculate summed expression per cell
adata_i_filtered.obs['Monocyte_Combined'] = adata_i_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_i_filtered,
    color='Monocyte_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Monocyte-derived Aggregate\nIsoform Marker Expression",
    vmin=(-(np.percentile(adata_i_filtered.obs['Monocyte_Combined'], 100))),
    vmax=np.percentile(adata_i_filtered.obs['Monocyte_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(5, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Monocyte-Derived Aggregate Isoform Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "Monocyte-Derived Aggregate\nIsoform Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_MonocyteDerived_combined_expression_isoform_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap

# Define genes of interest
genes = ['CD22:ENSG00000012124:ENST00000085219', 'CD22:ENSG00000012124:ENST00000341773', 'CD22:ENSG00000012124:ENST00000536635', 
         'CD22:ENSG00000012124:ENST00000596492', 'CD79A:ENSG00000105369:ENST00000221972', 'CD79A:ENSG00000105369:ENST00000444740', 
         'MS4A1:ENSG00000156738:ENST00000345732', 'MS4A1:ENSG00000156738:ENST00000389939', 'MS4A1:ENSG00000156738:ENST00000532073', 
         'MS4A1:ENSG00000156738:ENST00000532418', 'MS4A1:ENSG00000156738:ENST00000534668', 'MS4A1:ENSG00000156738:ENST00000674194', 
         'CD19:ENSG00000177455:ENST00000324662', 'CD19:ENSG00000177455:ENST00000538922', 'CD19:ENSG00000177455:ENST00000565089']

cluster_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]

sc.settings.figdir = "Intermediate_Files/Clustering_05012025/Figures/UMAP"

# Calculate summed expression per cell
adata_i_filtered.obs['BCell_Combined'] = adata_i_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_i_filtered,
    color='BCell_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Total BCell Isoform Marker Expression",
    vmin=(-(np.percentile(adata_i_filtered.obs['BCell_Combined'], 100))),
    vmax=np.percentile(adata_i_filtered.obs['BCell_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(5, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("BCell Aggregate Isoform Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "BCell Aggregate Isoform\nMarker Expression", 
#        va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_BCell_combined_expression_isoform_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
#Matched 8 gene-level IDs for Megakaryotes: ['ITGA2B:ENSG00000005961','MPL:ENSG00000117400', 'CXCR4:ENSG00000121966', 'CXCR1:ENSG00000163464', 'GP5:ENSG00000178732', 'CXCR2:ENSG00000180871', 'GP1BA:ENSG00000185245']

# Genes of interest
genes = ['ITGA2B:ENSG00000005961:ENST00000262407', 'MPL:ENSG00000117400:ENST00000372470']
        

# Calculate summed expression per cell
adata_i_filtered.obs['MK_Markers_Combined'] = adata_i_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_i_filtered,
    color='MK_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Aggregate Megakaryocyte Marker Combined Expression",
    vmin=(-(np.percentile(adata_i_filtered.obs['MK_Markers_Combined'], 100))),
    vmax=np.percentile(adata_i_filtered.obs['MK_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(5, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Megakaryocyte Aggregate Isoform Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "Aggregate Megakaryocyte\nIsoform Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_Megakaryocyte_combined_expression_isoform_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
#Save Clustered data
output_dir = 'Intermediate_Files/Clustering_05012025/'

#adata_g_filtered.write(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_AutoZILatent_05042025.h5mu"))
#adata_i_filtered.write(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_AutoZILatent_05042025.h5mu"))

adata_g_filtered.write(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_AutoZILatent_05062025.h5mu"))
adata_i_filtered.write(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_AutoZILatent_05062025.h5mu"))

In [None]:
# Load the mdata object from the file
output_dir = 'Intermediate_Files/Clustering_05012025'

from scanpy import read_h5ad
#adata_g_filtered = read_h5ad(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_AutoZILatent_05042025.h5mu"))
#adata_i_filtered = read_h5ad(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_AutoZILatent_05042025.h5mu"))

#adata_g_filtered = read_h5ad(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_AutoZILatent_05062025.h5mu"))
#adata_i_filtered = read_h5ad(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_AutoZILatent_05062025.h5mu"))


# Load the mdata object from the file
adata_g_filtered = read_h5ad(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_reannotated_AutoZILatent_05112025.h5mu"))
adata_i_filtered = read_h5ad(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_reannotated_AutoZILatent_05112025.h5mu"))

In [None]:
import pandas as pd
from collections import Counter

gtf_file = "PBMC_FEB_28_2025_extended_annotations.gtf"  # change this

# Read GTF
gtf_cols = ["chrom", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"]
gtf = pd.read_csv(gtf_file, sep="\t", comment="#", names=gtf_cols)

# Extract all attributes
attribute_data = gtf["attribute"].dropna().tolist()

# Collect keys
attr_keys = []

for attr in attribute_data:
    parts = [x.strip() for x in attr.strip().split(";") if x.strip()]
    for part in parts:
        key = part.split(" ")[0]
        attr_keys.append(key)

# Count occurrences
key_counts = Counter(attr_keys)

# Display unique attribute keys
print("Unique attributes found in GTF (with counts):")
for key, count in key_counts.items():
    print(f"{key}: {count} occurrences")

In [None]:
import pandas as pd
import re

# Load Bambu isoforms (from your existing code)
bambu_isos = [gene for gene in adata_i_filtered.var_names if "Bambu" in gene]

# Load GTF
gtf_file = "PBMC_FEB_28_2025_extended_annotations.gtf"

gtf_cols = ["chrom", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"]
gtf = pd.read_csv(gtf_file, sep="\t", comment="#", names=gtf_cols, dtype={"chrom": str})

# Filter exons
exons = gtf[gtf["feature"] == "exon"].copy()

# Extract attributes
def get_attr(attr_string, key):
    pattern = f'{key} ["=]?([^";]+)'
    match = re.search(pattern, attr_string)
    return match.group(1) if match else None

exons["gene_id"] = exons["attribute"].apply(lambda x: get_attr(x, "gene_id"))
exons["transcript_id"] = exons["attribute"].apply(lambda x: get_attr(x, "transcript_id"))
exons["exon_number"] = exons["attribute"].apply(lambda x: get_attr(x, "exon_number"))

# Create combined ID to match bambu_isos
exons["combined_id"] = exons["gene_id"] + ":" + exons["transcript_id"]

# Filter exons to only Bambu isoforms
bambu_exons = exons[exons["combined_id"].isin(bambu_isos)].copy()

# Process exon_number as integer and sort
bambu_exons["exon_number"] = bambu_exons["exon_number"].astype(int)
bambu_exons = bambu_exons.sort_values(["combined_id", "exon_number"])

# Add exon length
bambu_exons["exon_length"] = bambu_exons["end"] - bambu_exons["start"] + 1

# Rename chrom to chromosome
bambu_exons = bambu_exons.rename(columns={"chrom": "chromosome"})

# Final result
print(bambu_exons[["chromosome", "start", "end", "strand", "gene_id", "transcript_id", "combined_id", "exon_number", "exon_length"]])

In [None]:
# OPTIONAL: export to TSV
bambu_exons.to_csv("bambu_exons.tsv", sep="\t", index=False)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Group by combined_id
exon_counts_all = exons.groupby("transcript_id")["exon_number"].nunique().reset_index()
exon_counts_all.rename(columns={"exon_number": "n_exons"}, inplace=True)

# Bin counts ≥15
exon_counts_all["n_exons_binned"] = exon_counts_all["n_exons"].apply(lambda x: x if x < 10 else 10)

# Median (unbinned)
median_exons = exon_counts_all["n_exons"].median()

# Plot
plt.figure(figsize=(6, 4))
counts, bins, bars = plt.hist(exon_counts_all["n_exons_binned"],
                              bins=range(1, 12),
                              color="steelblue", edgecolor="black", align="left")

# Add labels on top
for bar, count in zip(bars, counts):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height + 0.5, str(int(count)),
             ha='center', va='bottom', fontsize=10)

# Add median line
#plt.axvline(median_exons, color='red', linestyle='dashed', linewidth=1)
#plt.text(median_exons + 0.5, plt.ylim()[1] * 0.9, f"Median = {median_exons:.1f}",
#         color='red', fontsize=10, va='top')

# Label the x-axis ticks
xtick_positions = list(range(1, 10)) + [10]
xtick_labels = [str(i) for i in range(1, 10)] + ['10+']
plt.xticks(xtick_positions, labels=xtick_labels, rotation=0)

plt.xlabel("Number of Exons")
plt.ylabel("Number of Transcripts")
plt.ylim(0,78000)
plt.title("Distribution of Exon Counts per Transcript (Grouped ≥10)")
plt.tight_layout()
plt.savefig("Intermediate_Files/Paper_Figs/all_isoform_exon_count_distribution_grouped.pdf")
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Bin at 5000
transcript_lengths["length_binned"] = transcript_lengths["transcript_length"].apply(lambda x: x if x < 5000 else 5000)

# Calculate median (unbinned)
median_length = transcript_lengths["transcript_length"].median()

# Plot histogram
plt.figure(figsize=(6, 4))
counts, bins, bars = plt.hist(transcript_lengths["length_binned"],
                              bins=list(range(0, 5100, 100)),  # 0–500–1000–...–5000
                              color="steelblue", edgecolor="black", align="left")

# Median line
plt.plot([median_length, median_length], [0, 55000], color="#D62728", linewidth=1.5, alpha=1, linestyle='dashed')

# Median label
plt.text(x=200, y=56000, s=f"Median = {int(median_length)}", color="#D62728", fontsize=10, fontweight="bold")

# Label the x-axis ticks
xtick_positions = list(range(0, 5000, 500)) + [5000]
xtick_labels = [str(x) for x in range(0, 5000, 500)] + ['5000+']
plt.xticks(xtick_positions, labels=xtick_labels, rotation=45)

plt.ylim(0,60000)
plt.xlabel("Transcript Length (nt)")
plt.ylabel("Number of Transcripts")
plt.title("Distribution of Transcript Lengths (Grouped ≥5000 bp)")
plt.tight_layout()
plt.savefig("Intermediate_Files/Paper_Figs/all_isoform_transcript_length_distribution_grouped_5000.pdf")
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Filter to only new transcripts from known genes
filtered = bambu_exons[
    bambu_exons["gene_id"].str.startswith("ENSG") &
    bambu_exons["transcript_id"].str.startswith("BambuTx")
].copy()

# Count exons per isoform (combined_id)
exon_counts_per_isoform = filtered.groupby("combined_id")["exon_number"].nunique().reset_index()
exon_counts_per_isoform.rename(columns={"exon_number": "n_exons"}, inplace=True)

# Add gene_id back
exon_counts_per_isoform["gene_id"] = exon_counts_per_isoform["combined_id"].apply(lambda x: x.split(":")[0])

# Now → for each exon count, get number of unique genes
genes_per_exon_count = exon_counts_per_isoform.groupby("n_exons")["gene_id"].nunique()

In [None]:
# Plot
plt.figure(figsize=(4, 4))
bars = plt.bar(genes_per_exon_count.index, genes_per_exon_count.values, width=0.8, color="steelblue", edgecolor="black")

# Add labels on top
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height + 0.5, str(int(height)),
             ha='center', va='bottom', fontsize=10)

# Labels and formatting
plt.xlabel("Number of Exons")
plt.ylabel("Number of New Transcripts from Known Genes")
plt.xticks(genes_per_exon_count.index, rotation=0)
plt.ylim(0, 30)  # Adjust manually to leave room above

plt.tight_layout()

# Save
plt.savefig("Intermediate_Files/Paper_Figs/bambu_new_isoforms_known_genes_exon_distribution.pdf")

plt.show()

In [None]:
import matplotlib.pyplot as plt

# Filter to only new transcripts from known genes
filtered = bambu_exons[
    bambu_exons["gene_id"].str.startswith("Bambu") &
    bambu_exons["transcript_id"].str.startswith("BambuTx")
].copy()

# Count exons per isoform (combined_id)
exon_counts_per_isoform = filtered.groupby("combined_id")["exon_number"].nunique().reset_index()
exon_counts_per_isoform.rename(columns={"exon_number": "n_exons"}, inplace=True)

# Add gene_id back
exon_counts_per_isoform["gene_id"] = exon_counts_per_isoform["combined_id"].apply(lambda x: x.split(":")[0])

# Now → for each exon count, get number of unique genes
genes_per_exon_count = exon_counts_per_isoform.groupby("n_exons")["gene_id"].nunique()

In [None]:
# Plot
plt.figure(figsize=(4, 4))
bars = plt.bar(genes_per_exon_count.index, genes_per_exon_count.values, width=0.8, color="steelblue", edgecolor="black")

# Add labels on top
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height + 0.5, str(int(height)),
             ha='center', va='bottom', fontsize=10)

# Labels and formatting
plt.xlabel("Number of Exons")
plt.ylabel("Number of Transcripts from New Genes")
plt.xticks(genes_per_exon_count.index, rotation=0)
plt.ylim(0, 35)  # Adjust manually to leave room above

plt.tight_layout()

# Save
plt.savefig("Intermediate_Files/Paper_Figs/bambu_new_isoforms_new_genes_exon_distribution.pdf")

plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Filter to only new transcripts from known genes
filtered = bambu_exons[
    bambu_exons["gene_id"].str.startswith("ENSG") &
    bambu_exons["transcript_id"].str.startswith("BambuTx")
].copy()

# Compute transcript length per isoform (sum of exon lengths)
transcript_lengths = filtered.groupby("combined_id")["exon_length"].sum().reset_index()
transcript_lengths.rename(columns={"exon_length": "transcript_length"}, inplace=True)

# Print range
min_len = transcript_lengths["transcript_length"].min()
max_len = transcript_lengths["transcript_length"].max()
print(f"Transcript length range: {min_len} - {max_len} bp")

# Median
median_length = transcript_lengths["transcript_length"].median()

# Plot
plt.figure(figsize=(4, 4))

sns.histplot(x=transcript_lengths["transcript_length"], color="#377eb8", alpha=1, bins=30)

plt.xlabel("Transcript Length (nt)")
plt.ylabel("# of New Transcripts from Known Genes")
plt.ylim(0, 7)  # Adjust manually to leave room above

# Median line
plt.plot([median_length, median_length], [0, 6.35], color="#D62728", linewidth=1.5, alpha=1, linestyle='dashed')

# Median label
plt.text(x=200, y=6.53, s=f"Median = {int(median_length)}", color="#D62728", fontsize=10, fontweight="bold")

plt.tight_layout()

# Save transparent PDF
plt.savefig("Intermediate_Files/Paper_Figs/bambu_new_isoforms_known_genes_transcript_length_histogram.pdf", dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Filter to only new transcripts from known genes
filtered = bambu_exons[
    bambu_exons["gene_id"].str.startswith("Bambu") &
    bambu_exons["transcript_id"].str.startswith("BambuTx")
].copy()

# Compute transcript length per isoform (sum of exon lengths)
transcript_lengths = filtered.groupby("combined_id")["exon_length"].sum().reset_index()
transcript_lengths.rename(columns={"exon_length": "transcript_length"}, inplace=True)

# Print range
min_len = transcript_lengths["transcript_length"].min()
max_len = transcript_lengths["transcript_length"].max()
print(f"Transcript length range: {min_len} - {max_len} bp")

# Median
median_length = transcript_lengths["transcript_length"].median()

# Plot
plt.figure(figsize=(4, 4))

sns.histplot(x=transcript_lengths["transcript_length"], color="#377eb8", alpha=1, bins=30)

plt.xlabel("Transcript Length (nt)")
plt.ylabel("# of Transcripts from Novel Genes")
plt.ylim(0, 4.0)  # Adjust manually to leave room above

# Median line
plt.plot([median_length, median_length], [0, 3.63], color="#D62728", linewidth=1.5, alpha=1, linestyle='dashed')

# Median label
plt.text(x=425, y=3.73, s=f"Median = {int(median_length)}", color="#D62728", fontsize=10, fontweight="bold")

plt.tight_layout()

# Save transparent PDF
plt.savefig("Intermediate_Files/Paper_Figs/bambu_new_isoforms_new_genes_transcript_length_histogram.pdf", dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# Filter to only BambuTx transcripts
filtered = bambu_exons[
    bambu_exons["transcript_id"].str.startswith("BambuTx")
].copy()

# Drop duplicates to ensure unique transcripts
filtered_unique = filtered[["combined_id", "gene_id", "strand"]].drop_duplicates()

# Categorize transcripts
def categorize(row):
    if row["gene_id"].startswith("Bambu"):
        return "New locus"
    elif row["strand"] == "-":
        return "Opposite strand"
    else:
        return "Same strand, no exon overlap"

filtered_unique["Category"] = filtered_unique.apply(categorize, axis=1)

# Count unique genes per category
counts = filtered_unique.groupby("Category")["gene_id"].nunique().reset_index()
counts.rename(columns={"gene_id": "Count"}, inplace=True)

# Custom colors and labels
custom_palette = ["#377eb8", "#4daf4a", "#984ea3"]
new_labels = ["New locus", "Opposite strand", "Same strand, no exon overlap"]
color_map = dict(zip(new_labels, custom_palette))

# Plot
plt.figure(figsize=(5, 6))
ax = sns.barplot(data=counts, y="Category", x="Count",
                 dodge=False, palette=color_map, saturation=1, order=new_labels)

# Add bar labels
for container in ax.containers:
    ax.bar_label(container, fontsize=10)

plt.xlim(0, counts["Count"].max() + 5)
sns.despine(ax=ax, top=False, right=True, left=False, bottom=False)
ax.set_yticks([])
ax.set_ylabel("")
plt.xlabel("Number of Genes")

# Custom legend
patches = [mpatches.Patch(color=color, label=label) for color, label in zip(custom_palette, new_labels)]
plt.legend(handles=patches, fontsize=10, loc='lower center', bbox_to_anchor=(0.5, 1.05), frameon=True)

plt.tight_layout()
plt.savefig("Intermediate_Files/Paper_Figs/bambu_combined_category_distribution.pdf", 
            dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# Categorize transcripts
def categorize(row):
    if not row["gene_id"].startswith("ENSG"):
        return "New locus"
    elif row["strand"] == "-":
        return "Opposite strand"
    else:
        return "Same strand, no exon overlap"

bambu_exons["Category"] = bambu_exons.apply(categorize, axis=1)

# Count unique transcripts per category
counts = bambu_exons[["combined_id", "Category"]].drop_duplicates()
category_counts = counts["Category"].value_counts().reset_index()
category_counts.columns = ["Category", "Count"]

# Custom colors
custom_palette = ["#377eb8", "#4daf4a", "#984ea3"]
new_labels = ["Opposite strand", "New locus", "Same strand, no exon overlap"]

# Create color mapping
color_map = dict(zip(new_labels, custom_palette))

# Plot
plt.figure(figsize=(4, 6))
ax = sns.barplot(data=category_counts, y="Category", x="Count",
                 dodge=False, palette=color_map, saturation=1, order=new_labels)

# Add bar labels
for container in ax.containers:
    ax.bar_label(container, fontsize=10)

# X axis limit (optional, you can comment this if not desired)
plt.xlim(0, category_counts["Count"].max() + 10)

# Remove spines
sns.despine(ax=ax, top=False, right=True, left=False, bottom=False)

# Remove y-axis ticks
ax.set_yticks([])
ax.set_ylabel("")

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# Categorize transcripts
def categorize(row):
    if not row["gene_id"].startswith("ENSG"):
        return "New locus"
    elif row["strand"] == "-":
        return "Opposite strand"
    else:
        return "Same strand, no exon overlap"

bambu_exons["Category"] = bambu_exons.apply(categorize, axis=1)

# Count unique transcripts per category
counts = bambu_exons[["combined_id", "Category"]].drop_duplicates()
category_counts = counts["Category"].value_counts().reset_index()
category_counts.columns = ["Category", "Count"]

# Custom colors
custom_palette = ["#377eb8", "#4daf4a", "#984ea3"]
new_labels = ["Opposite strand", "New locus", "Same strand, no exon overlap"]

# Create color mapping
color_map = dict(zip(new_labels, custom_palette))

# Plot
plt.figure(figsize=(3, 4))
ax = sns.barplot(data=category_counts, y="Category", x="Count",
                 dodge=False, palette=color_map, saturation=1, order=new_labels)

# Add bar labels
for container in ax.containers:
    ax.bar_label(container, fontsize=10)

# X axis limit (optional, you can comment this if not desired)
plt.xlim(0, 35)

# Remove spines
sns.despine(ax=ax, top=False, right=True, left=False, bottom=False)

# Remove y-axis ticks
ax.set_yticks([])
ax.set_ylabel("")

# Custom legend above plot
patches = [mpatches.Patch(color=color, label=label) for color, label in zip(custom_palette, new_labels)]
plt.legend(handles=patches, fontsize=10, loc='lower center', bbox_to_anchor=(0.5, 1.05),
         frameon=True)

# X label
plt.xlabel("Counts")

plt.tight_layout()

# Save
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/new_gene_body_events.pdf", 
            dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
adata_i_filtered.obs

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Output directory
output_dir = "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots"
os.makedirs(output_dir, exist_ok=True)

# Parse combined ids
combined_ids = pd.Series(adata_i_filtered.var_names)

split_ids = combined_ids.str.split(":", expand=True)
split_ids.columns = ["gene_name", "gene_id", "transcript_id"]
split_ids["gene_id_final"] = split_ids["gene_id"].fillna(split_ids["gene_name"])

# ✅ Filter to only known genes
known_genes = split_ids[split_ids["gene_id_final"].str.startswith("ENSG")].copy()

# Count isoforms per gene
isoform_counts_per_gene = known_genes.groupby("gene_id_final").size().reset_index(name="n_isoforms")

# ✅ Keep only genes with >=2 isoforms
isoform_counts_per_gene = isoform_counts_per_gene[isoform_counts_per_gene["n_isoforms"] >= 2]

# Count number of genes for each number of isoforms
summary = isoform_counts_per_gene["n_isoforms"].value_counts().reset_index()
summary.columns = ["n_isoforms", "n_genes"]
summary = summary.sort_values("n_isoforms")

# Group 6+ isoforms together
summary["n_isoforms"] = summary["n_isoforms"].apply(lambda x: "8+" if x >= 8 else str(x))
summary = summary.groupby("n_isoforms")["n_genes"].sum().reset_index()

# Plot
plt.figure(figsize=(4, 4))

ax = sns.barplot(data=summary, y="n_genes", x="n_isoforms",
                 color="#377eb8", saturation=1, edgecolor="black")

# Bar labels
ax.bar_label(ax.containers[0], fontsize=9, padding=1)
plt.ylim(0,2600)
plt.xlabel("# of transcripts expressed")
plt.ylabel("# of gene bodies")
#plt.title("Number of Known Genes with Multiple Isoforms")

plt.tight_layout()

# Save
plt.savefig(os.path.join(output_dir, "number_of_genes_with_multiple_transcripts_known_only.pdf"),
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# --------------------
# Step 1: Parse combined IDs
# --------------------

combined_ids = pd.Series(adata_i_filtered.var_names)

split_ids = combined_ids.str.split(":", expand=True)
split_ids.columns = ["gene_name", "gene_id", "transcript_id"]

transcript_df = pd.DataFrame({
    "combined_id": combined_ids,
    "gene_id": split_ids["gene_id"],
    "transcript_id": split_ids["transcript_id"]
})

# --------------------
# Step 2: Parse GTF
# --------------------

gtf_file = "Homo_sapiens.GRCh38.113.gtf"

transcript_biotype_dict = {}

with open(gtf_file, 'r') as f:
    for line in f:
        if line.startswith("#"):
            continue
        fields = line.strip().split('\t')
        if fields[2] != "transcript":
            continue
        attr_field = fields[8]
        attr_dict = {}
        for attr in attr_field.strip().split(';'):
            if attr.strip():
                key, value = attr.strip().split(' ', 1)
                attr_dict[key] = value.strip('"')
        transcript_id = attr_dict.get("transcript_id")
        biotype = attr_dict.get("transcript_biotype")
        if transcript_id and biotype:
            transcript_biotype_dict[transcript_id] = biotype

transcript_biotype_df = pd.DataFrame({
    'transcript_id': transcript_df["transcript_id"],
    'biotype': transcript_df["transcript_id"].map(transcript_biotype_dict)
})

# --------------------
# Step 3: Count transcripts per gene
# --------------------

gene_transcript_counts = transcript_df.groupby("gene_id").size().reset_index(name="n_transcripts")

# ✅ Only keep genes with >= 2 isoforms
genes_multi = gene_transcript_counts[gene_transcript_counts["n_transcripts"] >= 2]

# Filter transcript_df
transcript_df = transcript_df.merge(genes_multi, on="gene_id", how="inner")

# Add biotype
transcript_df = transcript_df.merge(transcript_biotype_df, on="transcript_id", how="left")

# Unique gene + biotype combinations
gene_biotype_summary = transcript_df[["gene_id", "biotype", "n_transcripts"]].drop_duplicates()

summary = gene_biotype_summary.groupby(["n_transcripts", "biotype"]).size().reset_index(name="n_gene_bodies")

In [None]:
# --------------------
# Step 4: Group biotypes
# --------------------

def simplify_biotype(biotype):
    if biotype == "protein_coding":
        return "Protein coding"
    elif biotype == "nonsense_mediated_decay":
        return "Nonsense-mediated decay"
    elif biotype == "retained_intron":
        return "Retained intron"
    elif biotype == "protein_coding_CDS_not_defined":
        return "Coding sequence not defined"
    else:
        return "Other"

# Summary = gene_id, biotype, n_transcripts, n_gene_bodies
summary = gene_biotype_summary.groupby(["n_transcripts", "biotype"]).size().reset_index(name="n_gene_bodies")

summary["biotype_category"] = summary["biotype"].apply(simplify_biotype)

# Group 6+ together
summary["n_transcripts"] = summary["n_transcripts"].apply(lambda x: "8+" if x >= 8 else str(x))

# Combine after grouping
summary = summary.groupby(["n_transcripts", "biotype_category"])["n_gene_bodies"].sum().reset_index()

# --------------------
# Step 5: Plot
# --------------------

biotype_colors = {
    "Protein coding": "#1f77b4",
    "Nonsense-mediated decay": "#ff7f0e",
    "Retained intron": "#2ca02c",
    "Coding sequence not defined": "#9467bd",
    "Long non-coding RNA": "#8c564b",
    #"Pseudogene": "#e377c2",
    "Other": "#7f7f7f"
}

# Define order
biotype_order = [
    "Protein coding",
    "Retained intron",
    "Coding sequence not defined",
    "Nonsense-mediated decay",
    "Other"
]

plt.figure(figsize=(12, 4))

ax = sns.barplot(data=summary, x="n_transcripts", y="n_gene_bodies", hue="biotype_category",
                 palette=biotype_colors, hue_order=biotype_order, saturation=1, width = 0.9)

for container in ax.containers:
    ax.bar_label(container, fontsize=10, padding=1)

plt.xlabel("# of Transcripts Expressed")
plt.ylabel("# of Gene Bodies")
plt.ylim(0,2400)
#plt.title("Gene Bodies with Multiple Transcripts Split by Transcript Biotype")
plt.legend(title="Transcript Biotype", fontsize=10, ncol = 2)
plt.tight_layout()

output_dir = "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots"
os.makedirs(output_dir, exist_ok=True)

plt.savefig(os.path.join(output_dir, "gene_bodies_by_biotype_grouped_FINAL.pdf"),
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Detect new isoforms directly from var_names
is_new_isoform = adata_i_filtered.var_names.str.contains("BambuTx")

# Extract expression matrix
X = adata_i_filtered.X
if hasattr(X, "toarray"):
    X = X.toarray()

# Subset to new isoforms only
new_isoform_expr = X[:, is_new_isoform]

# Count number of cells expressing each isoform
isoform_cell_counts = (new_isoform_expr > 0).sum(axis=0)

# Check if any are detected
print(f"Detected {len(isoform_cell_counts)} new isoforms")

# Plot histogram
plt.figure(figsize=(4, 4))
sns.histplot(isoform_cell_counts, bins=30, color="#377eb8", edgecolor="black")

plt.xlabel("Number of Cells Expressing Isoform")
plt.ylabel("Number of New Isoforms")
#plt.title("Prevalence of New Isoforms Across Cells")

plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plotsnew_isoform_cellular_prevalence_histogram.pdf",
            dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
# For each cell type and isoform, count number of expressing cells
isoform_expr = pd.DataFrame(
    adata_i_filtered.X.toarray() if hasattr(adata_i_filtered.X, "toarray") else adata_i_filtered.X,
    columns=adata_i_filtered.var_names,
    index=adata_i_filtered.obs["gen_cell_type_reannotated"]
)

# Filter to Bambu isoforms only
isoform_expr = isoform_expr.loc[:, isoform_expr.columns.isin(bambu_exons["combined_id"])]

# Melt for easier grouping
isoform_expr_long = isoform_expr.reset_index().melt(id_vars="gen_cell_type_reannotated",
                                                    var_name="combined_id",
                                                    value_name="expression")

# Filter for expression > 0
isoform_expr_long = isoform_expr_long[isoform_expr_long["expression"] > 0]

In [None]:
from scipy.sparse import issparse
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict

# Prepare matrix
X = adata_i_filtered.X
if issparse(X):
    X = X.tocsr()

# Extract metadata
cell_types = adata_i_filtered.obs["gen_cell_type"].values
cell_ids = adata_i_filtered.obs_names.values
isoform_ids = adata_i_filtered.var_names
ensg_ids = isoform_ids.str.extract(r"(ENSG\d+)")[0].values

# Create isoform → gene lookup
isoform_to_ensg = pd.Series(ensg_ids, index=isoform_ids).dropna()

# Record (cell_type, gene, n_isoforms) per cell
records = []

for i in range(X.shape[0]):
    row = X[i]
    if issparse(row):
        row = row.toarray().flatten()
    
    expressed_isoform_indices = np.where(row > 0)[0]
    expressed_isoforms = isoform_ids[expressed_isoform_indices]
    expressed_genes = isoform_to_ensg.reindex(expressed_isoforms).dropna()
    
    if not expressed_genes.empty:
        gene_isoform_groups = expressed_genes.groupby(expressed_genes).size()
        for gene, n_iso in gene_isoform_groups.items():
            records.append((cell_types[i], cell_ids[i], gene, n_iso))

# Convert to DataFrame
df_cell_gene_iso = pd.DataFrame(records, columns=["cell_type", "cell_id", "ENSG", "n_isoforms"])
df_cell_gene_iso = df_cell_gene_iso[df_cell_gene_iso["cell_type"].isin(cell_type_order)]
df_cell_gene_iso["cell_type"] = pd.Categorical(df_cell_gene_iso["cell_type"], categories=cell_type_order, ordered=True)

# Cap values above 20 for plotting
df_cell_gene_iso["n_isoforms_capped"] = df_cell_gene_iso["n_isoforms"].apply(lambda x: "20+" if x >= 20 else str(x))
order = list(map(str, range(1, 21))) + ["20+"]
df_cell_gene_iso["n_isoforms_capped"] = pd.Categorical(df_cell_gene_iso["n_isoforms_capped"], categories=order, ordered=True)

# Plot
plt.figure(figsize=(10, 5))
sns.violinplot(
    data=df_cell_gene_iso, x="cell_type", y="n_isoforms",
    order=cell_type_order, palette=cell_type_colors,
    inner=None, linewidth=0
)
sns.boxplot(
    data=df_cell_gene_iso, x="cell_type", y="n_isoforms",
    order=cell_type_order,
    showcaps=False, showfliers=False,
    boxprops={'facecolor': 'None', 'edgecolor': 'black'},
    whiskerprops={'linewidth': 0}
)

# Overlay Q1, median, Q3 lines
for i, ct in enumerate(cell_type_order):
    vals = df_cell_gene_iso[df_cell_gene_iso["cell_type"] == ct]["n_isoforms"].values
    q1, median, q3 = np.percentile(vals, [25, 50, 75])
    plt.hlines([q1, median, q3], xmin=i - 0.3, xmax=i + 0.3,
               colors=["gray", "black", "gray"], linestyles="--", linewidth=1)
    for y, color in zip([q1, median, q3], ["gray", "black", "gray"]):
        plt.text(i + 0.32, y, f"{y:.1f}", va="center", ha="left", fontsize=8, color=color)

plt.ylim(0, 21)
plt.yticks(list(range(0, 20)) + [20], labels=list(range(0, 20)) + ["20+"])
plt.ylabel("Number of Isoforms per Gene per Cell")
plt.xlabel("Cell Type")
plt.title("Isoform Diversity per Gene per Cell by Cell Type (Capped at 20)")
plt.tight_layout()
plt.savefig(
    "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/isoforms_per_gene_per_cell_violin.pdf",
    dpi=600, transparent=True, bbox_inches="tight"
)
plt.show()

In [None]:
from scipy.sparse import issparse
import pandas as pd
import numpy as np

# Convert sparse matrix to CSR (if needed)
X = adata_i_filtered.X
if issparse(X):
    X = X.tocsr()

# Extract cell types and isoform names
cell_types = adata_i_filtered.obs["gen_cell_type"].values
isoform_ids = adata_i_filtered.var_names

# Extract ENSG IDs (from isoform name)
ensg_ids = isoform_ids.str.extract(r"(ENSG\d+)")[0].values

# Create lookup DataFrame
isoform_info = pd.DataFrame({
    "isoform": isoform_ids,
    "ENSG": ensg_ids
}).dropna()

# Preallocate results
from collections import defaultdict
gene_isoform_counts = defaultdict(lambda: defaultdict(set))

# Iterate by cell
for i in range(X.shape[0]):
    row = X[i]
    if issparse(row):
        row = row.toarray().flatten()
    expressed = row > 0
    expressed_isoforms = isoform_ids[expressed]
    expressed_ensg = ensg_ids[expressed]
    ct = cell_types[i]
    for iso, ensg in zip(expressed_isoforms, expressed_ensg):
        gene_isoform_counts[ct][ensg].add(iso)

# Convert to DataFrame
records = []
for ct in gene_isoform_counts:
    for ensg in gene_isoform_counts[ct]:
        count = len(gene_isoform_counts[ct][ensg])
        records.append((ct, ensg, count))

df_plot = pd.DataFrame(records, columns=["cell_type", "ENSG", "n_isoforms"])

In [None]:
# Cap values above 20 and label them as "20+"
df_plot["n_isoforms_capped"] = df_plot["n_isoforms"].apply(lambda x: "20+" if x >= 20 else str(x))

# Convert to categorical to preserve order
order = list(map(str, range(1, 21))) + ["20+"]
df_plot["n_isoforms_capped"] = pd.Categorical(df_plot["n_isoforms_capped"], categories=order, ordered=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Define cell type colors
cell_type_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]
cell_type_colors = {
    "TCells": "#4daf4a",
    "NK Cells": "#fdb462",
    "BCells": "#377eb8",
    "Monocyte-derived": "#ff7f00",
    "Megakaryocytes": "#a6cee3"
}

# Ensure proper order and filtering
df_plot = df_plot[df_plot["cell_type"].isin(cell_type_order)].copy()
df_plot["cell_type"] = pd.Categorical(df_plot["cell_type"], categories=cell_type_order, ordered=True)

plt.figure(figsize=(10, 5))

# Violinplot with custom colors
sns.violinplot(
    data=df_plot,
    x="cell_type",
    y="n_isoforms",
    order=cell_type_order,
    palette=cell_type_colors,
    inner=None,
    linewidth=0
)

# Overlay boxplot (transparent)
sns.boxplot(
    data=df_plot,
    x="cell_type",
    y="n_isoforms",
    order=cell_type_order,
    showcaps=False,
    showfliers=False,
    boxprops={'facecolor': 'None', 'edgecolor': 'black'},
    whiskerprops={'linewidth': 0}
)

plt.ylim(0, 21)
plt.yticks(list(range(0, 20)) + [20], labels=list(range(0, 20)) + ["20+"])
plt.ylabel("Number of Isoforms per Gene")
plt.xlabel("Cell Type")
plt.title("Isoform Diversity per Gene by Cell Type (Capped at 20)")
# Add Q1, Median, Q3 lines per cell type
for i, ct in enumerate(cell_type_order):
    vals = df_iso_per_cell[df_iso_per_cell["cell_type"] == ct]["n_isoforms"].values

    q1 = np.percentile(vals, 25)
    median = np.percentile(vals, 50)
    q3 = np.percentile(vals, 75)

    # Plot horizontal dashed lines across the violin (in data space)
    plt.hlines([q1, median, q3], xmin=i - 0.3, xmax=i + 0.3,
               colors=["gray", "black", "gray"],
               linestyles="--", linewidth=1)

    # Annotate values slightly to the right of the violin
    for y, color in zip([q1, median, q3], ["gray", "black", "gray"]):
        plt.text(i + 0.32, y, f"{y:.1f}", va="center", ha="left", fontsize=8, color=color)
plt.tight_layout()
plt.savefig(
    "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/isoform_per_gene_by_celltype_capped.pdf",
    dpi=600, transparent=True, bbox_inches="tight"
)
plt.show()

In [None]:
# Compute median, Q1, and Q3 per cell type
summary_stats = df_plot.groupby("cell_type")["n_isoforms"].quantile([0.25, 0.5, 0.75]).unstack()
summary_stats.columns = ["Q1", "Median", "Q3"]
summary_stats = summary_stats.loc[cell_type_order]  # ensure correct order

# Print nicely
print(summary_stats.round(2).to_string())

In [None]:
ct = "TCells"
vals = df_plot[df_plot["cell_type"] == ct]["n_isoforms"]
print("≤1:", (vals <= 1).mean())
print("≤2:", (vals <= 2).mean())
print("≤5:", (vals <= 5).mean())

In [None]:
from scipy.sparse import issparse
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare matrix
X = adata_i_filtered.X
if issparse(X):
    X = X.tocsr()

# Extract metadata
cell_types = adata_i_filtered.obs["gen_cell_type"].values
isoform_ids = adata_i_filtered.var_names
ensg_ids = isoform_ids.str.extract(r"(ENSG\d+)")[0].values

# Map isoform index to gene ID
isoform_to_ensg = pd.Series(ensg_ids, index=isoform_ids).dropna()
valid_idx = isoform_to_ensg.index
valid_cols = [i for i, name in enumerate(isoform_ids) if name in valid_idx]

# Create list of expressed gene sets per cell
n_genes_per_cell = []
for i in range(X.shape[0]):
    row = X[i]
    if issparse(row):
        row = row.toarray().flatten()
    expressed_isoform_indices = np.where(row > 0)[0]
    expressed_isoform_names = isoform_ids[expressed_isoform_indices]
    expressed_genes = set(isoform_to_ensg.get(iso, None) for iso in expressed_isoform_names)
    expressed_genes.discard(None)
    n_genes_per_cell.append(len(expressed_genes))

# Build DataFrame
df_genes_per_cell = pd.DataFrame({
    "cell_type": cell_types,
    "n_genes": n_genes_per_cell
})
df_genes_per_cell = df_genes_per_cell[df_genes_per_cell["cell_type"].isin(cell_type_order)]
df_genes_per_cell["cell_type"] = pd.Categorical(df_genes_per_cell["cell_type"], categories=cell_type_order, ordered=True)

# Plot
plt.figure(figsize=(10, 5))
sns.violinplot(data=df_genes_per_cell, x="cell_type", y="n_genes", palette=cell_type_colors, inner=None, linewidth=0)
sns.boxplot(data=df_genes_per_cell, x="cell_type", y="n_genes", order=cell_type_order,
            showcaps=False, showfliers=False,
            boxprops={'facecolor': 'None', 'edgecolor': 'black'},
            whiskerprops={'linewidth': 0})

plt.ylabel("Number of Genes per Cell")
plt.xlabel("Cell Type")
plt.title("Gene Diversity per Cell by Cell Type")
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/genes_per_cell_violin.pdf",
            dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
for ct in cell_type_order:
    vals = df_genes_per_cell[df_genes_per_cell["cell_type"] == ct]["n_genes"]
    q1 = np.percentile(vals, 25)
    median = np.percentile(vals, 50)
    q3 = np.percentile(vals, 75)
    print(f"{ct:<18} Q1: {q1:<6.1f} Median: {median:<6.1f} Q3: {q3:<6.1f}")

In [None]:
from scipy.sparse import issparse
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Prepare matrix
X = adata_i_filtered.X
if issparse(X):
    X = X.tocsr()

# Extract metadata
cell_types = adata_i_filtered.obs["gen_cell_type"].values
isoform_ids = adata_i_filtered.var_names

# Count number of isoforms expressed per cell
n_isoforms_per_cell = []
for i in range(X.shape[0]):
    row = X[i]
    if issparse(row):
        row = row.toarray().flatten()
    n_expressed_isoforms = np.sum(row > 0)
    n_isoforms_per_cell.append(n_expressed_isoforms)

# Build DataFrame
df_isoforms_per_cell = pd.DataFrame({
    "cell_type": cell_types,
    "n_isoforms": n_isoforms_per_cell
})
df_isoforms_per_cell = df_isoforms_per_cell[df_isoforms_per_cell["cell_type"].isin(cell_type_order)]
df_isoforms_per_cell["cell_type"] = pd.Categorical(df_isoforms_per_cell["cell_type"], categories=cell_type_order, ordered=True)

# Plot
plt.figure(figsize=(10, 5))
sns.violinplot(data=df_isoforms_per_cell, x="cell_type", y="n_isoforms", palette=cell_type_colors, inner=None, linewidth=0)
sns.boxplot(data=df_isoforms_per_cell, x="cell_type", y="n_isoforms", order=cell_type_order,
            showcaps=False, showfliers=False,
            boxprops={'facecolor': 'None', 'edgecolor': 'black'},
            whiskerprops={'linewidth': 0})

plt.ylabel("Number of Isoforms per Cell")
plt.xlabel("Cell Type")
plt.title("Isoform Diversity per Cell by Cell Type")
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/isoforms_per_cell_violin.pdf",
            dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
for ct in cell_type_order:
    vals = df_isoforms_per_cell[df_genes_per_cell["cell_type"] == ct]["n_isoforms"]
    q1 = np.percentile(vals, 25)
    median = np.percentile(vals, 50)
    q3 = np.percentile(vals, 75)
    print(f"{ct:<18} Q1: {q1:<6.1f} Median: {median:<6.1f} Q3: {q3:<6.1f}")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Define cell type colors
cell_type_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]
cell_type_colors = {
    "TCells": "#4daf4a",
    "NK Cells": "#fdb462",
    "BCells": "#377eb8",
    "Monocyte-derived": "#ff7f00",
    "Megakaryocytes": "#a6cee3"
}

# Detect new isoforms
is_new_isoform = adata_i_filtered.var_names.str.contains("BambuTx")

# Expression matrix
X = adata_i_filtered.X
if hasattr(X, "toarray"):
    X = X.toarray()

# Subset to new isoforms
new_isoform_expr = X[:, is_new_isoform]

# Make dataframe: cells x isoforms
df_expr = pd.DataFrame(new_isoform_expr, 
                       index=adata_i_filtered.obs_names, 
                       columns=adata_i_filtered.var_names[is_new_isoform])

# Add cell type annotation
df_expr["cell_type"] = adata_i_filtered.obs["gen_cell_type"]

# Melt to long format
df_long = df_expr.melt(id_vars="cell_type", var_name="isoform", value_name="expression")

# Only keep expressed isoforms
df_long = df_long[df_long["expression"] > 0]

# Count: isoform + cell_type → number of expressing cells
counts = df_long.groupby(["isoform", "cell_type"]).size().reset_index(name="n_cells_expressing")

# Plot violin plot
plt.figure(figsize=(7, 4))

sns.violinplot(data=counts, x="cell_type", y="n_cells_expressing",
               order=cell_type_order, palette=cell_type_colors, inner="box", cut=0)

plt.xlabel("Cell Type")
plt.ylabel("Number of Cells Expressing Isoform")
plt.title("Prevalence of New Isoforms Across Cells by Cell Type")
sns.despine()

plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/new_isoform_cellular_prevalence_violin_by_celltype.pdf",
            dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Define cell type colors
cell_type_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]
cell_type_colors = {
    "TCells": "#4daf4a",
    "NK Cells": "#fdb462",
    "BCells": "#377eb8",
    "Monocyte-derived": "#ff7f00",
    "Megakaryocytes": "#a6cee3"
}

# Detect new isoforms
is_new_isoform = adata_i_filtered.var_names.str.contains("BambuTx")

# Expression matrix
X = adata_i_filtered.X
if hasattr(X, "toarray"):
    X = X.toarray()

# Subset to new isoforms
new_isoform_expr = X[:, is_new_isoform]

# Make dataframe: cells x isoforms
df_expr = pd.DataFrame(new_isoform_expr, 
                       index=adata_i_filtered.obs_names, 
                       columns=adata_i_filtered.var_names[is_new_isoform])

# Add cell type annotation
df_expr["cell_type"] = adata_i_filtered.obs["gen_cell_type"]

# Melt to long format
df_long = df_expr.melt(id_vars="cell_type", var_name="isoform", value_name="expression")

# Only keep expressed isoforms
df_long = df_long[df_long["expression"] > 0]

# Count expressing cells per isoform + cell type
counts = df_long.groupby(["isoform", "cell_type"]).size().reset_index(name="n_cells_expressing")

# Get total cells per cell type
total_cells_per_type = adata_i_filtered.obs["gen_cell_type"].value_counts().to_dict()

# Add % expressing column
counts["percent_expressing"] = counts.apply(
    lambda row: 100 * row["n_cells_expressing"] / total_cells_per_type[row["cell_type"]],
    axis=1
)

# Plot violin plot of % expressing
plt.figure(figsize=(7, 4))

sns.violinplot(data=counts, x="cell_type", y="percent_expressing",
               order=cell_type_order, palette=cell_type_colors, inner="box", cut=0)

plt.xlabel("Cell Type")
plt.ylabel("% of Cells Expressing Isoform")
plt.title("Prevalence of New Isoforms by Cell Type (% of Cluster)")
sns.despine()

plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/new_isoform_percent_expressing_violin_by_celltype_percent.pdf",
            dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load GTF (clean + no dtype warnings)
gtf_file = "Homo_sapiens.GRCh38.113.gtf"

gtf = pd.read_csv(
    gtf_file, 
    sep="\t", 
    comment="#", 
    header=None, 
    names=["chrom", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"],
    dtype=str
)

# Parse attributes to get transcript_id and gene_id
gtf["transcript_id"] = gtf["attribute"].str.extract('transcript_id "([^"]+)"')
gtf["gene_id"] = gtf["attribute"].str.extract('gene_id "([^"]+)"')
gtf["gene_name"] = gtf["attribute"].str.extract('gene_name "([^"]+)"')

# Build mapping: transcript_id → gene_name or gene_id fallback
gtf_mapping = gtf.drop_duplicates("transcript_id").set_index("transcript_id")["gene_name"].fillna(gtf["gene_id"])

# Build isoform dataframe
isoform_df = pd.DataFrame(adata_i_filtered.var.index)
isoform_df.columns = ["transcript_id"]

# Determine if isoform is new
isoform_df["is_new_isoform"] = isoform_df["transcript_id"].str.contains("BambuTx")

# Add gene name from GTF if available
isoform_df["gene_name"] = isoform_df["transcript_id"].map(gtf_mapping)

# Clean name → GeneName:TranscriptID or TranscriptID
isoform_df["clean_name"] = np.where(
    isoform_df["gene_name"].notnull(),
    isoform_df["gene_name"] + ":" + isoform_df["transcript_id"],
    isoform_df["transcript_id"]
)

# Only keep new isoforms
new_isoform_df = isoform_df[isoform_df["is_new_isoform"]].copy()

# Extract expression
X = adata_i_filtered.X
if hasattr(X, "toarray"):
    X = X.toarray()

# Subset to new isoforms
new_isoform_expr = X[:, new_isoform_df.index]

# Build dataframe for melted version
df = pd.DataFrame(new_isoform_expr, columns=new_isoform_df["clean_name"].values, index=adata_i_filtered.obs_names)

# Add cell type
df["cell_type"] = adata_i_filtered.obs["gen_cell_type"].values

# Melt
df_long = df.melt(id_vars="cell_type", var_name="isoform", value_name="expression")
df_long["expressed"] = df_long["expression"] > 0

# Calculate % of cells expressing each isoform per cell type
summary = df_long.groupby(["cell_type", "isoform"])["expressed"].mean().unstack(fill_value=0) * 100

# Sort isoforms alphabetically
summary = summary[sorted(summary.columns)]

In [None]:
# Plot heatmap
plt.figure(figsize=(len(summary.columns) * 0.3, 5))

sns.heatmap(summary, cmap="Reds", linewidths=0.3, linecolor='gray',
            cbar_kws={"label": "% of Cells Expressing"}, square=False)

plt.xlabel("New Isoforms")
plt.ylabel("Cell Type")
plt.title("New Isoform Expression by Cell Type")
plt.xticks(rotation=90)
plt.tight_layout()

# Save
#plt.savefig("Intermediate_Files/Clustering_05012025/Figures/Summary/new_isoform_celltype_heatmap.pdf",
#            dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load GTF cleanly
gtf_file = "Homo_sapiens.GRCh38.113.gtf"

gtf = pd.read_csv(
    gtf_file, 
    sep="\t", 
    comment="#", 
    header=None, 
    names=["chrom", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"],
    dtype=str
)

# Parse attributes
gtf["transcript_id"] = gtf["attribute"].str.extract('transcript_id "([^"]+)"')
gtf["gene_id"] = gtf["attribute"].str.extract('gene_id "([^"]+)"')
gtf["gene_name"] = gtf["attribute"].str.extract('gene_name "([^"]+)"')

# Build mapping transcript_id -> gene_name (or gene_id fallback)
gtf_mapping = gtf.drop_duplicates("transcript_id").set_index("transcript_id")["gene_name"].fillna(gtf["gene_id"])

# Build isoform dataframe
isoform_df = pd.DataFrame(adata_i_filtered.var.index)
isoform_df.columns = ["transcript_id"]

# Determine new isoforms
isoform_df["is_new_isoform"] = isoform_df["transcript_id"].str.contains("BambuTx")

# Add gene names
isoform_df["gene_name"] = isoform_df["transcript_id"].map(gtf_mapping)

# Make clean names
isoform_df["clean_name"] = np.where(
    isoform_df["gene_name"].notnull(),
    isoform_df["gene_name"] + ":" + isoform_df["transcript_id"],
    isoform_df["transcript_id"]
)

# Only new isoforms
new_isoform_df = isoform_df[isoform_df["is_new_isoform"]].copy()

# Get integer index for new isoforms (important fix!)
new_isoform_indices = new_isoform_df.index.to_numpy()

# Extract expression
X = adata_i_filtered.X
if hasattr(X, "toarray"):
    X = X.toarray()

In [None]:
# Subset expression
new_isoform_expr = X[:, new_isoform_indices]

# Build dataframe with correct names (FIXED)
df = pd.DataFrame(new_isoform_expr, 
                  columns=new_isoform_df["clean_name"].values, 
                  index=adata_i_filtered.obs_names)

# Add cell type
df["cell_type"] = adata_i_filtered.obs["gen_cell_type"].values

In [None]:
# Melt
df_long = df.melt(id_vars="cell_type", var_name="isoform", value_name="expression")
df_long["expressed"] = df_long["expression"] > 0

# Calculate % of cells expressing
summary = df_long.groupby(["cell_type", "isoform"])["expressed"].mean().unstack(fill_value=0) * 100

# Sort isoforms alphabetically
summary = summary[sorted(summary.columns)]

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- (same up to df_long and summary) ---

# Classify isoforms based on transcript ID
new_isoform_df["is_novel_gene"] = new_isoform_df["transcript_id"].str.startswith("Bambu")

# Add classification to melted dataframe
df_long["is_novel_gene"] = df_long["isoform"].map(
    new_isoform_df.set_index("clean_name")["is_novel_gene"]
)

# Add clean_name mapping
df_long["clean_name"] = df_long["isoform"]

# Split data
df_known = df_long[df_long["is_novel_gene"] == False]  # ENSG → known gene
df_novel = df_long[df_long["is_novel_gene"] == True]   # Bambu → novel gene

In [None]:
# Function to make heatmap sorted by clean_name
def plot_heatmap(df_subset, filename):
    summary = df_subset.groupby(["cell_type", "clean_name"])["expressed"].mean().unstack(fill_value=0) * 100

    # Sort isoforms alphabetically by clean_name
    summary = summary[sorted(summary.columns)]

    plt.figure(figsize=(len(summary.columns) * 0.3, 4))
    ax = sns.heatmap(summary, cmap="Reds", linewidths=0.5, linecolor='gray',
                cbar_kws={"label": "% of Cluster Expressing Isoform"}, square=False, 
               yticklabels=True)
    # Fix ytick labels rotation and alignment (this is what makes them show properly)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0, ha='right', fontsize=10)

    plt.xlabel("")
    plt.ylabel("")
    plt.xticks(rotation=90)
    plt.tight_layout()

    plt.savefig(filename, dpi=600, transparent=True, bbox_inches="tight")
    plt.show()

# Plot known genes isoforms
plot_heatmap(df_known, 
             "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/new_isoform_known_celltype_heatmap.pdf")

# Plot novel genes isoforms
plot_heatmap(df_novel, 
             "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/new_isoform_novel_celltype_heatmap.pdf")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- (same up to df_long and summary) ---

# Classify isoforms based on transcript ID
new_isoform_df["is_novel_gene"] = new_isoform_df["transcript_id"].str.startswith("Bambu")

# Add classification to melted dataframe
df_long["is_novel_gene"] = df_long["isoform"].map(
    new_isoform_df.set_index("clean_name")["is_novel_gene"]
)

# Add clean_name mapping
df_long["clean_name"] = df_long["isoform"]

# Split data
df_known = df_long[df_long["is_novel_gene"] == False]  # ENSG → known gene
df_novel = df_long[df_long["is_novel_gene"] == True]   # Bambu → novel gene

# Calculate global max absolute expression (for symmetric scaling)
abs_max = np.abs(df_long["expression"]).max()

In [None]:
# Function to make heatmap sorted by clean_name with coolwarm and 0-centered
def plot_heatmap(df_subset, filename, vmax):
    summary = df_subset.groupby(["cell_type", "clean_name"])["expression"].mean().unstack(fill_value=0)
    summary = summary[sorted(summary.columns)]

    plt.figure(figsize=(len(summary.columns) * 0.5, 4))
    ax = sns.heatmap(summary, cmap="coolwarm", linewidths=0.5, linecolor='gray',
                cbar_kws={"label": "Average Isoform Expression"}, square=False,
                vmin=-vmax, vmax=vmax, center=0, yticklabels=True)
    # Fix ytick labels rotation and alignment (this is what makes them show properly)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0, ha='right', fontsize=10)

    plt.xlabel("")
    plt.ylabel("")
    plt.xticks(rotation=90)
    plt.tight_layout()

    plt.savefig(filename, dpi=600, transparent=True, bbox_inches="tight")
    plt.show()

# Plot known genes isoforms
plot_heatmap(df_known, 
             "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/new_isoform_known_celltype_heatmap_ave_expression.pdf",
             vmax=2)

# Plot novel genes isoforms
plot_heatmap(df_novel, 
             "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/new_isoform_novel_celltype_heatmap_ave_expression.pdf",
             vmax=2)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def plot_clustermap_with_coexpression(df_subset, filename, vmax):
    # Prepare summary matrix
    summary = df_subset.groupby(["cell_type", "clean_name"])["expression"].mean().unstack(fill_value=0)
    summary = summary[sorted(summary.columns)]

    # --- Clustermap (this time DO NOT skip the plot, because we WANT dendrogram)
    g = sns.clustermap(summary, cmap="coolwarm",
                       linewidths=0.5, linecolor='gray',
                       vmin=-vmax, vmax=vmax, center=0,
                       figsize=(len(summary.columns) * 0.3, 5.5),
                       row_cluster=False, col_cluster=True,
                       cbar_kws={"label": ""},
                       dendrogram_ratio=(0.1, 0.5),
                       cbar_pos=(0.02, 0.52, 0.03, 0.2))

    # Fix x and y labels
    g.ax_heatmap.set_xlabel("")
    g.ax_heatmap.set_ylabel("", labelpad=20)
    g.ax_heatmap.yaxis.set_label_position("left")  # Ensure it is on left (not top)
    g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90)
    g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0, ha='right', va = 'center', fontsize=10)

    # After clustermap and after setting labels:

    # Pull yticklabels
    yticklabels = g.ax_heatmap.get_yticklabels()

    # Loop through and manually shift them down slightly
    for label in yticklabels:
        label.set_rotation(0)       # no rotation
        label.set_horizontalalignment('left')
    
    # Shift downward (this fixes the "too high / inside heatmap" issue)
        pos = label.get_position()
        label.set_position((pos[0], pos[1] - 0.3))  # adjust -0.3 → more negative = lower

    # Optional (but recommended) to make sure they render cleanly
    g.ax_heatmap.figure.canvas.draw()

    # --- Now make Isoform Co-expression heatmap separately
    isoform_order = [summary.columns[i] for i in g.dendrogram_col.reordered_ind]
    summary_reordered = summary[isoform_order]
    isoform_corr = summary_reordered.corr()

    # Plot Co-expression
    plt.figure(figsize=(len(summary.columns) * 0.3, 10))
    sns.heatmap(isoform_corr, cmap="coolwarm", linewidths=0.5, linecolor='gray',
                vmin=-1, vmax=1, center=0,
                xticklabels=True, yticklabels=True)

    plt.xlabel("Isoforms (Co-expression)")
    plt.ylabel("Isoforms (Co-expression)")
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()

    # Save both
    g.savefig(filename.replace(".pdf", "_dendrogram.pdf"), dpi=600, transparent=True, bbox_inches="tight")
    plt.savefig(filename.replace(".pdf", "_coexpression.pdf"), dpi=600, transparent=True, bbox_inches="tight")

    plt.show()

# Plot known genes isoforms with co-expression
plot_clustermap_with_coexpression(df_known, 
                "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/new_isoform_known_celltype_heatmap_clustered_coexpression.pdf",
                vmax=2)

# Plot novel genes isoforms with co-expression
plot_clustermap_with_coexpression(df_novel, 
                "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/new_isoform_novel_celltype_heatmap_clustered_coexpression.pdf",
                vmax=1)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, leaves_list

def plot_clustermap_with_coexpression(df_subset, filename, vmax):
    # Step 1: Prepare mean expression matrix
    summary = df_subset.groupby(["cell_type", "clean_name"])["expression"].mean().unstack(fill_value=0)
    summary = summary[sorted(summary.columns)]
    # Ensure no NaNs or infinite values in the data

    # Step 2: Expression clustermap
    g = sns.clustermap(summary, cmap="coolwarm",
                       linewidths=0.5, linecolor='gray',
                       vmin=-vmax, vmax=vmax, center=0,
                       figsize=(len(summary.columns) * 0.3, 5.5),
                       row_cluster=False, col_cluster=True,
                       cbar_kws={"label": ""},
                       dendrogram_ratio=(0.1, 0.5),
                       cbar_pos=(0.02, 0.52, 0.03, 0.2))

    g.ax_heatmap.set_xlabel("")
    g.ax_heatmap.set_ylabel("", labelpad=20)
    g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90)
    g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0, ha='right', va='center', fontsize=10)

    for label in g.ax_heatmap.get_yticklabels():
        label.set_horizontalalignment('left')
        pos = label.get_position()
        label.set_position((pos[0], pos[1] - 0.3))

    g.ax_heatmap.figure.canvas.draw()

    # Step 3: Correlation matrix
    summary_reordered = summary[[summary.columns[i] for i in g.dendrogram_col.reordered_ind]]
    corr = summary_reordered.corr()

    # Step 4: Cluster the correlation matrix manually
    d = pdist(corr)
    linkage_matrix = linkage(d, method='average')
    idx = leaves_list(linkage_matrix)
    corr = corr.iloc[idx, :].iloc[:, idx]

    # Step 5: Apply mask for upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))

    # Step 6: Plot triangle-only heatmap
    plt.figure(figsize=(len(corr.columns) * 0.3, 10))
    sns.heatmap(corr, cmap="coolwarm", linewidths=0.5, linecolor='gray',
                vmin=-1, vmax=1, center=0,
                xticklabels=True, yticklabels=True,
                mask=mask)

    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.title("Co-expression (Correlation) — Triangular View")
    plt.tight_layout()

    # Save both figures
    g.savefig(filename.replace(".pdf", "_dendrogram.pdf"), dpi=600, transparent=True, bbox_inches="tight")
    plt.savefig(filename.replace(".pdf", "_coexpression.pdf"), dpi=600, transparent=True, bbox_inches="tight")
    plt.show()
    
plot_clustermap_with_coexpression(df_known, 
    "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/new_isoform_known_celltype_heatmap_clustered_coexpression.pdf",
    vmax=2)

plot_clustermap_with_coexpression(df_novel, 
    "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/new_isoform_novel_celltype_heatmap_clustered_coexpression.pdf",
    vmax=1)

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Create output directory
output_dir = "Intermediate_Files/Clustering_05012025/Figures/Bambu_Isoforms/PerGene_Heatmaps"
os.makedirs(output_dir, exist_ok=True)

# Prepare expression matrix
X = adata_i_filtered.X
if hasattr(X, "toarray"):
    X = X.toarray()

# Extract gene_id (ENSG) from isoform names if needed
isoform_df["gene_id"] = isoform_df["gene_name"].combine_first(
    isoform_df["transcript_id"].str.extract("(ENSG[0-9]+)")[0]
)

isoform_df["clean_name"] = np.where(
    isoform_df["gene_name"].notnull(),
    isoform_df["gene_name"] + ":" + isoform_df["transcript_id"],
    isoform_df["transcript_id"]
)

# Build full expression DataFrame
df_expr = pd.DataFrame(X, 
                       columns=isoform_df["clean_name"], 
                       index=adata_i_filtered.obs_names)
df_expr["cell_type"] = adata_i_filtered.obs["gen_cell_type"].values

# Melt for long format
df_long = df_expr.melt(id_vars="cell_type", var_name="isoform", value_name="expression")

# Add isoform metadata
meta = isoform_df.set_index("clean_name")[["gene_id", "is_new_isoform"]]
df_long = df_long.merge(meta, left_on="isoform", right_index=True, how="left")

# Get unique gene_ids
gene_ids = df_long["gene_id"].dropna().unique()

# Loop over each gene and plot
for gene in gene_ids:
    df_sub = df_long[df_long["gene_id"] == gene].copy()
    
    if df_sub["isoform"].nunique() < 2:
        continue  # skip if only one isoform

    # Compute mean expression
    summary = df_sub.groupby(["cell_type", "isoform"])["expression"].mean().unstack(fill_value=0)

    # Sort isoforms by novelty (known first)
    isoform_order = (
        df_sub[["isoform", "is_new_isoform"]]
        .drop_duplicates()
        .sort_values(by="is_new_isoform")
        .set_index("isoform")
        .index
    )
    summary = summary.loc[:, summary.columns.intersection(isoform_order)]

    # Plot
    plt.figure(figsize=(len(summary.columns) * 0.3 + 1, 2 + 0.4 * len(summary.index)))
    sns.heatmap(summary, cmap="coolwarm", center=0, 
                vmax=summary.abs().values.max(),
                linewidths=0.5, linecolor='gray',
                cbar_kws={"label": "Mean Expression"})

    plt.title(f"Isoform Expression for {gene}")
    plt.xlabel("Isoform")
    plt.ylabel("Cell Type")
    plt.xticks(rotation=90, fontsize=6)
    plt.yticks(rotation=0)

    plt.tight_layout()
    plt.savefig(f"{output_dir}/{gene}_isoform_expression_heatmap.pdf",
                dpi=600, transparent=True, bbox_inches="tight")
    plt.show()
    plt.close()

In [None]:
adata_TCell

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec

# Function to make clustered heatmap with co-expression below
def plot_clustermap_with_coexpression(df_subset, filename, vmax):
    summary = df_subset.groupby(["cell_type", "clean_name"])["expression"].mean().unstack(fill_value=0)
    summary = summary[sorted(summary.columns)]

    # Clustered heatmap
    g = sns.clustermap(summary, cmap="coolwarm",
                       linewidths=0.5, linecolor='gray',
                       vmin=-vmax, vmax=vmax, center=0,
                       figsize=(len(summary.columns) * 0.5, 10),
                       cbar_kws={"label": "Average Isoform Expression"},
                       row_cluster=True, col_cluster=True)

    # Get the isoform order from clustered columns
    isoform_order = [summary.columns[i] for i in g.dendrogram_col.reordered_ind]

    # Reorder summary columns and calculate correlation
    summary_reordered = summary[isoform_order]
    isoform_corr = summary_reordered.corr()

    # Build figure with gridspec (Main heatmap + co-expression heatmap below)
    fig = plt.figure(figsize=(len(summary.columns) * 0.5, 14))
    gs = gridspec.GridSpec(2, 1, height_ratios=[10, 4])

    # --- Main Heatmap
    ax0 = fig.add_subplot(gs[0])
    sns.heatmap(summary_reordered, cmap="coolwarm", linewidths=0.5, linecolor='gray',
                vmin=-vmax, vmax=vmax, center=0, ax=ax0,
                cbar_kws={"label": "Average Isoform Expression"})

    ax0.set_xlabel("New Isoforms")
    ax0.set_ylabel("Cell Type")
    ax0.set_xticklabels(ax0.get_xticklabels(), rotation=90)

    # --- Isoform Co-expression Heatmap
    ax1 = fig.add_subplot(gs[1])
    sns.heatmap(isoform_corr, cmap="coolwarm", linewidths=0.5, linecolor='gray',
                vmin=-1, vmax=1, center=0, ax=ax1,
                xticklabels=True, yticklabels=True)

    ax1.set_xlabel("New Isoforms (Co-expression)")
    ax1.set_ylabel("New Isoforms (Co-expression)")
    ax1.set_xticklabels(ax1.get_xticklabels(), rotation=90)
    ax1.set_yticklabels(ax1.get_yticklabels(), rotation=0)

    plt.tight_layout()
    plt.savefig(filename, dpi=600, transparent=True, bbox_inches="tight")
    plt.show()

# Plot known genes isoforms with co-expression
plot_clustermap_with_coexpression(df_known, 
                "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/new_isoform_known_celltype_heatmap_clustered_coexpression_TCellsOnly.pdf",
                vmax=2)

# Plot novel genes isoforms with co-expression
plot_clustermap_with_coexpression(df_novel, 
                                  "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/new_isoform_novel_celltype_heatmap_clustered_coexpression_TCellsOnly.pdf",
                                  vmax=2)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate isoforms expressed per cell
X = adata_i_filtered.X

# If sparse, convert to dense
if hasattr(X, "toarray"):
    X = X.toarray()

isoform_counts = (X > 0).sum(axis=1).ravel()  # ensure flat array
adata_i_filtered.obs['n_isoforms_expressed'] = isoform_counts

# Create dataframe for plotting
df = adata_i_filtered.obs[['gen_cell_type', 'n_isoforms_expressed']].copy()

# Define cell type order and colors
cell_type_order = ["TCells", "NK Cells", "BCells", "Monocyte-derived", "Megakaryocytes"]
color_map = {
    "TCells": "#4daf4a",
    "NK Cells": "#fdb462",
    "BCells": "#377eb8",
    "Monocyte-derived": "#ff7f00",
    "Megakaryocytes": "#a6cee3"
}

# Plot
plt.figure(figsize=(6, 4))

sns.violinplot(data=df, x='gen_cell_type', y='n_isoforms_expressed',
               order=cell_type_order, palette=color_map, inner="box", cut=0)

plt.xlabel("Cell Type")
plt.ylabel("Number of isoforms expressed per cell")
plt.xticks(rotation=45, ha='right')
sns.despine()

plt.tight_layout()
#plt.savefig("Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Summary_Plots/isoform_expression_by_celltype_violin.pdf",
 #           dpi=600, transparent=True, bbox_inches="tight")
plt.show()

In [None]:
# Extract all gene names that start with "Bambu"
isos_RPS9P4 = [gene for gene in adata_i_filtered.var_names if "ENSG00000243829" in gene]

# Determine grid size: adjust ncols as needed.
n_genes = len(isos_RPS9P4)
ncols = 3
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each bambu gene and plot UMAP colored by its expression.
for i, gene in enumerate(isos_RPS9P4):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        #vmax = 1
    )

# Hide any extra subplots (if nrows*ncols > number of genes)
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
import scanpy as sc
import pandas as pd

# Suppose 'adata' is your AnnData object
# Step 1: Extract all gene names that start with "Bambu"
bambu_genes = [gene for gene in adata_g_filtered.var_names if "Bambu" in gene]
bambu_isos = [gene for gene in adata_i_filtered.var_names if "Bambu" in gene]

# Step 2: Subset the AnnData object to these genes
adata_g_bambu = adata_g_filtered[:, bambu_genes].copy()
adata_i_bambu = adata_i_filtered[:, bambu_isos].copy()

# Step 3: Convert the expression matrix to a Pandas DataFrame.
# Here, adata_bambu.to_df() converts the .X matrix (cells as rows, genes as columns).
df_expr_g = adata_g_bambu.to_df()
df_expr_i = adata_i_bambu.to_df()

# Step 4: Add clustering annotations to the DataFrame.
# Replace 'clusters' with the appropriate key if different.
df_expr_g['gen_cell_type'] = adata_g_bambu.obs['gen_cell_type'].values
#df_expr_g['sub_cell_type'] = adata_g_bambu.obs['sub_cell_type'].values
df_expr_i['gen_cell_type'] = adata_i_bambu.obs['gen_cell_type'].values
#df_expr_i['sub_cell_type'] = adata_i_bambu.obs['sub_cell_type'].values

# Step 5: Identify only the numeric columns (the gene expression columns)
numeric_cols_g = df_expr_g.select_dtypes(include=[np.number]).columns
numeric_cols_i = df_expr_i.select_dtypes(include=[np.number]).columns

# Step 5: Group by cluster and compute average expression for each gene.
# Group by the cell type and compute average expression
avg_expr_by_cluster_gen_g = df_expr_g.groupby("gen_cell_type")[numeric_cols_g].median()
avg_expr_by_cluster_gen_i = df_expr_i.groupby("gen_cell_type")[numeric_cols_i].median()
#avg_expr_by_cluster_sub_g = df_expr_g.groupby("sub_cell_type")[numeric_cols_g].median()
#avg_expr_by_cluster_sub_i = df_expr_i.groupby("sub_cell_type")[numeric_cols_i].median()

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os

# Define genes
genes_to_plot = bambu_genes
outdir = "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes"
os.makedirs(outdir, exist_ok=True)

n_genes = len(genes_to_plot)
ncols = 2
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

for i, gene in enumerate(genes_to_plot):
    
    if gene not in adata_g_filtered.var_names:
        print(f"{gene} not found, skipping.")
        axes[i].set_visible(False)
        continue

    safe_gene_name = gene.replace(":", "_")
    expr = adata_g_filtered[:, gene].X
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    expr_nonzero = expr[expr != 0]
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

    # Plot in GRID
    with plt.rc_context({'figure.figsize': (5, 5)}):
        sc.pl.embedding(
            adata_g_filtered,
            basis="umap",
            color=gene,
            use_raw=False,
            title=gene,
            cmap="coolwarm",
            ax=axes[i],
            show=False,
            norm=norm,
            frameon=False
        )

    # Plot INDIVIDUAL and save
    fig_indiv = sc.pl.embedding(
        adata_g_filtered,
        basis="umap",
        color=gene,
        use_raw=False,
        title=gene,
        cmap="coolwarm",
        show=False,
        return_fig=True,
        norm=norm,
        frameon=False
    )

    fig_path = os.path.join(outdir, f"{safe_gene_name}_UMAP.pdf")
    fig_indiv.savefig(fig_path, dpi=300)
    plt.close(fig_indiv)
    print(f"Saved individual: {fig_path}")

# Hide extra axes in grid
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()

# Save GRID
grid_path = os.path.join(outdir, "Bambu_Genes_Grid.pdf")
plt.savefig(grid_path, dpi=300)
print(f"Saved grid figure: {grid_path}")

plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os

# Create output directory
output_dir = "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Violin_Plots"
os.makedirs(output_dir, exist_ok=True)

# Select genes to plot
genes_to_plot = bambu_genes  # Or bambu_isos[:10] for just the first 10 genes

# Loop through each gene and make individual violin plots
for gene in genes_to_plot:

    plot_df = pd.DataFrame({
        "Expression": adata_g_bambu[:, gene].X.flatten(),
        "gen_cell_type": adata_g_bambu.obs["gen_cell_type"]
    })

    # Plot
    plt.figure(figsize=(6, 4))
    sns.violinplot(data=plot_df, x="gen_cell_type", y="Expression",
                   hue="gen_cell_type", legend=False, inner="quartile", cut=0)

    plt.title(gene, fontsize=12, fontweight='bold')
    plt.xlabel("Cell Type")
    plt.ylabel("Expression")
    plt.xticks(rotation=45, fontsize=9)
    plt.yticks(fontsize=9)
    plt.tight_layout()

    # Save individual plot
    filename = os.path.join(output_dir, f"violin_plot_{gene}.pdf")
    plt.tight_layout()
    plt.savefig(filename, dpi=600, transparent=True, bbox_inches="tight")
    plt.show()  # <- THIS SHOWS PLOT
    # plt.close()  # <- DO NOT CLOSE until AFTER show

    # Show
    plt.show()

print("All violin plots saved.")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os

# Create output directory
output_dir = "Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/Violin_Plots"
os.makedirs(output_dir, exist_ok=True)

# Select genes to plot
genes_to_plot = bambu_isos  # Or bambu_isos[:10] for just the first 10 genes

# Loop through each gene and make individual violin plots
for gene in genes_to_plot:

    plot_df = pd.DataFrame({
        "Expression": adata_i_bambu[:, gene].X.flatten(),
        "gen_cell_type": adata_i_bambu.obs["gen_cell_type"]
    })

    # Plot
    plt.figure(figsize=(6, 4))
    sns.violinplot(data=plot_df, x="gen_cell_type", y="Expression",
                   hue="gen_cell_type", legend=False, inner="quartile", cut=0)

    plt.title(gene, fontsize=12, fontweight='bold')
    plt.xlabel("Cell Type")
    plt.ylabel("Expression")
    plt.xticks(rotation=45, fontsize=9)
    plt.yticks(fontsize=9)
    plt.tight_layout()

    # Save individual plot
    filename = os.path.join(output_dir, f"violin_plot_{gene}.pdf")
    plt.tight_layout()
    plt.savefig(filename, dpi=600, transparent=True, bbox_inches="tight")
    plt.show()  # <- THIS SHOWS PLOT
    # plt.close()  # <- DO NOT CLOSE until AFTER show

    # Show
    plt.show()

print("All violin plots saved.")

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os

# Define genes
genes_to_plot = bambu_isos
outdir = "Intermediate_Files/Clustering_05012025/Figures/Bambu_Isoform"
os.makedirs(outdir, exist_ok=True)

n_genes = len(genes_to_plot)
ncols = 2
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

for i, gene in enumerate(genes_to_plot):
    
    if gene not in adata_i_filtered.var_names:
        print(f"{gene} not found, skipping.")
        axes[i].set_visible(False)
        continue

    safe_gene_name = gene.replace(":", "_")
    expr = adata_i_filtered[:, gene].X
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    expr_nonzero = expr[expr != 0]
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)

    # Plot in GRID
    with plt.rc_context({'figure.figsize': (5, 5)}):
        sc.pl.embedding(
            adata_i_filtered,
            basis="umap",
            color=gene,
            use_raw=False,
            title=gene,
            cmap="coolwarm",
            ax=axes[i],
            show=False,
            norm=norm,
            frameon=False
        )

    # Plot INDIVIDUAL and save
    fig_indiv = sc.pl.embedding(
        adata_i_filtered,
        basis="umap",
        color=gene,
        use_raw=False,
        title=gene,
        cmap="coolwarm",
        show=False,
        return_fig=True,
        norm=norm,
        frameon=False
    )

    fig_path = os.path.join(outdir, f"{safe_gene_name}_UMAP.pdf")
    fig_indiv.savefig(fig_path, dpi=300)
    plt.close(fig_indiv)
    print(f"Saved individual: {fig_path}")

# Hide extra axes in grid
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()

# Save GRID
grid_path = os.path.join(outdir, "Bambu_Isoforms_Grid.pdf")
plt.savefig(grid_path, dpi=300)
print(f"Saved grid figure: {grid_path}")

plt.show()

In [None]:
adata_TCell = adata_g_filtered[adata_g_filtered.obs["gen_cell_type"] == "TCells"].copy()

In [None]:
adata_TCell_i = adata_i_filtered[adata_i_filtered.obs["gen_cell_type"] == "TCells"].copy()

In [None]:
# Function to find matching genes in var_names (combined_ID format)
def find_matching_genes(prefixes, gene_list):
    return [gene for gene in gene_list if any(gene.startswith(prefix) for prefix in prefixes)]

#PTPRCAP = CD45
TCell_Markers = ["CD4:", "CD3D:", "CD3E:", "CD3G:", "CD3Z", "CD8A:", "CD8B:", "PTPRCAP:"]
Naive_TCell = ["PTPRC:", "TCF7:", "FOXP1", "LEF1:", "PECAM1:"]
Memory_TCell = ["CCR5:", "HLA-DRB1:", "HLA-DRA:", "ITGAE:", "SELL", "TCF7:", "IL7R:", "CCR7:"]
CD8_TCell = ["CD8A:", "CD8B:", "CXCR3:", "KLRB1:", "PTGDR2:", "GATA3:", "IRF4:", "RORC:", "CCL5"]
#Central_Memory_TCell = ["CCR5:", "IL7RA:", "EOMES:", "PRDM1:", "IL7R:", "SELL", "CCR7:"]
#Effector_Memory_TCell = ["CCR5:", "HLA-DRB1:", "HLA-DRA:", "ITGAL:", "GZMA:", "PRDM1:", "SELL"]
Reg_TCell = ["FOXP3:", "IL2RA:", "CTLA4:", "STAT5A"]
#Th1_TCell = ["CXCR3:", "IFNG:", "TNF:", "STAT4:"]
CD4_Effector = ["CXCR3:", "TNF:", "STAT4:", "IL17A:", "IL13:", "IL25:", "AHR:", "FOXO4:", "GATA3", "IL2RA"]

# Get gene matches for gene-level data
TCell_genes_1 = find_matching_genes(TCell_Markers, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_1)} gene-level IDs for T-Cells: {TCell_genes_1}")
TCell_genes_2 = find_matching_genes(Naive_TCell, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_2)} gene-level IDs for Naive T-Cells: {TCell_genes_2}")
TCell_genes_3 = find_matching_genes(Memory_TCell, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_3)} gene-level IDs for Memory T-Cells: {TCell_genes_3}")
TCell_genes_4 = find_matching_genes(CD8_TCell, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_4)} gene-level IDs for CD8+ T-Cells: {TCell_genes_4}")
TCell_genes_5 = find_matching_genes(Reg_TCell, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_5)} gene-level IDs for Regulatory T-Cells: {TCell_genes_5}")
TCell_genes_6 = find_matching_genes(CD4_Effector, adata_g_filtered.var_names)
print(f"Matched {len(TCell_genes_6)} gene-level IDs for Helper T-Cells: {TCell_genes_6}")


# Get gene matches for isoform-level data
TCell_iso_1 = find_matching_genes(TCell_Markers, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_1)} isoform-level IDs for T-Cells: {TCell_iso_1}")
TCell_iso_2 = find_matching_genes(Naive_TCell, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_2)} isoform-level IDs for Naive T-Cells: {TCell_iso_2}")
TCell_iso_3 = find_matching_genes(Memory_TCell, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_3)} isoform-level IDs for Memory T-Cells: {TCell_iso_3}")
TCell_iso_4 = find_matching_genes(CD8_TCell, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_4)} isoform-level IDs for CD8+ T-Cells: {TCell_iso_4}")
TCell_iso_5 = find_matching_genes(Reg_TCell, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_5)} isoform-level IDs for Regulatory T-Cells: {TCell_iso_5}")
TCell_iso_6 = find_matching_genes(CD4_Effector, adata_i_filtered.var_names)
print(f"Matched {len(TCell_iso_6)} isoform-level IDs for Helper T-Cells: {TCell_iso_6}")

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os

# Define output directory
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "General TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("General T-Cell Markers")

# Loop through matched genes and plot them
for gene in TCell_genes_1:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
# Define output directory
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "Naive TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("Naive T-Cell Markers")

# Loop through matched genes and plot them
for gene in TCell_genes_2:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
print("Memory T-Cell Markers")

# Define output directory
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "Memory TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

# Loop through matched genes and plot them
for gene in TCell_genes_3:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr =adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
       adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
# Genes of interest
genes = ['TCF7:ENSG00000081059', 'CCR7:ENSG00000126353', 'SELL:ENSG00000188404']
        

# Calculate summed expression per cell
adata_TCell.obs['TMem_Markers_Combined'] =adata_TCell[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell,
    color='TMem_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Memory TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell.obs['TMem_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell.obs['TMem_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Memory TCell Aggregated Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "Total Memory TCell\nGene Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_TMem_TCellOnly_combined_expression_gene.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
print("CD8+ T-Cell Markers")

# Define output directory
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "CD8 TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir


# Loop through matched genes and plot them
for gene in TCell_genes_4:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
# Genes of interest
genes = ['GATA3:ENSG00000107485', 'KLRB1:ENSG00000111796', 'CD8A:ENSG00000153563',
        'CD8B:ENSG00000172116', 'CCL5:ENSG00000271503']
        

# Calculate summed expression per cell
adata_TCell.obs['CD8_TCell_Markers_Combined'] =adata_TCell[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell,
    color='CD8_TCell_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Cytotoxic TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell.obs['CD8_TCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell.obs['CD8_TCell_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Cytotoxic TCell Aggregate Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "Cytotoxic TCell Aggregate Gene Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_CytotoxicTCell_TCellsOnly_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
genes = ['CD3D:ENSG00000167286', 'CD3E:ENSG00000198851', 'CD3G:ENSG00000160654',
         'GATA3:ENSG00000107485', 'KLRB1:ENSG00000111796', 'CD8A:ENSG00000153563',
        'CD8B:ENSG00000172116', 'CCL5:ENSG00000271503']

# Calculate summed expression per cell
adata_g_filtered.obs['CD8_TCell_Markers_Combined'] =adata_g_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_g_filtered,
    color='CD8_TCell_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="Cytotoxic TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_g_filtered.obs['CD8_TCell_Markers_Combined'], 99))),
    vmax=np.percentile(adata_g_filtered.obs['CD8_TCell_Markers_Combined'], 99),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 2.4)

# Clean up axis
ax = plt.gca()
ax.set_title("")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
fig.text(-0.03, 0.53, "Total Cytotoxic TCell\nGene Marker Expression", 
         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_CytotoxicTCell_combined_expression_gene.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
print("Regulatory T-Cell Markers")

# Define output directory
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "Regulatory TCell"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir


# Loop through matched genes and plot them
for gene in TCell_genes_5:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
Reg_filtered = [i for i in TCell_genes_5 if any(gene in i for gene in ["FOXP3", "STAT5A", "IL2RA", "CTLA4"])]

# Safety: Check which markers are in adata_TCell_i.var_names
genes = [g for g in Reg_filtered if g in adata_TCell.var_names]

# Calculate summed expression per cell
X = adata_TCell[:, genes].X
if hasattr(X, "toarray"):
    X = X.toarray()

adata_TCell.obs['Reg_TCell_Gene_Markers_Combined'] = X.sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell,
    color='Reg_TCell_Gene_Markers_Combined',
    cmap="coolwarm",
    frameon=True,
    title="Regulatory TCell Gene Marker Expression",
    vmin=-(np.percentile(adata_TCell.obs['Reg_TCell_Gene_Markers_Combined'], 100)),
    vmax=np.percentile(adata_TCell.obs['Reg_TCell_Gene_Markers_Combined'], 100),
    show=False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Total Regulatory TCell Aggrgate Gene Marker Expression")
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT annotation
#fig.text(-0.03, 0.53, "Total Regulatory TCell Aggrgate Gene Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)

# Save figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_RegulatoryTCell_TCellsOnly_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
print("Helper T-Cell Markers")
# Define output directory
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "Helper (CD4) TCell"
output_dir = os.path.join(marker_root, f"Gene_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir


# Loop through matched genes and plot them
for gene in TCell_genes_6:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_g_filtered[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_g_filtered,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
# Genes of interest
genes = ['GATA3:ENSG00000107485', 'CD4:ENSG00000010610', 'AHR:ENSG00000106546',
        'IL2RA:ENSG00000134460']  

# Calculate summed expression per cell
adata_TCell.obs['CD4_Effector_TCell_Markers_Combined'] =adata_TCell[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell,
    color='CD4_Effector_TCell_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="CD4+ Effector TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_TCell.obs['CD4_Effector_TCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_TCell.obs['CD4_Effector_TCell_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("CD4+ Effector TCell Aggregate Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "CD4+ Effector TCell Aggregate Gene Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_CD4EffectorTCell_TCellsOnlycombined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
# Genes of interest
genes = ['CD3D:ENSG00000167286', 'CD3E:ENSG00000198851', 'CD3G:ENSG00000160654',
         'GATA3:ENSG00000107485', 'CD4:ENSG00000010610', 'AHR:ENSG00000106546',
        'IL2RA:ENSG00000134460']  

# Calculate summed expression per cell
adata_g_filtered.obs['CD4_Effector_TCell_Markers_Combined'] =adata_g_filtered[:, genes].X.toarray().sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_g_filtered,
    color='CD4_Effector_TCell_Markers_Combined',
    cmap= "coolwarm",
    frameon=True,
    title="CD4+ Effector TCell Marker Combined Expression",
    vmin=(-(np.percentile(adata_g_filtered.obs['CD4_Effector_TCell_Markers_Combined'], 100))),
    vmax=np.percentile(adata_g_filtered.obs['CD4_Effector_TCell_Markers_Combined'], 100),
    show = False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 2.4)

# Clean up axis
ax = plt.gca()
ax.set_title("CD4+ Effector TCell Aggregate Gene Marker Expression")  # Remove automatic title
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT SIDE annotation (as a side title)
#fig.text(-0.03, 0.53, "CD4+ Effector TCell Aggregate Gene Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)


# Save final figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_CD4EffectorTCell_combined_expression_gene_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
# Example: Suppose you manually identified the clusters as follows:
# (Change this to whatever is appropriate for your dataset)

tcell_cluster_mapping = {
    "0": "Memory TCells",
    "1": "Effector CD4 TCells #1",
    "2": "Effector CD8 TCells",
    "6": "Effector CD4 TCells #2",
    "7": "Effector CD4 TCells #3"
}

# Create new cell type annotations
adata_TCell.obs["TCell_subtype"] = adata_TCell.obs["0.5_log_AutoZI"].astype(str).map(tcell_cluster_mapping)

# Check assignments
adata_TCell.obs["TCell_subtype"].value_counts()

In [None]:
import matplotlib.pyplot as plt
import scanpy as sc

# Define desired legend order
legend_order = ["Memory TCells", "Effector CD8 TCells", "Effector CD4 TCells #1", "Effector CD4 TCells #2", "Effector CD4 TCells #3"]

# Plot UMAP using subtype annotation
sc.pl.umap(
    adata_TCell,
    color='TCell_subtype',
    title='',  # No title for publication version
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='lower right',
    legend_fontsize=10,
    legend_fontoutline=1,
    show=False  # Allow editing
)

# Adjust figure
fig = plt.gcf()
fig.set_size_inches(5,4)

# Get current legend handles and labels
ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()

# Build a dictionary of current labels and handles
label_handle_dict = dict(zip(labels, handles))

# Only keep those in legend_order that exist in current labels
ordered_labels = [label for label in legend_order if label in label_handle_dict]
ordered_handles = [label_handle_dict[label] for label in ordered_labels]

# Remove old legend
ax.get_legend().remove()

# Add new ordered legend
ax.legend(ordered_handles, ordered_labels,
          loc='lower center',
          bbox_to_anchor=(0.5, -0.4),  # Centered below
          fontsize=10,
          frameon=True,
          ncol=3)

plt.tight_layout()

# Save figure
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/UMAP_TCell_subtypes.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
# Create the column if it doesn't exist
if "gen_cell_type_reannotated" not in adata_g_filtered.obs.columns:
    adata_g_filtered.obs["gen_cell_type_reannotated"] = adata_g_filtered.obs["gen_cell_type"]

# Ensure it's categorical
if not pd.api.types.is_categorical_dtype(adata_g_filtered.obs["gen_cell_type_reannotated"]):
    adata_g_filtered.obs["gen_cell_type_reannotated"] = adata_g_filtered.obs["gen_cell_type_reannotated"].astype("category")

# Extract actual new categories from TCell_subtype
new_categories = adata_TCell.obs["TCell_subtype"].astype("category").cat.categories

# Add those categories to the target obs
adata_g_filtered.obs["gen_cell_type_reannotated"] = adata_g_filtered.obs["gen_cell_type_reannotated"].cat.add_categories(new_categories)

# Assign new labels
adata_g_filtered.obs.loc[adata_TCell.obs_names, "gen_cell_type_reannotated"] = adata_TCell.obs["TCell_subtype"]

# Check output
print(adata_g_filtered.obs["gen_cell_type_reannotated"].value_counts())

In [None]:
# Remove unused categories (this will remove "TCells" since no cells are assigned to it)
adata_g_filtered.obs["gen_cell_type_reannotated"] = adata_g_filtered.obs["gen_cell_type_reannotated"].cat.remove_unused_categories()

# Check result
print(adata_g_filtered.obs["gen_cell_type_reannotated"].value_counts())

In [None]:
# Plot UMAP using subtype annotation
# Define desired legend order
legend_order = ["NK Cells", "BCells", "Monokaryocytes", "Monocyte-derived", 
                "Memory TCells", "Effector CD8 TCells", "Effector CD4 TCells #1", 
                "Effector CD4 TCells #2", "Effector CD4 TCells #3"]

sc.pl.umap(
    adata_g_filtered,
    color='gen_cell_type_reannotated',
    title='',  # No title for publication version
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='lower right',
    legend_fontsize=10,
    legend_fontoutline=1,
    show=False  # Allow editing
)

# Adjust figure
fig = plt.gcf()
fig.set_size_inches(5,5)

# Get current legend handles and labels
ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()

# Build a dictionary of current labels and handles
label_handle_dict = dict(zip(labels, handles))

# Only keep those in legend_order that exist in current labels
ordered_labels = [label for label in legend_order if label in label_handle_dict]
ordered_handles = [label_handle_dict[label] for label in ordered_labels]

# Remove old legend
ax.get_legend().remove()

# Add new ordered legend
ax.legend(ordered_handles, ordered_labels,
          loc='lower center',
          bbox_to_anchor=(0.5, -0.35),  # Centered below
          fontsize=10,
          frameon=True,
          ncol=2)

plt.tight_layout()

# Save figure
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/UMAP_subtypes.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
#Save Clustered data
output_dir = 'Intermediate_Files/Clustering_05012025/'

adata_g_filtered.write(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_reannotated_AutoZILatent_05112025.h5mu"))

In [None]:
import scanpy as sc

# Choose your cell type labels
group1 = "Effector CD4 TCells #1"
group2 = "Effector CD4 TCells #2"

# Make a log1p version of raw counts
import numpy as np

adata_TCell.layers["log1p_counts"] = np.log1p(adata_TCell.layers["counts"])

# Subset AnnData
adata_sub = adata_TCell[adata_TCell.obs["TCell_subtype"].isin([group1, group2])].copy()


# Create a new column with simplified labels
adata_sub.obs["comparison_group"] = adata_sub.obs["TCell_subtype"].map({
    group1: "group1",
    group2: "group2"
})

sc.tl.rank_genes_groups(
    adata_sub, 
    groupby="comparison_group",
    reference="group1",  # this will be the baseline
    method="wilcoxon", 
    use_raw=False,
    layer="log1p_counts",   # <- raw counts go here
    pts=True
)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from adjustText import adjust_text

# Extract DE results
de_results = sc.get.rank_genes_groups_df(adata_sub, group="group2")

# Replace 0 to avoid -inf
de_results["-log10(pval)"] = -np.log10(de_results["pvals_adj"].replace(0, 1e-300))

# Define significance
de_results["significant"] = (de_results["pvals_adj"] < 0.05) & (abs(de_results["logfoldchanges"]) >= 1)

# Volcano plot
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=de_results,
    x="logfoldchanges",
    y="-log10(pval)",
    hue="significant",
    palette={True: "red", False: "gray"},
    alpha=0.7,
    edgecolor=None
)

# Vertical threshold lines
plt.axvline(1, color="blue", linestyle="--", lw=1)
plt.axvline(-1, color="blue", linestyle="--", lw=1)

# Label top N genes
top_n = 30
top_genes = de_results.nsmallest(top_n, "pvals_adj")
texts = []
for _, row in top_genes.iterrows():
    texts.append(plt.text(row["logfoldchanges"], row["-log10(pval)"], row["names"],
                          fontsize=8, ha='right' if row["logfoldchanges"] < 0 else 'left'))

adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))

# Aesthetics
plt.title("Volcano Plot: Effector CD4 TCells #1 vs #2")
plt.xlabel("Log2 Fold Change")
plt.ylabel("-log10 Adjusted p-value")
plt.legend(title="FDR < 0.05 & |log2FC| ≥ 1", loc="upper left")
plt.tight_layout()
#plt.savefig("Intermediate_Files/Paper_Figs/volcano_EffectorCD4T2_vs_T1_labeled.pdf", dpi=600)
plt.show()

In [None]:
import scanpy as sc

# Choose your cell type labels
group1 = "Effector CD4 TCells #1"
group2 = "Effector CD4 TCells #3"

# Make a log1p version of raw counts
import numpy as np

adata_TCell.layers["log1p_counts"] = np.log1p(adata_TCell.layers["counts"])

# Subset AnnData
adata_sub = adata_TCell[adata_TCell.obs["TCell_subtype"].isin([group1, group2])].copy()


# Create a new column with simplified labels
adata_sub.obs["comparison_group"] = adata_sub.obs["TCell_subtype"].map({
    group1: "group1",
    group2: "group2"
})

sc.tl.rank_genes_groups(
    adata_sub, 
    groupby="comparison_group",
    reference="group1",  # this will be the baseline
    method="wilcoxon", 
    use_raw=False,
    layer="log1p_counts",   # <- raw counts go here
    pts=True
)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from adjustText import adjust_text

# Extract DE results
de_results = sc.get.rank_genes_groups_df(adata_sub, group="group2")

# Replace 0 to avoid -inf
de_results["-log10(pval)"] = -np.log10(de_results["pvals_adj"].replace(0, 1e-300))

# Define significance
de_results["significant"] = (de_results["pvals_adj"] < 0.05) & (abs(de_results["logfoldchanges"]) >= 1)

# Volcano plot
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=de_results,
    x="logfoldchanges",
    y="-log10(pval)",
    hue="significant",
    palette={True: "red", False: "gray"},
    alpha=0.7,
    edgecolor=None
)

# Vertical threshold lines
plt.axvline(1, color="blue", linestyle="--", lw=1)
plt.axvline(-1, color="blue", linestyle="--", lw=1)

# Label top N genes
top_n = 30
top_genes = de_results.nsmallest(top_n, "pvals_adj")
texts = []
for _, row in top_genes.iterrows():
    texts.append(plt.text(row["logfoldchanges"], row["-log10(pval)"], row["names"],
                          fontsize=8, ha='right' if row["logfoldchanges"] < 0 else 'left'))

adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))

# Aesthetics
plt.title("Volcano Plot: Effector CD4 TCells #1 vs #3")
plt.xlabel("Log2 Fold Change")
plt.ylabel("-log10 Adjusted p-value")
plt.legend(title="FDR < 0.05 & |log2FC| ≥ 1", loc="upper left")
plt.tight_layout()
#plt.savefig("Intermediate_Files/Paper_Figs/volcano_EffectorCD4T2_vs_T1_labeled.pdf", dpi=600)
plt.show()

In [None]:
print(TCell_iso_1)

In [None]:



import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os

# Define output directory
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "TCell (TCellAlone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

print("T-Cell Markers")

for gene in TCell_iso_1:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell_i[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell_i,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os

print("Naive T-Cell Markers")

# Define output directory
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = " Naive TCell(TCell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

for gene in TCell_iso_2:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr = adata_TCell_i[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
        adata_TCell_i,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"_{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os

print("Memory T-Cell Markers")

# Define output directory
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "Memory TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

# Loop through matched genes and plot them
for gene in TCell_iso_3:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr =adata_TCell_i[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
       adata_TCell_i,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os

print("CD8+ T-Cell Markers")

# Define output directory
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "CD8+ TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

# Loop through matched genes and plot them
for gene in TCell_iso_4:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr =adata_TCell_i[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
       adata_TCell_i,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os

print("Regulatory T-Cell Markers")

# Define output directory
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "Regulatory TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

# Loop through matched genes and plot them
for gene in TCell_iso_5:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr =adata_TCell_i[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
       adata_TCell_i,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import numpy as np
import os

print("CD4+ Effector T-Cell Markers")

# Define output directory
marker_root = "Intermediate_Files/Clustering_05012025/Figures/Markers"
group_name = "CD4+ Effector TCell (TCell Alone)"
output_dir = os.path.join(marker_root, f"Isoform_{group_name}_Markers")
os.makedirs(output_dir, exist_ok=True)
sc.settings.figdir = output_dir

# Loop through matched genes and plot them
for gene in TCell_iso_6:
    safe_gene_name = gene.replace(":", "_")

    # Get expression vector
    expr =adata_TCell_i[:, gene].X

    # Convert sparse to dense if needed
    if hasattr(expr, "toarray"):
        expr = expr.toarray().flatten()
    else:
        expr = np.ravel(expr)

    # Exclude 0s if you want to scale based only on expressing cells
    expr_nonzero = expr[expr > 0]

    # Calculate 99th percentile for vmax
    vmax_val = np.percentile(expr_nonzero, 99) if len(expr_nonzero) > 0 else 1.0

    # Count how many cells exceed the 95th percentile
    num_above_vmax = np.sum(expr > vmax_val)
    print(f"{gene}: {num_above_vmax} cells above 99th percentile (vmax = {vmax_val:.2f})")

    # Calculate symmetric range for scaling around zero
    vmax_val = np.percentile(expr_nonzero, 99)
    norm = TwoSlopeNorm(vmin=-vmax_val, vcenter=0, vmax=vmax_val)
    
    sc.pl.umap(
       adata_TCell_i,
        color=gene,
        use_raw=False,
        title=f"{gene}",
        cmap="coolwarm",       # Sequential colormap with centered norm
        save=f"{safe_gene_name}.pdf",
        show=True,
        norm=norm             # << This handles vmin, vmax, and vcenter
    )

In [None]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt

# Filtered isoform-level markers for Cytotoxic T Cells
TCells_filtered = [i for i in TCell_iso_1 if any(gene in i for gene in ["CD3D", "CD3E", "CD3G"])]

# Safety: Check which markers are in adata_TCell_i.var_names
genes = [g for g in TCells_filtered if g in adata_TCell_i.var_names]

# Calculate summed expression per cell
X = adata_TCell_i[:, genes].X
if hasattr(X, "toarray"):
    X = X.toarray()

adata_TCell_i.obs['Gen_TCell_Isoform_Markers_Combined'] = X.sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell_i,
    color='Gen_TCell_Isoform_Markers_Combined',
    cmap="coolwarm",
    frameon=True,
    title="General TCell Isoform Marker Expression",
    vmin=-(np.percentile(adata_TCell_i.obs['Gen_TCell_Isoform_Markers_Combined'], 100)),
    vmax=np.percentile(adata_TCell_i.obs['Gen_TCell_Isoform_Markers_Combined'], 100),
    show=False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("")
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT annotation
fig.text(-0.03, 0.53, "Total General TCell\nIsoform Marker Expression", 
         va='center', ha='center', rotation=90, fontsize=11)

# Save figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_GeneralTCell_TCellsOnly_combined_expression_isoform.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt

# Filtered isoform-level markers for Cytotoxic T Cells
Memory_filtered = [i for i in TCell_iso_3 if any(gene in i for gene in ["CCR7", "SELL", "TCF7"])]

# Safety: Check which markers are in adata_TCell_i.var_names
genes = [g for g in Memory_filtered if g in adata_TCell_i.var_names]

# Calculate summed expression per cell
X = adata_TCell_i[:, genes].X
if hasattr(X, "toarray"):
    X = X.toarray()

adata_TCell_i.obs['Mem_TCell_Isoform_Markers_Combined'] = X.sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell_i,
    color='Mem_TCell_Isoform_Markers_Combined',
    cmap="coolwarm",
    frameon=True,
    title="Memory TCell Isoform Marker Expression",
    vmin=-(np.percentile(adata_TCell_i.obs['Mem_TCell_Isoform_Markers_Combined'], 100)),
    vmax=np.percentile(adata_TCell_i.obs['Mem_TCell_Isoform_Markers_Combined'], 100),
    show=False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("Memory TCell Aggregate Isoform Marker Expression")
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT annotation
#fig.text(-0.03, 0.53, "Memory TCell Aggregate Isoform Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)

# Save figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_MemoryTCell_TCellsOnly_combined_expression_isoform_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt

# Filtered isoform-level markers for Cytotoxic T Cells
Effector_filtered = [i for i in TCell_iso_6 if any(gene in i for gene in ["CD4", "IL2RA", "GATA3", "AHR"])]

# Safety: Check which markers are in adata_TCell_i.var_names
genes = [g for g in Effector_filtered if g in adata_TCell_i.var_names]

# Calculate summed expression per cell
X = adata_TCell_i[:, genes].X
if hasattr(X, "toarray"):
    X = X.toarray()

adata_TCell_i.obs['CD4_TCell_Isoform_Markers_Combined'] = X.sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell_i,
    color='CD4_TCell_Isoform_Markers_Combined',
    cmap="coolwarm",
    frameon=True,
    title="Helper TCell Isoform Marker Expression",
    vmin=-(np.percentile(adata_TCell_i.obs['CD4_TCell_Isoform_Markers_Combined'],100)),
    vmax=np.percentile(adata_TCell_i.obs['CD4_TCell_Isoform_Markers_Combined'], 100),
    show=False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("CD4+ Effector TCell Aggregate Isoform Marker Expression")
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT annotation
#fig.text(-0.03, 0.53, "CD4+ Effector TCell Aggregate Isoform Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)

# Save figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_HelperTCell_TCellsOnly_combined_expression_isoform_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt

# Filtered isoform-level markers for Cytotoxic T Cells
Cytotoxic_filtered = [i for i in TCell_iso_4 if any(g in i for g in ["CD8A", "CD8B", "GATA3", "KLRB1", "CCL5"])]

# Safety: Check which markers are in adata_TCell_i.var_names
genes = [g for g in Cytotoxic_filtered if g in adata_TCell_i.var_names]

# Calculate summed expression per cell
X = adata_TCell_i[:, genes].X
if hasattr(X, "toarray"):
    X = X.toarray()

adata_TCell_i.obs['Cytotoxic_TCell_Isoform_Markers_Combined'] = X.sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell_i,
    color='Cytotoxic_TCell_Isoform_Markers_Combined',
    cmap="coolwarm",
    frameon=True,
    title="Cytotoxic TCell Isoform Marker Expression",
    vmin=-(np.percentile(adata_TCell_i.obs['Cytotoxic_TCell_Isoform_Markers_Combined'], 99)),
    vmax=np.percentile(adata_TCell_i.obs['Cytotoxic_TCell_Isoform_Markers_Combined'], 99),
    show=False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 3)

# Clean up axis
ax = plt.gca()
ax.set_title("CD8 Effector TCell Aggregate Isoform Marker Expression")
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT annotation
#fig.text(-0.03, 0.53, "CD8 Effector TCell Aggregate Isoform Marker Expression", 
#         va='center', ha='center', rotation=90, fontsize=11)

# Save figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_CytotoxicTCell_TCellsOnly_combined_expression_isoform_aggregate.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
Reg_filtered = [i for i in TCell_iso_5 if any(gene in i for gene in ["FOXP3", "STAT5A", "IL2RA", "CTLA4"])]

# Safety: Check which markers are in adata_TCell_i.var_names
genes = [g for g in Reg_filtered if g in adata_TCell_i.var_names]

# Calculate summed expression per cell
X = adata_TCell_i[:, genes].X
if hasattr(X, "toarray"):
    X = X.toarray()

adata_TCell_i.obs['Reg_TCell_Isoform_Markers_Combined'] = X.sum(axis=1)

# Plot UMAP
fig = sc.pl.umap(
    adata_TCell_i,
    color='Reg_TCell_Isoform_Markers_Combined',
    cmap="coolwarm",
    frameon=True,
    title="Regulatory TCell Isoform Marker Expression",
    vmin=-(np.percentile(adata_TCell_i.obs['Reg_TCell_Isoform_Markers_Combined'], 99)),
    vmax=np.percentile(adata_TCell_i.obs['Reg_TCell_Isoform_Markers_Combined'], 99),
    show=False
)

# Adjust figure size
fig = plt.gcf()
fig.set_size_inches(4, 2.4)

# Clean up axis
ax = plt.gca()
ax.set_title("")
ax.set_xticks([])
ax.set_yticks([])

# Add LEFT annotation
fig.text(-0.03, 0.53, "Total Regulatory TCell\nIsoform Marker Expression", 
         va='center', ha='center', rotation=90, fontsize=11)

# Save figure
plt.tight_layout()
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/umap_RegulatoryTCell_TCellsOnly_combined_expression_isoform.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
sc.pl.umap(
            adata_TCell_i,
            color=f'0.5_log_AutoZI',
            title=f'UMAP with Clusters (Isoform-Level) ',
            frameon=True,
            legend_fontsize=10,
            legend_fontoutline=2,
size= 50
        )

In [None]:
# Example: Suppose you manually identified the clusters as follows:
# (Change this to whatever is appropriate for your dataset)

tcell_cluster_mapping = {
    "0": "Memory TCells",
    "1": "Effector CD4 TCells",
    "3": "Effector CD8 TCells",
    "6": "Unspecified TCells",
    "2": "Effector CD8 TCells",
    "7": "Unspecified TCells",
    "4": "Unspecified TCells",
}

# Create new cell type annotations
adata_TCell_i.obs["TCell_subtype"] = adata_TCell_i.obs["0.5_log_AutoZI"].astype(str).map(tcell_cluster_mapping)

# Check assignments
adata_TCell_i.obs["TCell_subtype"].value_counts()

In [None]:
import matplotlib.pyplot as plt
import scanpy as sc

# Define desired legend order
legend_order = ["Memory TCells", "Effector CD8 TCells", "Effector CD4 TCells", "Unspecified TCells"]

# Plot UMAP using subtype annotation
sc.pl.umap(
    adata_TCell_i,
    color='TCell_subtype',
    title='',  # No title for publication version
    frameon=True,
    palette=plt.get_cmap('tab20').colors,
    legend_loc='lower right',
    legend_fontsize=10,
    legend_fontoutline=1,
    show=False  # Allow editing
)

# Adjust figure
fig = plt.gcf()
fig.set_size_inches(5,4)

# Get current legend handles and labels
ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()

# Build a dictionary of current labels and handles
label_handle_dict = dict(zip(labels, handles))

# Only keep those in legend_order that exist in current labels
ordered_labels = [label for label in legend_order if label in label_handle_dict]
ordered_handles = [label_handle_dict[label] for label in ordered_labels]

# Remove old legend
ax.get_legend().remove()

# Add new ordered legend
ax.legend(ordered_handles, ordered_labels,
          loc='lower center',
          bbox_to_anchor=(0.5, -0.4),  # Centered below
          fontsize=11,
          frameon=True,
          ncol=2)

plt.tight_layout()

# Save figure
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/UMAP/UMAP_TCell_subtypes_isoform.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
# Create the column if it doesn't exist
if "gen_cell_type_reannotated" not in adata_i_filtered.obs.columns:
    adata_i_filtered.obs["gen_cell_type_reannotated"] = adata_i_filtered.obs["gen_cell_type"]

# Temporarily convert to plain string for flexible assignment
adata_i_filtered.obs["gen_cell_type_reannotated"] = (
    adata_i_filtered.obs["gen_cell_type_reannotated"].astype(str)
)

# Assign new subtype labels from adata_TCell_i
adata_i_filtered.obs.loc[adata_TCell_i.obs_names, "gen_cell_type_reannotated"] = (
    adata_TCell_i.obs["TCell_subtype"].astype(str)
)

# (Optional) Re-cast to categorical for plotting, ordering, etc.
adata_i_filtered.obs["gen_cell_type_reannotated"] = (
    adata_i_filtered.obs["gen_cell_type_reannotated"].astype("category")
)

# Check output
print(adata_i_filtered.obs["gen_cell_type_reannotated"].value_counts())

In [None]:
sc.pl.umap(
            adata_i_filtered,
            color=f'gen_cell_type_reannotated',
            title=f'UMAP with Clusters (Isoform-Level) ',
            frameon=True,
            legend_fontsize=10,
            legend_fontoutline=2,
        )

In [None]:
sc.pl.umap(
            adata_i_filtered,
            color=f'0.5_log_AutoZI',
            title=f'UMAP with Clusters (Isoform-Level) ',
            frameon=True,
            legend_fontsize=10,
            legend_fontoutline=2,
        )

In [None]:
#Save Denoised data
output_dir = 'Intermediate_Files/Clustering_05012025/'

adata_i_filtered.write(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_reannotated_AutoZILatent_05112025.h5mu"))

In [None]:
import scanpy as sc

import pandas as pd
from scipy.stats import spearmanr

group1 = "Effector CD8 TCells"
group2 = "Unspecified TCells"

# Subset and average expression per gene
adata_sub = adata_TCell_i[adata_TCell_i.obs["TCell_subtype"].isin([group1, group2])].copy()
X = adata_sub.X.toarray() if hasattr(adata_sub.X, "toarray") else adata_sub.X
df_expr = pd.DataFrame(X, index=adata_sub.obs_names, columns=adata_sub.var_names)
df_expr["group"] = adata_sub.obs["TCell_subtype"].values

# Mean expression per group
mean_expr = df_expr.groupby("group").mean().T

# Correlation
rho, pval = spearmanr(mean_expr[group1], mean_expr[group2])
print(f"Spearman correlation between {group1} and {group2}: ρ = {rho:.3f}, p = {pval:.3g}")

In [None]:
import scanpy as sc

# Choose your cell type labels
group1 = "Effector CD8 TCells"
group2 = "Unspecified TCells"

# Make a log1p version of raw counts
import numpy as np

adata_TCell_i.layers["log1p_counts"] = np.log1p(adata_TCell_i.layers["counts"])

# Subset AnnData
adata_sub = adata_TCell_i[adata_TCell_i.obs["TCell_subtype"].isin([group1, group2])].copy()


# Create a new column with simplified labels
adata_sub.obs["comparison_group"] = adata_sub.obs["TCell_subtype"].map({
    group1: "group1",
    group2: "group2"
})

sc.tl.rank_genes_groups(
    adata_sub, 
    groupby="comparison_group",
    reference="group1",  # this will be the baseline
    method="wilcoxon", 
    use_raw=False,
    layer="log1p_counts",   # <- raw counts go here
    pts=True
)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from adjustText import adjust_text

# Extract results
de_results = sc.get.rank_genes_groups_df(adata_sub, group="group2")

# Compute -log10 adjusted p-value
de_results["-log10(pval)"] = -np.log10(de_results["pvals_adj"].replace(0, 1e-300))

# Define significant hits
de_results["significant"] = (de_results["pvals_adj"] < 0.05) & (abs(de_results["logfoldchanges"]) >= 1)

# Filter to significant genes only
sig_genes = de_results[de_results["significant"]].copy()

# Select top N genes by adjusted p-value
top_n = 10
top_genes = sig_genes.nsmallest(top_n, "pvals_adj")

# Start plot
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=de_results,
    x="logfoldchanges",
    y="-log10(pval)",
    hue="significant",
    palette={True: "red", False: "gray"},
    edgecolor=None,
    alpha=0.7
)

# Threshold lines
plt.axvline(1, color="blue", linestyle="--", lw=1)
plt.axvline(-1, color="blue", linestyle="--", lw=1)

# Add labels for top genes
texts = []
for _, row in top_genes.iterrows():
    texts.append(
        plt.text(
            row["logfoldchanges"],
            row["-log10(pval)"],
            row["names"],
            fontsize=8,
            ha='right' if row["logfoldchanges"] < 0 else 'left'
        )
    )
adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))

adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))
plt.title("Volcano Plot: Effector CD8 TCells vs Unspecified TCells")
plt.xlabel("Log2 Fold Change")
plt.ylabel("-log10 Adjusted p-value")
plt.legend(title="FDR < 0.05", loc="lower left")
plt.tight_layout()
#plt.savefig("Intermediate_Files/Paper_Figs/volcano_EffectorCD4T2_vs_T1.pdf", dpi=600)
plt.show()

In [None]:
import scanpy as sc

import pandas as pd
from scipy.stats import spearmanr

group1 = "Effector CD4 TCells"
group2 = "Unspecified TCells"

# Subset and average expression per gene
adata_sub = adata_TCell_i[adata_TCell_i.obs["TCell_subtype"].isin([group1, group2])].copy()
X = adata_sub.X.toarray() if hasattr(adata_sub.X, "toarray") else adata_sub.X
df_expr = pd.DataFrame(X, index=adata_sub.obs_names, columns=adata_sub.var_names)
df_expr["group"] = adata_sub.obs["TCell_subtype"].values

# Mean expression per group
mean_expr = df_expr.groupby("group").mean().T

# Correlation
rho, pval = spearmanr(mean_expr[group1], mean_expr[group2])
print(f"Spearman correlation between {group1} and {group2}: ρ = {rho:.3f}, p = {pval:.3g}")

In [None]:
# Choose your cell type labels
group1 = "Effector CD4 TCells"
group2 = "Unspecified TCells"

# Make a log1p version of raw counts
import numpy as np

adata_TCell_i.layers["log1p_counts"] = np.log1p(adata_TCell_i.layers["counts"])

# Subset AnnData
adata_sub = adata_TCell_i[adata_TCell_i.obs["TCell_subtype"].isin([group1, group2])].copy()


# Create a new column with simplified labels
adata_sub.obs["comparison_group"] = adata_sub.obs["TCell_subtype"].map({
    group1: "group1",
    group2: "group2"
})

sc.tl.rank_genes_groups(
    adata_sub, 
    groupby="comparison_group",
    reference="group1",  # this will be the baseline
    method="wilcoxon", 
    use_raw=False,
    layer="log1p_counts",   # <- raw counts go here
    pts=True
)

sc.tl.rank_genes_groups(
    adata_sub, 
    groupby="comparison_group",
    reference="group1",  # this will be the baseline
    method="wilcoxon", 
    use_raw=False,
    layer="log1p_counts",   # <- raw counts go here
    pts=True
)

# Get top markers
top1 = sc.get.rank_genes_groups_df(adata_sub, group=group1).sort_values("pvals_adj").head(50)["names"]
top2 = sc.get.rank_genes_groups_df(adata_sub, group=group2).sort_values("pvals_adj").head(50)["names"]

# Jaccard similarity
shared = set(top1).intersection(set(top2))
jaccard = len(shared) / len(set(top1).union(set(top2)))
print(f"Jaccard similarity of top 50 markers: {jaccard:.2f}")
print("Shared markers:", sorted(shared))

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from adjustText import adjust_text

# Extract results
de_results = sc.get.rank_genes_groups_df(adata_sub, group="group2")

# Compute -log10 adjusted p-value
de_results["-log10(pval)"] = -np.log10(de_results["pvals_adj"].replace(0, 1e-300))

# Define significant hits
de_results["significant"] = (de_results["pvals_adj"] < 0.05) & (abs(de_results["logfoldchanges"]) >= 1)

# Filter to significant genes only
sig_genes = de_results[de_results["significant"]].copy()

# Select top N genes by adjusted p-value
top_n = 10
top_genes = sig_genes.nsmallest(top_n, "pvals_adj")

# Start plot
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=de_results,
    x="logfoldchanges",
    y="-log10(pval)",
    hue="significant",
    palette={True: "red", False: "gray"},
    edgecolor=None,
    alpha=0.7
)

# Threshold lines
plt.axvline(1, color="blue", linestyle="--", lw=1)
plt.axvline(-1, color="blue", linestyle="--", lw=1)

# Add labels for top genes
texts = []
for _, row in top_genes.iterrows():
    texts.append(
        plt.text(
            row["logfoldchanges"],
            row["-log10(pval)"],
            row["names"],
            fontsize=8,
            ha='right' if row["logfoldchanges"] < 0 else 'left'
        )
    )
adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))

adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))
plt.title("Volcano Plot: Effector CD4 TCells vs Unspecified TCells")
plt.xlabel("Log2 Fold Change")
plt.ylabel("-log10 Adjusted p-value")
plt.legend(title="FDR < 0.05", loc="lower left")
plt.tight_layout()
#plt.savefig("Intermediate_Files/Paper_Figs/volcano_EffectorCD4T2_vs_T1.pdf", dpi=600)
plt.show()

In [None]:
import scanpy as sc

# Choose your cell type labels
group1 = "Memory TCells"
group2 = "Unspecified TCells"

# Subset AnnData
adata_sub =adata_TCell_i[adata_TCell_i.obs["TCell_subtype"].isin([group1, group2])].copy()

# Create a new column with simplified labels
adata_sub.obs["comparison_group"] = adata_sub.obs["TCell_subtype"].map({
    group1: "group1",
    group2: "group2"
})

sc.tl.rank_genes_groups(
    adata_sub, 
    groupby="comparison_group",
    reference="group1",  # this will be the baseline
    method="wilcoxon", 
    pts=True
)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from adjustText import adjust_text

# Extract results
de_results = sc.get.rank_genes_groups_df(adata_sub, group="group2")

# Compute -log10 adjusted p-value
de_results["-log10(pval)"] = -np.log10(de_results["pvals_adj"].replace(0, 1e-300))

# Define significant hits
de_results["significant"] = (de_results["pvals_adj"] < 0.05) & (abs(de_results["logfoldchanges"]) >= 1)

# Filter to significant genes only
sig_genes = de_results[de_results["significant"]].copy()

# Select top N genes by adjusted p-value
top_n = 10
top_genes = sig_genes.nsmallest(top_n, "pvals_adj")

# Start plot
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=de_results,
    x="logfoldchanges",
    y="-log10(pval)",
    hue="significant",
    palette={True: "red", False: "gray"},
    edgecolor=None,
    alpha=0.7
)

# Threshold lines
plt.axvline(1, color="blue", linestyle="--", lw=1)
plt.axvline(-1, color="blue", linestyle="--", lw=1)

# Add labels for top genes
texts = []
for _, row in top_genes.iterrows():
    texts.append(
        plt.text(
            row["logfoldchanges"],
            row["-log10(pval)"],
            row["names"],
            fontsize=8,
            ha='right' if row["logfoldchanges"] < 0 else 'left'
        )
    )
adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))

adjust_text(texts, arrowprops=dict(arrowstyle="-", lw=0.5))
plt.title("Volcano Plot: Memory TCells vs Unspecified TCells")
plt.xlabel("Log2 Fold Change")
plt.ylabel("-log10 Adjusted p-value")
plt.legend(title="FDR < 0.05", loc="lower left")
plt.tight_layout()
#plt.savefig("Intermediate_Files/Paper_Figs/volcano_EffectorCD4T2_vs_T1.pdf", dpi=600)
plt.show()

In [None]:
from scanpy import read_h5ad

output_dir = 'Intermediate_Files/Clustering_05012025/'

# Load the mdata object from the file
adata_g_filtered = read_h5ad(os.path.join(output_dir, "PBMC_gene_AutoZI_clustered_celltypes_reannotated_AutoZILatent_05112025.h5mu"))
adata_i_filtered = read_h5ad(os.path.join(output_dir, "PBMC_iso_AutoZI_clustered_celltypes_reannotated_AutoZILatent_05112025.h5mu"))

In [None]:
def find_matching_genes(gene_name, adata):
    return [g for g in adata.var_names if gene_name.upper() in g.upper()]

In [None]:
import scanpy as sc
import pandas as pd

# Suppose 'adata' is your AnnData object
# Step 1: Extract all gene names that start with "Bambu"
bambu_genes = [gene for gene in adata_g_filtered.var_names if "Bambu" in gene]
bambu_isos = [gene for gene in adata_i_filtered.var_names if "Bambu" in gene]

# Step 2: Subset the AnnData object to these genes
adata_g_bambu = adata_g_filtered[:, bambu_genes].copy()
adata_i_bambu = adata_i_filtered[:, bambu_isos].copy()

# Step 3: Convert the expression matrix to a Pandas DataFrame.
# Here, adata_bambu.to_df() converts the .X matrix (cells as rows, genes as columns).
df_expr_g = adata_g_bambu.to_df()
df_expr_i = adata_i_bambu.to_df()

# Step 4: Add clustering annotations to the DataFrame.
# Replace 'clusters' with the appropriate key if different.
df_expr_g['gen_cell_type'] = adata_g_bambu.obs['gen_cell_type'].values
df_expr_g['sub_cell_type'] = adata_g_bambu.obs['sub_cell_type'].values
df_expr_i['gen_cell_type'] = adata_i_bambu.obs['gen_cell_type'].values
df_expr_i['sub_cell_type'] = adata_i_bambu.obs['sub_cell_type'].values

# Step 5: Identify only the numeric columns (the gene expression columns)
numeric_cols_g = df_expr_g.select_dtypes(include=[np.number]).columns
numeric_cols_i = df_expr_i.select_dtypes(include=[np.number]).columns

# Step 5: Group by cluster and compute average expression for each gene.
# Group by the cell type and compute average expression
avg_expr_by_cluster_gen_g = df_expr_g.groupby("gen_cell_type")[numeric_cols_g].median()
avg_expr_by_cluster_gen_i = df_expr_i.groupby("gen_cell_type")[numeric_cols_i].median()
avg_expr_by_cluster_sub_g = df_expr_g.groupby("sub_cell_type")[numeric_cols_g].median()
avg_expr_by_cluster_sub_i = df_expr_i.groupby("sub_cell_type")[numeric_cols_i].median()

In [None]:
bambu_genes = sorted(bambu_genes)
print(bambu_genes)

In [None]:
bambu_isos = sorted(bambu_isos)
print(bambu_isos)

In [None]:
print("Median expression of 'Bambu' genes by cell-type (gene-level):")
print(avg_expr_by_cluster_gen_g)

In [None]:
print("Median expression of 'Bambu' genes by cell-type (iso-level):")
print(avg_expr_by_cluster_gen_i)

In [None]:
print("Median expression of 'Bambu' genes by sub-cell-type (gene-level):")
print(avg_expr_by_cluster_sub_g)

In [None]:
print("Median expression of 'Bambu' genes by sub-cell-type (iso-level):")
print(avg_expr_by_cluster_sub_i)

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np

# Suppose 'adata' is your AnnData object
# Step 1: Extract all gene names that start with "Bambu"
bambu_genes = [gene for gene in adata_g_filtered.var_names if "Bambu" in gene]
bambu_isos = [gene for gene in adata_i_filtered.var_names if "Bambu" in gene]

# Step 2: Subset the AnnData object to these genes
adata_g_bambu = adata_g_filtered[:, bambu_genes].copy()
adata_i_bambu = adata_i_filtered[:, bambu_isos].copy()

# Step 3: Convert the expression matrix to a Pandas DataFrame.
# Here, adata_bambu.to_df() converts the .X matrix (cells as rows, genes as columns).
df_expr_g = adata_g_bambu.to_df()
df_expr_i = adata_i_bambu.to_df()

# Step 4: Add clustering annotations to the DataFrame.
# Replace 'clusters' with the appropriate key if different.
df_expr_g['gen_cell_type'] = adata_g_bambu.obs['gen_cell_type'].values
df_expr_g['sub_cell_type'] = adata_g_bambu.obs['sub_cell_type'].values
df_expr_i['gen_cell_type'] = adata_i_bambu.obs['gen_cell_type'].values
df_expr_i['sub_cell_type'] = adata_i_bambu.obs['sub_cell_type'].values

# Step 5: Select only numeric columns (i.e. the gene expression columns)
numeric_cols_g = df_expr_g.select_dtypes(include=[np.number]).columns
numeric_cols_i = df_expr_i.select_dtypes(include=[np.number]).columns

# Group by the general cell type and compute the sum of expression for each gene.
sum_expr_by_cluster_gen_g = df_expr_g.groupby("gen_cell_type")[numeric_cols_g].sum()
sum_expr_by_cluster_gen_i = df_expr_i.groupby("gen_cell_type")[numeric_cols_i].sum()

# Group by the sub cell type (if desired) and compute the sum.
sum_expr_by_cluster_sub_g = df_expr_g.groupby("sub_cell_type")[numeric_cols_g].sum()
sum_expr_by_cluster_sub_i = df_expr_i.groupby("sub_cell_type")[numeric_cols_i].sum()

In [None]:
print("Sum counts of 'Bambu' genes by general cell type (gene-level):")
print(sum_expr_by_cluster_gen_g)

In [None]:
print("Sum counts of 'Bambu' genes by general cell type (iso-level):")
print(sum_expr_by_cluster_gen_i)

In [None]:
print("\nSum counts of 'Bambu' genes by sub cell type (gene-level):")
print(sum_expr_by_cluster_sub_g)

In [None]:
print("\nSum counts of 'Bambu' genes by sub cell type (isoform-level):")
print(sum_expr_by_cluster_sub_i)

In [None]:
import numpy as np
import pandas as pd

# For gene-level data:

# Select numeric columns (i.e. the genes)
numeric_cols_g = df_expr_g.select_dtypes(include=[np.number]).columns.tolist()

# Group by the general cell type and compute the median expression per gene
median_expr_by_cluster_g = df_expr_g.groupby("gen_cell_type")[numeric_cols_g].median()

# For each cluster, sort genes by median expression (descending) and select the top 10 gene names
top10_genes_by_cluster = {}
for cluster in median_expr_by_cluster_g.index:
    sorted_genes = median_expr_by_cluster_g.loc[cluster].sort_values(ascending=False)
    top10_genes = sorted_genes.index[:10].tolist()
    top10_genes_by_cluster[cluster] = top10_genes

print("Top 10 genes per cluster (gene-level data) based on median expression:")
for cluster, genes in top10_genes_by_cluster.items():
    print(f"Cluster {cluster}: {genes}")

In [None]:
# For isoform-level data:

# Select numeric columns (the isoforms)
numeric_cols_i = df_expr_i.select_dtypes(include=[np.number]).columns.tolist()

# Group by the general cell type and compute the median expression per isoform
median_expr_by_cluster_i = df_expr_i.groupby("gen_cell_type")[numeric_cols_i].median()

# For each cluster, sort isoforms by median expression (descending) and select the top 10 isoform names
top10_isos_by_cluster = {}
for cluster in median_expr_by_cluster_i.index:
    sorted_isos = median_expr_by_cluster_i.loc[cluster].sort_values(ascending=False)
    top10_isos = sorted_isos.index[:10].tolist()
    top10_isos_by_cluster[cluster] = top10_isos

print("\nTop 10 isoforms per cluster (isoform-level data) based on median expression:")
for cluster, isos in top10_isos_by_cluster.items():
    print(f"Cluster {cluster}: {isos}")

In [None]:
import scanpy as sc
import pandas as pd

# Suppose 'adata' is your AnnData object
# Step 1: Extract all gene names that start with "Bambu"
bambu_genes = [gene for gene in adata_g_filtered.var_names if "Bambu" in gene]
bambu_isos = [gene for gene in adata_i_filtered.var_names if "Bambu" in gene]

# Step 2: Subset the AnnData object to these genes
adata_g_bambu = adata_g_filtered[:, bambu_genes].copy()
adata_i_bambu = adata_i_filtered[:, bambu_isos].copy()

# Step 3: Convert the expression matrix to a Pandas DataFrame.
# Here, adata_bambu.to_df() converts the .X matrix (cells as rows, genes as columns).
df_expr_g = adata_g_bambu.to_df()
df_expr_i = adata_i_bambu.to_df()

# Step 4: Add clustering annotations to the DataFrame.
# Replace 'clusters' with the appropriate key if different.
df_expr_g['gen_cell_type'] = adata_g_bambu.obs['gen_cell_type'].values
df_expr_g['gen_cell_type_reannotated'] = adata_g_bambu.obs['gen_cell_type_reannotated'].values
df_expr_i['gen_cell_type'] = adata_i_bambu.obs['gen_cell_type'].values
df_expr_i['gen_cell_type_reannotated'] = adata_i_bambu.obs['gen_cell_type_reannotated'].values

# Step 5: Identify only the numeric columns (the gene expression columns)
numeric_cols_g = df_expr_g.select_dtypes(include=[np.number]).columns
numeric_cols_i = df_expr_i.select_dtypes(include=[np.number]).columns

# Step 5: Group by cluster and compute average expression for each gene.
# Group by the cell type and compute average expression
avg_expr_by_cluster_gen_g = df_expr_g.groupby("gen_cell_type")[numeric_cols_g].median()
avg_expr_by_cluster_gen_i = df_expr_i.groupby("gen_cell_type")[numeric_cols_i].median()
avg_expr_by_cluster_sub_g = df_expr_g.groupby("gen_cell_type_reannotated")[numeric_cols_g].median()
avg_expr_by_cluster_sub_i = df_expr_i.groupby("gen_cell_type_reannotated")[numeric_cols_i].median()

import scanpy as sc
import matplotlib.pyplot as plt

# Extract all gene names that start with "Bambu"
bambu_genes = [gene for gene in adata_g_filtered.var_names if gene.startswith("Bambu")]

# Determine grid size: adjust ncols as needed.
n_genes = len(bambu_genes)
ncols = 3
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each bambu gene and plot UMAP colored by its expression.
for i, gene in enumerate(bambu_genes):
    sc.pl.umap(
        adata_g_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        #vmax = 1
    )

# Hide any extra subplots (if nrows*ncols > number of genes)
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt

# Extract all gene names that start with "Bambu"
bambu_isos = [gene for gene in adata_i_filtered.var_names if "Bambu" in gene]

# Determine grid size: adjust ncols as needed.
n_genes = len(bambu_isos)
ncols = 3
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each bambu gene and plot UMAP colored by its expression.
for i, gene in enumerate(bambu_isos):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        #vmax = 1
    )

# Hide any extra subplots (if nrows*ncols > number of genes)
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Extract all gene names that start with "Bambu"
TRBV74_isos = [gene for gene in adata_i_filtered.var_names if "ENSG00000253409" in gene]

# Determine grid size: adjust ncols as needed.
n_genes = len(TRBV74_isos)
ncols = 3
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each bambu gene and plot UMAP colored by its expression.
for i, gene in enumerate(TRBV74_isos):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        #vmax = 1
    )

# Hide any extra subplots (if nrows*ncols > number of genes)
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Extract all gene names that start with "Bambu"
TARP_isos = [gene for gene in adata_i_filtered.var_names if "ENSG00000289746" in gene]

# Determine grid size: adjust ncols as needed.
n_genes = len(TARP_isos)
ncols = 3
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each bambu gene and plot UMAP colored by its expression.
for i, gene in enumerate(TARP_isos):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        #vmax = 1
    )

# Hide any extra subplots (if nrows*ncols > number of genes)
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Extract all gene names that start with "Bambu"
CMC1_isos = [gene for gene in adata_i_filtered.var_names if "ENSG00000187118" in gene]

# Determine grid size: adjust ncols as needed.
n_genes = len(CMC1_isos)
ncols = 2
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each bambu gene and plot UMAP colored by its expression.
for i, gene in enumerate(CMC1_isos):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        vmax = 8
    )

# Hide any extra subplots (if nrows*ncols > number of genes)
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Extract all gene names that start with "Bambu"
LYAR_isos = [gene for gene in adata_i_filtered.var_names if "ENSG00000145220" in gene]

# Determine grid size: adjust ncols as needed.
n_genes = len(LYAR_isos)
ncols = 2
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each bambu gene and plot UMAP colored by its expression.
for i, gene in enumerate(LYAR_isos):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        #vmax = 1
    )

# Hide any extra subplots (if nrows*ncols > number of genes)
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Extract all gene names that start with "Bambu"
isos_TRGC1 = [gene for gene in adata_i_filtered.var_names if "ENSG00000211689" in gene]

# Determine grid size: adjust ncols as needed.
n_genes = len(isos_TRGC1)
ncols = 3
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each bambu gene and plot UMAP colored by its expression.
for i, gene in enumerate(isos_TRGC1):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        #vmax = 1
    )

# Hide any extra subplots (if nrows*ncols > number of genes)
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Extract all gene names that start with "Bambu"
isos_TRGC2 = [gene for gene in adata_i_filtered.var_names if "ENSG00000227191" in gene]

# Determine grid size: adjust ncols as needed.
n_genes = len(isos_TRGC2)
ncols = 3
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each bambu gene and plot UMAP colored by its expression.
for i, gene in enumerate(isos_TRGC2):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        #vmax = 1
    )

# Hide any extra subplots (if nrows*ncols > number of genes)
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Extract all gene names that contain the specific ENSG ID
isos_308813 = [gene for gene in adata_i_filtered.var_names if "ENSG00000308813" in gene]

# Determine grid size
n_genes = len(isos_308813)
ncols = 3
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each isoform and plot UMAP colored by expression
for i, gene in enumerate(isos_308813):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        vmax=8,
        vmin=-2
    )

# Hide extra axes if grid is bigger than number of genes
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()

# ✅ Save figure
plt.savefig("Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/ENSG00000308813_isoform_umaps.pdf",
            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
# Extract all gene names that contain the specific ENSG ID
isos_TALAM1 = [gene for gene in adata_i_filtered.var_names if "ENSG00000289740" in gene]

# Determine grid size
n_genes = len(isos_TALAM1)
ncols = 3
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each isoform and plot UMAP colored by expression
for i, gene in enumerate(isos_TALAM1):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        vmax=8,
        vmin=-2
    )

# Hide extra axes if grid is bigger than number of genes
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()

# ✅ Save figure
#plt.savefig("Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/TALAM1_isoform_umaps.pdf",
#            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
# Extract all gene names that contain the specific ENSG ID
isos_MALAT1 = [gene for gene in adata_i_filtered.var_names if "ENSG00000251562" in gene]

# Determine grid size
n_genes = len(isos_MALAT1)
ncols = 3
nrows = (n_genes + ncols - 1) // ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
axes = axes.flatten()

# Loop through each isoform and plot UMAP colored by expression
for i, gene in enumerate(isos_MALAT1):
    sc.pl.umap(
        adata_i_filtered, 
        color=gene, 
        use_raw=False, 
        title=f"{gene}", 
        cmap="viridis", 
        show=False, 
        ax=axes[i],
        vmax=8,
        vmin=-2
    )

# Hide extra axes if grid is bigger than number of genes
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()

# ✅ Save figure
#plt.savefig("Intermediate_Files/Clustering_05012025/Figures/Bambu_Genes/MALAT1_isoform_umaps.pdf",
#            dpi=600, transparent=True, bbox_inches="tight")

plt.show()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm

# Step 1: Define ENSG IDs for MALAT1 and TALAM1
target_ensgs = ["ENSG00000251562", "ENSG00000289740"]
isoform_matches = [gene for gene in adata_i_filtered.var_names if any(ensg in gene for ensg in target_ensgs)]

print(f"✅ Found {len(isoform_matches)} isoforms:")
for g in isoform_matches:
    print("  ", g)

# Step 2: Extract expression matrix
X = adata_i_filtered.X.toarray() if hasattr(adata_i_filtered.X, "toarray") else adata_i_filtered.X
iso_indices = [adata_i_filtered.var_names.get_loc(i) for i in isoform_matches]

df_expr = pd.DataFrame(X[:, iso_indices],
                       columns=isoform_matches,
                       index=adata_i_filtered.obs["gen_cell_type_reannotated"])

# Step 3: Mean expression per cell type
mean_expr = df_expr.groupby(df_expr.index).mean()

# Step 4: Set symmetric vmin/vmax for centered colormap
abs_max = np.abs(mean_expr.values).max()
norm = TwoSlopeNorm(vmin=-2, vcenter=0, vmax=2)

# Step 5: Plot
g = sns.clustermap(mean_expr,
                   cmap="coolwarm", linewidths=0.5, linecolor="gray",
                   figsize=(len(mean_expr.columns) * 0.4 + 2, len(mean_expr.index) * 0.5 + 2),
                   col_cluster=True, row_cluster=False,
                   cbar_kws={"label": "Mean Expression"},
                   norm=TwoSlopeNorm(vmin=-2, vcenter=0, vmax=2),
                  cbar_pos=(0.09, 0.65, 0.03, 0.2))

g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90, fontsize=8)
g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0)
g.ax_heatmap.set_xlabel("")
g.ax_heatmap.set_ylabel("")
g.ax_heatmap.set_title("", pad=20)

g.savefig("Intermediate_Files/Paper_Figs/MALAT1_TALAM1_isoform_expression_clustermap.pdf",
          dpi=600, bbox_inches="tight", transparent=True)
plt.show()

In [None]:
# Apply log1p transformation
adata_g_filtered.layers["log_raw"] = np.log1p(adata_g_filtered.layers["counts"])
adata_i_filtered.layers["log_raw"] = np.log1p(adata_i_filtered.layers["counts"])

In [None]:
# --- Create new datasets (subsets) for each cell type ---
# Get the unique cell types
unique_cell_types_g = adata_g_filtered.obs['gen_cell_type'].unique()
unique_cell_types_i = adata_i_filtered.obs['gen_cell_type'].unique()

# Create a dictionary to store the subsets
celltype_datasets_g = {}
celltype_datasets_i = {}

for cell_type in unique_cell_types_g:
    # Subset the AnnData object for each cell type
    subset = adata_g_filtered[adata_g_filtered.obs['gen_cell_type'] == cell_type].copy()
    celltype_datasets_g[cell_type] = subset
    # Optionally, save each subset to file:
    #subset.write_h5ad(f"PBMC_{cell_type}_gene_raw_counts_04092024.h5ad", compression="gzip")
    #print(f"Saved dataset for {cell_type} with {subset.n_obs} cells.")

for cell_type in unique_cell_types_i:
    # Subset the AnnData object for each cell type
    subset = adata_i_filtered[adata_i_filtered.obs['gen_cell_type'] == cell_type].copy()
    celltype_datasets_i[cell_type] = subset
    # Optionally, save each subset to file:
   # subset.write_h5ad(f"PBMC_{cell_type}_iso_raw_counts_04092024.h5ad", compression="gzip")
   # print(f"Saved dataset for {cell_type} with {subset.n_obs} cells.")

# Now you have separate datasets stored in the dictionary 'celltype_datasets'
# You can access, for example, raw counts for specific genes in:
# celltype_datasets["TCells"].X

In [None]:
TCell_g = celltype_datasets_g["TCells"]
NKCell_g = celltype_datasets_g["NK Cells"]
Monocyte_g = celltype_datasets_g["Monocyte-derived"]
BCell_g = celltype_datasets_g["BCells"]
MK_g = celltype_datasets_g["Megakaryocytes"]

TCell_i = celltype_datasets_i["TCells"]
NKCell_i = celltype_datasets_i["NK Cells"]
Monocyte_i = celltype_datasets_i["Monocyte-derived"]
BCell_i = celltype_datasets_i["BCells"]
MK_i = celltype_datasets_i["Megakaryocytes"]

In [None]:
TCell_Markers = ["CD4:", "CD3D:", "CD3E:", "IL7R:", "CD8A:", "CD8B:","CCR7:", "SELL:", "FOXP3:", "IL2RA:", "CTLA4:"]
BCells = ["MS4A1:","CD19:", "CD79A:", "CD22:"]
NKCells = ["NKG7:", "GNLY:", "KLRD1:", "KLRF1:", "GZMB:", "PRF1:", "FCGR3A:","IFNG:", "CCL5:", "IL2RB:"]
Myeloid = ["ITGAM:", "ITGAX:", "CD33:", "CD14:", "CD1C:", "HLA-DRA:", "HLA-DRB1:"]
Monocytes = ["CD14:", "LYZ:", "VCAN:", "FCN1:", "CST3:", "S100A8:", "S100A9:", "FCGR3A:", "CX3CR1:"]
DC = ["CD1C:", "CLEC10A:", "FCER1A:", "CST3:"]
pDC = ["CLEC4C:", "LILRA4:", "GZMB:", "TCL1A:"]
MK = ["PPBP:", "PF4:", "ITGA2B:", "GP1BA:", "VWF:"]



# Function to find matching genes in var_names (combined_ID format)
def find_matching_genes(prefixes, gene_list):
    return [gene for gene in gene_list if any(gene.startswith(prefix) for prefix in prefixes)]

In [None]:
TCell_g[0]

In [None]:
TCell_i[0]

In [None]:
# Get gene matches for gene-level data
TCell_genes = find_matching_genes(TCell_Markers, adata_g_filtered.var_names)
NK_genes = find_matching_genes(NKCells, adata_g_filtered.var_names)
Myeloid_genes = find_matching_genes(Myeloid, adata_g_filtered.var_names)
Monocyte_genes = find_matching_genes(Monocytes, adata_g_filtered.var_names)
BCell_genes = find_matching_genes(BCells, adata_g_filtered.var_names)
DC_genes = find_matching_genes(DC, adata_g_filtered.var_names)
pDC_genes = find_matching_genes(pDC, adata_g_filtered.var_names)
MK_genes = find_matching_genes(MK, adata_g_filtered.var_names)

TCell_isos = find_matching_genes(TCell_Markers, adata_i_filtered.var_names)
NK_isos = find_matching_genes(NKCells, adata_i_filtered.var_names)
Myeloid_isos = find_matching_genes(Myeloid, adata_i_filtered.var_names)
Monocyte_isos = find_matching_genes(Monocytes, adata_i_filtered.var_names)
BCell_isos = find_matching_genes(BCells, adata_i_filtered.var_names)
DC_isos = find_matching_genes(DC, adata_i_filtered.var_names)
pDC_isos = find_matching_genes(pDC, adata_i_filtered.var_names)
MK_isos = find_matching_genes(MK, adata_i_filtered.var_names)

In [None]:
from scipy.stats import mode

# Subset the AnnData object to only include the genes in TCell_genes_1
TCell_g_subset = TCell_g[:, TCell_genes].copy()

# Choose the expression layer.
X = TCell_g_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
TCell_summary_df_g = pd.DataFrame({
    "gene": TCell_g_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(TCell_summary_df_g)

In [None]:
from scipy.stats import mode

# Subset the AnnData object to only include the genes in TCell_genes_1
TCell_i_subset = TCell_i[:, TCell_isos].copy()

# Choose the expression layer.
X = TCell_i_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
TCell_summary_df_i = pd.DataFrame({
    "gene": TCell_i_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(TCell_summary_df_i)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(TCell_g_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(TCell_i_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
NKCell_g_subset =NKCell_g[:, NK_genes].copy()

# Choose the expression layer.
X = NKCell_g_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
NKCell_summary_df_g = pd.DataFrame({
    "gene": NKCell_g_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(NKCell_summary_df_g)

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
NKCell_i_subset =NKCell_i[:, NK_isos].copy()

# Choose the expression layer.
X = NKCell_i_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
NKCell_summary_df_i = pd.DataFrame({
    "gene": NKCell_i_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(NKCell_summary_df_i)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(NKCell_g_subset.var_names):
    plt.figure(figsize=(6, 4))
    plt.hist(NKCell_g_subset.layers["log_raw"][:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(NKCell_i_subset.var_names):
    plt.figure(figsize=(6, 4))
    plt.hist(NKCell_i_subset.layers["log_raw"][:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
Monocyte_g_subset =Monocyte_g[:, Myeloid_genes].copy()

# Choose the expression layer.
X = Monocyte_g_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
Monocyte_summary_df_g = pd.DataFrame({
    "gene": Monocyte_g_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(Monocyte_summary_df_g)

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
Monocyte_i_subset =Monocyte_i[:, Myeloid_isos].copy()

# Choose the expression layer.
X =Monocyte_i_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
Monocyte_summary_df_i = pd.DataFrame({
    "gene": Monocyte_i_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(Monocyte_summary_df_i)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Monocyte_g_subset.var_names):
    plt.figure(figsize=(6, 4))
    plt.hist(Monocyte_g_subset.layers["log_raw"][:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Monocyte_i_subset.var_names):
    plt.figure(figsize=(6, 4))
    plt.hist(Myeloid_i_subset.layers["log_raw"][:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
BCell_g_subset =BCell_g[:, BCell_genes].copy()

# Choose the expression layer.
X = BCell_g_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
BCell_summary_df_g = pd.DataFrame({
    "gene": BCell_g_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(BCell_summary_df_g)

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
BCell_i_subset =BCell_i[:, BCell_isos].copy()

# Choose the expression layer.
X = BCell_i_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
BCell_summary_df_i = pd.DataFrame({
    "gene": BCell_i_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(BCell_summary_df_i)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(BCell_g_subset.var_names):
    plt.figure(figsize=(6, 4))
    plt.hist(BCell_g_subset.layers["log_raw"][:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(BCell_i_subset.var_names):
    plt.figure(figsize=(6, 4))
    plt.hist(BCell_i_subset.layers["log_raw"][:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
MK_g_subset =MK_g[:, MK_genes].copy()

# Choose the expression layer.
X = MK_g_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
MK_summary_df_g = pd.DataFrame({
    "gene": MK_g_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(MK_summary_df_g)

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
MK_i_subset =MK_i[:, MK_isos].copy()

# Choose the expression layer.
X = MK_i_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
MK_summary_df_i = pd.DataFrame({
    "gene": MK_i_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(MK_summary_df_i)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(MK_g_subset.var_names):
    plt.figure(figsize=(6, 4))
    plt.hist(MK_g_subset.layers["log_raw"][:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(MK_i_subset.var_names):
    plt.figure(figsize=(6, 4))
    plt.hist(MK_i_subset.layers["log_raw"][:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# --- Create new datasets (subsets) for each cell type ---
# Get the unique cell types
unique_cell_types_g = adata_g_filtered.obs['gen_cell_type_reannotated'].unique()
unique_cell_types_i = adata_i_filtered.obs['gen_cell_type_reannotated'].unique()

# Create a dictionary to store the subsets
subcelltype_datasets_g = {}
subcelltype_datasets_i = {}

for cell_type in unique_cell_types_g:
    # Subset the AnnData object for each cell type
    subset = adata_g_filtered[adata_g_filtered.obs['gen_cell_type_reannotated'] == cell_type].copy()
    subcelltype_datasets_g[cell_type] = subset
    # Optionally, save each subset to file:
    subset.write_h5ad(f"PBMC_{cell_type}_gene_subtype_raw_counts_06022024.h5ad", compression="gzip")
    print(f"Saved dataset for {cell_type} with {subset.n_obs} cells.")

In [None]:
for cell_type in unique_cell_types_i:
    # Subset the AnnData object for each cell type
    subset = adata_i_filtered[adata_i_filtered.obs['gen_cell_type_reannotated'] == cell_type].copy()
    subcelltype_datasets_i[cell_type] = subset
    # Optionally, save each subset to file:
    subset.write_h5ad(f"PBMC_{cell_type}_iso_subtype_raw_counts_06022024.h5ad", compression="gzip")
    print(f"Saved dataset for {cell_type} with {subset.n_obs} cells.")

# Now you have separate datasets stored in the dictionary 'celltype_datasets'
# You can access, for example, raw counts for specific genes in:
# celltype_datasets["TCells"].X

In [None]:
NKCells_g = subcelltype_datasets_g["NK Cells"]
CD8_TCells_g = subcelltype_datasets_g["Effector CD8 TCells"]
CD4_TCells_1_g = subcelltype_datasets_g["Effector CD4 TCells #1"]
CD4_TCells_2_g = subcelltype_datasets_g["Effector CD4 TCells #2"]
CD4_TCells_3_g = subcelltype_datasets_g["Effector CD4 TCells #3"]
Mem_TCells_g = subcelltype_datasets_g["Memory TCells"]
Monocytes_g = subcelltype_datasets_g["Monocyte-derived"]
BCells_g = subcelltype_datasets_g["BCells"]
Megakaryotes_g = subcelltype_datasets_g["Megakaryocytes"]

NKCells_i = subcelltype_datasets_i["NK Cells"]
CD8_TCells_i = subcelltype_datasets_i["Effector CD8 TCells"]
CD4_TCells_i = subcelltype_datasets_i["Effector CD4 TCells"]
Mem_TCells_i = subcelltype_datasets_i["Memory TCells"]
Unspecified_TCells_i = subcelltype_datasets_i["Unspecified TCells"]
Monocytes_i = subcelltype_datasets_i["Monocyte-derived"]
BCells_i = subcelltype_datasets_i["BCells"]
Megakaryotes_i = subcelltype_datasets_i["Megakaryocytes"]

In [None]:
TCell_Markers = ["CD4:", "CD3D:", "CD3E:"]
TMem = ["CCR7:", "SELL:", "TCF7:"]
Cytotoxic = ["CD8A:", "CD8B:", "GATA3:", "KLRB1:", "CCL5:"]
CD4Effector = ["CD4:", "IL2RA:", "GATA3:", "AHR:"]
BCells = ["MS4A1:","CD19:", "CD79A:", "CD22:"]
NKCells = ["KLRD1:", "GZMB:", "FCGR3A:", "IL2RB:", "CD226:", "ITGAM:", "NCAM1"]
Myeloid = ["ITGAM:", "ITGAX:", "CD33:", "CD14:", "CD1C:", "HLA-DRA:", "HLA-DRB1:", "FCGR3A:", "FCGR2A:", "CLEC7A", "LILRB4:"]
Monocytes = ["CD14:", "LYZ:", "VCAN:", "FCN1:", "CST3:", "S100A8:", "S100A9:", "FCGR3A:", "CX3CR1:"]
DC = ["CD1C:", "CLEC10A:", "FCER1A:", "CST3:"]
pDC = ["CLEC4C:", "LILRA4:", "GZMB:", "TCL1A:"]
MK = ["PPBP:", "PF4:", "ITGA2B:", "GP1BA:", "VWF:"]

# Function to find matching genes in var_names (combined_ID format)
def find_matching_genes(prefixes, gene_list):
    return [gene for gene in gene_list if any(gene.startswith(prefix) for prefix in prefixes)]

In [None]:
# Get gene matches for gene-level data
TCell_genes = find_matching_genes(TCell_Markers, adata_g_filtered.var_names)
Cytotoxic_genes = find_matching_genes(Cytotoxic, adata_g_filtered.var_names)
CD4Effector_genes = find_matching_genes(CD4Effector, adata_g_filtered.var_names)
TMem_genes = find_matching_genes(TMem, adata_g_filtered.var_names)
NK_genes = find_matching_genes(NKCells, adata_g_filtered.var_names)
Myeloid_genes = find_matching_genes(Myeloid, adata_g_filtered.var_names)
Monocyte_genes = find_matching_genes(Monocytes, adata_g_filtered.var_names)
BCell_genes = find_matching_genes(BCells, adata_g_filtered.var_names)
DC_genes = find_matching_genes(DC, adata_g_filtered.var_names)
pDC_genes = find_matching_genes(pDC, adata_g_filtered.var_names)
MK_genes = find_matching_genes(MK, adata_g_filtered.var_names)

TCell_isos = find_matching_genes(TCell_Markers, adata_i_filtered.var_names)
TReg_isos = find_matching_genes(Treg, adata_i_filtered.var_names)
TMem_isos = find_matching_genes(TMem, adata_i_filtered.var_names)
NK_isos = find_matching_genes(NKCells, adata_i_filtered.var_names)
Myeloid_isos = find_matching_genes(Myeloid, adata_i_filtered.var_names)
Monocyte_isos = find_matching_genes(Monocytes, adata_i_filtered.var_names)
BCell_isos = find_matching_genes(BCells, adata_i_filtered.var_names)
DC_isos = find_matching_genes(DC, adata_i_filtered.var_names)
pDC_isos = find_matching_genes(pDC, adata_i_filtered.var_names)
MK_isos = find_matching_genes(MK, adata_i_filtered.var_names)

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
CD8_TCells_g_subset =CD8_TCells_g[:, TCell_genes].copy()

# Choose the expression layer.
X = CD8_TCells_g_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)

# Create a summary DataFrame with gene names and statistics
Naive_TCell_1_summary_df_g = pd.DataFrame({
    "gene": Naive_TCell_1_g_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median
})

print(Naive_TCell_1_summary_df_g)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Naive_TCell_1_g_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
Naive_TCell_2_g_subset =Naive_TCells_2_g[:, TCell_genes].copy()

# Choose the expression layer.
X = Naive_TCell_2_g_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)

# Create a summary DataFrame with gene names and statistics
Naive_TCell_2_summary_df_g = pd.DataFrame({
    "gene": Naive_TCell_2_g_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median
})

print(Naive_TCell_2_summary_df_g)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Naive_TCell_2_g_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
Naive_TCell_1_i_subset =Naive_TCells_1_i[:, TCell_isos].copy()

# Choose the expression layer.
X = Naive_TCell_1_i_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)

# Create a summary DataFrame with gene names and statistics
Naive_TCell_1_summary_df_i = pd.DataFrame({
    "gene": Naive_TCell_1_i_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median
})

print(Naive_TCell_1_summary_df_i)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Naive_TCell_1_i_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
Memory_TCell_subset =Memory_TCells_data[:, TCell_genes].copy()

# Choose the expression layer.
X = Memory_TCell_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
Memory_TCell_summary_df = pd.DataFrame({
    "gene": Memory_TCell_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(Memory_TCell_summary_df)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Memory_TCell_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
CD8_TCell_subset =CD8_TCells_data[:, TCell_genes].copy()

# Choose the expression layer.
X = CD8_TCell_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
CD8_TCell_summary_df = pd.DataFrame({
    "gene": Memory_TCell_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(CD8_TCell_summary_df)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(CD8_TCell_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
Monocyte_subset =Monocyte_data[:, Monocyte_genes].copy()

# Choose the expression layer.
X = Monocyte_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
Monocyte_summary_df = pd.DataFrame({
    "gene": Monocyte_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(Monocyte_summary_df)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Monocyte_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Subset the AnnData object to only include the genes in TCell_genes_1
Mono_Derived_DC_genes = DC_genes + Monocyte_genes

Mono_Derived_DC_subset =Mono_Derived_DCs_data[:, Mono_Derived_DC_genes].copy()

# Choose the expression layer.
X = Mono_Derived_DC_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
Mono_Derived_DC_summary_df = pd.DataFrame({
    "gene": Mono_Derived_DC_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(Mono_Derived_DC_summary_df)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Mono_Derived_DC_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
Megakaryote_subset =Megakaryotes_data[:, MK_genes].copy()

# Choose the expression layer.
X = Megakaryote_subset.layers["log_raw"]

# If the expression matrix is sparse, convert it to a dense NumPy array.
if hasattr(X, "toarray"):
    X = X.toarray()

# X now has shape (n_cells, n_genes)
# Compute statistics across cells for each gene (i.e. axis=0)
gene_min = np.min(X, axis=0)
gene_max = np.max(X, axis=0)
gene_mean = np.mean(X, axis=0)
gene_median = np.median(X, axis=0)
# Compute mode along axis 0 (for each gene)
gene_mode_result = mode(X, axis=0)
gene_mode = gene_mode_result.mode[0]  # Extract the mode values

# Create a summary DataFrame with gene names and statistics
Megakaryote_summary_df = pd.DataFrame({
    "gene": Megakaryote_subset.var_names,
    "min_expression": gene_min,
    "max_expression": gene_max,
    "mean_expression": gene_mean,
    "median_expression": gene_median,
    "mode_expression": gene_mode,
})

print(Megakaryote_summary_df)

In [None]:
# Plot histograms for each gene (or for a subset)
n_genes = X.shape[1]
for i, gene in enumerate(Megakaryote_subset.var_names[:n_genes]):
    plt.figure(figsize=(6, 4))
    plt.hist(X[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Raw Count Histogram for {gene}')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    # If you wish to save the figure instead of or in addition to showing it, use:
    # plt.savefig(f'histogram_{gene}.png')
    # plt.close()

In [None]:
# Apply log1p transformation
adata_g_filtered.layers["log_raw"] = np.log1p(adata_g_filtered.layers["counts"])
#adata_i_filtered.layers["log_raw"] = np.log1p(adata_i_filtered.layers["counts"])

In [None]:
adata_g_filtered.X = adata_g_filtered.layers["log_raw"]

# Run PCA on log-transformed data
sc.pp.pca(adata_g_filtered, layer="log_raw")

# Compute UMAP
sc.pp.neighbors(adata_g_filtered, n_neighbors = 20)
sc.tl.umap(adata_g_filtered)

In [None]:
# Perform Leiden clustering at multiple resolutions
resolutions = [0.05, 0.08, 0.1, 0.15, 0.18, 0.2, 0.25, 0.28, 0.3, 0.35, 0.38]
for res in resolutions:
    sc.tl.leiden(adata_g_filtered, resolution=res, key_added=f'{res}_20lat_5e3', flavor = "igraph", n_iterations=2)

In [None]:
# Perform Leiden clustering at multiple resolutions
resolutions = [0.04, 0.06, 0.1, 0.14, 0.16, 0.2, 0.24, 0.26, 0.3, 0.34, 0.36, 0.4, 0.44, 0.46, 0.5]
for res in resolutions:
    sc.tl.leiden(adata_g_filtered, resolution=res, key_added=f'{res}_log_AutoZI', flavor = "igraph", n_iterations=2)

# Relabel clusters by size
resolutions = ['0.04_log_AutoZI', '0.06_log_AutoZI', '0.1_log_AutoZI', '0.14_log_AutoZI', '0.16_log_AutoZI', 
               '0.2_log_AutoZI', '0.24_log_AutoZI', '0.26_log_AutoZI', '0.3_log_AutoZI', '0.34_log_AutoZI', '0.36_log_AutoZI',
              '0.4_log_AutoZI', '0.44_log_AutoZI', '0.46_log_AutoZI', '0.5_log_AutoZI']
for cluster_key in resolutions:
    adata_g_filtered_pbmc = relabel_clusters_by_size(adata_g_filtered, cluster_key)

sc.pl.umap(adata_g_filtered, color=["batch"], title="UMAP Colored by Batch")

# Call the functions
plot_umap_with_labels_g(adata_g_filtered, resolutions=[
    #0.05, 0.08, 0.1, 0.15, 0.18, 0.2, 0.25, 0.28, 0.3, 0.35, 0.38
    0.04, 0.06, 0.1, 0.14, 0.16, 0.2, 0.24, 0.26, 0.3, 0.34, 0.36, 0.4, 0.44, 0.46, 0.5
], use_rep_key= 'log_AutoZI')