In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import patches as mpatches
from matplotlib.colors import LinearSegmentedColormap, to_hex
import seaborn as sns
import scanpy as sc
import anndata as ad
from scipy import sparse
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set scanpy settings
sc.settings.verbosity = 3  # verbosity level

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
adata = ad.read_h5ad('colon_adata_clustered.h5ad')

In [3]:
print(adata)

AnnData object with n_obs × n_vars = 424423 × 3000
    obs: 'fov', 'RNA_RNA.QC.Module_Cell.Typing.InSituType.1_1_clusters', 'RNA_RNA.QC.Module_Cell.Typing.InSituType.1_1_posterior_probability', 'Area', 'AspectRatio', 'CenterX_local_px', 'CenterY_local_px', 'Width', 'Height', 'Mean.PanCK', 'Max.PanCK', 'Mean.G', 'Max.G', 'Mean.Membrane', 'Max.Membrane', 'Mean.CD45', 'Max.CD45', 'Mean.DAPI', 'Max.DAPI', 'SplitRatioToLocal', 'NucArea', 'NucAspectRatio', 'Circularity', 'Eccentricity', 'Perimeter', 'Solidity', 'cell_id', 'assay_type', 'version', 'Run_Tissue_name', 'Panel', 'cellSegmentationSetId', 'cellSegmentationSetName', 'slide_ID', 'CenterX_global_px', 'CenterY_global_px', 'cell_ID', 'unassignedTranscripts', 'median_RNA', 'RNA_quantile_0.75', 'RNA_quantile_0.8', 'RNA_quantile_0.85', 'RNA_quantile_0.9', 'RNA_quantile_0.95', 'RNA_quantile_0.99', 'nCount_RNA', 'nFeature_RNA', 'median_negprobes', 'negprobes_quantile_0.75', 'negprobes_quantile_0.8', 'negprobes_quantile_0.85', 'negprobes_quan

## Cell type marker feature plots

We use a curated set of markers for epithelial, immune, stroma, and other compartments. First we check which genes are absent in the dataset, then generate feature plots for genes that are present.

In [29]:
celltype_markers = {
    "Epithelial": [
        "KRT8","KRT19","KRT20","TACSTD2","CEACAM5","REG4",
        "LGALS4","SLC26A3","SLC9A3","AQP8","CA1","CA2","FABP1",
        "GUCA2A","MUC2","FCGBP","CLCA1","AGR2","SPINK4","ZG16",
        "DEFA5","DEFA6","LYZ","PLA2G2A","REG3A","CHGA","SCG5","NEUROD1",
        "OLFM4","SMOC2","CLDN1","CLDN4","CEACAM7","DMBT1","GCG","RGS2"
    ],
    "Tumor_EMT_like": [
        "VIM","ITGA5","FN1","ZEB1","SNAI2","TWIST1","ZEB2","COL1A1","COL1A2","TAGLN"
    ],
    "Tumor_Response_TGFB_AP1_JNK": [
        "TGFB2","TGFBR2","FOS"
    ],

    "PanImmune": [
        "PTPRC","CD74","HLA-DRA","HLA-DPA1"
    ],
    "BCell": [
    ],
    "PlasmaCell": [
        "MZB1","JCHAIN","IGHG1","IGKC","IGHA1"
    ],
    "TCell": [
        "CD3D","TRAC","TRBC1","CD4","IL7R","LTB","GZMK","CXCR4"
    ],
    "CD4_TCell": [
        "CD4"
    ],
    "CD8_TCell": [
    ],
    "Th17": [
    ],
    "NKCell": [
        "KLRD1","FCGR3A","NCAM1","GNLY"
    ],
    "Myeloid": [
        "TYROBP","FCER1G","AIF1","FCGR3A"
    ],
    "Monocyte": [
        "CD14","S100A9","LGALS3"
    ],
    "Macrophage": [
        "C1QA","C1QB","C1QC","APOE","CTSB","CTSD","MSR1","CD163","FCER1G"
    ],
    "SPP1_TAM": [
        "SPP1","LGALS3","TREM2","GPNMB","CTSL"
    ],
    "Dendritic": [
    ],
    "MastCell": [
        "KIT","IL1RL1"
    ],
    "Treg": [
        "CTLA4","TIGIT","IKZF2"
    ],

    "Endothelial": [
        "PECAM1","KDR","ENG","PLVAP"
    ],
    "Lymphatic_Endo": [
        "PDPN","CCL21"
    ],

    "Pericyte": [
        "STEAP4","PDGFRB","MCAM","NOTCH3","ABCC9","DES"
    ],
    "SMC": [
        "MYH11","ACTG2","DES","ACTA2","TAGLN","CNN1","MYL9","TPM2","CALD1"
    ],

    "Fibroblast": [
        "VIM","PDGFRA","PDGFRB","LUM","COL1A1","COL1A2","COL3A1",
        "COL6A1","COL14A1","PI16","COL5A1","COL5A2","LOX","FBLN1","FBLN2",
        "FN1","VCAM1","MGP","SFRP2","C3","SPARC"
    ],
    "CCL8_Fibroblast": [
        "COL3A1"
    ],
    "PDGFRAhi_Fibroblast": [
        "PDGFRA","MMP1"
    ],
    "Lymphoid_Stromal": [
        "CXCL13","CCL19"
    ],

    "CAF_myofibroblast": [
        "PDPN","ACTA2","TAGLN","POSTN","THY1","INHBA"
    ],
    "CAF_inflammatory": [
        "CXCL12","IL6","CCL2","PTGS2","CXCL8","CSF1"
    ],

    "Enteric_Glial": [
        "PLP1","APOD"
    ],
    "Adipose": [
        "FABP4","ADIPOQ","ADIRF"
    ]
}

In [30]:
genes_in_adata = set(adata.var_names)
# Check for genes absent in adata per label and print them
print("Genes absent in adata, by label:\n")
for label, genes in celltype_markers.items():
    if not genes:
        continue
    absent = [g for g in genes if g not in genes_in_adata]
    if absent:
        print(f"  {label}: {absent}")
    else:
        print(f"  {label}: (all present)")
print("\nDone checking.")

Genes absent in adata, by label:

  Epithelial: (all present)
  Tumor_EMT_like: (all present)
  Tumor_Response_TGFB_AP1_JNK: (all present)
  PanImmune: (all present)
  PlasmaCell: (all present)
  TCell: (all present)
  CD4_TCell: (all present)
  NKCell: (all present)
  Myeloid: (all present)
  Monocyte: (all present)
  Macrophage: (all present)
  SPP1_TAM: (all present)
  MastCell: (all present)
  Treg: (all present)
  Endothelial: (all present)
  Lymphatic_Endo: (all present)
  Pericyte: (all present)
  SMC: (all present)
  Fibroblast: (all present)
  CCL8_Fibroblast: (all present)
  PDGFRAhi_Fibroblast: (all present)
  Lymphoid_Stromal: (all present)
  CAF_myofibroblast: (all present)
  CAF_inflammatory: (all present)
  Enteric_Glial: (all present)
  Adipose: (all present)

Done checking.


In [13]:
from matplotlib.colors import LinearSegmentedColormap

colors = ["blue", "green", "yellow", "red"]
custom_cmap = LinearSegmentedColormap.from_list(
    "BlueGreenYellowRed",
    colors,
    N=256,
)

feature_plot_root = Path("feature_plots")
feature_plot_root.mkdir(exist_ok=True)
print(f"Feature plots will be saved to {feature_plot_root.resolve()}")

celltype_feature_dir = feature_plot_root / "per_celltype"
celltype_feature_dir.mkdir(exist_ok=True)

Feature plots will be saved to /Users/brunondibambwayeroy/Documents/Research/YALE DATA/feature_plots


In [14]:
# Use UMAP if available, otherwise spatial
basis = "umap"

plots_created = 0
for label, genes in celltype_markers.items():
    if not genes:
        continue
    safe_label = "".join(c if c.isalnum() or c in "_-" else "_" for c in label)

    for gene in genes:
        if gene not in adata.var_names:
            continue
        sc.pl.umap(
            adata,
            color=gene,
            cmap=custom_cmap,
            size=1.5,
            vmax="p99",
            title=f"{label} marker: {gene}",
            show=False,
        )
        fig = plt.gcf()
        # All plots in per_celltype folder; filename includes label for context
        fig.savefig(
            celltype_feature_dir / f"{safe_label}_{gene}_feature.png",
            dpi=200,
            bbox_inches="tight",
        )
        plt.close(fig)
        plots_created += 1

print(
    f"Saved {plots_created} feature plots (cell type markers) to {celltype_feature_dir.resolve()}"
)

Saved 98 feature plots (cell type markers) to /Users/brunondibambwayeroy/Documents/Research/YALE DATA/feature_plots/per_celltype


In [31]:
# Feature plots with cluster breakdown: UMAP + horizontal bars (pct of expressing cells per cluster, bar color = expression level)
from matplotlib.colors import Normalize

cluster_hue_dir = feature_plot_root / "per_celltype_with_cluster_hue"
cluster_hue_dir.mkdir(exist_ok=True)
print(f"Cluster-hue feature plots will be saved to {cluster_hue_dir.resolve()}")

cluster_col = "leiden"
clusters = sorted(adata.obs[cluster_col].astype(str).unique())
basis = "umap"
plots_created = 0

for label, genes in celltype_markers.items():
    if not genes:
        continue
    safe_label = "".join(c if c.isalnum() or c in "_-" else "_" for c in label)

    for gene in genes:
        if gene not in adata.var_names:
            continue

        x_gene = adata[:, gene].X
        expr = (x_gene.toarray().flatten() if sparse.issparse(x_gene) else np.asarray(x_gene).flatten())

        expressing = expr > 0
        n_expressing = expressing.sum()
        if n_expressing == 0:
            continue

        vmax = np.percentile(expr[expr > 0], 99) if (expr > 0).any() else 1.0
        norm = Normalize(vmin=0, vmax=max(vmax, 1e-6))
        # Get the same color each cell has on the UMAP (colormap + norm)
        umap_colors_rgba = custom_cmap(norm(expr))

        cluster_pct = []
        cluster_mean_umap_color = []
        for cl in clusters:
            in_cl = (adata.obs[cluster_col].astype(str) == cl).values
            n_exp_in_cl = (expressing & in_cl).sum()
            cluster_pct.append(n_exp_in_cl / n_expressing)
            # Bar color = mean of the UMAP colors of cells in this cluster (more representative of the UMAP)
            if in_cl.sum() > 0:
                cluster_mean_umap_color.append(umap_colors_rgba[in_cl].mean(axis=0))
            else:
                cluster_mean_umap_color.append(np.array(custom_cmap(0)))  # no cells: use low end of scale

        fig, (ax_umap, ax_bars) = plt.subplots(
            1, 2, figsize=(14, max(6, len(clusters) * 0.25)),
            gridspec_kw={"width_ratios": [1.2, 0.9]}
        )
        sc.pl.umap(
            adata,
            color=gene,
            ax=ax_umap,
            cmap=custom_cmap,
            size=2,
            vmax="p99",
            show=False,
            title=f"{label} marker: {gene}",
        )

        y_pos = np.arange(len(clusters))[::-1]
        bar_widths = np.array(cluster_pct)
        bar_colors = cluster_mean_umap_color
        ax_bars.barh(y_pos, bar_widths, height=0.75, color=bar_colors, edgecolor="gray", linewidth=0.3)
        ax_bars.set_yticks(y_pos)
        ax_bars.set_yticklabels(clusters, fontsize=8)
        ax_bars.set_xlim(0, 1)
        ax_bars.set_xlabel("% of expressing cells in cluster")
        ax_bars.set_title("Cluster distribution (bar color = avg UMAP color in cluster)")
        sm = plt.cm.ScalarMappable(cmap=custom_cmap, norm=norm)
        sm.set_array([])
        cbar = fig.colorbar(sm, ax=ax_bars, shrink=0.6, label="Expression scale (UMAP reference)")
        plt.tight_layout()
        fig.savefig(
            cluster_hue_dir / f"{safe_label}_{gene}_feature.png",
            dpi=200,
            bbox_inches="tight",
        )
        plt.close(fig)
        plots_created += 1

print(f"Saved {plots_created} cluster-hue feature plots to {cluster_hue_dir.resolve()}")

Cluster-hue feature plots will be saved to /Users/brunondibambwayeroy/Documents/Research/YALE DATA/feature_plots/per_celltype_with_cluster_hue
Saved 161 cluster-hue feature plots to /Users/brunondibambwayeroy/Documents/Research/YALE DATA/feature_plots/per_celltype_with_cluster_hue
