# Signature Scoring on Post-ICI Expansion Enriched T Cells

In [None]:
# Import libraries
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Read adata object of processed T cells
adata_file = "/scratch_isilon/groups/singlecell/shared/projects/SERPENTINE/DF_July2025/data/Tcells_integrated_annotated.h5ad"
adata = sc.read_h5ad(adata_file)

In [None]:
adata.obs.columns

In [None]:
adata.obs['dataset'].unique()

In [None]:
# Subset for serpentine cells
adata = adata[adata.obs["dataset"] == "SERPENTINE"].copy()

In [None]:
adata.obs

In [None]:
# Define signatures
gene_signatures = {
    "Activity": [
        "CD27", "CD82", "TNFRSF4", "TNFRSF9", "CXCR3",
        "IFNG", "NKG7", "PRF1", "GZMK", "GZMA", "GZMB", "GZMH", "FASLG",
        "ITGAE", "ZNF683", "CD44", "ITGA1", "CD69"
    ],
    "Activation": ["CD27", "CD82", "TNFRSF4", "TNFRSF9", "CXCR3"],
    "Effector": [ "IFNG", "NKG7", "PRF1", "GZMK", "GZMA", "GZMB", "GZMH", "FASLG"],
    "Residency": ["ITGAE", "ZNF683", "CD44", "ITGA1", "CD69"],
    "Tumor_reactivity": ["41BB", "CXCL13", "ENTPD1"],
    "ITGAE": ["ITGAE"], 
    "ZNF683": ["ZNF683"], 
    "CD44": ["CD44"], 
    "ITGA1": ["ITGA1"], 
    "CD69": ["CD69"]
}

In [None]:
adata.var_names

In [None]:
# Compute signatures
for sig_name, genes in gene_signatures.items():
    
    # Keep only genes present in the dataset
    genes_present = [g for g in genes if g in adata.var_names]
    if len(genes_present) == 0:
        print(f"Warning: no genes found for {sig_name}")
        adata_subset.obs[sig_name] = np.nan
        continue
    
    # Compute the module score
    sc.tl.score_genes(
        adata,
        gene_list=genes_present,
        score_name=sig_name,
        ctrl_size=50,   # number of control genes for background
        use_raw=False
    )


In [None]:
adata.obs

In [None]:
# Adjust adata barcode column so it matches the enrichment df
adata.obs["barcode"] = adata.obs["Replicate"].astype(str) + "_" + adata.obs["barcode"]
adata.obs["barcode"] = adata.obs["barcode"].str.replace(r"-\d+(?:-\d+)*$", "", regex=True)
adata.obs = adata.obs.set_index("barcode")

In [None]:
# Adjust patient column
adata.obs["Patient"] = (
    adata.obs["Patient"]
    .str.extract(r"(\d+)")[0]   # extract first capture group as Series
    .astype(int)                 # convert each value to int
    .apply(lambda x: f"P{x:02d}")  # now lambda gets scalar int
)

In [None]:
# Read clonotype enrichment data
enrichment_csv = "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR/out/data/enriched_cells.csv"
enrichment = pd.read_csv(enrichment_csv, sep = " ", header = 0)
enrichment = enrichment.rename(columns={"patient": "Patient"})
enrichment.head

In [None]:
## Make barcode a column again for merging
#adata_obs = adata.obs.reset_index()

# Merging
merged_meta = adata.obs.merge(
    enrichment,
    on=["barcode", "Patient"],
    how="left"
)

In [None]:
print(enrichment.shape)
print(adata.obs.shape)
print(merged_meta.shape)

In [None]:
# Select relevant columns
signature_cols = list(gene_signatures.keys())
cols_to_keep = ["barcode", "Patient", "Tissue", "Timepoint", "presence_status", "clonotype_id", "enriched", "LogFC", "ITGAE", "ZNF683", "CD44", "ITGA1", "CD69"] + signature_cols
merged_meta = merged_meta[cols_to_keep]

In [None]:
# Subset enriched cells
subset_meta = merged_meta[merged_meta["enriched"] == True]

In [None]:
subset_meta

In [None]:
# Save subsetted data
subset_meta.to_csv("/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR/out/data/scored_enriched_cells.csv.csv", index=False)

### Map Clonal Proportions in GEX-based UMAP

In [None]:
# Read adata object of processed T cells
adata_file = "/scratch_isilon/groups/singlecell/shared/projects/SERPENTINE/DF_July2025/data/Tcells_integrated_annotated.h5ad"
adata = sc.read_h5ad(adata_file)

In [None]:
# Subset for serpentine cells
adata = adata[adata.obs["dataset"] == "SERPENTINE"].copy()

In [None]:
# Adjust adata barcode column so it matches the sizes df
adata.obs["barcode"] = adata.obs["Replicate"].astype(str) + "_" + adata.obs["barcode"]
adata.obs["barcode"] = adata.obs["barcode"].str.replace(r"-\d+(?:-\d+)*$", "", regex=True)
adata.obs = adata.obs.set_index("barcode")

In [None]:
# Read cell barcode and normalized clonal sizes equivalences
sizes_csv = "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR/out/data/Cell_barcodes_sizes_01-2026.csv"
sizes = pd.read_csv(sizes_csv, sep = ",", header = 0)

In [None]:
sizes = sizes.set_index("barcode")
sizes.head()

In [None]:
adata.obs.head()

In [None]:
# Make barcodes consistent (remove GEX part)
sizes.index = (
    sizes.index
        .astype(str)
        .str.replace("_GEX1", "_1", regex=False)
        .str.replace("_GEX2", "_2", regex=False)
        .str.replace("_GEX", "", regex=False)
)

In [None]:
# Add clonal proportions to metadata
adata.obs["Clonal_Proportion"] = (
    sizes["norm_cloneSize"]
      .reindex(adata.obs_names)
)

In [None]:
adata.obs.head()

In [None]:
adata.obs["Clonal_Proportion"].value_counts(dropna=False)

In [None]:
adata.obs["Clonal_Proportion"].notna().sum()

In [None]:
adata.obs["Clonal_Proportion_log10"] = np.log10(adata.obs["Clonal_Proportion"]+1e-4)

In [None]:
# Cap range - remove influence of outliers
adata.obs["Clonal_Proportion_log10_2"] = adata.obs["Clonal_Proportion_log10"]
mask = adata.obs["Clonal_Proportion_log10_2"] > -2.5
adata.obs.loc[mask, "Clonal_Proportion_log10_2"] = -2.5

In [None]:
# sort cells by Clonal_Proportion (NaNs go first)
adata_sorted = adata[adata.obs["Clonal_Proportion_log10_2"].fillna(-1).sort_values().index]

In [None]:
'''
Small epsilon (1e-6): |-----|-6- -5- -4- -3- -2- -1- 0|
                        extreme small values dominate the color scale
                        -> most points orange

Larger epsilon (1e-4): |----|-4- -3- -2- -1- 0|
                        extreme small values capped
                        -> differences between small/medium clones visible
'''

In [None]:
adata_sorted.obs

In [None]:
# Show UMAP with Clonal Proportions

fig, ax = plt.subplots(figsize=(10, 10))  # 10x10 inches

vmax = adata.obs["Clonal_Proportion_log10_2"].max()

# Plot UMAP
sc.pl.umap(
    adata_sorted,
    color="Clonal_Proportion_log10_2",
    cmap="inferno",
    frameon=False,
    show=False,
    vmax=vmax,
    ax=ax,
    s=5
    #size="Clonal_Proportion_log10_2"
)

# Remove title
ax.set_title("")

# Rasterize points
for coll in ax.collections:
    coll.set_rasterized(True)

# Set colorbar label
cbar = fig.axes[-1]
cbar.set_ylabel("Clonal Proportion (Log10)", rotation=270, labelpad=20, size = 12)

# Save as PDF with high resolution
plt.savefig("/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR/out/figs/TCR_Fig_Jan/TCR_UMAP_ClonalProportion.pdf", dpi=300, bbox_inches="tight")
plt.close()

In [None]:
# Original clonal proportions
clonal = adata.obs["Clonal_Proportion"]

# List of epsilons to compare
epsilons = [0, 1e-6, 1e-5, 1e-4]
titles = ["No epsilon", "ε = 1e-6", "ε = 1e-5", "ε = 1e-4"]

# Prepare figure
fig, axes = plt.subplots(1, 4, figsize=(20, 5), sharey=True)

for ax, eps, title in zip(axes, epsilons, titles):
    if eps == 0:
        # Avoid log10(0) by replacing zeros with NaN
        log_vals = np.log10(clonal.replace(0, np.nan))
    else:
        log_vals = np.log10(clonal + eps)
    
    ax.hist(log_vals.dropna(), bins=50, color='steelblue', edgecolor='black')
    ax.set_title(title)
    ax.set_xlabel("log10(Clonal_Proportion)")
    ax.set_ylabel("Number of cells")

plt.tight_layout()
plt.show()

### Extract Single Cells Clonal Proportions 

In [None]:
adata_props = adata.obs[["Replicate", "Clonal_Proportion", "Clonal_Proportion_log10"]].copy()
adata_props.head()

In [None]:
# Save
adata_props.to_csv('/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR/out/data/SP_Cells_Clonal_Proportions_01-2026.csv', index=True)