# Extract the T cell annoations from the preprocessed and integrated Anndata object

## Env setup

In [None]:
# Load libraries
import scanpy as sc
import pandas as pd
import numpy as np
import os

In [None]:
# Set input and output data dirs
in_data_dir = "/scratch_isilon/groups/singlecell/shared/projects/SERPENTINE/DF_July2025/data"
out_data_dir = "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR/out/data"

## Read anndata obj

In [None]:
# Read adata object
adata = sc.read_h5ad(os.path.join(in_data_dir, "Tcells_integrated_annotated.h5ad"))
adata

In [None]:
sc.pl.umap(adata, color=["CD3E", "CD4", "CD8B", "CD8A", "KLRF1", "TRGC1", "SLC4A10", "ZBTB16", "MKI67", 'leiden_1', 'leiden_2', 'leiden_3', 'lv2'], cmap = "magma")

In [None]:
sc.pl.umap(
    adata[adata.obs["lv2"] == "T Naive (3)"], 
    color=["CD3E", "CD4", "CD8B", "CD8A"]
)

In [None]:
sc.pl.umap(
    adata[adata.obs["annotation"] == "T proliferating"], 
    color=["CD3E", "CD4", "CD8B", "CD8A"]
)

### Classify Naive and T proliferating on CD4 or CD8 T cells based on their CD8 and CD4 ratio

In [None]:
adata.obs["CD4_CD8_assignment"] = None

# T Naive
naive = adata.obs["annotation"] == "T Naive"

cd4  = adata[naive, "CD4"].X.A.flatten()
cd8a = adata[naive, "CD8A"].X.A.flatten()
cd8b = adata[naive, "CD8B"].X.A.flatten()

adata.obs.loc[naive, "CD4_CD8_assignment"] = np.where(
    (cd8a + cd8b) > cd4, "CD8", "CD4"
)

# T Proliferating
prolif = adata.obs["annotation"] == "T proliferating"

cd4  = adata[prolif, "CD4"].X.A.flatten()
cd8a = adata[prolif, "CD8A"].X.A.flatten()
cd8b = adata[prolif, "CD8B"].X.A.flatten()

adata.obs.loc[prolif, "CD4_CD8_assignment"] = np.where(
    cd4 > (cd8a + cd8b), "CD4", "CD8"
)

In [None]:
adata.obs["CD4_CD8_assignment"].value_counts()

In [None]:
sc.pl.umap(
    adata[adata.obs["annotation"] == "T Naive"], 
    color=["CD3E", "CD4", "CD8B", "CD8A", "CD4_CD8_assignment"]
)

In [None]:
sc.pl.umap(
    adata[adata.obs["annotation"] == "T proliferating"], 
    color=["CD3E", "CD4", "CD8B", "CD8A", "CD4_CD8_assignment"]
)

### CD4 / CD8 Assignment

In [None]:
# Fill the CD4 and CD8 assignment for the other clusters

cd4_clusters = [
    "Tregs activated", "CD4 follicular helper", "Tgd-V1", "Tregs", 
    "Th-17", "Th-1", "Tregs proliferating", "CD4 central memory",
    "CD4 central memory pre-Tfh"
]

cd8_clusters = [
    "CD8 resident", "T proliferating", "CD8 activated", "CD8 NK-like",
    "CD8 resident activated", "CD8 pre-exhausted", "CD8 IFN",
    "CD8 effector", "CD8 resident exhausted", "CD8 metabolic"
]

na_clusters = [
    "MAIT-17", "Tgd-17", "NK-Tgd", "NK"
]

# Assign for all "other" cells (anything not in Naive/Prolif handled before)
mask_unassigned = adata.obs["CD4_CD8_assignment"].isna()

adata.obs.loc[
    mask_unassigned & adata.obs["annotation"].isin(cd4_clusters),
    "CD4_CD8_assignment"
] = "CD4"

adata.obs.loc[
    mask_unassigned & adata.obs["annotation"].isin(cd8_clusters),
    "CD4_CD8_assignment"
] = "CD8"

adata.obs.loc[
    mask_unassigned & adata.obs["annotation"].isin(na_clusters),
    "CD4_CD8_assignment"
] = "NA"

In [None]:
sc.pl.umap(
    adata, 
    color=["CD3E", "CD4", "CD8B", "CD8A", "CD4_CD8_assignment"],
    ncols = 5
)

### Lv1 Annotation

In [None]:
# Create empty lv1 column
adata.obs["lv1"] = None

# Define category groups
cd4_clusters = [
    "CD4 follicular helper", "Tgd-V1", "Th-17", "Th-1"
]

treg_clusters = [
    "Tregs activated", "Tregs", "Tregs proliferating"
]

cd8_clusters = [
    "CD8 resident", "CD8 activated", "CD8 NK-like", "CD8 resident activated",
    "CD8 pre-exhausted", "CD8 IFN", "CD8 effector", "CD8 resident exhausted",
    "CD8 metabolic"
]

tnaive_cm_clusters = [
    "CD4 central memory", "CD4 central memory pre-Tfh", "T Naive"
]

nc_clusters = [
    "MAIT-17", "Tgd-17", "NK-Tgd"
]

nk_clusters = ["NK"]

# Assign values
adata.obs.loc[adata.obs["annotation"].isin(cd4_clusters), "lv1"] = "CD4"
adata.obs.loc[adata.obs["annotation"].isin(treg_clusters), "lv1"] = "Treg"
adata.obs.loc[adata.obs["annotation"].isin(cd8_clusters), "lv1"] = "CD8"
adata.obs.loc[adata.obs["annotation"].isin(tnaive_cm_clusters), "lv1"] = "T Naive/CM"
adata.obs.loc[adata.obs["annotation"].isin(nc_clusters), "lv1"] = "NC"
adata.obs.loc[adata.obs["annotation"].isin(nk_clusters), "lv1"] = "NK"

# Only for T proliferating cells, set lv1 according to CD4_CD8_assignment
tpro_mask = adata.obs["annotation"] == "T proliferating"
adata.obs.loc[tpro_mask, "lv1"] = adata.obs.loc[tpro_mask, "CD4_CD8_assignment"]

In [None]:
sc.pl.umap(
    adata, 
    color=["CD3E", "CD4", "CD8B", "CD8A", "lv1"],
    ncols = 5
)

## Extract GEX phenotypes

In [None]:
# Get metadata
meta = adata.obs
meta.head()

In [None]:
print(adata.obs["annotation"].unique())
print(adata.obs["lv2"].unique())

In [None]:
meta["dataset"].unique()

In [None]:
meta["batch"].unique()

## Serpentine Subset

In [None]:
# Subset for serpentine only patients
serp_meta = meta[meta["dataset"] == "SERPENTINE"]

In [None]:
serp_meta

In [None]:
serp_meta["Cohort"].unique()

In [None]:
serp_meta.info()

In [None]:
# Subset columns of interest
serp_meta_sub = serp_meta[["bc", "Replicate", "cell_type", "lv1", "CD4_CD8_assignment", "annotation", "lv2"]]
serp_meta_sub.head()

## Chen Subset

In [None]:
# Subset for chen et al 24 only patients
chen_meta = meta[meta["dataset"] == "Chen2024"]

In [None]:
# Subset columns of interest
chen_meta_sub = chen_meta[["bc", "Replicate", "cell_type", "lv1", "CD4_CD8_assignment", "annotation", "lv2"]]
chen_meta_sub.head()
chen_meta_sub["Replicate"].unique()

## Save GEX annotations

In [None]:
# Save annotations
serp_meta_sub.to_csv(os.path.join(out_data_dir, "SERP_T_Annotations_11-2025_v2.csv"), index=False)
chen_meta_sub.to_csv(os.path.join(out_data_dir, "Chen_T_Annotations_11-2025_v2.csv"), index=False)