In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata
import sys

sys.path.append('../')
import preprocessing_tools as pt

In [2]:
def label_cd4_cd8_from_markers(
    adata,
    t_mask=None,                  # boolean mask for T cells; if None, uses all cells
    layer="counts",               # use raw counts if present
    out_col="cd4_cd8",
    delta=0.25,                   # required separation between CD4 and CD8 scores
    min_score=0.10,               # require at least some signal
):
    # work on a copy to avoid touching your main object (no saving)
    ad = adata.copy()

    # choose expression matrix
    if layer is not None and layer in ad.layers:
        ad.X = ad.layers[layer].copy()

    # log-normalize for scoring
    sc.pp.normalize_total(ad, target_sum=1e4)
    sc.pp.log1p(ad)

    # common markers (gene SYMBOLS)
    t_markers   = ["TRAC", "CD3D", "CD3E"]  # optional sanity check
    cd4_markers = ["CD4", "IL7R", "CCR7", "LTB", "TCF7", "LEF1"]  # MALAT1 is ubiquitous; remove if you dislike
    cd8_markers = ["CD8A", "CD8B"]

    # keep only genes present
    def present(gs): return [g for g in gs if g in ad.var_names]
    cd4g, cd8g = present(cd4_markers), present(cd8_markers)
    if len(cd4g) == 0 or len(cd8g) == 0:
        raise ValueError(
            f"Markers missing from adata.var_names. Present CD4={cd4g}, CD8={cd8g}. "
            "Check that var_names are gene SYMBOLs (or map from Ensembl first)."
        )

    # score genes (adds columns in ad.obs on the copy)
    sc.tl.score_genes(ad, cd4g, score_name="_cd4_score")
    sc.tl.score_genes(ad, cd8g, score_name="_cd8_score")

    cd4s = ad.obs["_cd4_score"].to_numpy()
    cd8s = ad.obs["_cd8_score"].to_numpy()

    if t_mask is None:
        t_mask = np.ones(ad.n_obs, dtype=bool)
    else:
        t_mask = np.asarray(t_mask, dtype=bool)

    labels = pd.Series(pd.NA, index=ad.obs_names, dtype="object")

    # only label within T-mask; others stay NA
    diff = cd4s - cd8s
    labels.loc[t_mask & (diff >  delta) & (cd4s > min_score)] = "CD4T"
    labels.loc[t_mask & (diff < -delta) & (cd8s > min_score)] = "CD8T"
    labels.loc[t_mask & labels.isna()] = "Ambiguous"

    # write back onto the *original* adata (just obs column in memory)
    adata.obs[out_col] = labels

    # optional: also keep the scores if you want
    adata.obs[out_col + "_cd4_score"] = pd.Series(cd4s, index=adata.obs_names)
    adata.obs[out_col + "_cd8_score"] = pd.Series(cd8s, index=adata.obs_names)

    print(adata.obs[out_col].value_counts(dropna=False))

In [3]:
condition_key = 'condition'
covariate_key = 'patient_id'

control_name = 'Baseline'
stim_name = 'On treatment'

# Read and preprocess adata

In [4]:
data_path = "../../preprocessed_datasets/ICI_original_concat.h5ad"

In [5]:
adata = sc.read_h5ad(data_path)

#adata = adata[adata.obs['disease'].isin(['Pneumonitis', 'No irAE'])]

print(adata.X.max(), adata.X.min())

16688.0 0.0


In [6]:
adata_copy = adata.copy()
adata_copy.var_names = adata_copy.var['gene_symbol'].copy()

t_mask = (adata_copy.obs["ch1__cell type"] == "Human T cells")  
label_cd4_cd8_from_markers(adata_copy, t_mask=t_mask, layer="counts", out_col="cd4_cd8")

  return fn(*args_all, **kw)


cd4_cd8
Ambiguous    93284
CD4T         93183
CD8T         36211
Name: count, dtype: int64


In [7]:
adata.obs['cell_type'] = adata_copy.obs['cd4_cd8'].tolist()

In [8]:
pd.crosstab(adata.obs['cell_type'], adata.obs['disease'])

disease,Arthritis,Colitis/ Neurotoxicity,No irAE,Pneumonitis,Thyroiditis,Thyroiditis/ Nephritis,Thyroiditis/ Neurotoxicity / Nephritis,Thyroiditis/ Neurotoxicity/ Nephritis
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Ambiguous,17646,2616,18779,11480,41152,376,876,359
CD4T,15429,1437,41780,32332,707,253,1029,216
CD8T,11580,631,11769,9081,2471,149,394,136


In [9]:
adata = adata[adata.obs['cell_type'] == 'CD8T']

In [10]:
adata

View of AnnData object with n_obs × n_vars = 36211 × 20341
    obs: 'sample_id', 'prefix', 'GEO', 'patient_id', 'T', 'title', 'source', 'organism', 'ch1__cell type', 'tissue', 'disease', 'condition', 'library_strategy', 'library_source', 'library_selection', 'cell_type'
    var: 'gene_symbol', 'gene_id_ensembl'

In [11]:
adata.layers['counts'] = adata.X.copy()
adata.raw = adata.copy()
adata.uns = {}

  adata.layers['counts'] = adata.X.copy()


In [12]:
sc.pp.filter_cells(adata, min_counts=100)
sc.pp.filter_genes(adata, min_counts=5)

sc.pp.normalize_total(
    adata, 
    target_sum=1e4, 
    exclude_highly_expressed=True
    )
sc.pp.log1p(adata)

adata.shape, adata.X.max(), adata.X.min()

((35983, 16838), 11.959834, 0.0)

# Create cov_cond column

In [13]:
adata.obs['cov_cond'] = adata.obs[covariate_key].astype(str) + '_' + adata.obs[condition_key].astype(str)

In [14]:
adata.obs['cov_cond']

AAACCTGAGCTTATCG-1-GSM6668925_P1_T      P1_On treatment
AAACCTGCACCTCGTT-1-GSM6668925_P1_T      P1_On treatment
AAACCTGGTTCGTTGA-1-GSM6668925_P1_T      P1_On treatment
AAACCTGTCCTGTACC-1-GSM6668925_P1_T      P1_On treatment
AAACCTGTCTCATTCA-1-GSM6668925_P1_T      P1_On treatment
                                             ...       
TTGCCGTAGTTCGCAT.1-GSM6668972_P24_TN       P24_Baseline
TTGGAACAGTAGGCCA.1-GSM6668972_P24_TN       P24_Baseline
TTGGCAACAAACTGCT.1-GSM6668972_P24_TN       P24_Baseline
TTTATGCCATGCCCGA.1-GSM6668972_P24_TN       P24_Baseline
TTTGTCAGTGGTCTCG.1-GSM6668972_P24_TN       P24_Baseline
Name: cov_cond, Length: 35983, dtype: object

# Compute DEGs

In [None]:
stim_degs = pt.compute_degs(
    adata, 
    cov_key=covariate_key, 
    cond_key=condition_key, 
    stim_name=stim_name, 
    control_name=control_name,
    condition_names=[control_name, stim_name],
    method='wilcoxon'
)

In [None]:
adata.uns[f'rank_genes_groups_{condition_key}'] = {
    stim_name :stim_degs,
}

In [None]:
adata.X.max(), adata.X.min()

In [None]:
adata.X = adata.layers['counts'].copy()

In [None]:
adata.X.max(), adata.X.min()

# Create split column

In [None]:
adata_split = pt.create_split_cols(
    adata=adata, 
    cov_key=covariate_key, 
    cond_key=condition_key, 
    stim_name=stim_name,
    random_state=42
)

# Create cell ids

In [None]:
adata_split.obs['sc_cell_ids'] = list(range(adata.shape[0]))

In [None]:
adata_split.obs['condition'].value_counts()

In [None]:
adata_split.obs['disease'].value_counts()

In [None]:
pd.crosstab(adata_split.obs['condition'],adata_split.obs['disease'])

# Write

In [None]:
adata_split.write_h5ad('../../preprocessed_datasets/ici.h5ad')

In [None]:
adata_split

In [None]:
adata.obs['patient_id'].value_counts()

In [None]:
len(adata[adata.obs['disease'] == 'No irAE'].obs['patient_id'].unique())

In [None]:
len(adata[adata.obs['disease'] == 'Pneumonitis'].obs['patient_id'].unique())

In [None]:
adata[adata.obs['disease'] == 'Pneumonitis'].obs['patient_id'].unique()

In [None]:
len(adata.obs['patient_id'].unique())

In [None]:
adata.obs['patient_id'].value_counts()

In [None]:
pd.crosstab(adata_split.obs['patient_id'],adata_split.obs['condition'])

In [None]:
adata_split.X.max(), adata_split.X.min()

In [None]:
adata_split.obs['patient_id'].unique().tolist()

In [None]:
['P17','P8','P9','P10','P11','P18','P19','P12','P13','P14','P20','P21','P22','P23','P24']

In [None]:
adata_split