In [12]:
import anndata as ad
import scanpy as sc
from tqdm import tqdm
import pickle as pkl

In [75]:
adata_pp = ad.read_h5ad("../../../data/sciplex_qc_ann.h5ad")

In [76]:
adata_pp

AnnData object with n_obs × n_vars = 401917 × 27544
    obs: 'cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score', 'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2', 'product_dose', 'product_name', 'proliferation_index', 'replicate', 'size_factor', 'target', 'vehicle', 'SMILES', 'fmfp', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'drug_celltype_dose', 'match_index'
    var: 'id', 'num_cells_expressed-0-0', 'num_cells_expressed-1-0', 'num_cells_expressed-1', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells'

## All DEGS

In [8]:
def calculate_degs(adata_pp, method='wilcoxon'):
    adata = adata_pp.copy()
    sc.pp.filter_genes(adata, min_cells = 10)
    sc.pp.normalize_total(adata, inplace=True)
    sc.pp.log1p(adata)
    adata.obs['drug_celltype_dose'] = adata.obs['cell_type'].astype(str) + "_" + adata.obs['product_name'].astype(
        str) + "_" + adata.obs['dose'].astype(str)

    results = dict()

    for condition in tqdm(adata.obs.drug_celltype_dose.unique()):
        adata_cond = adata[adata.obs.drug_celltype_dose == condition].copy()
        if adata_cond.n_obs == 0:
            continue

        if list(adata_cond.obs.product_name.unique())[0] == "Vehicle":
            continue

        adata_ref = adata[adata.obs.cell_type == list(adata_cond.obs.cell_type.unique())[0]].copy()
        adata_ref = adata_ref[adata_ref.obs.product_name == "Vehicle"]
        ref_cond = list(adata_ref.obs.drug_celltype_dose.unique())[0]
        
        adata_comb = ad.concat([adata_ref, adata_cond])

        sc.tl.rank_genes_groups(adata_comb, groupby='drug_celltype_dose', reference=ref_cond, method=method)
        df = sc.get.rank_genes_groups_df(adata_comb, group=condition, pval_cutoff = 0.05)

        df['abs_logfc'] = abs(df['logfoldchanges'])
        df = df[df['abs_logfc'] >= 0.1]

        results[condition] = list(df['names'])

        del adata_cond
        del adata_ref
        del adata_comb
        
    return results
        

In [9]:
all_degs = calculate_degs(adata_pp)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2218/2218 [20:32<00:00,  1.80it/s]


In [1]:
all_degs

NameError: name 'all_degs' is not defined

In [13]:
with open("./all_degs.pkl", 'wb') as f:
    pkl.dump(all_degs, f)

In [71]:
gene_degs = list()
for key in all_degs.keys():
    gene_degs.extend(all_degs[key])

gene_degs = list(set(gene_degs))

In [72]:
len(gene_degs)

3210

## Feature Agreement between each pp condition

In [79]:
def get_feature_agreement(adata_pp):
    results = list()
    
    for feature_select_method in ['cell_ranger', 'seurat_v3', 'seurat']:
        for feature_number in [500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]:

            adata = adata_pp.copy()
            adata.obs['drug_celltype_dose'] = adata.obs['cell_type'].astype(str) + "_" + adata.obs['product_name'].astype(str) + "_" + adata.obs['dose'].astype(str)

            print("Runnign HVG Selection for:", feature_select_method, feature_number)
            
            if feature_select_method == "random":
                sc.pp.normalize_total(adata, inplace=True)
                sc.pp.log1p(adata)
                genes_random = random.sample(list(adata.var_names), feature_number)
                adata = adata[:, adata.var_names.isin(genes_random)].copy()

            elif feature_select_method == "seurat":
                sc.pp.normalize_total(adata, inplace=True)
                sc.pp.log1p(adata)
                sc.pp.highly_variable_genes(adata, flavor='seurat', n_top_genes=feature_number)
                adata = adata[:, adata.var['highly_variable']].copy()

            elif feature_select_method == "seurat_v3":
                sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=feature_number)
                sc.pp.normalize_total(adata, inplace=True)
                sc.pp.log1p(adata)
                adata = adata[:, adata.var['highly_variable']].copy()

            elif feature_select_method == "cell_ranger":
                sc.pp.normalize_total(adata, inplace=True)
                sc.pp.log1p(adata)
                sc.pp.highly_variable_genes(adata, flavor='cell_ranger', n_top_genes=feature_number)
                adata = adata[:, adata.var['highly_variable']].copy()

            if adata.n_obs == 0:
                continue

            results.append({
                "feature_selection_method": feature_select_method,
                "n_features": feature_number,
                "genes": list(adata.var_names)
            })
    return results

In [80]:
feature_agreement = get_feature_agreement(adata_pp)

Runnign HVG Selection for: cell_ranger 500
Runnign HVG Selection for: cell_ranger 1000
Runnign HVG Selection for: cell_ranger 2000
Runnign HVG Selection for: cell_ranger 3000
Runnign HVG Selection for: cell_ranger 4000
Runnign HVG Selection for: cell_ranger 5000
Runnign HVG Selection for: cell_ranger 6000
Runnign HVG Selection for: cell_ranger 7000
Runnign HVG Selection for: cell_ranger 8000
Runnign HVG Selection for: seurat_v3 500
Runnign HVG Selection for: seurat_v3 1000
Runnign HVG Selection for: seurat_v3 2000
Runnign HVG Selection for: seurat_v3 3000
Runnign HVG Selection for: seurat_v3 4000
Runnign HVG Selection for: seurat_v3 5000
Runnign HVG Selection for: seurat_v3 6000
Runnign HVG Selection for: seurat_v3 7000
Runnign HVG Selection for: seurat_v3 8000
Runnign HVG Selection for: seurat 500
Runnign HVG Selection for: seurat 1000
Runnign HVG Selection for: seurat 2000
Runnign HVG Selection for: seurat 3000
Runnign HVG Selection for: seurat 4000
Runnign HVG Selection for: seurat 

In [81]:
feature_agreement

[{'feature_selection_method': 'cell_ranger',
  'n_features': 500,
  'genes': ['CFH',
   'SLC4A1',
   'THSD7A',
   'ACSM3',
   'PRKAR2B',
   'ETV1',
   'TENM1',
   'TYROBP',
   'ALOX5',
   'IGF1',
   'SLC38A5',
   'CNTN1',
   'CYP24A1',
   'CPS1',
   'GRAMD1B',
   'ABCC2',
   'VIM',
   'ANK1',
   'VCAN',
   'RIPOR3',
   'ARHGAP6',
   'CELF2',
   'TNFRSF9',
   'KCNH2',
   'SLC2A3',
   'BCAT1',
   'LIMCH1',
   'FAM107B',
   'CD84',
   'ACSL4',
   'TGFBR3',
   'FRMPD1',
   'MYO3B',
   'TP63',
   'IGF2BP2',
   'ST6GAL1',
   'CA12',
   'TXK',
   'SEMA3A',
   'SEMA3C',
   'ARHGAP15',
   'MCAM',
   'ARG2',
   'MEF2C',
   'PTPRC',
   'FYB1',
   'SLCO1A2',
   'AKR1B1',
   'CEACAM6',
   'CASS4',
   'HEPH',
   'LYZ',
   'SEL1L3',
   'NLRP1',
   'ESR1',
   'SORBS1',
   'TPTEP1',
   'GRAP2',
   'DHRS2',
   'GATA1',
   'TIMP1',
   'GABRE',
   'FLT1',
   'SLC7A5',
   'ACSBG1',
   'SCG3',
   'CALB1',
   'NCALD',
   'TGFB1',
   'SIGLEC6',
   'TFPI2',
   'STEAP1B',
   'CPED1',
   'EPHB6',
   'TFR2',
   '

In [82]:
with open("./genes_per_pp_condition.pkl", 'wb') as f:
    pkl.dump(feature_agreement, f)

## DEGS included in each pp condition

In [32]:
def inspect_degs_overlap(adata_pp, all_degs):
    results = list()
    
    for feature_select_method in ['cell_ranger', 'seurat_v3', 'seurat']:
        for feature_number in [500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]:

            adata = adata_pp.copy()
            adata.obs['drug_celltype_dose'] = adata.obs['cell_type'].astype(str) + "_" + adata.obs['product_name'].astype(str) + "_" + adata.obs['dose'].astype(str)

            print("Runnign HVG Selection for:", feature_select_method, feature_number)
            
            if feature_select_method == "random":
                sc.pp.normalize_total(adata, inplace=True)
                sc.pp.log1p(adata)
                genes_random = random.sample(list(adata.var_names), feature_number)
                adata = adata[:, adata.var_names.isin(genes_random)].copy()

            elif feature_select_method == "seurat":
                sc.pp.normalize_total(adata, inplace=True)
                sc.pp.log1p(adata)
                sc.pp.highly_variable_genes(adata, flavor='seurat', n_top_genes=feature_number)
                adata = adata[:, adata.var['highly_variable']].copy()

            elif feature_select_method == "seurat_v3":
                sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=feature_number)
                sc.pp.normalize_total(adata, inplace=True)
                sc.pp.log1p(adata)
                adata = adata[:, adata.var['highly_variable']].copy()

            elif feature_select_method == "cell_ranger":
                sc.pp.normalize_total(adata, inplace=True)
                sc.pp.log1p(adata)
                sc.pp.highly_variable_genes(adata, flavor='cell_ranger', n_top_genes=feature_number)
                adata = adata[:, adata.var['highly_variable']].copy()

            if adata.n_obs == 0:
                continue

            genes_adata = list(adata.var_names)
            
            for cond in tqdm(adata.obs.drug_celltype_dose.unique()):
                if cond not in all_degs:
                    print(cond, "not in DEGs, skipping...")
                    continue

                common_genes = list(set(all_degs[cond]) & set(genes_adata))

                if len(all_degs[cond]) == 0:
                    pct_included = None
                else:
                    pct_included = len(common_genes)/len(all_degs[cond])
                

                results.append({
                    "cell_type": cond.split("_")[0],
                    "drug": cond.split("_")[1],
                    "dosage": cond.split("_")[2],
                    "feature_selection_method": feature_select_method,
                    "n_features": feature_number,
                    "all_degs": all_degs[cond],
                    "all_genes": genes_adata,
                    "common_genes": common_genes,
                    "pct_included": pct_included,
                    "number_of_degs": len(all_degs[cond])
                })

    return results
                    

In [None]:
degs_overlap = inspect_degs_overlap(adata_pp, all_degs)

In [34]:
with open("./degs_overlap_stats.pkl", 'wb') as f:
    pkl.dump(degs_overlap, f)

## DEGs enrichment

In [56]:
import gseapy as gp
import pandas as pd

genesets = gp.get_library(name="MSigDB_Hallmark_2020", organism="Human")  # dict

results_summary = dict()

for cond, degs in all_degs.items():
    if not degs:
        continue

    enr = gp.enrich(
        gene_list=list(set(degs)),
        gene_sets=genesets,        # dict here
        outdir=None,
        cutoff=0.05,
    )

    res = enr.results
    
    if res is None or len(res) == 0:
        results_summary[cond] = []
        continue

    df_enr = res.copy() if isinstance(res, pd.DataFrame) else pd.DataFrame(res)
    df_enr = df_enr[df_enr["Adjusted P-value"] <= 0.05]
    
    if df_enr.empty:
        results_summary[cond] = []

    
    results_summary[cond] = list(df_enr['Term'])



2025-08-21 14:48:52,869 [ERROR] No hits returned for all input gene sets!
2025-08-21 14:48:52,893 [ERROR] No hits returned for all input gene sets!
2025-08-21 14:48:52,909 [ERROR] No hits returned for all input gene sets!
2025-08-21 14:48:52,912 [ERROR] No hits returned for all input gene sets!
2025-08-21 14:48:52,920 [ERROR] No hits returned for all input gene sets!
2025-08-21 14:48:52,927 [ERROR] No hits returned for all input gene sets!
2025-08-21 14:48:52,993 [ERROR] No hits returned for all input gene sets!
2025-08-21 14:48:53,003 [ERROR] No hits returned for all input gene sets!
2025-08-21 14:48:53,019 [ERROR] No hits returned for all input gene sets!
2025-08-21 14:48:53,045 [ERROR] No hits returned for all input gene sets!
2025-08-21 14:48:53,048 [ERROR] No hits returned for all input gene sets!
2025-08-21 14:48:53,051 [ERROR] No hits returned for all input gene sets!
2025-08-21 14:48:53,071 [ERROR] No hits returned for all input gene sets!
2025-08-21 14:48:53,074 [ERROR] No hit

In [57]:
results_summary

{'A549_Enzastaurin (LY317615)_1000.0': [],
 'A549_MLN8054_100.0': [],
 'A549_Thalidomide_1000.0': ['Hypoxia', 'mTORC1 Signaling'],
 'A549_Linifanib (ABT-869)_10000.0': ['Xenobiotic Metabolism'],
 'A549_Resminostat_1000.0': ['Androgen Response'],
 'A549_Quisinostat (JNJ-26481585) 2HCl_10.0': ['Androgen Response',
  'Complement',
  'Estrogen Response Early',
  'Estrogen Response Late',
  'Xenobiotic Metabolism',
  'mTORC1 Signaling'],
 'A549_Temsirolimus (CCI-779, NSC 683864)_100.0': [],
 'A549_ENMD-2076_1000.0': ['p53 Pathway'],
 'A549_Crizotinib (PF-02341066)_1000.0': ['p53 Pathway'],
 'A549_Panobinostat (LBH589)_1000.0': ['Androgen Response',
  'Cholesterol Homeostasis',
  'Estrogen Response Early',
  'Estrogen Response Late',
  'Fatty Acid Metabolism',
  'G2-M Checkpoint',
  'Mitotic Spindle',
  'UV Response Dn',
  'Xenobiotic Metabolism',
  'mTORC1 Signaling'],
 'A549_Sorafenib Tosylate_1000.0': [],
 'A549_Mesna _1000.0': [],
 'A549_Trametinib (GSK1120212)_1000.0': [],
 'A549_Roscov

In [66]:
with open("./drug_pathway_enrichment.pkl", 'wb') as f:
    pkl.dump(results_summary, f)