In [1]:
import os
import scanpy as sc
import decoupler as dc
import pandas as pd
import numpy as np
import anndata as an
from sklearn.preprocessing import StandardScaler
#from scripts.workflow.utils.scale_by_celltype import scale_by_celltype

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.sparse import issparse

# Define a function to scale data by cell type within a cohort
def scale_by_cell_type(pdata, disease, cohort_name=None):

     if issparse(pdata.X):
         # Convert the sparse matrix to a dense format
         pdata.X = pdata.X.toarray()

     # Subset data for the given cohort
     #pdata_cohort = pdata[pdata.obs['studyID'] == cohort_name]

     # Get unique cell types
     cell_types = pdata.obs['Level1'].unique()

     # Create a list to store scaled data
     scaled_data_list = []

     # Iterate over each cell type
     for cell_type in cell_types:
         for disease in pdata.obs['disease'].unique(): 
         # Subset data for healthy and disease patients within the cell type
             pdata_h = pdata[(pdata.obs['disease'] ==
    'healthy') & (pdata.obs['Level1'] == cell_type)]
             pdata_r = pdata[(pdata.obs['disease'] ==
    disease) & (pdata.obs['Level1'] == cell_type)]
    
             if pdata_h.shape[0] > 0 and pdata_r.shape[0] > 0:
                 # Scale disease patients based on healthy controls
                 scaler = StandardScaler()
                 scaler.fit(pdata_h.X)
                 pdata_r_scaled = pdata_r.copy()
                 pdata_r_scaled.X = scaler.transform(pdata_r.X)
    
                 # Append scaled data to the list
                 scaled_data_list.append(pdata_r_scaled)
     # Concatenate scaled data for all cell types
     scaled_data = scaled_data_list[0].concatenate(scaled_data_list[1:])

     return scaled_data


In [31]:
def scale_by_cell_type(pdata, disease, cohort_name=None):
     if issparse(pdata.X):
         # Convert the sparse matrix to a dense format
         pdata.X = pdata.X.toarray()
     # Subset data for the given cohort
     if cohort_name is not None: 
         pdata_cohort = pdata[pdata.obs['studyID'] == cohort_name].copy()
     else: 
         pdata_cohort = pdata.copy()

     # Get unique cell types
     cell_types = pdata_cohort.obs['Level1'].unique()

     # Create a list to store scaled data
     scaled_data_list = []

     # Iterate over each cell type
     for cell_type in cell_types:
         # Subset data for healthy and SLE patients within the cell type
         pdata_cohort_h = pdata_cohort[(pdata_cohort.obs['disease'] ==
'healthy') & (pdata_cohort.obs['Level1'] == cell_type)]
         pdata_cohort_r = pdata_cohort[(pdata_cohort.obs['disease'] ==
disease) & (pdata_cohort.obs['Level1'] == cell_type)]

         if pdata_cohort_h.shape[0] > 0 and pdata_cohort_r.shape[0] > 0:
             # Scale SLE patients based on healthy controls
             scaler = StandardScaler()
             scaler.fit(pdata_cohort_h.X)
             pdata_cohort_r_scaled = pdata_cohort_r.copy()
             pdata_cohort_r_scaled.X = scaler.transform(pdata_cohort_r.X)

             # Append scaled data to the list
             scaled_data_list.append(pdata_cohort_r_scaled)

     # Concatenate scaled data for all cell types
     scaled_data = scaled_data_list[0].concatenate(scaled_data_list[1:]) if scaled_data_list else None
     return scaled_data

In [46]:
# Parameters
diseases = ["SLE", "Cirrhosis", "Flu", "HNSCC"]
workDir = "/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/"
# --- Input 
data_path = workDir + "03_downstream_analysis/07_gene_regulatory_network/results/"
net_merged_inputpath = data_path + 'test_revision_20250402/TFactivity_level2/data/net_merged_new.csv'

# --- Output 
output_folder = data_path + 'test_revision_20250402/TFactivity_level2/results/'

In [54]:
adatas_dict = {}
for disease in diseases: 
    inputpath = os.path.join(data_path, f"04_MAIN_geneUniverse.log1p_{disease}subset.h5ad")                     
    dis_adata = sc.read_h5ad(inputpath)
    dis_adata.obs["disease"] = dis_adata.obs["disease"].replace({"cirrhosis": "Cirrhosis", 
                                                                 "flu": "Flu"})
    adatas_dict[disease]=dis_adata

  dis_adata.obs["disease"] = dis_adata.obs["disease"].replace({"cirrhosis": "Cirrhosis",
  dis_adata.obs["disease"] = dis_adata.obs["disease"].replace({"cirrhosis": "Cirrhosis",


In [55]:
adatas_dict

{'SLE': AnnData object with n_obs × n_vars = 996812 × 51
     obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2', 'SLEDAI_score', 'Flare', 'Responder'
     var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status', 'highly_variable'
     uns: 'log1p',
 'Cirrhosis': AnnData object with n_obs × n_vars = 78149 × 51
     obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
     var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status', 'highly_variable'
     uns: 'log1p',
 'Flu': AnnData object with n_obs × n_vars = 106739 × 51
     obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
     var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status', 'highly_variable'
     uns: 'log1p',
 'HNSCC': AnnData object with n_obs × n_vars = 14842 × 51
     obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Le

In [56]:
for key, adata in adatas_dict.items(): 
    print(adata.obs.disease.unique())

['healthy', 'SLE']
Categories (2, object): ['SLE', 'healthy']
['healthy', 'Cirrhosis']
Categories (2, object): ['Cirrhosis', 'healthy']
['healthy', 'Flu']
Categories (2, object): ['Flu', 'healthy']
['healthy', 'HNSCC']
Categories (2, object): ['HNSCC', 'healthy']


In [12]:
#adata = an.concat(adatas)
#adata.obs.disease.unique()
#adata

In [50]:
net_merged = pd.read_csv(net_merged_inputpath, index_col = 0)
net_merged

Unnamed: 0,source_collectri,target,weight,PMID,index,source_net,Factor_celltype,Factor_function,value
2,SMAD3,JUN,1,10022869;12374795,ENSG00000177606,SPECTRA_0,global,all_IL6-JAK-STAT3_signaling,0.061590
3,SMAD3,JUN,1,10022869;12374795,ENSG00000177606,SPECTRA_65,T_CD4_Naive,Tregs_FoxP3_stabilization,0.000138
4,SMAD4,JUN,1,10022869;12374795,ENSG00000177606,SPECTRA_0,global,all_IL6-JAK-STAT3_signaling,0.061590
5,SMAD4,JUN,1,10022869;12374795,ENSG00000177606,SPECTRA_65,T_CD4_Naive,Tregs_FoxP3_stabilization,0.000138
6,STAT5A,IL2,1,10022878;11435608;17182565;17911616;22854263;2...,ENSG00000109471,SPECTRA_68,T_CD4_Naive,cytokine_and_receptors_proinflammatory,0.001665
...,...,...,...,...,...,...,...,...,...
73841,NFKB,TNFSF12,1,9560343,ENSG00000239697,SPECTRA_87,T_CD4_NonNaive,TNF_receptors_ligands,0.088949
73842,NFKB,TNFSF12,1,9560343,ENSG00000239697,SPECTRA_99,T_CD8_Naive,TNF_receptors_ligands,0.069565
73843,NFKB,TNFSF12,1,9560343,ENSG00000239697,SPECTRA_111,T_CD8_NonNaive,TNF_receptors_ligands,0.109352
73844,NFKB,TNFSF12,1,9560343,ENSG00000239697,SPECTRA_121,UTC,TNF_receptors_ligands,0.114455


In [51]:
collectri = dc.get_collectri(organism='human', split_complexes=False)
net_merged = net_merged[
    (net_merged['Factor_function'] == 'IFN_response') & 
    (net_merged['source_collectri'].isin(['STAT1', 'SP1']))]
net_merged

Unnamed: 0,source_collectri,target,weight,PMID,index,source_net,Factor_celltype,Factor_function,value
225,SP1,FOS,1,10082538;17626239;9528985,ENSG00000170345,SPECTRA_46,Mono,IFN_response,0.001059
334,SP1,RAC2,1,10098607,ENSG00000128340,SPECTRA_34,ILC,IFN_response,0.001280
1456,SP1,SPI1,1,10446912;12706891;22734486,ENSG00000066336,SPECTRA_46,Mono,IFN_response,0.000397
2281,SP1,OAS1,1,10567409,ENSG00000089127,SPECTRA_13,B,IFN_response,0.001815
2282,SP1,OAS1,1,10567409,ENSG00000089127,SPECTRA_24,DC,IFN_response,0.118557
...,...,...,...,...,...,...,...,...,...
58692,STAT1,MX1,1,9781817,ENSG00000157601,SPECTRA_71,T_CD4_Naive,IFN_response,0.061918
58693,STAT1,MX1,1,9781817,ENSG00000157601,SPECTRA_98,T_CD8_Naive,IFN_response,0.003288
58694,STAT1,MX1,1,9781817,ENSG00000157601,SPECTRA_110,T_CD8_NonNaive,IFN_response,0.000468
58695,STAT1,MX1,1,9781817,ENSG00000157601,SPECTRA_130,pDC,IFN_response,0.239647


In [9]:
cells = list(net_merged['Factor_celltype'].unique())
cells

['Mono',
 'ILC',
 'B',
 'DC',
 'Plasma',
 'T_CD4_Naive',
 'T_CD8_Naive',
 'T_CD8_NonNaive',
 'pDC',
 'T_CD4_NonNaive',
 'UTC']

In [65]:
output_folder

'/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/03_downstream_analysis/07_gene_regulatory_network/results/test_revision_20250402/TFactivity_level2/results/'

In [57]:
pdata_objs = {}
for disease, adata in adatas_dict.items(): 
    print(f"---------- {disease} ------------")
    # Arguments
    level = 'Level2'
    if (level == 'Level2'):
        add_levels = ['Level1', 'Level2']
    else:
        add_levels = [level]
    
    # data Load
    adata.var_names = adata.var['symbol'].astype(str)

    
    if disease in pdata_objs: 
        pdata = pdata_objs[disease]
    else:
        pdata = dc.get_pseudobulk(
            adata,
            sample_col = 'sampleID',
            groups_col = level,
            layer=None,
            mode='mean',
            min_cells=10,
            min_counts=0
        )
        pdata_objs[disease] = pdata

---------- SLE ------------
---------- Cirrhosis ------------
---------- Flu ------------
---------- HNSCC ------------


In [38]:
disease = "cirrhosis" 
pdata = pdata_objs[disease]
cohort_name = None
if issparse(pdata.X):
     # Convert the sparse matrix to a dense format
     pdata.X = pdata.X.toarray()
 # Subset data for the given cohort
if cohort_name is not None: 
     pdata_cohort = pdata[pdata.obs['studyID'] == cohort_name].copy()
else: 
     pdata_cohort = pdata.copy()

 # Get unique cell types
cell_types = pdata_cohort.obs['Level1'].unique()

 # Create a list to store scaled data
scaled_data_list = []

 # Iterate over each cell type
for cell_type in cell_types:
     # Subset data for healthy and SLE patients within the cell type
     pdata_cohort_h = pdata_cohort[(pdata_cohort.obs['disease'] ==
'healthy') & (pdata_cohort.obs['Level1'] == cell_type)]
     pdata_cohort_r = pdata_cohort[(pdata_cohort.obs['disease'] ==
disease) & (pdata_cohort.obs['Level1'] == cell_type)]

     if pdata_cohort_h.shape[0] > 0 and pdata_cohort_r.shape[0] > 0:
         # Scale SLE patients based on healthy controls
         scaler = StandardScaler()
         scaler.fit(pdata_cohort_h.X)
         pdata_cohort_r_scaled = pdata_cohort_r.copy()
         pdata_cohort_r_scaled.X = scaler.transform(pdata_cohort_r.X)

         # Append scaled data to the list
         scaled_data_list.append(pdata_cohort_r_scaled)

 # Concatenate scaled data for all cell types
scaled_data = scaled_data_list[0].concatenate(scaled_data_list[1:]) if scaled_data_list else None


In [43]:
pdata_cohort.obs.disease.unique()

array(['healthy', 'cirrhosis'], dtype=object)

In [30]:
for i in scaled_data_list: 
    print(i.shape)
    print(i.obs.disease.unique())
    print(i.obs.Level1.unique())

(65, 51)
['healthy']
['Mono']
(40, 51)
['cirrhosis']
['Mono']


In [58]:
for disease, pdata in pdata_objs.items(): 
    print(f"---------- {disease} ------------")
    print(pdata.obs.disease.unique())
    pdata_scaled = scale_by_cell_type(pdata, disease=disease)
    print(pdata_scaled.obs.disease.unique())

---------- SLE ------------
['SLE' 'healthy']
['SLE']
---------- Cirrhosis ------------
['healthy' 'Cirrhosis']
['Cirrhosis']
---------- Flu ------------
['healthy' 'Flu']
['Flu']
---------- HNSCC ------------
['healthy' 'HNSCC']
['HNSCC']


  scaled_data = scaled_data_list[0].concatenate(scaled_data_list[1:]) if scaled_data_list else None
  scaled_data = scaled_data_list[0].concatenate(scaled_data_list[1:]) if scaled_data_list else None
  scaled_data = scaled_data_list[0].concatenate(scaled_data_list[1:]) if scaled_data_list else None
  scaled_data = scaled_data_list[0].concatenate(scaled_data_list[1:]) if scaled_data_list else None


In [60]:
for disease, adata in adatas_dict.items(): 
    print(f"---------- {disease} ------------")
    # Arguments
    level = 'Level2'
    if (level == 'Level2'):
        add_levels = ['Level1', 'Level2']
    else:
        add_levels = [level]
    
    # data Load
    adata.var_names = adata.var['symbol'].astype(str)

    
    if disease in pdata_objs.keys():
        pdata = pdata_objs[disease]
    else:
        pdata = dc.get_pseudobulk(
            adata,
            sample_col = 'sampleID',
            groups_col = level,
            layer=None,
            mode='mean',
            min_cells=10,
            min_counts=0
        )
        pdata_objs[disease] = pdata
    
    # Scale by cell
    # Apply the function to each cohort
    pdata_scaled = scale_by_cell_type(pdata,disease=disease)
    
    res = {}
    cells = pdata_scaled.obs['Level1'].unique()
    for cell in cells:
    
        print(f"Running: {cell}")
        try:
            # Subet network by cell type
            net = net_merged[net_merged['Factor_celltype'] == cell]
            
            # Subset psbulk by cell type
            pdata_r_ct = pdata_scaled[pdata_scaled.obs['Level1'] == cell].copy()
    
            # Run ulm and store results
            dc.run_ulm(
                pdata_r_ct,
                net,
                source='source_collectri',
                target='target',
                weight='weight',
                use_raw=False,
                min_n=10
            )
            acts = dc.get_acts(pdata_r_ct, obsm_key='ulm_estimate')
            tfs = list(acts.var_names)
            diff = list(set(['STAT1', 'SP1']) - set(tfs))
            df = sc.get.obs_df(acts, tfs + add_levels + ['studyID', 'sex', 'sampleID', 'disease', 'binned_age'])
            if len(diff) > 0:
                df[diff] = np.nan    
            
            res[cell] = df[add_levels + ['STAT1', 'SP1', 'studyID', 'sex', 'sampleID', 'disease', 'binned_age']]
        
        except Exception as e:
            print(f"Error for {cell}")
    
    
    results_df = pd.concat(res).reset_index().set_index(level)
    results_df.to_csv(output_folder + f'/STAT1_SP1_{level}_{disease}.csv')

---------- SLE ------------
Running: B
Error for B
Running: DC


  scaled_data = scaled_data_list[0].concatenate(scaled_data_list[1:]) if scaled_data_list else None


Running: UTC
Error for UTC
Running: Mono
Running: ILC
Running: Cycling_cells
Error for Cycling_cells
Running: Plasma
Running: Platelets
Error for Platelets
Running: T_CD4_NonNaive
Error for T_CD4_NonNaive
Running: T_CD4_Naive
Running: T_CD8_NonNaive
Running: T_CD8_Naive
Running: pDC


  results_df = pd.concat(res).reset_index().set_index(level)


---------- Cirrhosis ------------
Running: Mono


  scaled_data = scaled_data_list[0].concatenate(scaled_data_list[1:]) if scaled_data_list else None


---------- Flu ------------
Running: B
Error for B
Running: DC
Running: UTC
Error for UTC
Running: Mono
Running: ILC
Running: Cycling_cells
Error for Cycling_cells
Running: Plasma
Running: Platelets
Error for Platelets
Running: T_CD4_NonNaive
Error for T_CD4_NonNaive
Running: T_CD4_Naive
Running: T_CD8_NonNaive
Running: T_CD8_Naive
Running: pDC


  scaled_data = scaled_data_list[0].concatenate(scaled_data_list[1:]) if scaled_data_list else None
  results_df = pd.concat(res).reset_index().set_index(level)


---------- HNSCC ------------
Running: Mono


  scaled_data = scaled_data_list[0].concatenate(scaled_data_list[1:]) if scaled_data_list else None
