In [1]:
LAM=0.001

In [2]:
for v in ['LAM']:
    if (v not in locals()) or (v not in globals()):
        raise TypeError(f"{v} not defined")
    else:
        print(f"{v} = {eval(v)}")

LAM = 0.001


*env: inflammaSpectra*

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import os
import sys
import scanpy as sc

from tqdm import tqdm
from sklearn.feature_selection import f_classif, mutual_info_classif
from scipy.stats import spearmanr, pearsonr, false_discovery_control

from kneed import DataGenerator, KneeLocator
from pyprojroot import here
import session_info

import Spectra as spc

from Spectra import K_est as kst
from upsetplot import from_contents, UpSet

# Sig 
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import math

sys.path.insert(1, str(here('bin')))
from customPalette import diseases_palette
from customPythonFunctions import generateID2SymbolDF

import warnings
warnings.filterwarnings('ignore')

from itertools import product

import decoupler as dc

# GENERAL NOTES

## How to use SPECTRA results
We have used SEACells to run SPECTRA. Consequently: 
1. SPECTRA activities have to be used using the spectra_categories. If we want to go deeper, we should transfer the results to the cell level. 
2. To compute cell-activities we have to extract the signatures provided by SPECTRA and use another method at cell level. In this case, there are two approaches:
   - Take topX markers (what we did in version 1). 
   - Put a threshold in the minimum weight for a gene to be included. These weights could also be used as input for the method to compute the signature

## Interpretation
A. GLOBAL FACTORS 
1. Global factors should be interpreted independently of the cell type. In principle, these are factors that affect the cells no matter their cell type. We could plot the activity accross cell types to confirm.
2. To make use of the "non-annotated" factors, I was thinking on assessing these global factors to find one that is interesting given its behaviour accross diseases. Then, using the genes included we could provide a function that is global (no-celltype dependent) and specifically active in a set of diseases.
   
B. CELL TYPE FACTORS 

Conversely to the global factors, here I would focus on the annotated factors (unless a clear signal is seen). There is code to assess the activity of all the celltype-related factors accross cell types. I would use this as starting point.
   - Non-annotated factors: If no interesting pattern is seen, I would remove/ignore them.
   - Annotated factors:
   - - *One function - One Factor*: In these cases, I would try to see the relevance of the signature in the cell type and the genes included in the factor.
     - *One function - Multiple factors*: In these cases, I would first assess (a) the concordance of the genes within each factor and (b) the concordance of the activity across diseases. If (a) and (b) are similar, I would merge them. If (a) is similar but (b) no, I would try to see what makes the factor to be different.
    
C. DOWNSTREAM ANALYSIS 

Given the fact that the rest of the analyses (classifiers) is done stratified by cell type, I would not include the global factors. This is another reason to go deeper in the non-annotated factors only for global. This way the analyses could be paralellized: 
1. Identify which factors to merge
2. A. Annotatated factors
   - Assess activities across cell types
   - Use them for cell/patient classifier
3. B Unnanotated factors
   - Assess activities across diseases
   - Annotated them
   - Define function 

## Explanation of the script 
Here you have: 
1. Explanation to all the SPECTRA output: factor_names, gene_weights, cell_scores, factor naming (overlapping)
2. Overall analysis of factor-factor correlation
3. Analysis of global factors
4. Analysis of Cell type specific factors: I did not filtered for annotation so that you can first assess all together. 


# 0. Load data
In the previous script we generated
1. Adata with spectra results
2. Gene set dictionary
3. We defined a given obs_key
4. Spectra model

In [4]:
adataM = sc.read_h5ad(here("00_data_processing/results/02_INFLAMMATION_main_afterQC.h5ad"), backed='r')

In [5]:
sampleMetadata = adataM.obs[['sampleID','chemistry','technology','patientID','disease','sex','binned_age']].drop_duplicates()

In [6]:
selected_genes = pd.read_pickle(here('03_downstream_analysis/02_gene_universe_definition/results/04_selected_gene_list.pkl'))

In [7]:
project_dir = str(here('03_downstream_analysis/05_SPECTRA/'))

if not(os.path.exists(f"{project_dir}/results/figures")):
    os.makedirs(f"{project_dir}/results/figures")
if not(os.path.exists(f"{project_dir}/results/markers")):
    os.makedirs(f"{project_dir}/results/markers")

In [8]:
# Load spectra results
obs_key = "spectra_categories" 
adataSPECTRA = sc.read_h5ad(f"{project_dir}/results/04_MAIN_SEAcell_scANVInorm_SPECTRA_lam{LAM}_overlap0.3_v2.h5ad")

# We used a pre-subsetted dataset, thus all genes were used
adataSPECTRA.var['spectra_vocab'] = True
adataSPECTRA

AnnData object with n_obs × n_vars = 71108 × 8253
    obs: 'spectra_categories'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status', 'mt', 'ribo', 'hb', 'plt', 'spectra_vocab'
    uns: 'SPECTRA_L', 'SPECTRA_factors', 'SPECTRA_markers', 'SPECTRA_overlap', 'log1p'
    obsm: 'SPECTRA_cell_scores'

In [9]:
# Load SEAcell information
SEAdata = sc.read_h5ad(here("03_downstream_analysis/04_integration_with_annotation/results/normalized_adatas_nextflow/SEAcellAdataMerged.log1p.h5ad"))
SEAdata

AnnData object with n_obs × n_vars = 71108 × 8253
    obs: 'Level1', 'sampleID', 'disease'
    uns: 'log1p'

In [10]:
SEAdata.obs = SEAdata.obs.drop('disease', axis=1).reset_index().merge(sampleMetadata, on = 'sampleID', how = 'left').set_index('index')

In [11]:
adataSPECTRA.obs = adataSPECTRA.obs.merge(SEAdata.obs[['sampleID','chemistry','technology','patientID','disease']], 
                                          left_index=True, right_index=True, how='left')
adataSPECTRA.var = adataSPECTRA.var.merge(adataM.var[['hgnc_id','symbol']], left_index=True, right_index=True, how='left')

In [12]:
# Load gene set dict
with open(here('03_downstream_analysis/05_SPECTRA/results/SPECTRA_GeneSet_ensg_v2.pickle'), 'rb') as f:
    spectra_gene_set = pickle.load(f)
spectra_gene_set

{'global': {'global_all_IL6-JAK-STAT3_signaling': ['ENSG00000141506',
   'ENSG00000096996',
   'ENSG00000243646',
   'ENSG00000164400',
   'ENSG00000004468',
   'ENSG00000163734',
   'ENSG00000134460',
   'ENSG00000077238',
   'ENSG00000163737',
   'ENSG00000105397',
   'ENSG00000227507',
   'ENSG00000026103',
   'ENSG00000205755',
   'ENSG00000184371',
   'ENSG00000115145',
   'ENSG00000142166',
   'ENSG00000027697',
   'ENSG00000139567',
   'ENSG00000131724',
   'ENSG00000056736',
   'ENSG00000196396',
   'ENSG00000143575',
   'ENSG00000170458',
   'ENSG00000105329',
   'ENSG00000168610',
   'ENSG00000159128',
   'ENSG00000198223',
   'ENSG00000134470',
   'ENSG00000184557',
   'ENSG00000100368',
   'ENSG00000259207',
   'ENSG00000115232',
   'ENSG00000169245',
   'ENSG00000111321',
   'ENSG00000213928',
   'ENSG00000170581',
   'ENSG00000067182',
   'ENSG00000124334',
   'ENSG00000163823',
   'ENSG00000232810',
   'ENSG00000006327',
   'ENSG00000137193',
   'ENSG00000108688',
   'EN

In [13]:
list(spectra_gene_set['global'].keys())

['global_all_IL6-JAK-STAT3_signaling',
 'global_all_JAK-STAT_signaling',
 'global_all_type-I and II-ifn-response',
 'global_all_TNF-via-NFkB_signaling',
 'global_all_MHC-II-I presentation',
 'global_all_complement_production']

In [14]:
del SEAdata
del adataM

In [15]:
varDF = pd.read_csv(here("03_downstream_analysis/05_SPECTRA/results/varDF.csv"))
varDF.head()

Unnamed: 0,ensembl_gene_id,hgnc_id,symbol,locus_group,HUGO_status,mt,ribo,hb,plt
0,ENSG00000000003,HGNC:11858,TSPAN6,protein_coding,official,False,False,False,False
1,ENSG00000000419,HGNC:3005,DPM1,protein_coding,official,False,False,False,False
2,ENSG00000000457,HGNC:19285,SCYL3,protein_coding,official,False,False,False,False
3,ENSG00000000460,HGNC:25565,FIRRM,protein_coding,official,False,False,False,False
4,ENSG00000000938,HGNC:3697,FGR,protein_coding,official,False,False,False,False


# 1. Exploratory analysis of the factors

First we do a quick exploratory analysis of the obtained factors. Remember that we have both global and cell-type specific factors. For each of these categories we can obtain annotated or unannotated factors depending on the overlap with the input gene-sets. Here we will obtain: 
- Number and naming of the factors
- Gene weights of each gene to each factor
- Overlapping coefficient between gene sets and factors
- Score of each factor in each cell 

## Number and name of the factors

Factors are named as: 

```['index' + '-X-' + 'cell type specificity' + '-X-' + 'cell-type' + '_' +'assigned label']```

Ex: *'0-X-global-X-lymphocyteB_MYC_targets'* 

See Section **SPECTRA_overlap: Overlapping factors-GeneSets** to understand *"assigned label"*

In [16]:
factors = adataSPECTRA.uns['SPECTRA_overlap'].index
print(f"Total number of factors: {len(factors)}")

Total number of factors: 135


In [17]:
for factor in factors: 
    print(factor)

0-X-global-X-global_all_IL6-JAK-STAT3_signaling
1-X-global-X-B_IFN_Type_1_2_Lambda
2-X-global-X-global_all_type-I and II-ifn-response
3-X-global-X-global_all_TNF-via-NFkB_signaling
4-X-global-X-global_all_MHC-II-I presentation
5-X-global-X-global_all_complement_production
6-X-global-X-6
7-X-B-X-effector
8-X-B-X-B_chemokines
9-X-B-X-Plasma_chemokine_receptors
10-X-B-X-UTC_cytokine_and_receptors_proinflammatory
11-X-B-X-T_CD4_Naive_cytokine_and receptors_ antiinflammatory
12-X-B-X-ILC_IFN_Type_1_2_Lambda
13-X-B-X-Mono_IFN_response
14-X-B-X-T_CD8_NonNaive_TNF_receptors_ligands
15-X-B-X-T_CD4_NonNaive_adhesion_molecules
16-X-B-X-T_CD8_NonNaive_antigen_presentation_molecules
17-X-B-X-17
18-X-DC-X-DC_antigen-crosspresentation
19-X-DC-X-T_CD8_Naive_chemokines
20-X-DC-X-T_CD4_NonNaive_chemokine_receptors
21-X-DC-X-UTC_cytokine_and_receptors_proinflammatory
22-X-DC-X-cDC_cytokine_and receptors_ antiinflammatory
23-X-DC-X-T_CD8_Naive_IFN_Type_1_2_Lambda
24-X-DC-X-ILC_IFN_response
25-X-DC-X-T_CD8

In [18]:
Factor_number = []
Factor_spectra_category = []
Gene_set_overlap = []
Missing_annotation = []

is_integer  = lambda element: element.isdigit() if element.isnumeric() else (element.startswith('-') and element[1:].isdigit())

for factor in factors:
    parts = factor.split("-X-")
    Factor_number.append(parts[0])
    Factor_spectra_category.append(parts[1])
    Gene_set_overlap.append(parts[2])
    Missing_annotation.append(is_integer(parts[2]))

factor_df = pd.DataFrame({
    'Factor_number': Factor_number,
    'Factor_spectra_category': Factor_spectra_category,
    'Gene_set_overlap': Gene_set_overlap,
    'Missing_annotation': Missing_annotation
})

In [19]:
factor_df

Unnamed: 0,Factor_number,Factor_spectra_category,Gene_set_overlap,Missing_annotation
0,0,global,global_all_IL6-JAK-STAT3_signaling,False
1,1,global,B_IFN_Type_1_2_Lambda,False
2,2,global,global_all_type-I and II-ifn-response,False
3,3,global,global_all_TNF-via-NFkB_signaling,False
4,4,global,global_all_MHC-II-I presentation,False
...,...,...,...,...
130,130,pDC,T_CD8_Naive_IFN_response,False
131,131,pDC,T_CD8_NonNaive_TNF_receptors_ligands,False
132,132,pDC,Plasma_adhesion_molecules,False
133,133,pDC,pDC_antigen_presentation_molecules,False


### Factors per celltype

In [20]:
unique_categories = factor_df['Factor_spectra_category'].unique()
for category in unique_categories:
    filtered_df = factor_df[factor_df['Factor_spectra_category'] == category]
    
    print(f"Factor_spectra_category: {category}")
    print(filtered_df)
    print()

Factor_spectra_category: global
  Factor_number Factor_spectra_category  \
0             0                  global   
1             1                  global   
2             2                  global   
3             3                  global   
4             4                  global   
5             5                  global   
6             6                  global   

                        Gene_set_overlap  Missing_annotation  
0     global_all_IL6-JAK-STAT3_signaling               False  
1                  B_IFN_Type_1_2_Lambda               False  
2  global_all_type-I and II-ifn-response               False  
3      global_all_TNF-via-NFkB_signaling               False  
4       global_all_MHC-II-I presentation               False  
5       global_all_complement_production               False  
6                                      6                True  

Factor_spectra_category: B
   Factor_number Factor_spectra_category  \
7              7                       B   
8  

# 2. Factor selection: Inflammatory-related factors

**Obtain gene weights to each factor**

In [20]:
adataSPECTRA.uns['SPECTRA_factors']

array([[1.66221956e-15, 3.25536173e-14, 7.10839330e-15, ...,
        8.83772892e-17, 2.97727820e-15, 1.52717275e-16],
       [1.10817069e-15, 2.29933164e-14, 4.75834305e-15, ...,
        6.68683883e-17, 2.28852607e-15, 9.87078124e-17],
       [2.73712048e-15, 5.74495102e-14, 1.25985771e-14, ...,
        1.54261408e-16, 4.60611215e-15, 2.54933156e-16],
       ...,
       [3.80380007e-15, 1.25286818e-13, 2.17760460e-14, ...,
        1.21154038e-13, 1.50187748e-13, 2.53273298e-14],
       [5.80486107e-15, 2.79097310e-13, 3.85169444e-14, ...,
        2.33132073e-14, 1.33310457e-13, 1.15636494e-14],
       [8.97457008e-04, 4.09201960e-02, 1.11760911e-02, ...,
        1.66005224e-02, 3.56527336e-02, 2.58524918e-03]])

In [21]:
index_labels = adataSPECTRA.uns['SPECTRA_overlap'].index
gene_weights = pd.DataFrame(adataSPECTRA.uns['SPECTRA_factors'], 
                            index= index_labels,
                            columns=adataSPECTRA.var[adataSPECTRA.var['spectra_vocab']].index)
gene_weights.head()

ensembl_gene_id,ENSG00000000003,ENSG00000000457,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,ENSG00000001461,ENSG00000001561,...,ENSG00000278619,ENSG00000278637,ENSG00000278662,ENSG00000278677,ENSG00000278705,ENSG00000278817,ENSG00000278828,ENSG00000280670,ENSG00000280789,ENSG00000282608
0-X-global-X-global_all_IL6-JAK-STAT3_signaling,1.66222e-15,3.255362e-14,7.108393e-15,4.3315350000000006e-17,5.497816e-16,3.206445e-15,1.057057e-15,4.152293e-15,1.354309e-15,7.388704e-15,...,8.076534e-17,1.604406e-15,2.332613e-15,1.923066e-15,1.979497e-15,3.506886e-16,5.575245e-15,8.837729000000001e-17,2.977278e-15,1.527173e-16
1-X-global-X-B_IFN_Type_1_2_Lambda,1.108171e-15,2.299332e-14,4.758343e-15,3.819025e-17,3.985897e-16,2.13764e-15,3.022476e-17,3.077916e-15,1.422393e-15,6.019337e-15,...,6.932764e-17,1.214989e-15,1.637209e-15,1.38665e-15,1.631433e-15,2.437689e-16,4.0315e-15,6.686839000000001e-17,2.288526e-15,9.870781e-17
2-X-global-X-global_all_type-I and II-ifn-response,2.73712e-15,5.744951e-14,1.259858e-14,5.1632680000000005e-17,1.11309e-15,5.587422e-15,4.6122089999999995e-20,7.008616e-15,3.292712e-15,1.330521e-14,...,1.408319e-16,2.763175e-15,3.994905e-15,3.144978e-15,3.659637e-15,5.858709e-16,6.734683e-15,1.542614e-16,4.606112e-15,2.549332e-16
3-X-global-X-global_all_TNF-via-NFkB_signaling,8.518205e-16,1.570927e-14,3.516911e-15,3.3033630000000003e-17,3.04563e-16,1.666658e-15,9.245895e-20,2.090478e-15,8.147142e-16,4.175936e-15,...,4.3292380000000006e-17,9.039299e-16,1.197679e-15,1.025573e-15,1.188532e-15,1.864964e-16,2.036101e-15,4.384817e-17,1.668691e-15,6.243284e-17
4-X-global-X-global_all_MHC-II-I presentation,2.003873e-15,5.072165e-14,1.220851e-14,4.3649420000000004e-17,1.004335e-15,4.993081e-15,0.001373877,5.995152e-15,1.772227e-15,1.293963e-14,...,9.679482000000001e-17,2.463584e-15,3.413538e-15,2.854015e-15,1.960062e-15,5.124434e-16,7.823581e-15,1.331931e-16,5.220355e-15,1.964017e-16


#### Gene set enrichment for re-labeling factors

In [22]:
def extract_genes(spectra_gene_set, net_list):
    for k in spectra_gene_set.keys():
        #print(k)
        if isinstance(spectra_gene_set[k],dict):
            extract_genes(spectra_gene_set[k], net_list)
        else:
            net_list.append(pd.DataFrame({'gene':spectra_gene_set[k], 'gene_set':k}))


In [23]:
net_list = []
extract_genes(spectra_gene_set, net_list)
netDF = pd.concat(net_list, axis=0)

In [24]:
netDF

Unnamed: 0,gene,gene_set
0,ENSG00000141506,global_all_IL6-JAK-STAT3_signaling
1,ENSG00000096996,global_all_IL6-JAK-STAT3_signaling
2,ENSG00000243646,global_all_IL6-JAK-STAT3_signaling
3,ENSG00000164400,global_all_IL6-JAK-STAT3_signaling
4,ENSG00000004468,global_all_IL6-JAK-STAT3_signaling
...,...,...
107,ENSG00000166710,cDC_antigen_presentation_molecules
108,ENSG00000179218,cDC_antigen_presentation_molecules
109,ENSG00000182287,cDC_antigen_presentation_molecules
110,ENSG00000136943,cDC_antigen_presentation_molecules


In [26]:
netDF['Level1'] = ''
for ct in spectra_gene_set.keys():
    for f in spectra_gene_set[ct].keys():
        netDF.loc[netDF.gene_set == f,'Level1'] = ct

In [27]:
netDF['weight'] = 1

In [28]:
gene_weights['Level1'] = [l.split('-X-')[1] for l in gene_weights.index.tolist()]

In [29]:

ULMres_list = []
for l1 in tqdm(gene_weights['Level1'].unique()):

    estimate, pvalues = dc.run_ulm(net = netDF.query("Level1==@l1 or Level1=='global'"),
                                   mat = gene_weights.query("Level1==@l1").drop('Level1',axis=1),
                                   source='gene_set', target='gene', use_raw=False)
    ulm_i = (
        estimate.melt(ignore_index=False, value_name='estimate').reset_index().rename({'variable':'gene_set', 'index':'factor'}, axis=1)
        .merge(pvalues.melt(ignore_index=False, value_name='pValues').reset_index().rename({'variable':'gene_set', 'index':'factor'}, axis=1), on=['gene_set','factor'])
    )
    ulm_i['Level1'] = l1
    ULMres_list.append(ulm_i)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:01<00:00,  6.74it/s]


In [30]:
ULMres = pd.concat(ULMres_list, axis=0)
ULMres

Unnamed: 0,factor,gene_set,estimate,pValues,Level1
0,0-X-global-X-global_all_IL6-JAK-STAT3_signaling,global_all_IL6-JAK-STAT3_signaling,73.236252,0.000000e+00,global
1,1-X-global-X-B_IFN_Type_1_2_Lambda,global_all_IL6-JAK-STAT3_signaling,16.788246,3.183197e-62,global
2,2-X-global-X-global_all_type-I and II-ifn-resp...,global_all_IL6-JAK-STAT3_signaling,0.532302,5.945314e-01,global
3,3-X-global-X-global_all_TNF-via-NFkB_signaling,global_all_IL6-JAK-STAT3_signaling,3.502776,4.628742e-04,global
4,4-X-global-X-global_all_MHC-II-I presentation,global_all_IL6-JAK-STAT3_signaling,-0.437972,6.614182e-01,global
...,...,...,...,...,...
145,130-X-pDC-X-T_CD8_Naive_IFN_response,pDC_cytokine_and_receptors_proinflammatory,-0.290128,7.717257e-01,pDC
146,131-X-pDC-X-T_CD8_NonNaive_TNF_receptors_ligands,pDC_cytokine_and_receptors_proinflammatory,-0.203296,8.389088e-01,pDC
147,132-X-pDC-X-Plasma_adhesion_molecules,pDC_cytokine_and_receptors_proinflammatory,-0.376227,7.067580e-01,pDC
148,133-X-pDC-X-pDC_antigen_presentation_molecules,pDC_cytokine_and_receptors_proinflammatory,-0.630489,5.283924e-01,pDC


In [36]:
ULMres['adj_pV'] = false_discovery_control(ULMres['pValues'])

**Defining factor as annotatated or not**

Factors are considered *annotated* if they have at least 1 adj_pV < 0.05 and estimate > 0

In [37]:
ULMres['annotated'] = False

In [38]:
ULMres.loc[(ULMres.adj_pV < 0.05) & (ULMres.estimate > 0), 'annotated'] = True

In [39]:
ULMres

Unnamed: 0,factor,gene_set,estimate,pValues,Level1,adj_pV,annotated
0,0-X-global-X-global_all_IL6-JAK-STAT3_signaling,global_all_IL6-JAK-STAT3_signaling,73.236252,0.000000e+00,global,0.000000e+00,True
1,1-X-global-X-B_IFN_Type_1_2_Lambda,global_all_IL6-JAK-STAT3_signaling,16.788246,3.183197e-62,global,4.130667e-61,True
2,2-X-global-X-global_all_type-I and II-ifn-resp...,global_all_IL6-JAK-STAT3_signaling,0.532302,5.945314e-01,global,9.684836e-01,False
3,3-X-global-X-global_all_TNF-via-NFkB_signaling,global_all_IL6-JAK-STAT3_signaling,3.502776,4.628742e-04,global,2.205400e-03,True
4,4-X-global-X-global_all_MHC-II-I presentation,global_all_IL6-JAK-STAT3_signaling,-0.437972,6.614182e-01,global,9.684836e-01,False
...,...,...,...,...,...,...,...
145,130-X-pDC-X-T_CD8_Naive_IFN_response,pDC_cytokine_and_receptors_proinflammatory,-0.290128,7.717257e-01,pDC,9.684836e-01,False
146,131-X-pDC-X-T_CD8_NonNaive_TNF_receptors_ligands,pDC_cytokine_and_receptors_proinflammatory,-0.203296,8.389088e-01,pDC,9.684836e-01,False
147,132-X-pDC-X-Plasma_adhesion_molecules,pDC_cytokine_and_receptors_proinflammatory,-0.376227,7.067580e-01,pDC,9.684836e-01,False
148,133-X-pDC-X-pDC_antigen_presentation_molecules,pDC_cytokine_and_receptors_proinflammatory,-0.630489,5.283924e-01,pDC,9.684836e-01,False


In [37]:
ULMres.to_csv(f"{project_dir}/results/ULMresDF_{LAM}.csv", index=True)

**The following code returns the best corresponce for each factors (that pass the significance threshold)**

In [40]:
idx = ULMres.groupby('factor', observed=True)['estimate'].transform(max) == ULMres['estimate']
factorBestMatch = ULMres[idx].query("annotated==True")
factorBestMatch

Unnamed: 0,factor,gene_set,estimate,pValues,Level1,adj_pV,annotated
0,0-X-global-X-global_all_IL6-JAK-STAT3_signaling,global_all_IL6-JAK-STAT3_signaling,73.236252,0.000000e+00,global,0.000000e+00,True
8,1-X-global-X-B_IFN_Type_1_2_Lambda,global_all_JAK-STAT_signaling,83.666985,0.000000e+00,global,0.000000e+00,True
18,4-X-global-X-global_all_MHC-II-I presentation,global_all_MHC-II-I presentation,78.628578,0.000000e+00,global,0.000000e+00,True
24,3-X-global-X-global_all_TNF-via-NFkB_signaling,global_all_TNF-via-NFkB_signaling,108.212700,0.000000e+00,global,0.000000e+00,True
33,5-X-global-X-global_all_complement_production,global_all_complement_production,48.866062,0.000000e+00,global,0.000000e+00,True
...,...,...,...,...,...,...,...
108,133-X-pDC-X-pDC_antigen_presentation_molecules,pDC_antigen_presentation_molecules,120.828735,0.000000e+00,pDC,0.000000e+00,True
111,126-X-pDC-X-T_CD8_Naive_chemokine_receptors,pDC_chemokine_receptors,50.870296,0.000000e+00,pDC,0.000000e+00,True
120,125-X-pDC-X-T_CD4_NonNaive_chemokines,pDC_chemokines,56.246185,0.000000e+00,pDC,0.000000e+00,True
133,128-X-pDC-X-T_CD8_Naive_cytokine_and receptors...,pDC_cytokine_and receptors_ antiinflammatory,38.498188,2.332368e-298,pDC,4.397609e-297,True


In [39]:
factorBestMatch.to_csv(f"{project_dir}/results/factorBestMatchDF_{LAM}.csv", index=True)

# Assess number of genes

In [40]:
mutual_info_HvsD = dict()
mutual_info_DvsD = dict()
annotated_dict = dict()
# ANOVA_HvsD = dict()
healthy_index = np.array(adataSPECTRA.obs['disease'] == 'healthy') 
disease_index = adataSPECTRA.obs['disease']
for i, f in tqdm(enumerate(adataSPECTRA.uns['SPECTRA_overlap'].index.to_list())):

    f_split = f.split('-X-')
    ct = f_split[1]

    
    #annotated_dict[f] = [f_split[-1].isdigit() == False]

    if f in ULMres.query("annotated == True").factor.tolist():
        annotated_dict[f] = True
    else:
        annotated_dict[f] = False

    if ct == 'global':
        X=np.array(adataSPECTRA.obsm['SPECTRA_cell_scores'][:,i]).reshape(-1, 1)
        # y = healthy_index
        mutual_info_HvsD[f] = mutual_info_classif(X=X, y=healthy_index)   
        mutual_info_DvsD[f] = mutual_info_classif(X=X, y=disease_index)  
        # ANOVA_HvsD[f] = f_classif(X=X, y=healthy_index)[0]
    else:
        cell_index = adataSPECTRA.obs['spectra_categories'] == ct
        X=np.array(adataSPECTRA.obsm['SPECTRA_cell_scores'][cell_index,i]).reshape(-1, 1)
        # y = healthy_index[cell_index]
        mutual_info_HvsD[f] = mutual_info_classif(X=X, y=healthy_index[cell_index])
        mutual_info_DvsD[f] = mutual_info_classif(X=X, y=disease_index[cell_index])
        # ANOVA_HvsD[f] = f_classif(X=X, y=healthy_index[cell_index])[0]

135it [00:17,  7.52it/s]


In [41]:
all_metrics = (
    pd.DataFrame.from_dict(mutual_info_HvsD).T.rename(columns={0: 'MI_HvsD'})
    # .join(pd.DataFrame.from_dict(ANOVA_HvsD).T.rename(columns={0: 'ANOVA_HvsD'}))
    .join(pd.DataFrame.from_dict(mutual_info_DvsD).T.rename(columns={0: 'MI_DvsD'}))
    .join(pd.DataFrame.from_dict(annotated_dict, orient='index').rename(columns={0: 'annotated'}))
    # .join(pd.DataFrame.from_dict(ANOVA_dis).T.rename(columns={0: 'ANOVA_DvsD'}))
)

#pd.DataFrame.from_dict(annotated_dict, orient='index', columns=['annotated'])

In [42]:
all_metrics

Unnamed: 0,MI_HvsD,MI_DvsD,annotated
0-X-global-X-global_all_IL6-JAK-STAT3_signaling,0.136088,0.345846,True
1-X-global-X-B_IFN_Type_1_2_Lambda,0.064166,0.286926,True
2-X-global-X-global_all_type-I and II-ifn-response,0.110601,0.212987,True
3-X-global-X-global_all_TNF-via-NFkB_signaling,0.042438,0.315464,True
4-X-global-X-global_all_MHC-II-I presentation,0.056068,0.160609,True
...,...,...,...
130-X-pDC-X-T_CD8_Naive_IFN_response,0.148305,0.431836,True
131-X-pDC-X-T_CD8_NonNaive_TNF_receptors_ligands,0.187521,0.337265,True
132-X-pDC-X-Plasma_adhesion_molecules,0.143572,0.433527,True
133-X-pDC-X-pDC_antigen_presentation_molecules,0.169549,0.532486,True


In [43]:
def get_marker_genes_knee(adata = None, gene_weights = None, S=.5):
    
    #top_markers = 50
    total_factors = len(adata.uns['SPECTRA_markers'])
    markers_df_list = list()
    gene_weightsT = gene_weights.transpose()
    
    # Loop through factor numbers
    for f in gene_weightsT.columns:
        # Get markers for the current factor (first top_markers)
    
        ## SELECTING genes based on the elbow of the distribution of their coefficents in each factor
        y = np.array(gene_weightsT[f].sort_values(ascending = True))
        x = np.array(range(y.shape[0]))
        kneedle = KneeLocator(x, y, S=S, curve="convex", direction="increasing")
        y_thr = kneedle.knee_y
    
        markers = gene_weightsT.index[gene_weightsT[f] > y_thr].tolist()
        markers_df_list.append(pd.DataFrame(True, index=markers,columns=[f]))
        
    markers_df = pd.concat(markers_df_list, ignore_index=False, axis=1).fillna(False)
    return markers_df

In [44]:
def count_genes(S, MI_HvsD_q, MI_DvsD_q, annotatedTF, gene_weights, all_metrics):
    markers_df = get_marker_genes_knee(adata = adataSPECTRA, gene_weights = gene_weights, S=S)

    if annotatedTF:
        MI_HvsD_thr = np.quantile(all_metrics.query('annotated == True').MI_HvsD, q=MI_HvsD_q)
        MI_DvsD_thr = np.quantile(all_metrics.query('annotated == True').MI_DvsD, q=MI_DvsD_q)
        dropped_mi_factors = all_metrics.query('MI_HvsD < @MI_HvsD_thr or MI_DvsD < @MI_DvsD_thr or annotated == False').index
    else:
        MI_HvsD_thr = np.quantile(all_metrics.MI_HvsD, q=MI_HvsD_q)
        MI_DvsD_thr = np.quantile(all_metrics.MI_DvsD, q=MI_DvsD_q)        
        dropped_mi_factors = all_metrics.query('MI_HvsD < @MI_HvsD_thr or MI_DvsD < @MI_DvsD_thr').index
        
    output = markers_df.drop(dropped_mi_factors, axis=1)
    remain_factors_n = len(output.columns)
    remain_factors_ct =  pd.DataFrame(pd.DataFrame([f.split('-X-')[1] for f in output.columns]).value_counts()).transpose()

    median_gene_n = output.sum(axis=0).median()
    
    output = output.sum(axis=1) > 0
    
    return [output.sum(), median_gene_n, output['ENSG00000051523'], remain_factors_n, remain_factors_ct]

In [45]:
#S = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1,1.2]
#quantile = [0, .25, .5, .75]
#quantile = [0]
S = [0.5, 0.6, 0.7, 0.8, 0.9]
quantile = [0]


# MI_HvsD_thr = np.quantile(all_metrics.MI_HvsD, q=[0, .25, .5, .75])
# MI_DvsD_thr = np.quantile(all_metrics.MI_DvsD, q=[0, .25, .5, .75])
result = [[s, q1, q2, TF] + count_genes(s, q1, q2, TF, gene_weights.drop('Level1',axis=1), all_metrics) for s, q1, q2, TF, in tqdm(product(S, quantile, quantile, [False,True]))]

10it [00:10,  1.03s/it]


In [46]:
remain_factors = pd.DataFrame([d[0:8] for d in result], columns=['S', 'MI_HvsD', 'MI_DvsD', 'only_annotated','n_genes', 'median_gene_n', 'withCYBA', 'n_factors'])

In [47]:
remain_factors_ct = pd.concat([d[-1] for d in result]).fillna(0).reset_index().drop('index', axis=1)

In [48]:
data = pd.concat([remain_factors, remain_factors_ct], axis=1)

In [49]:
pd.set_option('display.max_rows', None)

In [50]:
dataSorted = data.sort_values(['n_genes','n_factors'])
dataSorted

Unnamed: 0,S,MI_HvsD,MI_DvsD,only_annotated,n_genes,median_gene_n,withCYBA,n_factors,"(T_CD4_Naive,)","(T_CD4_NonNaive,)","(Mono,)","(T_CD8_Naive,)","(T_CD8_NonNaive,)","(B,)","(DC,)","(ILC,)","(Plasma,)","(UTC,)","(pDC,)","(global,)"
1,0.5,0,0,True,988,25.0,True,125,14,14,12,11,11,10,11,9,9,9,9,6
3,0.6,0,0,True,1020,27.0,True,125,14,14,12,11,11,10,11,9,9,9,9,6
5,0.7,0,0,True,1074,27.0,True,125,14,14,12,11,11,10,11,9,9,9,9,6
7,0.8,0,0,True,1196,30.0,True,125,14,14,12,11,11,10,11,9,9,9,9,6
0,0.5,0,0,False,1353,27.0,True,135,15,15,12,12,12,11,11,10,10,10,10,7
9,0.9,0,0,True,1355,30.0,True,125,14,14,12,11,11,10,11,9,9,9,9,6
2,0.6,0,0,False,1397,30.0,True,135,15,15,12,12,12,11,11,10,10,10,10,7
4,0.7,0,0,False,1472,30.0,True,135,15,15,12,12,12,11,11,10,10,10,10,7
6,0.8,0,0,False,1685,30.0,True,135,15,15,12,12,12,11,11,10,10,10,10,7
8,0.9,0,0,False,1862,30.0,True,135,15,15,12,12,12,11,11,10,10,10,10,7


In [54]:
dataSorted = data.sort_values(['n_genes','n_factors'])
dataSorted

Unnamed: 0,S,MI_HvsD,MI_DvsD,only_annotated,n_genes,median_gene_n,withCYBA,n_factors,"(T_CD4_Naive,)","(T_CD4_NonNaive,)","(Mono,)","(T_CD8_Naive,)","(T_CD8_NonNaive,)","(B,)","(DC,)","(ILC,)","(Plasma,)","(UTC,)","(pDC,)","(global,)"
31,0.1,0.75,0.75,True,427,18.0,True,23,7,4,0.0,4,0.0,1,3,0.0,0.0,0.0,4,0.0
63,0.2,0.75,0.75,True,427,18.0,True,23,7,4,0.0,4,0.0,1,3,0.0,0.0,0.0,4,0.0
95,0.3,0.75,0.75,True,427,18.0,True,23,7,4,0.0,4,0.0,1,3,0.0,0.0,0.0,4,0.0
127,0.4,0.75,0.75,True,453,18.0,True,23,7,4,0.0,4,0.0,1,3,0.0,0.0,0.0,4,0.0
159,0.5,0.75,0.75,True,453,18.0,True,23,7,4,0.0,4,0.0,1,3,0.0,0.0,0.0,4,0.0
191,0.6,0.75,0.75,True,479,19.0,True,23,7,4,0.0,4,0.0,1,3,0.0,0.0,0.0,4,0.0
223,0.7,0.75,0.75,True,479,19.0,True,23,7,4,0.0,4,0.0,1,3,0.0,0.0,0.0,4,0.0
255,0.8,0.75,0.75,True,479,19.0,True,23,7,4,0.0,4,0.0,1,3,0.0,0.0,0.0,4,0.0
23,0.1,0.5,0.75,True,523,19.5,True,30,9,5,0.0,7,0.0,2,3,0.0,0.0,0.0,4,0.0
55,0.2,0.5,0.75,True,523,19.5,True,30,9,5,0.0,7,0.0,2,3,0.0,0.0,0.0,4,0.0


In [67]:
dataSorted.to_csv(f"{project_dir}/results/GeneSelection_assessmentDF_{LAM}.csv", index=True)

### Selecting the best parameter configuration

In [None]:
max_n_genes = 1100

In [None]:
rowIdx = dataSorted.query('n_genes <= @max_n_genes')['n_genes'].idxmax()

In [None]:
dataSorted.query('n_genes <= @max_n_genes')

In [None]:
dataSorted.loc[rowIdx,:]

In [53]:
S_sel = 0.6
MI_HvsD_q = 0
MI_DvsD_q = 0
annotatedTF = True

In [54]:
#S_sel = dataSorted['S'][rowIdx] #0.50, 0.057991
#MI_HvsD_q = dataSorted['MI_HvsD'][rowIdx] #0.50, 0.057991
#MI_DvsD_q = dataSorted['MI_DvsD'][rowIdx] #0.50, 0.057991
#annotated = dataSorted['only_annotated'][rowIdx]

markers_df = get_marker_genes_knee(adata = adataSPECTRA, gene_weights = gene_weights.drop('Level1',axis=1), S=S_sel)

if annotatedTF:
    MI_HvsD_thr = np.quantile(all_metrics.query('annotated == True').MI_HvsD, q=MI_HvsD_q)
    MI_DvsD_thr = np.quantile(all_metrics.query('annotated == True').MI_DvsD, q=MI_DvsD_q)
    dropped_mi_factors = all_metrics.query('MI_HvsD < @MI_HvsD_thr or MI_DvsD < @MI_DvsD_thr or annotated == False').index
else:
    MI_HvsD_thr = np.quantile(all_metrics.MI_HvsD, q=MI_HvsD_q)
    MI_DvsD_thr = np.quantile(all_metrics.MI_DvsD, q=MI_DvsD_q)        
    dropped_mi_factors = all_metrics.query('MI_HvsD < @MI_HvsD_thr or MI_DvsD < @MI_DvsD_thr').index
    
output = markers_df.drop(dropped_mi_factors, axis=1)

In [55]:
output[output == False] = np.nan
output = output.dropna(axis = 0, how = 'all').fillna(False)

In [56]:
geneSelection_forDownstream = output.index

In [57]:
len(geneSelection_forDownstream)

1020

In [None]:
#geneSelection_forDownstream

In [58]:
# Load gene set dict
with open(here('03_downstream_analysis/05_SPECTRA/results/SPECTRA_GeneSet_ensg_v2.pickle'), 'rb') as f:
    spectra_gene_set = pickle.load(f)

In [59]:
def extract_genes(dictionary):
    for key, value in dictionary.items():
        if isinstance(value, dict):
            yield from extract_genes(value)
        else:
            yield value

In [60]:
import itertools
gene_set_dictionary_ensg = list(set(itertools.chain(*extract_genes(spectra_gene_set))))
len(gene_set_dictionary_ensg)

734

In [61]:
intersection = list(set(gene_set_dictionary_ensg) & set(geneSelection_forDownstream))
len(intersection)/min(len(gene_set_dictionary_ensg), len(geneSelection_forDownstream))

0.7901907356948229

In [68]:
output.shape

(1020, 125)

In [62]:
output.head()

Unnamed: 0,0-X-global-X-global_all_IL6-JAK-STAT3_signaling,1-X-global-X-B_IFN_Type_1_2_Lambda,2-X-global-X-global_all_type-I and II-ifn-response,3-X-global-X-global_all_TNF-via-NFkB_signaling,4-X-global-X-global_all_MHC-II-I presentation,5-X-global-X-global_all_complement_production,7-X-B-X-effector,8-X-B-X-B_chemokines,9-X-B-X-Plasma_chemokine_receptors,10-X-B-X-UTC_cytokine_and_receptors_proinflammatory,...,123-X-UTC-X-cDC_antigen_presentation_molecules,125-X-pDC-X-T_CD4_NonNaive_chemokines,126-X-pDC-X-T_CD8_Naive_chemokine_receptors,127-X-pDC-X-T_CD4_Naive_cytokine_and_receptors_proinflammatory,128-X-pDC-X-T_CD8_Naive_cytokine_and receptors_ antiinflammatory,129-X-pDC-X-pDC_IFN_Type_1_2_Lambda,130-X-pDC-X-T_CD8_Naive_IFN_response,131-X-pDC-X-T_CD8_NonNaive_TNF_receptors_ligands,132-X-pDC-X-Plasma_adhesion_molecules,133-X-pDC-X-pDC_antigen_presentation_molecules
ENSG00000004468,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
ENSG00000006327,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
ENSG00000010278,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
ENSG00000026103,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
ENSG00000026508,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [63]:
output.to_csv(here('03_downstream_analysis/05_SPECTRA/results/SPECTRAFactor_Processed_LAM0.001_S0.6_markersDF.csv'), index=True)

In [72]:
gene_weights = gene_weights.drop('Level1',axis=1)

In [73]:
gene_weights.shape

(135, 8253)

In [74]:
gene_weights.head()

ensembl_gene_id,ENSG00000000003,ENSG00000000457,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,ENSG00000001461,ENSG00000001561,...,ENSG00000278619,ENSG00000278637,ENSG00000278662,ENSG00000278677,ENSG00000278705,ENSG00000278817,ENSG00000278828,ENSG00000280670,ENSG00000280789,ENSG00000282608
0-X-global-X-global_all_IL6-JAK-STAT3_signaling,1.66222e-15,3.255362e-14,7.108393e-15,4.3315350000000006e-17,5.497816e-16,3.206445e-15,1.057057e-15,4.152293e-15,1.354309e-15,7.388704e-15,...,8.076534e-17,1.604406e-15,2.332613e-15,1.923066e-15,1.979497e-15,3.506886e-16,5.575245e-15,8.837729000000001e-17,2.977278e-15,1.527173e-16
1-X-global-X-B_IFN_Type_1_2_Lambda,1.108171e-15,2.299332e-14,4.758343e-15,3.819025e-17,3.985897e-16,2.13764e-15,3.022476e-17,3.077916e-15,1.422393e-15,6.019337e-15,...,6.932764e-17,1.214989e-15,1.637209e-15,1.38665e-15,1.631433e-15,2.437689e-16,4.0315e-15,6.686839000000001e-17,2.288526e-15,9.870781e-17
2-X-global-X-global_all_type-I and II-ifn-response,2.73712e-15,5.744951e-14,1.259858e-14,5.1632680000000005e-17,1.11309e-15,5.587422e-15,4.6122089999999995e-20,7.008616e-15,3.292712e-15,1.330521e-14,...,1.408319e-16,2.763175e-15,3.994905e-15,3.144978e-15,3.659637e-15,5.858709e-16,6.734683e-15,1.542614e-16,4.606112e-15,2.549332e-16
3-X-global-X-global_all_TNF-via-NFkB_signaling,8.518205e-16,1.570927e-14,3.516911e-15,3.3033630000000003e-17,3.04563e-16,1.666658e-15,9.245895e-20,2.090478e-15,8.147142e-16,4.175936e-15,...,4.3292380000000006e-17,9.039299e-16,1.197679e-15,1.025573e-15,1.188532e-15,1.864964e-16,2.036101e-15,4.384817e-17,1.668691e-15,6.243284e-17
4-X-global-X-global_all_MHC-II-I presentation,2.003873e-15,5.072165e-14,1.220851e-14,4.3649420000000004e-17,1.004335e-15,4.993081e-15,0.001373877,5.995152e-15,1.772227e-15,1.293963e-14,...,9.679482000000001e-17,2.463584e-15,3.413538e-15,2.854015e-15,1.960062e-15,5.124434e-16,7.823581e-15,1.331931e-16,5.220355e-15,1.964017e-16


In [75]:
output_columns = output.columns
subset_gene_weights = gene_weights.loc[output_columns]
subset_gene_weights.shape

(125, 8253)

In [76]:
subset_gene_weights.to_csv(here('03_downstream_analysis/05_SPECTRA/results/SPECTRAFactor_Processed_LAM0.001_S0.6_GeneWeights.csv'), index=True)