In [1]:
import numpy as np
import json 
import scanpy as sc
from collections import OrderedDict
import scipy 
import pandas as pd
import matplotlib.pyplot as plt
import os

import pickle

import warnings
import sys

from pyprojroot import here

warnings.filterwarnings("ignore")
sys.path.insert(1, str(here('bin')))

# Import functions
from customPythonFunctions import generateID2SymbolDF

In [2]:
#spectra imports 
import Spectra as spc
from Spectra import Spectra_util as spc_tl
from Spectra import K_est as kst
from Spectra import default_gene_sets

#KnowledgeBase imports
import cytopus as cp

# Import GeneSets

In [3]:
workDir = "/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/"

## Global

In [4]:
global_GeneSet_path = workDir + "03_downstream_analysis/02_gene_universe_definition/data/CuratedLiterature_geneSets.xlsx"
print(global_GeneSet_path)

/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/03_downstream_analysis/02_gene_universe_definition/data/CuratedLiterature_geneSets.xlsx


In [5]:
global_GS = pd.read_excel(global_GeneSet_path)

In [6]:
global_GS

Unnamed: 0,chemokines,chemokine_receptors,cytokine_and_receptors_proinflammatory,cytokine_and receptors_ antiinflammatory,global_all_IL6-JAK-STAT3_signaling,global_all_JAK-STAT_signaling,global_all_type-I and II-ifn-response,IFN_Type_1_2_Lambda,IFN_response,global_all_TNF-via-NFkB_signaling,TNF_receptors_ligands,adhesion_molecules,antigen_presentation_molecules,global_all_MHC-II-I presentation,global_all_complement_production
0,XCL1,CCR1,IL1A,CSF1,TYK2,PIK3R2,LY6E,IFNA1,IFIT1,PDE4B,TNFRSF1A,ALCAM,ABCB5,HLA-DQB3,C1S
1,XCL2,CCR2,IL1B,IL1F10,IL18R1,IL10RB,TREX1,IFNA2,IFIT2,PTGER4,TNFRSF1B,CADM1,B2M,HLA-DQA1,C1QA
2,CX3CL1,CCR3,IL6,IL10,ITGA4,SPRED2,DDX58,IFNA4,IFIT3,ATP2B1,TNFRSF4,CADM3,CALR,HLA-DPA2,C4A
3,CCL1,CCR4,LIF,IL12A,CSF2RA,IFNAR1,IFI44L,IFNA5,IFIT5,CCL2,TNFRSF6B,CD22,CANX,HLA-DQA2,CFB
4,CCL2,CCR5,CSF2,IL12B,SOCS1,LIF,MCOLN2,IFNA6,ISG15,TAP1,TNFRSF8,CD226,ERAP1,HLA-DPB2,C1QB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,,,,,,,,,,MXD1,,,,,
198,,,,,,,,,,PTGS2,,,,,
199,,,,,,,,,,TUBB2A,,,,,
200,,,,,,,,,,ACKR3,,,,,


In [7]:
# Number of UNIQUE genes
np.unique(np.array(global_GS.fillna('', )).flatten()).shape

(796,)

### Obtain spectra global curated by Juan and Juan gene sets 
After discussing with the developers of SPECTRA they suggest that: 

*"If you just want to quantify the activity of one homogenous program and you want to compare this across cell types you should just add it as Global. Using the same gene set for several cell types also makes sense if you assume that the processes in question exist in different variations in the cell types (you can check how much the marker genes differ between the cell type specific factors). In either case from my standpoint either global or only cell type specific programs (can be in many cell types) makes sense, I’m not sure whether adding this as global and cell type specific make sense I suspect that it might give you weird possibly unstable solutions. We have never tried, let me know if it works empirically."*

As our assumption is the latter, we will use: 
- Global Gene Sets: Spectra default global gene sets curated by Juan. He removed those that are not important for our questions (i.e not inflammatory related global gene sets)
- Cell type Gene Sets: Spectra default + Juan defined signatures/GeneSets

In [8]:
global_GS_dict = {}
for col in global_GS.columns:
    dict_values = global_GS[col].dropna().tolist()
    global_GS_dict[col] = dict_values

In [9]:
# Obtain global and signature dictionaries
spectra_global_curated_GS_dict = {}
immunologist_signatures_GS_dict = {}

# Iterate over keys in global_GS_dict
for key, value in global_GS_dict.items():
    # Check if the key starts with "global"
    if key.startswith("global"):
        spectra_global_curated_GS_dict[key] = value
    else:
        immunologist_signatures_GS_dict[key] = value

Now, we need to include *immunologist_signatures_GS_dict* in each of the cell type specific gene sets

## Cell-type specific

In [10]:
CellTypeDF = pd.read_excel(here('03_downstream_analysis/02_gene_universe_definition/data/Final_gene_sets_per_celltype.xlsx'))
CellTypeDF

Unnamed: 0,B_effector,CD4-T_CD4T_TH17_UP,CD4-T_CD4T_TH2_UP,CD4-T_CD4T_TFH_UP,CD4-T_CD4T_TH1_UP,Tregs_Tregs_FoxP3_stabilization,CD8-T_CD8T_exhaustion,CD8-T_CD8T_tcr_activation,cDC_DC_antigen-crosspresentation,Mono_IFNG_response,Mono_IL4-IL13_response
0,IL2,CXCL8,IL5,IL6,CXCR3,NFATC1,TOX,ALCAM,MRC1,TNFAIP3,ARG1
1,IL13,IL17A,IL4,CD84,IFNG,ATF1,LAG3,JAG1,SEC61A2,SOCS3,PPARG
2,IL4,IL17F,HAVCR1,S1PR1,IL12RB2,CREB1,PDCD1,LTA,SEC61A1,IL23A,SOCS2
3,CSF2,IL1R1,GATA3,IL21R,TBX21,NFATC2,HAVCR2,CCL3,IFI30,IL1B,RNASE2
4,IL6,KLRB1,IL13,CXCR5,STAT4,STAT5A,EOMES,CCL4,SEC61G,IL1A,IL4R
...,...,...,...,...,...,...,...,...,...,...,...
61,,,,,,,,STAT5A,,,
62,,,,,,,,ALPL,,,
63,,,,,,,,IL5,,,
64,,,,,,,,MAFF,,,


In [11]:
all_cell_types_dictionary =  dict()
for c in CellTypeDF.columns:
    s = c.split("_", maxsplit=1)
    ct = s[0]
    function = s[1]
    if ct not in all_cell_types_dictionary.keys():
        all_cell_types_dictionary[ct] = dict()
    all_cell_types_dictionary[ct][function] = CellTypeDF[c].dropna().tolist()

In [12]:
all_cell_types_dictionary.keys()

dict_keys(['B', 'CD4-T', 'Tregs', 'CD8-T', 'cDC', 'Mono'])

In [13]:
key_counts = {}

# Loop through the keys and calculate the number of keys within each key
total_count = 0
for key, value in all_cell_types_dictionary.items():
    if isinstance(value, dict):
        key_counts[key] = len(value)
        total_count += len(value)
    else:
        key_counts[key] = 0

# Print the number of keys within each key
for key, count in key_counts.items():
    print(f"{key}: {count}")

print(f"Total: {total_count}")

B: 1
CD4-T: 4
Tregs: 1
CD8-T: 2
cDC: 1
Mono: 2
Total: 11


Cell type names will be changed afterwards

### Edit Cell type GeneSets

#### (JUST IN CASE) Merge gene-sets
For SPECTRA, we need a 1:1 correspondence between categories and gene_set_dictionary keys (+global). There could be the case in which the current granularity of the dictionary does not match the desired granularity of the annotated cell types. 

For example: Tregs are included within the CD4-T category and TNK within the CD8. 

To solve this we: 
1. Include the gene-sets in the higher annotation level (CD4-T or CD8-T respectively) so that we don't lose those gene sets.
2. We remove the lower level (Tregs and TNK) dictionaries.  

In [14]:
all_cell_types_dictionary.keys()

dict_keys(['B', 'CD4-T', 'Tregs', 'CD8-T', 'cDC', 'Mono'])

In [15]:
if 'Tregs' in all_cell_types_dictionary.keys():
    all_cell_types_dictionary['CD4-T'].update(all_cell_types_dictionary.pop('Tregs'))

if 'TNK' in all_cell_types_dictionary.keys():
    all_cell_types_dictionary['CD8-T'].update(all_cell_types_dictionary.pop('TNK'))
    
if 'NK' in all_cell_types_dictionary.keys():
    all_cell_types_dictionary['ILC'].update(all_cell_types_dictionary.pop('NK'))



In [16]:
all_cell_types_dictionary.keys()

dict_keys(['B', 'CD4-T', 'CD8-T', 'cDC', 'Mono'])

#### (JUST IN CASE) Duplicate gene sets in sublineages
However, it is also possible to happen the contrary, that the granularity of the gene_set is not granular enough. 

For example: TCD4 and TCD8 should be divided into Naive and NonNaive. 

In this case, what we do is: 
- Generate two new gene_set_dictionary keys with the categories not present (Naive, NonNaive) and include the GeneSets from the higher annotation. Ideally, these genesets should be divided into Naive and NonNaive specific if possible. I had a *quick* discussion with Juan and he said that in this case, all GeneSets applied to both Naive and NonNaive. I would confirm this with Juan; if it is actually the case, there is no problem in tearms of SPECTRA it is just for a better explanation. Ex: *Does it make sense to try to find a TCR_activity factor in a cell type that is Naive?* (similar to other gene sets included)
- Remove the high annotation gene sets
  - This is not done as at the time of generating this notebook the conclusion regarding the division of Naive and NonNaive is not done yet. It would be as simple as doing: *all_cell_types_dictionary.pop('CD4-T')*

**TCD4 Naive**

In [17]:
subpops = ["T_CD4_Naive", "T_CD4_NonNaive"]

In [18]:
for pop in subpops: 
    all_cell_types_dictionary[pop] = {}
    for key, value in all_cell_types_dictionary['CD4-T'].items():
        new_key = pop + '_' + key.replace('CD4-T_', '')
        all_cell_types_dictionary[pop][new_key] = value

**TCD8 Naive**

In [19]:
subpops = ["T_CD8_Naive", "T_CD8_NonNaive"]

In [20]:
for pop in subpops: 
    all_cell_types_dictionary[pop] = {}
    for key, value in all_cell_types_dictionary['CD8-T'].items():
        new_key = pop + '_' + key.replace('CD8-T_', '')
        all_cell_types_dictionary[pop][new_key] = value

In [21]:
all_cell_types_dictionary.keys()

dict_keys(['B', 'CD4-T', 'CD8-T', 'cDC', 'Mono', 'T_CD4_Naive', 'T_CD4_NonNaive', 'T_CD8_Naive', 'T_CD8_NonNaive'])

#### Include GeneSets for non-present celltypes
There could be cell type for which we do not have a gene set defined. SPECTRA requires all categories in the *cell type column* to be present as keys in the dictionaries. As such, we generate an empty dictionary for these cell types (we will fill it afterwards with the Immunologist GeneSets). This will mean that there is no default defined gene set for these cell types, but we will include the Immunologist GeneSets in the next section

In [22]:
celltypes_of_interest = ["Plasma", "UTC", 'pDC', 'ILC']

In [23]:
for key in celltypes_of_interest:
    all_cell_types_dictionary[key] = {}

In [24]:
del all_cell_types_dictionary['CD8-T']
del all_cell_types_dictionary['CD4-T']

In [25]:
all_cell_types_dictionary.keys()

dict_keys(['B', 'cDC', 'Mono', 'T_CD4_Naive', 'T_CD4_NonNaive', 'T_CD8_Naive', 'T_CD8_NonNaive', 'Plasma', 'UTC', 'pDC', 'ILC'])

#### Include Immunologist GeneSets in Cell-type GeneSets
As we mentioned, Immunologist GeneSets should be present in each cell type and not in global.

In [26]:
for key, value in all_cell_types_dictionary.items():
    merged_dict = {}
    for signature_key, signature_genes in immunologist_signatures_GS_dict.items():
        new_key = key + "_" + signature_key
        merged_dict[new_key] = signature_genes
    all_cell_types_dictionary[key].update(merged_dict)

In [27]:
key_counts = {}

# Loop through the keys and calculate the number of keys within each key
total_count = 0
for key, value in all_cell_types_dictionary.items():
    if isinstance(value, dict):
        key_counts[key] = len(value)
        total_count += len(value)
    else:
        key_counts[key] = 0

# Print the number of keys within each key
for key, count in key_counts.items():
    print(f"{key}: {count}")

print(f"Total: {total_count}")

B: 10
cDC: 10
Mono: 11
T_CD4_Naive: 14
T_CD4_NonNaive: 14
T_CD8_Naive: 11
T_CD8_NonNaive: 11
Plasma: 9
UTC: 9
pDC: 9
ILC: 9
Total: 117


These will be the total gene sets per cell type that will be composed of: **Default cell type gene sets** (with modifiations depending on merging and splitting) + **immunologist gene sets** 

# Generate final GeneSet
Merge Global dictionary and cell type dictionary into a single gene set dictionary

In [28]:
gene_set_dictionary = {}
gene_set_dictionary["global"] = spectra_global_curated_GS_dict
for key, value in all_cell_types_dictionary.items():
    gene_set_dictionary[key] = value

## Modify cell type names
**Important: The cell type labels have to match with the cell type labels in the gene set dictionary (+ global)**

In [29]:
gene_set_dictionary.keys()

dict_keys(['global', 'B', 'cDC', 'Mono', 'T_CD4_Naive', 'T_CD4_NonNaive', 'T_CD8_Naive', 'T_CD8_NonNaive', 'Plasma', 'UTC', 'pDC', 'ILC'])

In [30]:
# Define a mapping of old keys to new keys
key_mapping = {
    'Mac': 'Mono',
    'p-DC': 'pDC',
    'cDC': 'DC'
}

# Rename the keys in gene_set_dictionary
for old_key, new_key in key_mapping.items():
    if old_key in gene_set_dictionary:
        gene_set_dictionary[new_key] = gene_set_dictionary.pop(old_key)

In [31]:
gene_set_dictionary.keys()

dict_keys(['global', 'B', 'Mono', 'T_CD4_Naive', 'T_CD4_NonNaive', 'T_CD8_Naive', 'T_CD8_NonNaive', 'Plasma', 'UTC', 'pDC', 'ILC', 'DC'])

## Modify gene names to ENSG names

In [32]:
varDF = pd.read_csv(here('03_downstream_analysis/05_SPECTRA/results/varDF.csv'), index_col=0)
varDF

Unnamed: 0_level_0,hgnc_id,symbol,locus_group,HUGO_status,mt,ribo,hb,plt
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000000003,HGNC:11858,TSPAN6,protein_coding,official,False,False,False,False
ENSG00000000419,HGNC:3005,DPM1,protein_coding,official,False,False,False,False
ENSG00000000457,HGNC:19285,SCYL3,protein_coding,official,False,False,False,False
ENSG00000000460,HGNC:25565,FIRRM,protein_coding,official,False,False,False,False
ENSG00000000938,HGNC:3697,FGR,protein_coding,official,False,False,False,False
...,...,...,...,...,...,...,...,...
ENSG00000288253,,ENSG00000288253,lncRNA,non-official,False,False,False,False
ENSG00000288302,,LOC124903386,lncRNA,non-official,False,False,False,False
ENSG00000288398,,ENSG00000288398,lncRNA,non-official,False,False,False,False
ENSG00000288558,HGNC:54422,DUS4L-BCAP29,other,official,False,False,False,False


In [33]:
import warnings
warnings.filterwarnings("ignore")
# Initialize the new gene_set_dictionary_ensg
gene_set_dictionary_ensg = {}

# Iterate over each key-value pair in gene_set_dictionary
for cell_type, gene_sets in gene_set_dictionary.items():
    # Initialize a new dictionary for the cell type
    cell_type_dict_ensg = {}
    
    # Iterate over each gene set in the current cell type
    for gene_set_name, gene_set_genes in gene_sets.items():
        ID2SymbolDF = generateID2SymbolDF(varDF = varDF, 
                                          symbolList = set(gene_set_genes), 
                                          ID_col = 'index', 
                                          symbols_col = 'symbol', 
                                          HUGOstatus_col = 'HUGO_status', 
                                          behaviour = 'all')
        
        # Find genes with multiple ENSG IDs
        multiple_ids = ID2SymbolDF[ID2SymbolDF.duplicated(subset='symbol', keep=False)]
        if not multiple_ids.empty:
            print(f"Warning: Gene set '{gene_set_name}' in cell type '{cell_type}' contains {len(multiple_ids)} genes with multiple ENSG IDs:")
            print(multiple_ids)
        
        # Map symbols to ENSG IDs using ID2SymbolDF
        gene_set_genes_ensg = ID2SymbolDF['gene_id'].tolist()
        
        # Add the gene set to the new dictionary
        cell_type_dict_ensg[gene_set_name] = gene_set_genes_ensg
    
    # Add the new dictionary to gene_set_dictionary_ensg
    gene_set_dictionary_ensg[cell_type] = cell_type_dict_ensg
warnings.filterwarnings("default")

In [34]:
key_counts = {}

# Loop through the keys and calculate the number of keys within each key
total_count = 0
for key, value in gene_set_dictionary_ensg.items():
    if isinstance(value, dict):
        key_counts[key] = len(value)
        total_count += len(value)
    else:
        key_counts[key] = 0

# Print the number of keys within each key
for key, count in key_counts.items():
    print(f"{key}: {count}")

print(f"Total: {total_count}")

global: 6
B: 10
Mono: 11
T_CD4_Naive: 14
T_CD4_NonNaive: 14
T_CD8_Naive: 11
T_CD8_NonNaive: 11
Plasma: 9
UTC: 9
pDC: 9
ILC: 9
DC: 10
Total: 123


# Save GeneSet

In [35]:
with open('{}/03_downstream_analysis/05_SPECTRA/results/SPECTRA_GeneSet_symbol_v2.pickle'.format(workDir), 'wb') as f:
    pickle.dump(gene_set_dictionary, f, pickle.HIGHEST_PROTOCOL)

In [36]:
with open('{}/03_downstream_analysis/05_SPECTRA/results/SPECTRA_GeneSet_ensg_v2.pickle'.format(workDir), 'wb') as f:
    pickle.dump(gene_set_dictionary_ensg, f, pickle.HIGHEST_PROTOCOL)

In [37]:
def extract_genes(dictionary):
    for key, value in dictionary.items():
        if isinstance(value, dict):
            yield from extract_genes(value)
        else:
            yield value

In [38]:
import itertools
gene_set_dictionary_symbol = list(set(itertools.chain(*extract_genes(gene_set_dictionary))))
gene_set_dictionary_ensg = list(set(itertools.chain(*extract_genes(gene_set_dictionary_ensg))))

In [39]:
len(gene_set_dictionary_symbol)

865

In [40]:
len(gene_set_dictionary_ensg)

734