# Analysis of the selection

## The component selection (i.e. list of components after strong constant removal) <a class="anchor" id="component_list"></a>

In [1]:
components = ("ATF3", "ATF7", "CEBPA", "CREM", "E2F1", "EBF1", "ESR1", "FLI1", "FOS", "FOXO3", "FOXP2", "GATA1", "GFI1B", "HBP1", "HNF4A", "IKZF1", "IRF1", "IRF2", "JUN", "JUNB", "KLF1", "KLF6", "MEF2C", "MYB", "MYC", "NFATC1", "NRF1", "RELA", "RUNX1", "SP1", "SPI1", "STAT1", "STAT3", "STAT6", "TBP", "TBX21", "TCF12", "TRP53", "ZEB2")
print(f"Number of nodes : {len(components)}")

Number of nodes : 39


## Comparison with state-of-the-art models
* Hamey et al, 2017, Reconstructing blood stem cell regulatory network models from single-cell molecular profiles.
* Collombet et al, 2017, Logical modeling of lymphoid and myeloid cell specification and transdifferentiation.
* Moignard et al, 2015, Decoding the regulatory network of early blood development from single-cell gene expression measurements.

In [3]:
hamey = {"BPTF", "PRDM16", "GATA3", "ETV6", "NKX2.3", "HOXA5", "ETS1", "NOTCH", "HOXB4", "GFI1B", "TAL1", "HHEX", "PBX1", "GATA2", "NFE2", "MEIS1", "CBFA2T3H", "ETS2", "GATA1", "LMO2", "LDB1", "IKZF1", "SMARCC1", "FLI1", "LYL1", "HOXA9", "MITF", "RUNX1", "TCF7", "ERG", "MYB"}
collombet = {"CSF1", "CSF1R", "CEBPA", "CEBPB", "MAC1", "RUNX1", "SPI1", "MEF2C", "IKZF1", "GFI1", "EGR1", "E2A", "ID2", "ETS1", "FOXO1", "EBF1", "PAX5", "CD19", "IL7", "IL7R"}
moignard = {"ETV2", "FLI1", "SCL", "GATA1", "NOTCH1", "SOX7", "HOXB4", "LYL1", "IKAROS", "ERG", "PU.1", "MYB", "NFE2", "ETS1", "ETO2", "HHEX", "LMO2", "SOX17", "GFI1", "GFI1B"}

print(f"Hamey: {len(hamey)}")
print(f"Collombet: {len(collombet)}")
print(f"Moignard: {len(moignard)}")

Hamey: 31
Collombet: 20
Moignard: 20


**Gene name standardization:**

In [10]:
def get_dict_synonyms(path_NCBIgenedata: str) -> dict:
    """
    Create a dictionary matching each possible gene name to its NCBI symbol.
    
    Particularity:
    Creation of a temporary file for speeding up the task facing a large matrix from NCBI, the parsing of the NCBI gene data is run with awk. A temporary file is then created.
    
    INPUT
        path_NCBIgenedata: path to the NCBI gene data
    OUTPUT
        dictionary (key: gene name, value: reference gene name (being the NCBI symbol))
    """
    
    # Parse the downloaded NCBI gene data:
    path_NCBIgenedata_cut = f"{path_NCBIgenedata}_cut"
    command_parsing = "awk -F'\t' '{print $3 \"\t\" $5 \"\t\" $11}' " + path_NCBIgenedata + " | tr \| '\t' > " + path_NCBIgenedata_cut + " ; sed -i 1d " + path_NCBIgenedata_cut
    os.system(command_parsing)
    
    # Extract gene data information:    
    gene_synonyms_dict = dict()
    symbols = set()

    with open (path_NCBIgenedata_cut, "r") as file_synonyms:
        for gene in file_synonyms:
            gene = gene.strip().upper()
            gene_symbols_list = gene.split("\t")
            #extract reference gene symbol:
            ncbi_symbol = gene_symbols_list.pop(0)
            #delete non-informative synonyms:
            res = [syn for syn in gene_symbols_list if (syn != "-" and syn != ncbi_symbol)]

            #create the dictionnary matching each symbol to its reference gene symbol (NCBI symbol):
            gene_synonyms_dict[ncbi_symbol] = ncbi_symbol
            symbols.add(ncbi_symbol)

            for gene in res:
                if gene not in symbols:
                    # Warning with NCBI list of synonyms:
                    # A noun can be the synonym of several symbols.
                    # Arbitrary, the choosen one is the first.
                    gene_synonyms_dict[gene] = ncbi_symbol
                    
    os.system(f"rm {path_NCBIgenedata_cut}")
    return gene_synonyms_dict

In [4]:
def get_reference_gene_name(gene_name: str, dict_synonyms: dict) -> str:
    """
    Given a gene name, return its reference name.
    INPUT
        dict_synonyms
        gene_name: the gene name you want its reference name
    OUTPUT
        the synonym considered as the reference name
    """
    gene_name = gene_name.upper()
    if gene_name in dict_synonyms:
        return dict_synonyms[gene_name]
    return gene_name

In [8]:
def set_standardization(gene_set: set, synonyms) -> set:
    return {get_reference_gene_name(gene, synonyms) for gene in gene_set}

In [12]:
import os
synonyms = get_dict_synonyms("data/Mus_musculus.gene_info.20221005.tsv")

In [42]:
hamey_ref = set_standardization(hamey, synonyms)
print(f"Hamey's genes standardization:\nbefore:\t{hamey.difference(hamey_ref)}\nafter:\t{hamey_ref.difference(hamey)}")

Hamey's genes standardization:
before:	{'CBFA2T3H', 'NKX2.3'}
after:	{'CBFA2T3', 'NKX2-3'}


In [43]:
collombet_ref = set_standardization(collombet, synonyms)
print(f"Collombet's genes standardization:\nbefore:\t{collombet.difference(collombet_ref)}\nafter:\t{collombet_ref.difference(collombet)}")

Collombet's genes standardization:
before:	{'MAC1', 'E2A'}
after:	{'TCF3', 'ITGAM'}


In [45]:
moignard_ref = set_standardization(moignard, synonyms)
print(f"Moignard' genes standardization:\nbefore:\t{moignard.difference(moignard_ref)}\nafter:\t{moignard_ref.difference(moignard)}")

Moignard' genes standardization:
before:	{'SCL', 'ETO2', 'IKAROS', 'PU.1'}
after:	{'CBFA2T3', 'IKZF1', 'SCLY', 'SPI1'}


**Comparison of the models and the component selection with BoNesis:**

In [53]:
print(f"Hamey - Moignard intersection: {len(hamey_ref.intersection(moignard_ref))} genes\n{sorted(hamey_ref.intersection(moignard_ref))}\n")
print(f"Hamey - Collombet intersection: {len(hamey_ref.intersection(collombet_ref))} genes\n{sorted(hamey_ref.intersection(collombet_ref))}\n")
print(f"Collombet - Moignard intersection: {len(collombet_ref.intersection(moignard_ref))} genes\n{sorted(collombet_ref.intersection(moignard_ref))}\n")
print(f"Hamey - Collombet - Moignard: {len(hamey_ref.intersection(moignard_ref).intersection(collombet_ref))} genes\n{sorted(hamey_ref.intersection(moignard_ref).intersection(collombet_ref))}\n")

Hamey - Moignard intersection: 13 genes
['CBFA2T3', 'ERG', 'ETS1', 'FLI1', 'GATA1', 'GFI1B', 'HHEX', 'HOXB4', 'IKZF1', 'LMO2', 'LYL1', 'MYB', 'NFE2']

Hamey - Collombet intersection: 3 genes
['ETS1', 'IKZF1', 'RUNX1']

Collombet - Moignard intersection: 4 genes
['ETS1', 'GFI1', 'IKZF1', 'SPI1']

Hamey - Collombet - Moignard: 2 genes
['ETS1', 'IKZF1']



In [66]:
print(f"Comparison of component selection via BoNesis with:\n")
print(f" - Hamey ({len(hamey)} nds), nb of common genes: {len(set(components).intersection(hamey_ref))}\n{sorted(list(set(components).intersection(hamey_ref)))}\n")
print(f" - Moignard ({len(moignard)} nds), nb of common genes: {len(set(components).intersection(moignard_ref))}\n{sorted(list(set(components).intersection(moignard_ref)))}\n")
print(f" - Collombet ({len(collombet)} nds), nb of common genes: {len(set(components).intersection(collombet_ref))}\n{sorted(list(set(components).intersection(collombet_ref)))}\n")
print(f"Genes in common with the intersection of the three state-of-the-art models and the selection via BoNesis: {len(set(components).intersection(hamey_ref).intersection(moignard_ref).intersection(collombet_ref))}\n{list(set(components).intersection(hamey_ref).intersection(moignard_ref).intersection(collombet_ref))}")

Comparison of component selection via BoNesis with:

 - Hamey (31 nds), nb of common genes: 6
['FLI1', 'GATA1', 'GFI1B', 'IKZF1', 'MYB', 'RUNX1']

 - Moignard (20 nds), nb of common genes: 6
['FLI1', 'GATA1', 'GFI1B', 'IKZF1', 'MYB', 'SPI1']

 - Collombet (20 nds), nb of common genes: 6
['CEBPA', 'EBF1', 'IKZF1', 'MEF2C', 'RUNX1', 'SPI1']

Genes in common with the intersection of the three state-of-the-art models and the selection via BoNesis: 1
['IKZF1']


In [70]:
with_hamey = set(components).intersection(hamey_ref)
with_moignard = set(components).intersection(moignard_ref)
with_collombet = set(components).intersection(collombet_ref)

selected_genes = with_hamey.union(with_moignard).union(with_collombet)

print(f"Hamey, Moignard, Collombet models have {len(hamey_ref.union(moignard_ref).union(collombet_ref))} distinct genes.")
print(f"{len(selected_genes)} over the {len(components)} genes of our selection are in common with those being in state-of-the-art models: \n{sorted(list(selected_genes))}")

Hamey, Moignard, Collombet models have 53 distinct genes.
10 over the 39 genes of our selection are in common with those being in state-of-the-art models: 
['CEBPA', 'EBF1', 'FLI1', 'GATA1', 'GFI1B', 'IKZF1', 'MEF2C', 'MYB', 'RUNX1', 'SPI1']
