In [1]:
from biocypher import BioCypher
from genespectrakg.adapters.genespectra_adapter_individual import (
    GeneSpectraAdapter,
    GeneSpectraAdapterNodeType,
    GeneSpectraAdapterEdgeType,
    GeneSpectraAdapterCellTypeField,
    GeneSpectraAdapterGeneField,
    GeneSpectraAdapterOrthologousGroupField,
    GeneSpectraAdapterSpeciesField,
    GeneSpectraAdapterEdgeField,
)

# Instantiate the BioCypher interface
# You can use `config/biocypher_config.yaml` to configure the framework or
# supply settings via parameters below
bc = BioCypher(
    biocypher_config_path="config/biocypher_config.yaml",
)

# Choose node types to include in the knowledge graph.
# These are defined in the adapter (`adapter.py`).
node_types = [
    GeneSpectraAdapterNodeType.CELL_TYPE,
    GeneSpectraAdapterNodeType.GENE,
    GeneSpectraAdapterNodeType.SPECIES,
    GeneSpectraAdapterNodeType.ORTHOLOGOUS_GROUP,
]

# Choose protein adapter fields to include in the knowledge graph.
# These are defined in the adapter (`adapter.py`).
node_fields = [
    # Proteins
    GeneSpectraAdapterCellTypeField.CELL_TYPE_ID,
    GeneSpectraAdapterCellTypeField.CELL_TYPE_NAME,
    GeneSpectraAdapterCellTypeField.TISSUE_ID,
    GeneSpectraAdapterCellTypeField.TISSUE_NAME,
    GeneSpectraAdapterCellTypeField.BROAD_TYPE,
    GeneSpectraAdapterCellTypeField.BROAD_TYPE_2,
    GeneSpectraAdapterCellTypeField.BROAD_TYPE_3, 
    GeneSpectraAdapterGeneField.GENE_ID,
    GeneSpectraAdapterGeneField.GENE_NAME,
    GeneSpectraAdapterGeneField.PEPTIDE_ID,
    GeneSpectraAdapterSpeciesField.SPECIES_ID,
    GeneSpectraAdapterSpeciesField.SPECIES_NAME,
    GeneSpectraAdapterOrthologousGroupField.EGGNOG_DATASET_NAME,
    GeneSpectraAdapterOrthologousGroupField.ORTHOLOGOUS_GROUP_ID,
    GeneSpectraAdapterOrthologousGroupField.EGGNOG_DATASET_ID,
]

edge_types = [
    GeneSpectraAdapterEdgeType.CELL_TYPE_FROM_SPECIES,
    GeneSpectraAdapterEdgeType.GENE_ENHANCED_IN_CELL_TYPE,
    GeneSpectraAdapterEdgeType.GENE_ENRICHED_IN_CELL_TYPE,
    GeneSpectraAdapterEdgeType.GENE_FROM_SPECIES,
    GeneSpectraAdapterEdgeType.GENE_IN_ORTHOLOGOUS_GROUP,
    GeneSpectraAdapterEdgeType.GENE_LOW_SPECIFICITY_IN_SPECIES,
]

edge_fields = [
    GeneSpectraAdapterEdgeField.SPECIFICITY_CATEGORY,
    GeneSpectraAdapterEdgeField.DISTRIBUTION_CATEGORY,
    GeneSpectraAdapterEdgeField.SPECIFICITY_CATEGORY_TYPE,
    GeneSpectraAdapterEdgeField.FRACTION_EXPRESSED,
    GeneSpectraAdapterEdgeField.MAX_EXPRESSION,
    GeneSpectraAdapterEdgeField.MEAN_EXPRESSION,
    GeneSpectraAdapterEdgeField.NUMBER_EXPRESSED,
    GeneSpectraAdapterEdgeField.SPECIFICITY_SCORE,
    GeneSpectraAdapterEdgeField.GROUPS_EXPRESSED,
]

%load_ext autoreload
%autoreload 2


INFO -- This is BioCypher v0.5.19.
INFO -- Logging into `biocypher-log/biocypher-20240402-162544.log`.


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
help(bc.write_import_call)

Help on method write_import_call in module biocypher._core:

write_import_call() -> None method of biocypher._core.BioCypher instance
    Write a shell script to import the database depending on the chosen
    DBMS.



In [3]:

# Create a protein adapter instance
adapter = GeneSpectraAdapter(
    node_types=node_types,
    node_fields=node_fields,
    edge_types=edge_types,
    edge_fields=edge_fields,
)

adapter.load_genespectra_data(eggnog_file='data/MTG_eggnog_ensembl_mapped_5_taxa.csv',
                              cell_ontology_file='data/MTG_cell_type_to_ontology_taxo_cs_only.csv', 
                              genespectra_file='data/1TPM_lim_taxo_species/all_species_classes_taxo_species_1TPM_ensembl_name_processed_all_spec_and_lowspec.csv')



INFO -- Loading data.


In [4]:
adapter.cell_type

Unnamed: 0,cell_ontology_id,cell_type_name,uberon_tissue_id,tissue_name,broad_type,broad_type_2,broad_type_3
0,,Astro,UBERON:0002771,middle temporal gyrus,Non-neuronal,Non-neuronal,Non-neuronal
1,,Oligo,UBERON:0002771,middle temporal gyrus,Non-neuronal,Non-neuronal,Non-neuronal
2,,VLMC,UBERON:0002771,middle temporal gyrus,Non-neuronal,Non-neuronal,Non-neuronal
3,,Micro-PVM,UBERON:0002771,middle temporal gyrus,Non-neuronal,Non-neuronal,Non-neuronal
4,,OPC,UBERON:0002771,middle temporal gyrus,Non-neuronal,Non-neuronal,Non-neuronal
5,,Endo,UBERON:0002771,middle temporal gyrus,Non-neuronal,Non-neuronal,Non-neuronal
6,,Non-IT,UBERON:0002771,middle temporal gyrus,Deep Exc,Exc,Neuronal
7,,CGE-derived,UBERON:0002771,middle temporal gyrus,Inhibitory,Inhibitory,Neuronal
8,,MGE-derived,UBERON:0002771,middle temporal gyrus,Inhibitory,Inhibitory,Neuronal
9,,IT,UBERON:0002771,middle temporal gyrus,IT Exc,Exc,Neuronal


In [8]:
adapter.species

Unnamed: 0,ncbi_txid,species_scientific_name
0,9606,hsapiens
18962,9544,mmulatta
35734,9483,cjacchus
49529,9598,ptroglodytes
67206,9593,ggorilla


In [11]:
adapter.genespectra_lowspec


Unnamed: 0,cell_ontology_id,cell_type_name,uberon_tissue_id,tissue_name,ensembl_gene_id,external_gene_name,ncbi_txid,species_scientific_name,specificity_category,distribution_catehory,specificity_category_type,fraction_expressed,max_expression,mean_expression,n_expressed,specificity_score,groups_expressed
0,,,UBERON:0002771,middle temporal gyrus,ENSPTRG00000012018,AAK1,9598,ptroglodytes,low cell type specificity,expressed in over 90%,low cell type specificity,100.0,1285.5156,578.16156,10,0.0,Astro;CGE-derived;Endo;IT;MGE-derived;Micro-PV...
1,,,UBERON:0002771,middle temporal gyrus,ENSPTRG00000051945,ABCA7,9598,ptroglodytes,low cell type specificity,expressed in over 90%,low cell type specificity,100.0,36.4581,14.60127,10,0.0,Astro;CGE-derived;Endo;IT;MGE-derived;Micro-PV...
2,,,UBERON:0002771,middle temporal gyrus,ENSPTRG00000011878,ABCG5,9598,ptroglodytes,low cell type specificity,expressed in over 30%,low cell type specificity,40.0,1.379,0.74211,4,0.0,CGE-derived;Endo;IT;Non-IT
3,,,UBERON:0002771,middle temporal gyrus,ENSPTRG00000011749,ABHD1,9598,ptroglodytes,low cell type specificity,expressed in over 30%,low cell type specificity,60.0,4.0714,1.6765101,6,0.0,Astro;CGE-derived;Endo;Micro-PVM;OPC;Oligo
4,,,UBERON:0002771,middle temporal gyrus,ENSPTRG00000013340,ABHD12,9598,ptroglodytes,low cell type specificity,expressed in over 90%,low cell type specificity,100.0,265.4951,113.730995,10,0.0,Astro;CGE-derived;Endo;IT;MGE-derived;Micro-PV...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47634,,,UBERON:0002771,middle temporal gyrus,ENSCJAG00000080833,ZNF791,9483,cjacchus,low cell type specificity,expressed in over 90%,low cell type specificity,100.0,22.1833,10.14906,10,0.0,Astro;CGE-derived;Endo;IT;MGE-derived;Micro-PV...
47635,,,UBERON:0002771,middle temporal gyrus,ENSCJAG00000007041,ZNHIT6,9483,cjacchus,low cell type specificity,expressed in over 90%,low cell type specificity,100.0,91.9614,69.602036,10,0.0,Astro;CGE-derived;Endo;IT;MGE-derived;Micro-PV...
47636,,,UBERON:0002771,middle temporal gyrus,ENSCJAG00000043512,ZSCAN22,9483,cjacchus,low cell type specificity,expressed in over 90%,low cell type specificity,100.0,4.0712,2.27635,10,0.0,Astro;CGE-derived;Endo;IT;MGE-derived;Micro-PV...
47637,,,UBERON:0002771,middle temporal gyrus,ENSCJAG00000002903,ZSCAN29,9483,cjacchus,low cell type specificity,expressed in over 90%,low cell type specificity,100.0,8.6226,3.4595902,10,0.0,Astro;CGE-derived;Endo;IT;MGE-derived;Micro-PV...


In [5]:

# Create a knowledge graph from the adapter
bc.write_nodes(adapter.get_nodes())

INFO -- Loading ontologies...
INFO -- Instantiating OntologyAdapter class for https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl.
INFO -- Creating output directory `/Users/ysong/SOFTWARE/GeneSpectraKG/biocypher-out/20240402162547`.
INFO -- Generating nodes.


get gene nodes from genespectra and EggNOG




get OG nodes from EggNOG
get species nodes from EggNOG
get cell type nodes from cell ontology info


INFO -- Writing 86758 entries to Gene-part000.csv
INFO -- Writing 76839 entries to OrthologousGroup-part000.csv
INFO -- Writing 5 entries to Species-part000.csv
INFO -- Writing 10 entries to CellType-part000.csv


True

In [6]:

bc.write_edges(adapter.get_edges())


INFO -- Generating edges.


Get cell type from species edges
Get gene from species edges
Get gene from OG edges
Get gene enriched in cell type edges
Get gene enhanced in cell type edges
Get gene low specifricity in species edges


INFO -- Writing 50 entries to CellTypeFromSpecies-part000.csv
INFO -- Writing 86758 entries to GeneFromSpecies-part000.csv
INFO -- Writing 414386 entries to GeneInOrthologousGroup-part000.csv
INFO -- Writing 31461 entries to GeneEnrichedInCellType-part000.csv
INFO -- Writing 4130 entries to GeneEnhancedInCellType-part000.csv
INFO -- Writing 47345 entries to GeneLowSpecificityInSpecies-part000.csv


True

In [7]:

# Write admin import statement
bc.write_import_call()

## TODO: still happens, same gene id but diff peptide id lead to duplicate gene nodes
# Print summary
bc.summary()

INFO -- Writing broadtaxo import call to `/Users/ysong/SOFTWARE/GeneSpectraKG/biocypher-out/20240402162547/neo4j-admin-import-call.sh`.
INFO -- Duplicate node types encountered (IDs in log): 
    gene



INFO -- Duplicate edge types encountered (IDs in log): 
    gene low specificity in species
    gene enriched in cell type
    gene in orthologous group
    gene enhanced in cell type
    gene from species

INFO -- No missing labels in input.


Showing ontology structure based on https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl
entity
├── association
│   ├── cell type from species
│   ├── gene enhanced in cell type
│   ├── gene enriched in cell type
│   ├── gene from species
│   ├── gene low specificity in species
│   └── gene to gene family association
│       └── gene in orthologous group
└── named thing
    └── biological entity
        ├── gene
        ├── gene family
        │   └── orthologous group
        └── organismal entity
            ├── anatomical entity
            │   └── cell
            │       └── cell type
            └── cellular organism
                └── species

