In [1]:
from biocypher import BioCypher
from genespectrakg.adapters.genespectra_adapter_individual import (
    GeneSpectraAdapter,
    GeneSpectraAdapterNodeType,
    GeneSpectraAdapterEdgeType,
    GeneSpectraAdapterCellTypeField,
    GeneSpectraAdapterGeneField,
    GeneSpectraAdapterOrthologousGroupField,
    GeneSpectraAdapterSpeciesField,
    GeneSpectraAdapterEdgeField,
)

# Instantiate the BioCypher interface
# You can use `config/biocypher_config.yaml` to configure the framework or
# supply settings via parameters below
bc = BioCypher(
    biocypher_config_path="config/biocypher_config.yaml",
)

# Choose node types to include in the knowledge graph.
# These are defined in the adapter (`adapter.py`).
node_types = [
    GeneSpectraAdapterNodeType.CELL_TYPE,
    GeneSpectraAdapterNodeType.GENE,
    GeneSpectraAdapterNodeType.SPECIES,
    GeneSpectraAdapterNodeType.ORTHOLOGOUS_GROUP,
]

# Choose protein adapter fields to include in the knowledge graph.
# These are defined in the adapter (`adapter.py`).
node_fields = [
    # Proteins
    GeneSpectraAdapterCellTypeField.CELL_TYPE_ID,
    GeneSpectraAdapterCellTypeField.CELL_TYPE_NAME,
    GeneSpectraAdapterCellTypeField.TISSUE_ID,
    GeneSpectraAdapterCellTypeField.TISSUE_NAME,
    GeneSpectraAdapterGeneField.GENE_ID,
    GeneSpectraAdapterGeneField.GENE_NAME,
    GeneSpectraAdapterSpeciesField.SPECIES_ID,
    GeneSpectraAdapterSpeciesField.SPECIES_NAME,
    GeneSpectraAdapterOrthologousGroupField.EGGNOG_DATASET_NAME,
    GeneSpectraAdapterOrthologousGroupField.ORTHOLOGOUS_GROUP_ID,
    GeneSpectraAdapterOrthologousGroupField.EGGNOG_DATASET_ID,
]

edge_types = [
    GeneSpectraAdapterEdgeType.CELL_TYPE_FROM_SPECIES,
    GeneSpectraAdapterEdgeType.GENE_ENHANCED_IN_CELL_TYPE,
    GeneSpectraAdapterEdgeType.GENE_ENRICHED_IN_CELL_TYPE,
    GeneSpectraAdapterEdgeType.GENE_FROM_SPECIES,
    GeneSpectraAdapterEdgeType.GENE_IN_ORTHOLOGOUS_GROUP,
]

edge_fields = [
    GeneSpectraAdapterEdgeField.SPECIFICITY_CATEGORY,
    GeneSpectraAdapterEdgeField.DISTRIBUTION_CATEGORY,
    GeneSpectraAdapterEdgeField.SPECIFICITY_CATEGORY_TYPE,
    GeneSpectraAdapterEdgeField.FRACTION_EXPRESSED,
    GeneSpectraAdapterEdgeField.MAX_EXPRESSION,
    GeneSpectraAdapterEdgeField.MEAN_EXPRESSION,
    GeneSpectraAdapterEdgeField.NUMBER_EXPRESSED,
    GeneSpectraAdapterEdgeField.SPECIFICITY_SCORE,
]



INFO -- This is BioCypher v0.5.19.
INFO -- Logging into `biocypher-log/biocypher-20231130-101005.log`.


In [5]:

# Create a protein adapter instance
adapter = GeneSpectraAdapter(
    node_types=node_types,
    node_fields=node_fields,
    edge_types=edge_types,
    edge_fields=edge_fields,
)

adapter.load_genespectra_data(eggnog_file='data/MTG_eggnog_ensembl_ncbi_genename_mapped_mammalia.csv',
                              cell_ontology_file='data/MTG_cell_type_to_ontology.csv', 
                              genespectra_file='data/human_classes_subclass_processed.csv')



INFO -- Loading data.


In [6]:
adapter.eggnog

Unnamed: 0,ensembl_gene_id,ncbi_gene_name,ncbi_txid,species_scientific_name,eggnog_og_id,eggnog_dataset_id
0,ENSG00000121410,A1BG,9606,human,8ZMKA,40674
1,ENSG00000175899,A2M,9606,human,8ZNU1,40674
2,ENSG00000171428,NAT1,9606,human,8Z4IU,40674
3,ENSG00000156006,NAT2,9606,human,8Z4IU,40674
4,ENSG00000114771,AADAC,9606,human,8ZJEX,40674
...,...,...,...,...,...,...
50462,ENSGGOG00000002196,ZNF768,9593,gorilla,8ZKSF,40674
50463,ENSGGOG00000025968,VKORC1,9593,gorilla,8ZEBQ,40674
50464,ENSGGOG00000016094,SLC25A53,9593,gorilla,8ZF2G,40674
50465,ENSGGOG00000004937,SPMIP6,9593,gorilla,8ZBQ1,40674


In [4]:

# Create a knowledge graph from the adapter
bc.write_nodes(adapter.get_nodes())



INFO -- Loading ontologies...
INFO -- Instantiating OntologyAdapter class for https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl.
INFO -- Creating output directory `/Users/ysong/SOFTWARE/GeneSpectraKG/biocypher-out/20231130101102`.
INFO -- Generating nodes.


get gene nodes from EggNOG




get OG nodes from EggNOG


KeyError: 'eggnog_dataset_name'

In [5]:
adapter.genespectra_enriched

Unnamed: 0,cell_ontology_id,cell_type_name,uberon_tissue_id,tissue_name,ensembl_gene_id,external_gene_name,ncbi_txid,species_scientific_name,specificity_category,distribution_catehory,specificity_category_type,fraction_expressed,max_expression,mean_expression,n_expressed,specificity_score
0,CL:1001602,cerebral cortex endothelial cell,UBERON:0002771,middle temporal gyrus,ENSG00000175899,A2M,9606,hsapiens,group enriched,expressed in less than 30%,enriched,19.047619047619047,2500.244384765625,203.2610626220703,4,8.942085200463916
1,CL:0000129,microglial cell,UBERON:0002771,middle temporal gyrus,ENSG00000175899,A2M,9606,hsapiens,group enriched,expressed in less than 30%,enriched,19.047619047619047,2500.244384765625,203.2610626220703,4,8.942085200463916
3,CL:0000129,microglial cell,UBERON:0002771,middle temporal gyrus,ENSG00000108798,ABI3,9606,hsapiens,cell type enriched,expressed in single,enriched,4.761904761904762,24.42460060119629,1.4966096878051758,1,4.0
7,CL:0000129,microglial cell,UBERON:0002771,middle temporal gyrus,ENSG00000183549,ACSM5,9606,hsapiens,cell type enriched,expressed in single,enriched,4.761904761904762,50.14519882202149,2.9263617992401123,1,4.0
14,CL:0000129,microglial cell,UBERON:0002771,middle temporal gyrus,ENSG00000169252,ADRB2,9606,hsapiens,cell type enriched,expressed in single,enriched,4.761904761904762,71.07710266113281,3.959357023239136,1,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5318,CL:0000128,oligodendrocyte,UBERON:0002771,middle temporal gyrus,ENSG00000198597,ZNF536,9606,hsapiens,group enriched,expressed in over 30%,enriched,47.61904761904761,2684.9912109375,577.9708862304688,10,7.050118059707626
5319,CL:4023018,pvalb GABAergic cortical interneuron,UBERON:0002771,middle temporal gyrus,ENSG00000198597,ZNF536,9606,hsapiens,group enriched,expressed in over 30%,enriched,47.61904761904761,2684.9912109375,577.9708862304688,10,7.050118059707626
5320,CL:4023015,sncg GABAergic cortical interneuron,UBERON:0002771,middle temporal gyrus,ENSG00000198597,ZNF536,9606,hsapiens,group enriched,expressed in over 30%,enriched,47.61904761904761,2684.9912109375,577.9708862304688,10,7.050118059707626
5321,CL:4023016,vip GABAergic cortical interneuron,UBERON:0002771,middle temporal gyrus,ENSG00000198597,ZNF536,9606,hsapiens,group enriched,expressed in over 30%,enriched,47.61904761904761,2684.9912109375,577.9708862304688,10,7.050118059707626


In [6]:

bc.write_edges(adapter.get_edges())


INFO -- Generating edges.


Get cell type from species edges
Get gene from species edges
Get gene from OG edges
Get gene enriched in cell type edges
Get gene enhanced in cell type edges


INFO -- Writing 21 entries to CellTypeFromSpecies-part000.csv
INFO -- Writing 72595 entries to GeneFromSpecies-part000.csv
INFO -- Writing 72697 entries to GeneInOrthologousGroup-part000.csv
INFO -- Writing 3101 entries to GeneEnrichedInCellType-part000.csv
INFO -- Writing 2223 entries to GeneEnhancedInCellType-part000.csv


True

In [7]:

# Write admin import statement
bc.write_import_call()

# Print summary
bc.summary()

INFO -- Writing neo4j import call to `/Users/ysong/SOFTWARE/GeneSpectraKG/biocypher-out/20231122172537/neo4j-admin-import-call.sh`.
INFO -- No duplicate nodes in input.
INFO -- No duplicate edges in input.
INFO -- No missing labels in input.


Showing ontology structure based on https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl
entity
├── association
│   ├── cell type from species
│   ├── gene enhanced in cell type
│   ├── gene enriched in cell type
│   ├── gene from species
│   └── gene to gene family association
│       └── gene in orthologous group
└── named thing
    └── biological entity
        ├── gene
        ├── gene family
        │   └── orthologous group
        └── organismal entity
            ├── anatomical entity
            │   └── cell type
            └── cellular organism
                └── species

