In [1]:
from biocypher import BioCypher
from genespectrakg.adapters.genespectra_adapter_individual import (
    GeneSpectraAdapter,
    GeneSpectraAdapterNodeType,
    GeneSpectraAdapterEdgeType,
    GeneSpectraAdapterCellTypeField,
    GeneSpectraAdapterGeneField,
    GeneSpectraAdapterOrthologousGroupField,
    GeneSpectraAdapterSpeciesField,
    GeneSpectraAdapterEdgeField,
)

# Instantiate the BioCypher interface
# You can use `config/biocypher_config.yaml` to configure the framework or
# supply settings via parameters below
bc = BioCypher(
    biocypher_config_path="config/biocypher_config.yaml",
)

# Choose node types to include in the knowledge graph.
# These are defined in the adapter (`adapter.py`).
node_types = [
    GeneSpectraAdapterNodeType.CELL_TYPE,
    GeneSpectraAdapterNodeType.GENE,
    GeneSpectraAdapterNodeType.SPECIES,
    GeneSpectraAdapterNodeType.ORTHOLOGOUS_GROUP,
]

# Choose protein adapter fields to include in the knowledge graph.
# These are defined in the adapter (`adapter.py`).
node_fields = [
    # Proteins
    GeneSpectraAdapterCellTypeField.CELL_TYPE_ID,
    GeneSpectraAdapterCellTypeField.CELL_TYPE_NAME,
    GeneSpectraAdapterCellTypeField.TISSUE_ID,
    GeneSpectraAdapterCellTypeField.TISSUE_NAME,
    GeneSpectraAdapterGeneField.GENE_ID,
    GeneSpectraAdapterGeneField.GENE_NAME,
    GeneSpectraAdapterSpeciesField.SPECIES_ID,
    GeneSpectraAdapterSpeciesField.SPECIES_NAME,
    GeneSpectraAdapterOrthologousGroupField.EGGNOG_DATASET_NAME,
    GeneSpectraAdapterOrthologousGroupField.ORTHOLOGOUS_GROUP_ID,
    GeneSpectraAdapterOrthologousGroupField.EGGNOG_DATASET_ID,
]

edge_types = [
    GeneSpectraAdapterEdgeType.CELL_TYPE_FROM_SPECIES,
    GeneSpectraAdapterEdgeType.GENE_ENHANCED_IN_CELL_TYPE,
    GeneSpectraAdapterEdgeType.GENE_ENRICHED_IN_CELL_TYPE,
    GeneSpectraAdapterEdgeType.GENE_FROM_SPECIES,
    GeneSpectraAdapterEdgeType.GENE_IN_ORTHOLOGOUS_GROUP,
]

edge_fields = [
    GeneSpectraAdapterEdgeField.SPECIFICITY_CATEGORY,
    GeneSpectraAdapterEdgeField.DISTRIBUTION_CATEGORY,
    GeneSpectraAdapterEdgeField.SPECIFICITY_CATEGORY_TYPE,
    GeneSpectraAdapterEdgeField.FRACTION_EXPRESSED,
    GeneSpectraAdapterEdgeField.MAX_EXPRESSION,
    GeneSpectraAdapterEdgeField.MEAN_EXPRESSION,
    GeneSpectraAdapterEdgeField.NUMBER_EXPRESSED,
    GeneSpectraAdapterEdgeField.SPECIFICITY_SCORE,
    GeneSpectraAdapterEdgeField.GROUPS_EXPRESSED,
]

%load_ext autoreload
%autoreload 2


INFO -- This is BioCypher v0.5.19.
INFO -- Logging into `biocypher-log/biocypher-20240123-124355.log`.


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:

# Create a protein adapter instance
adapter = GeneSpectraAdapter(
    node_types=node_types,
    node_fields=node_fields,
    edge_types=edge_types,
    edge_fields=edge_fields,
)

adapter.load_genespectra_data(eggnog_file='data/MTG_eggnog_ensembl_mapped_5_taxa.csv',
                              cell_ontology_file='data/MTG_cell_type_to_ontology.csv', 
                              genespectra_file='data/1TPM_lim/all_species_classes_subclass_1TPM_ensembl_name_processed_all.csv')



INFO -- Loading data.


In [3]:
adapter.gene.shape

(84743, 2)

In [4]:

# Create a knowledge graph from the adapter
bc.write_nodes(adapter.get_nodes())

INFO -- Loading ontologies...
INFO -- Instantiating OntologyAdapter class for https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl.
INFO -- Creating output directory `/Users/ysong/SOFTWARE/GeneSpectraKG/biocypher-out/20240123124404`.
INFO -- Generating nodes.


get gene nodes from genespectra and EggNOG




get OG nodes from EggNOG
get species nodes from EggNOG
get cell type nodes from cell ontology info


INFO -- Writing 84741 entries to Gene-part000.csv
INFO -- Writing 76839 entries to OrthologousGroup-part000.csv
INFO -- Writing 5 entries to Species-part000.csv
INFO -- Writing 22 entries to CellType-part000.csv


True

In [5]:

bc.write_edges(adapter.get_edges())


INFO -- Generating edges.


Get cell type from species edges
Get gene from species edges
Get gene from OG edges
Get gene enriched in cell type edges
Get gene enhanced in cell type edges


INFO -- Writing 110 entries to CellTypeFromSpecies-part000.csv
INFO -- Writing 84741 entries to GeneFromSpecies-part000.csv
INFO -- Writing 414386 entries to GeneInOrthologousGroup-part000.csv
INFO -- Writing 23672 entries to GeneEnrichedInCellType-part000.csv
INFO -- Writing 15145 entries to GeneEnhancedInCellType-part000.csv


True

In [6]:

# Write admin import statement
bc.write_import_call()

# Print summary
bc.summary()

INFO -- Writing neo4j import call to `/Users/ysong/SOFTWARE/GeneSpectraKG/biocypher-out/20240123124404/neo4j-admin-import-call.sh`.
INFO -- Duplicate node types encountered (IDs in log): 
    gene

INFO -- Duplicate edge types encountered (IDs in log): 
    gene from species
    gene enriched in cell type
    gene enhanced in cell type

INFO -- No missing labels in input.


Showing ontology structure based on https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl
entity
├── association
│   ├── cell type from species
│   ├── gene enhanced in cell type
│   ├── gene enriched in cell type
│   ├── gene from species
│   └── gene to gene family association
│       └── gene in orthologous group
└── named thing
    └── biological entity
        ├── gene
        ├── gene family
        │   └── orthologous group
        └── organismal entity
            ├── anatomical entity
            │   └── cell
            │       └── cell type
            └── cellular organism
                └── species

