In [1]:
import os
os.getcwd()
os.chdir('/Users/ysong/SOFTWARE/GeneSpectraKG')

In [2]:
from biocypher import BioCypher
from genespectrakg.adapters.genespectra_adapter_individual import (
    GeneSpectraAdapter,
    GeneSpectraAdapterNodeType,
    GeneSpectraAdapterEdgeType,
    GeneSpectraAdapterCellTypeField,
    GeneSpectraAdapterGeneField,
    GeneSpectraAdapterOrthologousGroupField,
    GeneSpectraAdapterSpeciesField,
    GeneSpectraAdapterEdgeField,
)

# Instantiate the BioCypher interface
# You can use `config/biocypher_config.yaml` to configure the framework or
# supply settings via parameters below
bc = BioCypher(
    biocypher_config_path="config/biocypher_config.yaml",
)

# Choose node types to include in the knowledge graph.
# These are defined in the adapter (`adapter.py`).
node_types = [
    GeneSpectraAdapterNodeType.CELL_TYPE,
    GeneSpectraAdapterNodeType.GENE,
    GeneSpectraAdapterNodeType.SPECIES,
    GeneSpectraAdapterNodeType.ORTHOLOGOUS_GROUP,
]

# Choose protein adapter fields to include in the knowledge graph.
# These are defined in the adapter (`adapter.py`).
node_fields = [
    # Proteins
    GeneSpectraAdapterCellTypeField.CELL_TYPE_ID,
    GeneSpectraAdapterCellTypeField.CELL_TYPE_NAME,
    GeneSpectraAdapterCellTypeField.TISSUE_ID,
    GeneSpectraAdapterCellTypeField.TISSUE_NAME,
    GeneSpectraAdapterCellTypeField.BROAD_TYPE,
    GeneSpectraAdapterCellTypeField.BROAD_TYPE_2,
    GeneSpectraAdapterCellTypeField.BROAD_TYPE_3, 
    GeneSpectraAdapterGeneField.GENE_ID,
    GeneSpectraAdapterGeneField.GENE_NAME,
    GeneSpectraAdapterGeneField.PEPTIDE_ID,
    GeneSpectraAdapterSpeciesField.SPECIES_ID,
    GeneSpectraAdapterSpeciesField.SPECIES_NAME,
    GeneSpectraAdapterOrthologousGroupField.EGGNOG_DATASET_NAME,
    GeneSpectraAdapterOrthologousGroupField.ORTHOLOGOUS_GROUP_ID,
    GeneSpectraAdapterOrthologousGroupField.EGGNOG_DATASET_ID,
]

edge_types = [
    GeneSpectraAdapterEdgeType.CELL_TYPE_FROM_SPECIES,
    GeneSpectraAdapterEdgeType.GENE_ENHANCED_IN_CELL_TYPE,
    GeneSpectraAdapterEdgeType.GENE_ENRICHED_IN_CELL_TYPE,
    GeneSpectraAdapterEdgeType.GENE_FROM_SPECIES,
    GeneSpectraAdapterEdgeType.GENE_IN_ORTHOLOGOUS_GROUP,
    GeneSpectraAdapterEdgeType.GENE_LOW_SPECIFICITY_IN_SPECIES,
]

edge_fields = [
    GeneSpectraAdapterEdgeField.SPECIFICITY_CATEGORY,
    GeneSpectraAdapterEdgeField.DISTRIBUTION_CATEGORY,
    GeneSpectraAdapterEdgeField.SPECIFICITY_CATEGORY_TYPE,
    GeneSpectraAdapterEdgeField.FRACTION_EXPRESSED,
    GeneSpectraAdapterEdgeField.MAX_EXPRESSION,
    GeneSpectraAdapterEdgeField.MEAN_EXPRESSION,
    GeneSpectraAdapterEdgeField.NUMBER_EXPRESSED,
    GeneSpectraAdapterEdgeField.SPECIFICITY_SCORE,
    GeneSpectraAdapterEdgeField.GROUPS_EXPRESSED,
]

%load_ext autoreload
%autoreload 2


INFO -- This is BioCypher v0.5.19.
INFO -- Logging into `biocypher-log/biocypher-20240729-142635.log`.


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:

# Create a protein adapter instance
adapter = GeneSpectraAdapter(
    node_types=node_types,
    node_fields=node_fields,
    edge_types=edge_types,
    edge_fields=edge_fields,
)

adapter.load_genespectra_data(eggnog_file='data/MTG_eggnog_ensembl_mapped_5_taxa.csv',
                              cell_ontology_file='data/MTG_cell_type_to_ontology_broad.csv', 
                              genespectra_file='data/1TPM_lim/all_species_classes_subclass_1TPM_ensembl_name_processed_all_spec_and_lowspec.csv')



INFO -- Loading data.


In [4]:
# smaller dataset
adapter.orthologous_group = adapter.orthologous_group.loc[adapter.orthologous_group.eggnog_dataset_name.isin(['primates',  'vertebrata'])]

In [4]:
adapter.orthologous_group.eggnog_dataset_name.value_counts()

eggnog_dataset_name
primates      18894
mammalia      17172
vertebrata    14465
bilateria     13280
metazoa       13028
Name: count, dtype: int64

In [6]:
adapter.cell_type

Unnamed: 0,cell_ontology_id,cell_type_name,uberon_tissue_id,tissue_name,broad_type,broad_type_2,broad_type_3
0,CL:0002605,astrocyte of the cerebral cortex,UBERON:0002771,middle temporal gyrus,Non-neuronal,Non-neuronal,Non-neuronal
1,CL:0000128,oligodendrocyte,UBERON:0002771,middle temporal gyrus,Non-neuronal,Non-neuronal,Non-neuronal
2,CL:4023051,vascular leptomeningeal cell,UBERON:0002771,middle temporal gyrus,Non-neuronal,Non-neuronal,Non-neuronal
3,CL:0000129,microglial cell,UBERON:0002771,middle temporal gyrus,Non-neuronal,Non-neuronal,Non-neuronal
4,CL:0002453,oligodendrocyte precursor cell,UBERON:0002771,middle temporal gyrus,Non-neuronal,Non-neuronal,Non-neuronal
5,CL:1001602,cerebral cortex endothelial cell,UBERON:0002771,middle temporal gyrus,Non-neuronal,Non-neuronal,Non-neuronal
6,CL:4023012,near-projecting glutamatergic cortical neuron,UBERON:0002771,middle temporal gyrus,Deep Exc,Exc,Neuronal
7,CL:4023013,corticothalamic-projecting glutamatergic corti...,UBERON:0002771,middle temporal gyrus,Deep Exc,Exc,Neuronal
8,CL:4023038,L6b glutamatergic cortical neuron,UBERON:0002771,middle temporal gyrus,Deep Exc,Exc,Neuronal
9,CL:4023041,L5 extratelencephalic projecting glutamatergic...,UBERON:0002771,middle temporal gyrus,Deep Exc,Exc,Neuronal


In [5]:

# Create a knowledge graph from the adapter
bc.write_nodes(adapter.get_nodes())

INFO -- Loading ontologies...
INFO -- Instantiating OntologyAdapter class for https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl.
INFO -- Creating output directory `/Users/ysong/SOFTWARE/GeneSpectraKG/biocypher-out/20240729142653`.
INFO -- Generating nodes.


get gene nodes from genespectra and EggNOG




get OG nodes from EggNOG
get species nodes from EggNOG
get cell type nodes from cell ontology info


INFO -- Writing 86803 entries to Gene-part000.csv
INFO -- Writing 76839 entries to OrthologousGroup-part000.csv
INFO -- Writing 5 entries to Species-part000.csv
INFO -- Writing 22 entries to CellType-part000.csv


True

In [6]:

bc.write_edges(adapter.get_edges())


INFO -- Generating edges.


Get cell type from species edges
Get gene from species edges
Get gene from OG edges
Get gene enriched in cell type edges
Get gene enhanced in cell type edges
Get gene low specifricity in species edges


INFO -- Writing 110 entries to CellTypeFromSpecies-part000.csv
INFO -- Writing 86803 entries to GeneFromSpecies-part000.csv
INFO -- Writing 414386 entries to GeneInOrthologousGroup-part000.csv
INFO -- Writing 23672 entries to GeneEnrichedInCellType-part000.csv
INFO -- Writing 15145 entries to GeneEnhancedInCellType-part000.csv
INFO -- Writing 42630 entries to GeneLowSpecificityInSpecies-part000.csv


True

In [7]:

# Write admin import statement
bc.write_import_call()

# Print summary
bc.summary()

INFO -- Writing mtg-pv import call to `/Users/ysong/SOFTWARE/GeneSpectraKG/biocypher-out/20240729142653/neo4j-admin-import-call.sh`.
INFO -- Duplicate node types encountered (IDs in log): 
    gene

INFO -- Duplicate edge types encountered (IDs in log): 
    gene enriched in cell type
    gene from species
    gene in orthologous group
    gene enhanced in cell type
    gene low specificity in species

INFO -- No missing labels in input.


Showing ontology structure based on https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl
entity
├── association
│   ├── cell type from species
│   ├── gene enhanced in cell type
│   ├── gene enriched in cell type
│   ├── gene from species
│   ├── gene low specificity in species
│   └── gene to gene family association
│       └── gene in orthologous group
└── named thing
    └── biological entity
        ├── gene
        ├── gene family
        │   └── orthologous group
        └── organismal entity
            ├── anatomical entity
            │   └── cell
            │       └── cell type
            └── cellular organism
                └── species

