In [1]:
from biocypher import BioCypher
from enum import Enum, auto
from itertools import chain
from typing import Optional
from biocypher._logger import logger
import pandas as pd
import hashlib

INFO -- This is BioCypher v0.5.19.
INFO -- Logging into `biocypher-log/biocypher-20241107-205131.log`.


In [2]:
import os

os.chdir('/Users/ysong/SOFTWARE/GeneSpectraKG')

In [3]:
from genespectrakg.adapters.genespectra_adapter_wilcox_logit import *

In [4]:

# Instantiate the BioCypher interface
# You can use `config/biocypher_config.yaml` to configure the framework or
# supply settings via parameters below
bc = BioCypher(
    biocypher_config_path="config/biocypher_config.yaml",
)


In [5]:

# Choose node types to include in the knowledge graph.
# These are defined in the adapter (`adapter.py`).
node_types = [
    GeneSpectraAdapterNodeType.CELL_TYPE,
    GeneSpectraAdapterNodeType.GENE,
    GeneSpectraAdapterNodeType.ORTHOLOGOUS_GROUP,
]

# Choose protein adapter fields to include in the knowledge graph.
# These are defined in the adapter (`adapter.py`).
node_fields = [
    # Proteins
    GeneSpectraAdapterCellTypeField.SPECIES_OF_ORIGIN,
    GeneSpectraAdapterCellTypeField.CELL_TYPE_NAME,
    GeneSpectraAdapterCellTypeField.CELL_TYPE_ONTOLOGY_NAME,
    GeneSpectraAdapterCellTypeField.CELL_TYPE_NAME_AND_SPECIES,
    GeneSpectraAdapterCellTypeField.CELL_TYPE_ID,
    GeneSpectraAdapterCellTypeField.BROAD_TYPE,
    GeneSpectraAdapterCellTypeField.BROAD_TYPE_2,
    GeneSpectraAdapterCellTypeField.BROAD_TYPE_3,
    GeneSpectraAdapterCellTypeField.BROAD_TAXO_CS,
    GeneSpectraAdapterGeneField.PEPTIDE_ID,
    GeneSpectraAdapterGeneField.PREFERRED_NAME_AND_SPECIES,
    GeneSpectraAdapterGeneField.PREFERRED_NAME_WILCOX,
    GeneSpectraAdapterGeneField.SPECIES_OF_ORIGIN,
    GeneSpectraAdapterGeneField.IS_A_GO_TF, 
    GeneSpectraAdapterGeneField.DESCRIPTION,    
    GeneSpectraAdapterGeneField.PREFERRED_NAME, 
    GeneSpectraAdapterGeneField.PFAMS,
    GeneSpectraAdapterGeneField.GOS,
    GeneSpectraAdapterGeneField.KEGG_KO,
    GeneSpectraAdapterGeneField.KEGG_PATHWAY,
    GeneSpectraAdapterOrthologousGroupField.EGGNOG_DATASET_NAME,
    GeneSpectraAdapterOrthologousGroupField.ORTHOLOGOUS_GROUP_ID,
    GeneSpectraAdapterOrthologousGroupField.EGGNOG_DATASET_ID,
]

edge_types = [

    GeneSpectraAdapterEdgeType.GENE_WILCOX_MARKER_IN_CELL_TYPE,
    GeneSpectraAdapterEdgeType.GENE_IN_ORTHOLOGOUS_GROUP,
]

edge_fields = [
    GeneSpectraAdapterEdgeField.AVG_LOG2FC,
    GeneSpectraAdapterEdgeField.P_VAL,
    GeneSpectraAdapterEdgeField.P_VAL_ADJ,
    GeneSpectraAdapterEdgeField.P_VAL_ADJ_RANKING,
    GeneSpectraAdapterEdgeField.AVG_LOG2FC_RANKING,

]

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:

# Create a protein adapter instance
adapter = GeneSpectraAdapter(
    node_types=node_types,
    node_fields=node_fields,
    edge_types=edge_types,
    edge_fields=edge_fields,
)


In [7]:
adapter.node_fields

[<GeneSpectraAdapterCellTypeField.SPECIES_OF_ORIGIN: 'species_scientific_name'>,
 <GeneSpectraAdapterCellTypeField.CELL_TYPE_NAME: 'cell_type'>,
 <GeneSpectraAdapterCellTypeField.CELL_TYPE_ONTOLOGY_NAME: 'cell_type_name'>,
 <GeneSpectraAdapterCellTypeField.CELL_TYPE_NAME_AND_SPECIES: 'cell_type_name_and_species'>,
 <GeneSpectraAdapterCellTypeField.CELL_TYPE_ID: 'cell_ontology_id'>,
 <GeneSpectraAdapterCellTypeField.BROAD_TYPE: 'broad_type'>,
 <GeneSpectraAdapterCellTypeField.BROAD_TYPE_2: 'broad_type_2'>,
 <GeneSpectraAdapterCellTypeField.BROAD_TYPE_3: 'broad_type_3'>,
 <GeneSpectraAdapterCellTypeField.BROAD_TAXO_CS: 'broad_taxo_cs'>,
 <GeneSpectraAdapterGeneField.PEPTIDE_ID: 'peptide'>,
 <GeneSpectraAdapterGeneField.PREFERRED_NAME_AND_SPECIES: 'preferred_name_and_species'>,
 <GeneSpectraAdapterGeneField.PREFERRED_NAME_WILCOX: 'ensembl_gene_name_use'>,
 <GeneSpectraAdapterGeneField.SPECIES_OF_ORIGIN: 'species_of_origin'>,
 <GeneSpectraAdapterGeneField.IS_A_GO_TF: 'is_a_GO_tf'>,
 <GeneS

In [8]:
adapter.edge_fields

[<GeneSpectraAdapterEdgeField.AVG_LOG2FC: 'logfoldchanges'>,
 <GeneSpectraAdapterEdgeField.P_VAL: 'pvals'>,
 <GeneSpectraAdapterEdgeField.P_VAL_ADJ: 'pvals_adj'>,
 <GeneSpectraAdapterEdgeField.P_VAL_ADJ_RANKING: 'pvals_adj_rank'>,
 <GeneSpectraAdapterEdgeField.AVG_LOG2FC_RANKING: 'logfc_rank'>]

In [9]:
adapter.edge_types

[<GeneSpectraAdapterEdgeType.GENE_WILCOX_MARKER_IN_CELL_TYPE: 'gene_wilcox_marker_in_cell_type'>,
 <GeneSpectraAdapterEdgeType.GENE_IN_ORTHOLOGOUS_GROUP: 'gene_in_orthologous_group'>]

In [10]:
adapter.load_genespectra_data(eggnog_file='data/wilcox/emapper_mammalia_all_species.csv',  
                              cell_ontology_file='data/wilcox/MTG_cell_type_to_ontology_broad_with_taxo_cs_wilcox.csv',  
                              wilcox_marker_file='data/wilcox/all_species_wilcox_marker_processed_ensembl_name_use.csv')



INFO -- Loading data.


In [11]:
adapter.gene.loc[adapter.gene['preferred_name_and_species'].isna(), :]

Unnamed: 0,peptide,species_of_origin,is_a_GO_tf,Description,Preferred_name,PFAMs,GOs,KEGG_ko,KEGG_Pathway,ensembl_gene_name_use,gene_name_use,preferred_name_and_species


In [12]:
adapter.wilcox_markers[['ensembl_gene_name_use','species_of_origin']].drop_duplicates()

Unnamed: 0,ensembl_gene_name_use,species_of_origin
0,RNF219-AS1,P.troglodytes
1,SLC1A3,P.troglodytes
2,ATP1A2,P.troglodytes
3,ETNPPL,P.troglodytes
4,PRDM16,P.troglodytes
...,...,...
491173,RPS26,C.jacchus
491283,RPL14,C.jacchus
491303,RPL18A,C.jacchus
491323,RPL28,C.jacchus


In [13]:
adapter.eggnog[['Preferred_name', 'species_of_origin']].drop_duplicates()

Unnamed: 0,Preferred_name,species_of_origin
0,ARRB2,P.troglodytes
1,CRY2,P.troglodytes
2,ARRB1,P.troglodytes
3,CRY1,P.troglodytes
5,IGF1R,P.troglodytes
...,...,...
158339,DAK,C.jacchus
158345,BANP,C.jacchus
158347,SPACA6,C.jacchus
158348,FDPS,C.jacchus


In [14]:
# why does primate OGs covering less annotations of primate genes? 
# TODO: figure this out .. 

In [15]:
adapter.gene[['species_of_origin', 'preferred_name_and_species']].drop_duplicates()

Unnamed: 0,species_of_origin,preferred_name_and_species
0,P.troglodytes,ARRB2_P.troglodytes
1,P.troglodytes,CRY2_P.troglodytes
2,P.troglodytes,ARRB1_P.troglodytes
3,P.troglodytes,CRY1_P.troglodytes
4,P.troglodytes,IGF1R_P.troglodytes
...,...,...
92544,C.jacchus,KCNJ12_C.jacchus
92545,C.jacchus,LOC100411273_C.jacchus
92546,C.jacchus,LOC108589496_C.jacchus
92547,C.jacchus,SNX8_C.jacchus


In [16]:
adapter.cell_type.head()

Unnamed: 0,species_scientific_name,cell_type_name,cell_ontology_id,cell_type,broad_type,broad_type_2,broad_type_3,broad_taxo_cs,cell_type_name_and_species
0,H.sapiens,astrocyte of the cerebral cortex,CL:0002605,Astro,Non-neuronal,Non-neuronal,Non-neuronal,Astro,Astro_H.sapiens
1,H.sapiens,oligodendrocyte,CL:0000128,Oligo,Non-neuronal,Non-neuronal,Non-neuronal,Oligo,Oligo_H.sapiens
2,H.sapiens,vascular leptomeningeal cell,CL:4023051,VLMC,Non-neuronal,Non-neuronal,Non-neuronal,VLMC,VLMC_H.sapiens
3,H.sapiens,microglial cell,CL:0000129,Micro-PVM,Non-neuronal,Non-neuronal,Non-neuronal,Micro-PVM,Micro-PVM_H.sapiens
4,H.sapiens,oligodendrocyte precursor cell,CL:0002453,OPC,Non-neuronal,Non-neuronal,Non-neuronal,OPC,OPC_H.sapiens


In [17]:
# TOO MUCH NAs in the gene table, figure this out - only 9664 entries has eggnog annotation
# 87605 gene names involved in markers
# 34710 gene names has eggnog annotation

# TODO: still need to fix the gene merging because there are many NANs 
# after merging using gene names there are 91667 entries, it is an outer merge so it means that many marker genes did not have a eggnog annotation
# too few genes has eggnog - a problem of emapper running? they should easily have orthology mapping between species
# most likely it is a gene naming problem
# need to do: ncbi gene name (available from cellxgene) -> ensembl gene name, peptide id

# 92970 entries in eggnog but lots of duplication of gene names there

In [18]:
# Found the issue, many genes seems not to be annotated an OG in Primates,
# Go back to mammalia?

In [19]:
# Create a knowledge graph from the adapter
bc.write_nodes(adapter.get_nodes())

INFO -- Loading ontologies...
INFO -- Instantiating OntologyAdapter class for https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl.
INFO -- Creating output directory `/Users/ysong/SOFTWARE/GeneSpectraKG/biocypher-out/20241107205143`.
INFO -- Generating nodes.


get gene nodes from genespectra and EggNOG
finish writing gene nodes
get OG nodes from EggNOG
finish writing OG nodes
get cell type nodes from cell ontology info


INFO -- Writing 92549 entries to Gene-part000.csv
INFO -- Writing 14948 entries to OrthologousGroup-part000.csv
INFO -- Writing 120 entries to CellType-part000.csv


True

In [20]:

bc.write_edges(adapter.get_edges())


INFO -- Generating edges.


Get gene belongs to OG edges
Get gene is a wilcox marker in cell type edges
Yield gene is a wilcox marker in cell type edges
Index(['species_of_origin', 'ensembl_gene_name_use', 'cell_type', 'pvals',
       'logfoldchanges', 'pvals_adj', 'pvals_adj_rank', 'logfc_rank',
       'preferred_name_and_species', 'cell_type_name_and_species'],
      dtype='object')
  species_of_origin ensembl_gene_name_use cell_type pvals logfoldchanges  \
0     P.troglodytes            RNF219-AS1     Astro   0.0      12.537694   
1     P.troglodytes                SLC1A3     Astro   0.0      12.396414   
2     P.troglodytes                ATP1A2     Astro   0.0      11.736455   
3     P.troglodytes                ETNPPL     Astro   0.0      11.129451   
4     P.troglodytes                PRDM16     Astro   0.0      11.017203   

  pvals_adj pvals_adj_rank logfc_rank preferred_name_and_species  \
0       0.0          154.0        1.0   RNF219-AS1_P.troglodytes   
1       0.0          154.0        2.0       SLC

INFO -- Writing 49548 entries to GeneInOrthologousGroup-part000.csv
INFO -- Writing 490218 entries to GeneWilcoxMarkerInCellType-part000.csv


True

In [21]:
adapter.wilcox_markers.shape

(491558, 8)

In [22]:

# Write admin import statement
bc.write_import_call()

## TODO: still happens, same gene id but diff peptide id lead to duplicate gene nodes
# Print summary
bc.summary()

# NOTE: contains all positive significant markers, no matter the log2fc or pval ranking 

INFO -- Writing mtg-wilcox import call to `/Users/ysong/SOFTWARE/GeneSpectraKG/biocypher-out/20241107205143/neo4j-admin-import-call.sh`.


INFO -- No duplicate nodes in input.
INFO -- Duplicate edge types encountered (IDs in log): 
    gene wilcox marker in cell type

INFO -- No missing labels in input.


Showing ontology structure based on https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl
entity
├── association
│   ├── gene to gene family association
│   │   └── gene in orthologous group
│   └── gene wilcox marker in cell type
└── named thing
    └── biological entity
        ├── gene
        ├── gene family
        │   └── orthologous group
        └── organismal entity
            └── anatomical entity
                └── cell
                    └── cell type



In [23]:
# used import statement no onehot
#  `/Users/ysong/SOFTWARE/Platy/GeneSpectraKG_Platy/biocypher-out/20241003144626/neo4j-admin-import-call.sh`.

In [24]:
# used import statement with onehot
# /Users/ysong/SOFTWARE/Platy/GeneSpectraKG_Platy/biocypher-out/20241003212531/neo4j-admin-import-call.sh
# TODO: need to fix onehot dtype, now it is string, it needs to be list
# TODO: run this and reconnect to the fastRP part

In [29]:
adapter.eggnog.groupby(['species_of_origin', 'og_id']).count().sort_values('Preferred_name', ascending=False).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,peptide,is_a_GO_tf,Description,Preferred_name,PFAMs,GOs,KEGG_ko,KEGG_Pathway,eggnog_dataset_name,eggnog_dataset_id
species_of_origin,og_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
H.sapiens,3JAMA,40,40,40,40,40,40,40,40,40,40
P.troglodytes,3JAMA,33,33,33,33,33,33,33,33,33,33
G.gorilla,3JAMA,26,26,26,26,26,26,26,26,26,26
M.mulatta,3JAMA,19,19,19,19,19,19,19,19,19,19
H.sapiens,3J6D4,15,15,15,15,15,15,15,15,15,15
H.sapiens,3JE91,14,14,14,14,14,14,14,14,14,14
G.gorilla,3JE91,10,10,10,10,10,10,10,10,10,10
G.gorilla,3J6D4,9,9,9,9,9,9,9,9,9,9
P.troglodytes,3JE91,9,9,9,9,9,9,9,9,9,9
P.troglodytes,3J6D4,8,8,8,8,8,8,8,8,8,8
