# Neo4j format converter

This notebook converts the first version of the ontology-based annotated files to the neo4j format
### Imports

In [40]:
import pandas as pd
import datetime

# timestamp
today = datetime.date.today()

### Workflow

In [41]:
# list files
!ls *txt

top25.chebi-celltype_human.txt.txt  top25.chebi-pathway_human.txt.txt
top25.chebi-celltype_plant.txt.txt  top25.chebi-Pathway_plant.txt.txt
top25.chebi-doid_human.txt.txt	    top25.chebi-PHENO_plant.txt.txt
top25.chebi-envo_human.txt.txt	    top25.chebi-PO_plant.txt.txt
top25.chebi-envo_plant.txt.txt	    top25.chebi-uberon_human.txt.txt
top25.chebi-go_human.txt.txt	    top25.human.chebi-pheno.txt.txt
top25.chebi-GO_plant.txt.txt


In [42]:
# read file
df = pd.read_csv('./top25.chebi-celltype_human.txt.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
clh = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Human',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'ID#1': ':START_ID',
            'TYPE': ':TYPE',
            'ID#2': ':END_ID',
            '#PMCIDs': 'PMCIDs'
        })
     ).copy()
clh_edges = clh[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*clh_edges.shape))

# nodes
clh_sub = df[['ID#1', 'Name#1']].copy()
clh_sub = (
           clh_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'ID#1':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#1': 'NAME'
                 })     
        ).copy()
clh_obj = df[['ID#2', 'Name#2']].copy()
clh_obj = (
           clh_obj
                 .assign(LABEL=lambda x: 'CELL_TYPE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/cl.owl', ONTOLOGY_NAME=lambda x: 'Cell Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/cl/releases/2020-01-06')
                 .rename(columns={
                     'ID#2':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#2': 'NAME'
                 })      
        ).copy()
clh_nodes = pd.concat([clh_sub, clh_obj])
clh_nodes.drop_duplicates(inplace=True)
clh_nodes = clh_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*clh_nodes.shape))

data structure: 48701 annotations and 6 data fields
data structure: 48701 annotations and 9 data fields
data structure: 3207 annotations and 6 data fields


In [43]:
# read file
df = pd.read_csv('./top25.chebi-celltype_plant.txt.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
clp = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Plant',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'ID#1': ':START_ID',
            'TYPE': ':TYPE',
            'ID#2': ':END_ID',
            '#PMCIDs': 'PMCIDs'
        })
     ).copy()
clp_edges = clp[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*clp_edges.shape))

# nodes
clp_sub = df[['ID#1', 'Name#1']].copy()
clp_sub = (
           clp_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'ID#1':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#1': 'NAME'
                 })     
        ).copy()
clp_obj = df[['ID#2', 'Name#2']].copy()
clp_obj = (
           clp_obj
                 .assign(LABEL=lambda x: 'CELL_TYPE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/cl.owl', ONTOLOGY_NAME=lambda x: 'Cell Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/cl/releases/2020-01-06')
                 .rename(columns={
                     'ID#2':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#2': 'NAME'
                 })      
        ).copy()
clp_nodes = pd.concat([clp_sub, clp_obj])
clp_nodes.drop_duplicates(inplace=True)
clp_nodes = clp_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*clp_nodes.shape))

data structure: 40109 annotations and 6 data fields
data structure: 40109 annotations and 9 data fields
data structure: 3042 annotations and 6 data fields


In [44]:
# read file
df = pd.read_csv('./top25.chebi-doid_human.txt.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
do = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Human',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'ID#1': ':START_ID',
            'TYPE': ':TYPE',
            'ID#2': ':END_ID',
            '#PMCIDs': 'PMCIDs'
        })
     ).copy()
do_edges = do[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*do_edges.shape))

# nodes
do_sub = df[['ID#1', 'Name#1']].copy()
do_sub = (
           do_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'ID#1':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#1': 'NAME'
                 })     
        ).copy()
do_obj = df[['ID#2', 'Name#2']].copy()
do_obj = (
           do_obj
                 .assign(LABEL=lambda x: 'DISEASE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/doid.owl', ONTOLOGY_NAME=lambda x: 'Human Disease Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/doid/releases/2020-01-15/doid.owl')
                 .rename(columns={
                     'ID#2':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#2': 'NAME'
                 })      
        ).copy()
do_nodes = pd.concat([do_sub, do_obj])
do_nodes.drop_duplicates(inplace=True)
do_nodes = do_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*do_nodes.shape))

data structure: 52635 annotations and 6 data fields
data structure: 52635 annotations and 9 data fields
data structure: 6827 annotations and 6 data fields


In [45]:
# read file
df = pd.read_csv('./top25.chebi-envo_human.txt.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
envoh = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Human',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'ID#1': ':START_ID',
            'TYPE': ':TYPE',
            'ID#2': ':END_ID',
            '#PMCIDs': 'PMCIDs'
        })
     ).copy()
envoh_edges = envoh[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*envoh_edges.shape))

# nodes
envoh_sub = df[['ID#1', 'Name#1']].copy()
envoh_sub = (
           envoh_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'ID#1':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#1': 'NAME'
                 })     
        ).copy()
envoh_obj = df[['ID#2', 'Name#2']].copy()
envoh_obj = (
           envoh_obj
                 .assign(LABEL=lambda x: 'ENVIRONMENTAL', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/envo.owl', ONTOLOGY_NAME=lambda x: 'Environment Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/envo.owl')
                 .rename(columns={
                     'ID#2':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#2': 'NAME'
                 })      
        ).copy()
envoh_nodes = pd.concat([envoh_sub, envoh_obj])
envoh_nodes.drop_duplicates(inplace=True)
envoh_nodes = envoh_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*envoh_nodes.shape))

data structure: 44547 annotations and 6 data fields
data structure: 44547 annotations and 9 data fields
data structure: 3760 annotations and 6 data fields


In [46]:
# read file
df = pd.read_csv('./top25.chebi-envo_plant.txt.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
envop = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Plant',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'ID#1': ':START_ID',
            'TYPE': ':TYPE',
            'ID#2': ':END_ID',
            '#PMCIDs': 'PMCIDs'  
        })
     ).copy()
envop_edges = envop[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*envop_edges.shape))

# nodes
envop_sub = df[['ID#1', 'Name#1']].copy()
envop_sub = (
           envop_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'ID#1':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#1': 'NAME'
                 })     
        ).copy()
envop_obj = df[['ID#2', 'Name#2']].copy()
envop_obj = (
           envop_obj
                 .assign(LABEL=lambda x: 'ENVIRONMENTAL', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/envo.owl', ONTOLOGY_NAME=lambda x: 'Environment Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/envo.owl')
                 .rename(columns={
                     'ID#2':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#2': 'NAME'
                 })      
        ).copy()
envop_nodes = pd.concat([envop_sub, envop_obj])
envop_nodes.drop_duplicates(inplace=True)
envop_nodes = envop_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*envop_nodes.shape))

data structure: 36116 annotations and 6 data fields
data structure: 36116 annotations and 9 data fields
data structure: 3782 annotations and 6 data fields


In [47]:
# read file
df = pd.read_csv('./top25.chebi-uberon_human.txt.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
uber = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Human',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'ID#1': ':START_ID',
            'TYPE': ':TYPE',
            'ID#2': ':END_ID',
            '#PMCIDs': 'PMCIDs'  
        })
     ).copy()
uber_edges = uber[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*uber_edges.shape))

# nodes
uber_sub = df[['ID#1', 'Name#1']].copy()
uber_sub = (
           uber_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'ID#1':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#1': 'NAME'
                 })     
        ).copy()
uber_obj = df[['ID#2', 'Name#2']].copy()
uber_obj = (
           uber_obj
                 .assign(LABEL=lambda x: 'ANATOMY', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/uberon.owl', ONTOLOGY_NAME=lambda x: 'Uberon multi-species anatomy ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/uberon/releases/2019-11-22/uberon.owl')
                 .rename(columns={
                     'ID#2':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#2': 'NAME'
                 })      
        ).copy()
uber_nodes = pd.concat([uber_sub, uber_obj])
uber_nodes.drop_duplicates(inplace=True)
uber_nodes = uber_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*uber_nodes.shape))

data structure: 55317 annotations and 6 data fields
data structure: 55317 annotations and 9 data fields
data structure: 7248 annotations and 6 data fields


In [48]:
# read file
df = pd.read_csv('./top25.chebi-PO_plant.txt.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
po = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Plant',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'ID#1': ':START_ID',
            'TYPE': ':TYPE',
            'ID#2': ':END_ID',
            '#PMCIDs': 'PMCIDs'  
        })
     ).copy()
po_edges = po[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*po_edges.shape))

# nodes
po_sub = df[['ID#1', 'Name#1']].copy()
po_sub = (
           po_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'ID#1':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#1': 'NAME'
                 })     
        ).copy()
po_obj = df[['ID#2', 'Name#2']].copy()
po_obj = (
           po_obj
                 .assign(LABEL=lambda x: 'ANATOMY', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/po.owl', ONTOLOGY_NAME=lambda x: 'Uberon multi-species anatomy ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/uberon/releases/2019-11-22/uberon.owl')
                 .rename(columns={
                     'ID#2':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#2': 'NAME'
                 })      
        ).copy()
po_nodes = pd.concat([po_sub, po_obj])
po_nodes.drop_duplicates(inplace=True)
po_nodes = po_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*po_nodes.shape))

data structure: 26133 annotations and 6 data fields
data structure: 26133 annotations and 9 data fields
data structure: 2952 annotations and 6 data fields


In [49]:
# read file
df = pd.read_csv('./top25.human.chebi-pheno.txt.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
pheno = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Plant',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None,
            PMCIDs=lambda x: None)
        .rename(columns={
            'ID#1': ':START_ID',
            'TYPE': ':TYPE',
            'ID#2': ':END_ID'
        })
     ).copy()
pheno_edges = pheno[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*pheno_edges.shape))

# nodes
pheno_sub = df[['ID#1', 'Name#1']].copy()
pheno_sub = (
           pheno_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'ID#1':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#1': 'NAME'
                 })     
        ).copy()
pheno_obj = df[['ID#2', 'Name#2']].copy()
pheno_obj = (
           pheno_obj
                 .assign(LABEL=lambda x: 'PHENOTYPE', ONTOLOGY_IRI=lambda x: '["http://purl.obolibrary.org/obo/flopo.owl","http://purl.obolibrary.org/obo/mp.owl"]', ONTOLOGY_NAME=lambda x: '["Human Phenotype Ontology","Mammalian Phenotype Ontology"]', ONTOLOGY_VERSION_IRI=lambda x: '["http://purl.obolibrary.org/obo/hp/releases/2019-11-08/hp.owl","http://purl.obolibrary.org/obo/mp/releases/2019-11-07"]')
                 .rename(columns={
                     'ID#2':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#2': 'NAME'
                 })      
        ).copy()
pheno_nodes = pd.concat([pheno_sub, pheno_obj])
pheno_nodes.drop_duplicates(inplace=True)
pheno_nodes = pheno_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*pheno_nodes.shape))

data structure: 77009 annotations and 6 data fields
data structure: 77009 annotations and 9 data fields
data structure: 11418 annotations and 6 data fields


In [50]:
# read file
df = pd.read_csv('./top25.chebi-PHENO_plant.txt.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
phenop = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Plant',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None,
            PMCIDs=lambda x: None)
        .rename(columns={
            'ID#1': ':START_ID',
            'TYPE': ':TYPE',
            'ID#2': ':END_ID'
        })
     ).copy()
phenop_edges = phenop[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*phenop_edges.shape))

# nodes
phenop_sub = df[['ID#1', 'Name#1']].copy()
phenop_sub = (
           phenop_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'ID#1':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#1': 'NAME'
                 })     
        ).copy()
phenop_obj = df[['ID#2', 'Name#2']].copy()
phenop_obj = (
           phenop_obj
                 .assign(LABEL=lambda x: 'PHENOTYPE', ONTOLOGY_IRI=lambda x: '["http://purl.obolibrary.org/obo/flopo.owl","http://purl.obolibrary.org/obo/mp.owl"]', ONTOLOGY_NAME=lambda x: '["Human Phenotype Ontology","Mammalian Phenotype Ontology"]', ONTOLOGY_VERSION_IRI=lambda x: '["http://purl.obolibrary.org/obo/hp/releases/2019-11-08/hp.owl","http://purl.obolibrary.org/obo/mp/releases/2019-11-07"]')
                 .rename(columns={
                     'ID#2':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#2': 'NAME'
                 })      
        ).copy()
phenop_nodes = pd.concat([phenop_sub, phenop_obj])
phenop_nodes.drop_duplicates(inplace=True)
phenop_nodes = phenop_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*phenop_nodes.shape))

data structure: 12385 annotations and 6 data fields
data structure: 12385 annotations and 9 data fields
data structure: 2225 annotations and 6 data fields


In [51]:
# read file
df = pd.read_csv('./top25.chebi-go_human.txt.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
go = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Human',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'ID#1': ':START_ID',
            'TYPE': ':TYPE',
            'ID#2': ':END_ID',
            '#PMCIDs': 'PMCIDs'  
        })
     ).copy()
go_edges = go[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*go_edges.shape))

# nodes
go_sub = df[['ID#1', 'Name#1']].copy()
go_sub = (
           go_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'ID#1':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#1': 'NAME'
                 })     
        ).copy()
go_obj = df[['ID#2', 'Name#2']].copy()
go_obj = (
           go_obj
                 .assign(LABEL=lambda x: 'GO', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/go.owl', ONTOLOGY_NAME=lambda x: 'Uberon multi-species anatomy ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/uberon/releases/2019-11-22/uberon.owl')
                 .rename(columns={
                     'ID#2':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#2': 'NAME'
                 })      
        ).copy()
go_nodes = pd.concat([go_sub, go_obj])
go_nodes.drop_duplicates(inplace=True)
go_nodes = go_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*go_nodes.shape))

data structure: 58055 annotations and 6 data fields
data structure: 58055 annotations and 9 data fields
data structure: 5124 annotations and 6 data fields


In [52]:
# read file
df = pd.read_csv('./top25.chebi-GO_plant.txt.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
gop = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Plant',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'ID#1': ':START_ID',
            'TYPE': ':TYPE',
            'ID#2': ':END_ID',
            '#PMCIDs': 'PMCIDs'  
        })
     ).copy()
gop_edges = gop[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*gop_edges.shape))

# nodes
gop_sub = df[['ID#1', 'Name#1']].copy()
gop_sub = (
           gop_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'ID#1':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#1': 'NAME'
                 })     
        ).copy()
gop_obj = df[['ID#2', 'Name#2']].copy()
gop_obj = (
           gop_obj
                 .assign(LABEL=lambda x: 'GO', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/go.owl', ONTOLOGY_NAME=lambda x: 'Uberon multi-species anatomy ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/uberon/releases/2019-11-22/uberon.owl')
                 .rename(columns={
                     'ID#2':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#2': 'NAME'
                 })      
        ).copy()
gop_nodes = pd.concat([gop_sub, gop_obj])
gop_nodes.drop_duplicates(inplace=True)
gop_nodes = gop_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*gop_nodes.shape))

data structure: 43586 annotations and 6 data fields
data structure: 43586 annotations and 9 data fields
data structure: 4509 annotations and 6 data fields


In [53]:
# read file
df = pd.read_csv('./top25.chebi-pathway_human.txt.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
pw = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Human',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'ID#1': ':START_ID',
            'TYPE': ':TYPE',
            'ID#2': ':END_ID',
            '#PMCIDs': 'PMCIDs'  
        })
     ).copy()
pw_edges = pw[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*pw_edges.shape))

# nodes
pw_sub = df[['ID#1', 'Name#1']].copy()
pw_sub = (
           pw_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'ID#1':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#1': 'NAME'
                 })     
        ).copy()
pw_obj = df[['ID#2', 'Name#2']].copy()
pw_obj = (
           pw_obj
                 .assign(LABEL=lambda x: 'PATHWAY', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/pw.owl', ONTOLOGY_NAME=lambda x: 'Uberon multi-species anatomy ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/uberon/releases/2019-11-22/uberon.owl')
                 .rename(columns={
                     'ID#2':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#2': 'NAME'
                 })      
        ).copy()
pw_nodes = pd.concat([pw_sub, pw_obj])
pw_nodes.drop_duplicates(inplace=True)
pw_nodes = pw_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*pw_nodes.shape))

data structure: 27149 annotations and 6 data fields
data structure: 27149 annotations and 9 data fields
data structure: 2393 annotations and 6 data fields


In [54]:
# read file
df = pd.read_csv('./top25.chebi-Pathway_plant.txt.txt',sep='\t')
print('data structure: {0} annotations and {1} data fields'.format(*df.shape))

# modify to csv neo4j format
# edges
# RO:0002616  related via evidence or inference to
pwp = (df
        .assign(
            TYPE=lambda x: 'RO:0002616',
            PROPERTY_LABEL=lambda x: 'related via evidence or inference to',
            SPECIES=lambda x: 'Plant',
            SENTENCE=lambda x: None, 
            PMCID=lambda x: None)
        .rename(columns={
            'ID#1': ':START_ID',
            'TYPE': ':TYPE',
            'ID#2': ':END_ID',
            '#PMCIDs': 'PMCIDs'  
        })
     ).copy()
pwp_edges = pwp[[':START_ID', ':TYPE', ':END_ID', 'PROPERTY_LABEL', 'SPECIES', 'SENTENCE', 'PMCID', 'NPMI_score', 'PMCIDs']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*pwp_edges.shape))

# nodes
pwp_sub = df[['ID#1', 'Name#1']].copy()
pwp_sub = (
           pwp_sub
                 .assign(LABEL=lambda x: 'METABOLITE', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi.owl', ONTOLOGY_NAME=lambda x: 'Chemical Entities of Biological Interest Ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/chebi/183/chebi.owl')
                 .rename(columns={
                     'ID#1':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#1': 'NAME'
                 })     
        ).copy()
pwp_obj = df[['ID#2', 'Name#2']].copy()
pwp_obj = (
           pwp_obj
                 .assign(LABEL=lambda x: 'PATHWAY', ONTOLOGY_IRI=lambda x: 'http://purl.obolibrary.org/obo/pw.owl', ONTOLOGY_NAME=lambda x: 'Uberon multi-species anatomy ontology', ONTOLOGY_VERSION_IRI=lambda x: 'http://purl.obolibrary.org/obo/uberon/releases/2019-11-22/uberon.owl')
                 .rename(columns={
                     'ID#2':'ID:ID',
                     'LABEL': ':LABEL',
                     'Name#2': 'NAME'
                 })      
        ).copy()
pwp_nodes = pd.concat([pwp_sub, pwp_obj])
pwp_nodes.drop_duplicates(inplace=True)
pwp_nodes = pwp_nodes[['ID:ID',':LABEL', 'NAME', 'ONTOLOGY_IRI', 'ONTOLOGY_NAME', 'ONTOLOGY_VERSION_IRI']].copy()
print('data structure: {0} annotations and {1} data fields'.format(*pwp_nodes.shape))

data structure: 13094 annotations and 6 data fields
data structure: 13094 annotations and 9 data fields
data structure: 1667 annotations and 6 data fields


In [55]:
# edges
# concat 
edataframes = list([clh_edges, 
                    clp_edges, 
                    do_edges,
                    envoh_edges,
                    envop_edges,
                    uber_edges,
                    po_edges,
                    pheno_edges,
                    phenop_edges,
                    pw_edges,
                    pwp_edges,
                    go_edges,
                    gop_edges
                   ])
edges = pd.concat(edataframes)

# drop redundant
edges.drop_duplicates(inplace=True)
print('data structure: {0} annotations and {1} data fields'.format(*edges.shape))

# save
edges.fillna('NA').to_csv('./edges_v{}.csv'.format(today), index=False)

data structure: 534836 annotations and 9 data fields


In [56]:
# nodes
# concat
ndataframes = list([clh_nodes, 
                    clp_nodes, 
                    do_nodes,
                    envoh_nodes,
                    envop_nodes,
                    uber_nodes,
                    po_nodes,
                    pheno_nodes,
                    phenop_nodes,
                    pw_nodes,
                    pwp_nodes,
                    go_nodes,
                    gop_nodes
                   ])
nodes = pd.concat(ndataframes)

# drop redundant
nodes
nodes.drop_duplicates(inplace=True)
print('data structure: {0} annotations and {1} data fields'.format(*nodes.shape))

# save
nodes.fillna('NA').to_csv('./nodes_v{}.csv'.format(today), index=False)

data structure: 30425 annotations and 6 data fields


In [57]:
edges.head(2)

Unnamed: 0,:START_ID,:TYPE,:END_ID,PROPERTY_LABEL,SPECIES,SENTENCE,PMCID,NPMI_score,PMCIDs
0,CHEBI:25830,RO:0002616,CL:0000508,related via evidence or inference to,Human,,,0.16056,2
1,CHEBI:25830,RO:0002616,CL:0002180,related via evidence or inference to,Human,,,0.160489,2


In [58]:
nodes.head(2)

Unnamed: 0,ID:ID,:LABEL,NAME,ONTOLOGY_IRI,ONTOLOGY_NAME,ONTOLOGY_VERSION_IRI
0,CHEBI:25830,METABOLITE,[p-quinones],http://purl.obolibrary.org/obo/chebi.owl,Chemical Entities of Biological Interest Ontology,http://purl.obolibrary.org/obo/chebi/183/chebi...
26,CHEBI:60980,METABOLITE,[beta-glucoside],http://purl.obolibrary.org/obo/chebi.owl,Chemical Entities of Biological Interest Ontology,http://purl.obolibrary.org/obo/chebi/183/chebi...


In [59]:
print(clh_edges.shape,clh_nodes.shape)
print(clp_edges.shape,clp_nodes.shape)
print(do_edges.shape,do_nodes.shape)
print(envoh_edges.shape,envoh_nodes.shape)
print(envop_edges.shape,envop_nodes.shape)
print(uber_edges.shape,uber_nodes.shape)
print(po_edges.shape,po_nodes.shape)
print(pheno_edges.shape,pheno_nodes.shape)
print(phenop_edges.shape,phenop_nodes.shape)
print(pw_edges.shape,pw_nodes.shape)
print(pwp_edges.shape,pwp_nodes.shape)
print(go_edges.shape,go_nodes.shape)
print(gop_edges.shape,gop_nodes.shape)

(48701, 9) (3207, 6)
(40109, 9) (3042, 6)
(52635, 9) (6827, 6)
(44547, 9) (3760, 6)
(36116, 9) (3782, 6)
(55317, 9) (7248, 6)
(26133, 9) (2952, 6)
(77009, 9) (11418, 6)
(12385, 9) (2225, 6)
(27149, 9) (2393, 6)
(13094, 9) (1667, 6)
(58055, 9) (5124, 6)
(43586, 9) (4509, 6)
