The variables below should be updated accordingly.  
`INPUT_FILE_PATH` is where the the processed and integrated SemRep triples is stored (output of Stage 4).   
`NODE_MAPPING_FILE_PATH` is where node information for the processed triples is stored (output of Stage 2).  
`NEO4J_OUTPUT_DIR` is where the data to be imported into Neo4j is ouputted.  

In [1]:
INPUT_FILE_PATH = '../data/semrep_relationships_integrated.csv'
NODE_MAPPING_FILE_PATH = '../data/node_id_mapping.csv'
NEO4J_OUTPUT_DIR = '../neo4j-import'

In [2]:
import pandas as pd
import os, json

In [3]:
# columns in the edges file
id_col = 'id' # the preferred name of the entity
id_col_1 = id_col + "1"
id_col_2 = id_col + "2"

identifier_col = 'identifier' # the CUI of the entity
identifier_col_1 = identifier_col + "1"
identifier_col_2 = identifier_col + "2"

type_col = 'type' # the semantic group of the entity
type_col_1 = type_col + "1"
type_col_2 = type_col + "2"

info_col = 'info' # synonyms/original texts for the entity
info_col_1 = info_col + "1"
info_col_2 = info_col + "2"

source_col = 'evidence_source' # what source the triple is from
rel_col = ':TYPE' # relationship
weight_col = 'weight' # name of column with triple edge weight

# what to name the columns after aggregation
identifier_col_agg = identifier_col + "s"
type_col_agg = type_col + "s"
info_col_agg = info_col + "s"

In [4]:
edges_df = pd.read_csv(INPUT_FILE_PATH)
edges_df['text_id'] = edges_df['text_id'].fillna(-1).astype(int)

# add additional column for edges
edges_df['source'] = edges_df[id_col_1]
edges_df['target'] = edges_df[id_col_2]
edges_df['label'] = edges_df[':TYPE']
edges_df['id'] = edges_df[id_col_1] + "|" + edges_df[":TYPE"] + "|" + edges_df[id_col_2]
header = edges_df.columns

In [5]:
# start and end indices as integer
edges_df['start1'] = edges_df['start1'].fillna(-1).astype('int32')
edges_df['end1'] = edges_df['end1'].fillna(-1).astype('int32')
edges_df['start2'] = edges_df['start2'].fillna(-1).astype('int32')
edges_df['end2'] = edges_df['end2'].fillna(-1).astype('int32')
edges_df['startr'] = edges_df['startr'].fillna(-1).astype('int32')
edges_df['endr'] = edges_df['endr'].fillna(-1).astype('int32')

In [6]:
# format the header for neo4j import
edges_neo4j_header = []
for h in header:
    if h == id_col_1:
        edges_neo4j_header.append(h + ":START_ID(Item)")
    elif h == id_col_2:
        edges_neo4j_header.append(h + ":END_ID(Item)")
    elif h == weight_col:
        edges_neo4j_header.append(h + ":FLOAT")
    elif h == ":TYPE":
        edges_neo4j_header.append(h)
    elif 'start' in h or 'end' in h:
        edges_neo4j_header.append(h + ":INT")
    else:
        edges_neo4j_header.append(h + ":STRING")
edges_df.columns = edges_neo4j_header    
edges_df.to_csv(os.path.join(NEO4J_OUTPUT_DIR, 'edges.csv'), index=False)

In [7]:
nodes_df = pd.read_csv(NODE_MAPPING_FILE_PATH)
nodes_df = nodes_df[[id_col, identifier_col_agg, type_col_agg, info_col_agg]].drop_duplicates().dropna()
nodes_neo4j_header = [id_col + ":ID(Item)", identifier_col + ":STRING", type_col + ":STRING", info_col + ":STRING"]
nodes_df.columns = nodes_neo4j_header
nodes_df[":LABEL"] = 'Item'
nodes_df.to_csv(os.path.join(NEO4J_OUTPUT_DIR, 'nodes.csv'), index=False)

In [8]:
print('Node types')
print('====================')
node_types = sorted(set(nodes_df['type:STRING']))
json.dumps(node_types)

Node types


'["Activities & Behaviors", "Anatomy", "Anatomy;Chemicals & Drugs", "COVID-19 (disease)", "Chemicals & Drugs", "Concepts & Ideas", "Devices", "Disorders", "Genes & Molecular Sequences", "Genes & Molecular Sequences;Chemicals & Drugs", "Geographic Areas", "Living Beings", "Objects", "Objects;Chemicals & Drugs", "Occupations", "Organizations", "Organizations;Objects", "Phenomena", "Physiology", "Procedures", "SARS-CoV-2 (virus)"]'

In [9]:
print('Relationship types')
print('====================')
relationship_types =  sorted(set(edges_df[':TYPE']))
json.dumps(relationship_types)

Relationship types


'["ADMINISTERED_TO", "AFFECTS", "ASSOCIATED_WITH", "AUGMENTS", "CAUSES", "COEXISTS_WITH", "COMPLICATES", "CONVERTS_TO", "DIAGNOSES", "DISRUPTS", "INHIBITS", "INTERACTS_WITH", "ISA", "LOCATION_OF", "MANIFESTATION_OF", "MEASURES", "METHOD_OF", "OCCURS_IN", "PART_OF", "PRECEDES", "PREDISPOSES", "PREVENTS", "PROCESS_OF", "PRODUCES", "STIMULATES", "TREATS", "USES", "compared_with", "disease-gene", "disease-phenotype", "drug-gene", "gene-GO", "gene-gene", "gene-phenotype", "higher_than", "lower_than", "same_as"]'

In [10]:
print('Evidence sources')
print('====================')
evidence_sources = sorted(set(edges_df['evidence_source:STRING']))
json.dumps(evidence_sources)

Evidence sources


'["CORD-19", "DGIdb", "DisGeNET", "HPO", "STRING", "Uniprot"]'