# Neo4j Sample Code

Helpful tips for Neo4j:

1. Installation that worked for me (Woojae - WSL Ubuntu) https://www.techrepublic.com/article/how-to-install-neo4j-ubuntu-server/
2. If localhost doesn't respond, check the /etc/neo4j/neo4j.conf file and uncomment:
    - server.bolt.enabled=true
    - server.bolt.listen_address=:7687
    - server.http.enabled=true
    - server.http.listen_address=:7474
    - server.default_listen_address=0.0.0.0
    - dbms.security.auth_enabled=false 
        - If you want to bypass authentication temporarily
3. WSL code to launch Neo4j from the browser
    - sudo systemctl start neo4j
    - sudo systemctl status neo4j
    - sudo systemctl restart neo4j
    - sudo systemctl stop neo4j


In [1]:
!pip install neo4j

Collecting neo4j
  Downloading neo4j-5.25.0-py3-none-any.whl.metadata (5.7 kB)
Downloading neo4j-5.25.0-py3-none-any.whl (296 kB)
Installing collected packages: neo4j
Successfully installed neo4j-5.25.0


In [10]:
import neo4j
from neo4j import GraphDatabase
import pandas as pd
from IPython.display import display

import numpy as np
import ast 

In [3]:
# Connection to Neo4j running in WSL
uri = "bolt://localhost:7687"
#username = "neo4j"
#password = "capstone" 

#driver = GraphDatabase.driver(uri, auth=(username, password))
driver = GraphDatabase.driver(uri)

session = driver.session()

In [4]:
# DATASCI205 Helper Functions

def my_neo4j_wipe_out_database():
    "Wipe out the entire database by deleting all nodes and relationships"
    query = "MATCH (n)-[r]->() DELETE n, r"
    session.run(query)
    query = "MATCH (n) DELETE n"
    session.run(query)

def my_neo4j_run_query_pandas(query, **kwargs):
    "Run a query and return the results as a pandas DataFrame"
    result = session.run(query, **kwargs)
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    return df

def my_neo4j_nodes_relationships():
    "Display all nodes and relationships in the database"
    print("-------------------------")
    print("  Nodes:")
    print("-------------------------")
    
    query = """
        MATCH (n) 
        RETURN n.name as node_name, labels(n) as labels
        ORDER BY n.name
    """
    df = my_neo4j_run_query_pandas(query)
    display(df)

    print("-------------------------")
    print("  Relationships:")
    print("-------------------------")
    
    query = """
        MATCH (n1)-[r]->(n2) 
        RETURN n1.name as node_name_1, labels(n1) as node_1_labels, 
               type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        ORDER BY node_name_1, node_name_2
    """
    df = my_neo4j_run_query_pandas(query)
    display(df)

## Data

In [6]:
# Path for datasets

datapath = '../../../data/'

In [13]:
# Load datasets

df_train = pd.read_csv(f'{datapath}' + 'OfficialTrainingSet1.csv')
df_val = pd.read_csv(f'{datapath}' + 'OfficialValidationSet1.csv')
df_test = pd.read_csv(f'{datapath}' + 'OfficialTestSet1.csv')

print("Shape of train dataset:", df_train.shape)
print("Shape of validation dataset:", df_val.shape)
print("Shape of test dataset:", df_test.shape)

df_train.head(3)

Shape of train dataset: (500, 13)
Shape of validation dataset: (500, 13)
Shape of test dataset: (500, 13)


Unnamed: 0,article_code,title,abstract,chemicals,diseases,chemical_start_indices,chemical_end_indices,disease_start_indices,disease_end_indices,chemical_ids,disease_ids,CID_chemical,CID_disease
0,227508,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ...","['Naloxone', 'clonidine', 'clonidine', 'nalozo...","['hypertensive', 'hypotensive', 'hypertensive'...","['0', '49', '181', '244', '306', '354', '364',...","['8', '58', '190', '252', '322', '362', '372',...","['93', '274', '469', '750']","['105', '285', '481', '762']","['D009270', 'D003000', 'D003000', '-1', 'D0087...","['D006973', 'D007022', 'D006973', 'D006973']",['D008750'],['D007022']
1,354896,Lidocaine-induced cardiac asystole.,Intravenous administration of a single 50-mg b...,"['Lidocaine', 'lidocaine', 'lidocaine']","['cardiac asystole', 'depression', 'bradyarrhy...","['0', '90', '409']","['9', '99', '418']","['18', '142', '331']","['34', '152', '347']","['D008012', 'D008012', 'D008012']","['D006323', 'D003866', 'D001919']",['D008012'],['D006323']
2,435349,Suxamethonium infusion rate and observed fasci...,Suxamethonium chloride (Sch) was administered ...,"['Suxamethonium', 'Suxamethonium chloride', 'S...","['fasciculations', 'tetanic', 'Fasciculations'...","['0', '80', '104', '312']","['13', '102', '107', '315']","['41', '265', '395', '483', '523', '538', '561...","['55', '272', '409', '496', '536', '544', '568...","['D013390', 'D013390', 'D013390', 'D013390']","['D005207', 'D013746', 'D005207', 'D005207', '...",['D013390'],['D005207']


In [14]:
# Data transformation functions

def convert_col_to_list(string):
    """
    Converts all string columns that look like lists (col index 3 to end) into actual lists 
    """
    return ast.literal_eval(string)


def lowercase_cols(lst):
    """
    Converts chemicals and diseases column to lowercase
    """
    return [item.lower() for item in lst]


def map_cid_to_chemical_name(row):
    """
    Maps CID of chemical in the CID_chemical column into the actual name of the chemical
    """
    cid_chemicals = row['CID_chemical']
    chemical_ids = row['chemical_ids']
    chemicals = row['chemicals']
    
    chemical_names = []
    
    for cid in cid_chemicals:
        if cid in chemical_ids:
            idx = chemical_ids.index(cid)
            chemical_names.append(chemicals[idx])
        else:
            chemical_names.append('unknown')
    
    return chemical_names


def map_cid_to_disease_name(row):
    """
    Maps CID of disease in the CID_disease column into the actual name of the disease
    """
    cid_diseases = row['CID_disease']
    disease_ids = row['disease_ids']
    diseases = row['diseases']
    
    disease_names = []
    
    for cid in cid_diseases:
        if cid in disease_ids:
            idx = disease_ids.index(cid) 
            disease_names.append(diseases[idx]) 
        else:
            disease_names.append('unknown')
    
    return disease_names


# Function to handle "unknown" for chemical names
def map_cid_to_chemical_name_unknown(data):
    '''
    Addresses 'unknown' instances of CID_chemical_names caused by chemicals with pipe (|) notation
    '''
    chemical_id_map = {}
    for i, row in data.iterrows():
        for cid, chemical in zip(row['chemical_ids'], row['chemicals']):
            chemical_id_map[cid] = chemical
    
    # Function to map "unknown" to the correct chemical name if possible
    def resolve_unknown_chemical_name(cids):
        names = []
        for cid in cids:
            # Split combined IDs (separated by '|') and check for matches in the map
            split_ids = cid.split('|')
            name = ' | '.join([chemical_id_map.get(split_id, 'unknown') for split_id in split_ids])
            names.append(name)
        return names

    # Apply the function only to rows where CID_chemical_name has "unknown"
    data['CID_chemical_name'] = data.apply(lambda row: resolve_unknown_chemical_name(row['CID_chemical']) 
                                       if 'unknown' in row['CID_chemical_name'] else row['CID_chemical_name'], axis=1)
    return data

# Function to handle "Unknown" for disease names
def map_cid_to_disease_name_unknown(data):
    '''
    Addresses 'unknown' instances of CID_disease_names caused by diseases with pipe (|) notation
    '''
    disease_id_map = {}
    for i, row in data.iterrows():
        for cid, disease in zip(row['disease_ids'], row['diseases']):
            disease_id_map[cid] = disease
    
    # Function to map "unknown" to the correct disease name if possible
    def resolve_unknown_disease_name(cids):
        names = []
        for cid in cids:
            # Split combined IDs (separated by '|') and check for matches in the map
            split_ids = cid.split('|')
            name = ' | '.join([disease_id_map.get(split_id, 'unknown') for split_id in split_ids])
            names.append(name)
        return names

    # Apply the function only to rows where CID_disease_name has "Unknown"
    data['CID_disease_name'] = data.apply(lambda row: resolve_unknown_disease_name(row['CID_disease']) 
                                      if 'unknown' in row['CID_disease_name'] else row['CID_disease_name'], axis=1)
    return data

In [15]:
# Apply the data transformations functions to all three datasets

list_columns = ['chemicals', 'diseases', 'chemical_ids', 'disease_ids', 'CID_chemical', 'CID_disease']
for col in list_columns:
    df_train[col] = df_train[col].apply(convert_col_to_list) 
    df_val[col] = df_val[col].apply(convert_col_to_list) 
    df_test[col] = df_test[col].apply(convert_col_to_list) 

df_train['chemicals'] = df_train['chemicals'].apply(lowercase_cols)
df_train['diseases'] = df_train['diseases'].apply(lowercase_cols)
df_val['chemicals'] = df_val['chemicals'].apply(lowercase_cols)
df_val['diseases'] = df_val['diseases'].apply(lowercase_cols)
df_test['chemicals'] = df_test['chemicals'].apply(lowercase_cols)
df_test['diseases'] = df_test['diseases'].apply(lowercase_cols)

df_train['CID_chemical_name'] = df_train.apply(map_cid_to_chemical_name, axis=1)
df_train['CID_disease_name'] = df_train.apply(map_cid_to_disease_name, axis=1)
df_val['CID_chemical_name'] = df_val.apply(map_cid_to_chemical_name, axis=1)
df_val['CID_disease_name'] = df_val.apply(map_cid_to_disease_name, axis=1)
df_test['CID_chemical_name'] = df_test.apply(map_cid_to_chemical_name, axis=1)
df_test['CID_disease_name'] = df_test.apply(map_cid_to_disease_name, axis=1)

df_train = map_cid_to_chemical_name_unknown(df_train)
df_train = map_cid_to_disease_name_unknown(df_train)
df_val = map_cid_to_chemical_name_unknown(df_val)
df_val = map_cid_to_disease_name_unknown(df_val)
df_test = map_cid_to_chemical_name_unknown(df_test)
df_test = map_cid_to_disease_name_unknown(df_test)

df_train.head(3)

Unnamed: 0,article_code,title,abstract,chemicals,diseases,chemical_start_indices,chemical_end_indices,disease_start_indices,disease_end_indices,chemical_ids,disease_ids,CID_chemical,CID_disease,CID_chemical_name,CID_disease_name
0,227508,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ...","[naloxone, clonidine, clonidine, nalozone, alp...","[hypertensive, hypotensive, hypertensive, hype...","['0', '49', '181', '244', '306', '354', '364',...","['8', '58', '190', '252', '322', '362', '372',...","['93', '274', '469', '750']","['105', '285', '481', '762']","[D009270, D003000, D003000, -1, D008750, D0092...","[D006973, D007022, D006973, D006973]",[D008750],[D007022],[alpha-methyldopa],[hypotensive]
1,354896,Lidocaine-induced cardiac asystole.,Intravenous administration of a single 50-mg b...,"[lidocaine, lidocaine, lidocaine]","[cardiac asystole, depression, bradyarrhythmias]","['0', '90', '409']","['9', '99', '418']","['18', '142', '331']","['34', '152', '347']","[D008012, D008012, D008012]","[D006323, D003866, D001919]",[D008012],[D006323],[lidocaine],[cardiac asystole]
2,435349,Suxamethonium infusion rate and observed fasci...,Suxamethonium chloride (Sch) was administered ...,"[suxamethonium, suxamethonium chloride, sch, sch]","[fasciculations, tetanic, fasciculations, fasc...","['0', '80', '104', '312']","['13', '102', '107', '315']","['41', '265', '395', '483', '523', '538', '561...","['55', '272', '409', '496', '536', '544', '568...","[D013390, D013390, D013390, D013390]","[D005207, D013746, D005207, D005207, D005207, ...",[D013390],[D005207],[suxamethonium],[fasciculations]


## Neo4j Build

- This is a sample code that builds a graph database just from the training data of the BC5CDR dataset. 
- Consequentially, this skips the NER model, RE model, and just works with the `CID_chemial_name` and `CID_disease_name` columns.
- At this stage, I do not know what the exact output of the RE model is going to be. We'll need to change the formatting of the input/output accordingly as we finalize modeling parts. Also not sure if we'll be able to find different types of relationships, or it will be left as just having a relationship.
- This is run on a local machine. In the actual project, we'll need to connect to AWS and be able to retrieve the entities and relationships. 

In [20]:
# Node columns
chemical_column = 'CID_chemical_name'
disease_column = 'CID_disease_name'

In [22]:
# New df with only the node columns (chemical and disease)
df_train_nodes = df_train[[chemical_column, disease_column]].copy()
df_train_nodes.head(1)

Unnamed: 0,CID_chemical_name,CID_disease_name
0,[alpha-methyldopa],[hypotensive]


In [27]:
def convert_to_list(value):
    try:
        # Convert the string to a list if possible
        return ast.literal_eval(value)
    except (ValueError, SyntaxError):
        # If conversion fails, return the original value (in case it's already a list)
        return value

df_train_nodes[chemical_column] = df_train_nodes[chemical_column].apply(convert_to_list)
df_train_nodes[disease_column] = df_train_nodes[disease_column].apply(convert_to_list)

In [29]:
print(df_train_nodes.shape)
df_train_nodes.head(1)

(500, 2)


Unnamed: 0,CID_chemical_name,CID_disease_name
0,[alpha-methyldopa],[hypotensive]


In [30]:
# Explode both columns so each element in the lists gets its own row
df_train_nodes = df_train_nodes.explode(chemical_column).explode(disease_column)

In [32]:
def create_chemical_disease_graph_from_flat_df(df, chemical_column, disease_column):
    """
    Create relationships between chemicals and diseases from a flat dataframe with no lists.
    """
    for index, row in df.iterrows():
        chemical = row[chemical_column]
        disease = row[disease_column]
        
        # Only proceed if both chemical and disease are non-empty strings
        if chemical and disease:
            query = """
                MERGE (c:Chemical {name: $chemical})
                MERGE (d:Disease {name: $disease})
                MERGE (c)-[:RELATED_TO]->(d)
            """
            session.run(query, chemical=chemical, disease=disease)

In [33]:
create_chemical_disease_graph_from_flat_df(df_train_nodes, chemical_column, disease_column)

In [34]:
my_neo4j_nodes_relationships()

-------------------------
  Nodes:
-------------------------


Unnamed: 0,node_name,labels
0,"1,1-dichloro-2,2,2-trifluoroethane",[Chemical]
1,"1,2-dimethylhydrazine",[Chemical]
2,"1-chloro-1,2,2,2-tetrafluoroethane",[Chemical]
3,2-acetylaminofluorene,[Chemical]
4,2-chloroprocaine-ce,[Chemical]
...,...,...
882,white matter edema,[Disease]
883,withdrawal symptoms,[Disease]
884,ximelagatran,[Chemical]
885,zidovudine,[Chemical]


-------------------------
  Relationships:
-------------------------


Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels
0,"1,1-dichloro-2,2,2-trifluoroethane",[Chemical],RELATED_TO,liver disease,[Disease]
1,"1,2-dimethylhydrazine",[Chemical],RELATED_TO,initiation induced by carcinogens,[Disease]
2,"1-chloro-1,2,2,2-tetrafluoroethane",[Chemical],RELATED_TO,liver disease,[Disease]
3,2-acetylaminofluorene,[Chemical],RELATED_TO,initiation induced by carcinogens,[Disease]
4,2-chloroprocaine-ce,[Chemical],RELATED_TO,paralysis,[Disease]
...,...,...,...,...,...
1030,warfarin,[Chemical],RELATED_TO,nerve entrapment,[Disease]
1031,ximelagatran,[Chemical],RELATED_TO,abnormal liver function,[Disease]
1032,zidovudine,[Chemical],RELATED_TO,bone marrow suppression,[Disease]
1033,zidovudine,[Chemical],RELATED_TO,cardiomyopathy,[Disease]
