# Neo4j to heterogeneous PyG files
This code connects to a Neo4j database of choice and gets the needed information to construct the files that are needed for a GNN analysis with pytorch geometric, using the heterogeneous input format. For this format, the code below should output the following:
1.	The node files, for each different node category the feaheterores of every node are be stored, in our case these are the following nodes:
-	Molecules
-	Atoms
-	Bonds
-	Rings
-	Reactions
2.	The edge files, for every edge category an adjacency matrix is needed and it also has to be specified between which nodes these edges occur. This is done by stating ‘Node_name_1’, ‘Edge_name’, ‘Node_name_2’ in this order. When doing this for all nodes and edges the following is created:
-	'molecule', 'has_atom1', 'atom'
-	'molecule', 'has_atom1', 'atom'
-	'molecule', 'has_bond1', 'bond'
-	'molecule', 'has_ring', 'ring'
-	'atom', 'bonded_with', 'bond'
-	'ring', 'has_atom2', 'atom'
-	'ring', 'has_bond2', 'bond'
-	'molecule', 'reacts_in', 'reaction'
-	'reaction', 'produces', 'molecule'


## Import necessary packages

In [15]:
from neo4j import GraphDatabase
import pandas as pd
import numpy as np
from functions import to_int
import shutil
import random
import os

## Configure Neo4j connection
Using the official Neo4j tool, the code below lets you connect to a Neo4j database of choice
</br>This code is based on Neo4j's example: https://neo4j.com/docs/api/python-driver/current/

In [16]:
from neo4j import GraphDatabase
import pandas as pd

class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

Below, the neo4j username, password and URL can be given, note that you have the credentials to your own database, the ones that can be found below will not work. </br> After this, the connection with the Neo4j database is made.

In [17]:
# Give username, password and database URL
NEO4J_USER = 'neo4j'
NEO4J_PWD = '0611362444'
BOLT_URL = 'bolt://localhost:7687/'

# Establish connection with Neo4j
conn = Neo4jConnection(uri=BOLT_URL, user=NEO4J_USER, pwd=NEO4J_PWD)

## Check for necessary folders

In [18]:
# Check if a folder exists, otherwise create it

if os.path.exists('2-Pytorch geometric data/hetero/MOLfiles/raw') == True:
    # If the MOLfiles folder exists, delete it first
    shutil.rmtree('2-Pytorch geometric data/hetero/MOLfiles/raw')
    os.mkdir('2-Pytorch geometric data/hetero/MOLfiles/raw')

elif os.path.exists('2-Pytorch geometric data/hetero/MOLfiles') == True:
    os.mkdir('2-Pytorch geometric data/hetero/MOLfiles/raw')

elif os.path.exists('2-Pytorch geometric data/hetero') == True:
    os.mkdir('2-Pytorch geometric data/hetero/MOLfiles')
    os.mkdir('2-Pytorch geometric data/hetero/MOLfiles/raw')

elif os.path.exists('2-Pytorch geometric data') == True:
    os.mkdir('2-Pytorch geometric data/hetero')
    os.mkdir('2-Pytorch geometric data/hetero/MOLfiles')
    os.mkdir('2-Pytorch geometric data/hetero/MOLfiles/raw')

else:
    os.mkdir('2-Pytorch geometric data')
    os.mkdir('2-Pytorch geometric data/hetero')
    os.mkdir('2-Pytorch geometric data/hetero/MOLfiles')
    os.mkdir('2-Pytorch geometric data/hetero/MOLfiles/raw')

## Obtain the data
First, of each node type, a dataframe with the ID numbers of the nodes is costructed

In [19]:
# Create a dataframe with the node IDs for every node type
Molecule_id = pd.DataFrame(conn.query(f"MATCH (m:Molecule) RETURN ID(m)"))[0]
Atom_id = pd.DataFrame(conn.query(f"MATCH (a:Atom) RETURN ID(a)"))[0]
Bond_id = pd.DataFrame(conn.query(f"MATCH (b:Bond) RETURN ID(b)"))[0]
Ring_id = pd.DataFrame(conn.query(f"MATCH (r:Ring) RETURN ID(r)"))[0]
Reaction_id = pd.DataFrame(conn.query(f"MATCH (rxn:Reaction) RETURN ID(rxn)"))[0]

In [20]:
def new_index(df):
    """
    Sets all adjacency matrices to start from 0
    """
    # Checks the right node type and runs the code specific for this node type
    for n in [1,3]:
        if df[n][1] == 'Molecule':
            df[n-1] = df[n-1].replace(list(Molecule_id.unique()), range(0, Molecule_id.nunique()))

        elif df[n][1] == 'Atom':
            df[n-1] = df[n-1].replace(list(Atom_id.unique()), range(0, Atom_id.nunique()))

        elif df[n][1] == 'Bond':
            df[n-1] = df[n-1].replace(list(Bond_id.unique()), range(0, Bond_id.nunique()))

        elif df[n][1] == 'Ring':
            df[n-1] = df[n-1].replace(list(Ring_id.unique()), range(0, Ring_id.nunique()))

        elif df[n][1] == 'Reaction':
            df[n-1] = df[n-1].replace(list(Reaction_id.unique()), range(0, Reaction_id.nunique()))

### Get the different adjacency matrices

In [21]:
# Loops over four different edge types to query these and create the adjacency matrix
for n in ['BONDED_WITH', 'HAS_RING', 'REACTS_IN', 'PRODUCES']:
    adj_matrix = pd.DataFrame(conn.query(f"MATCH (a)-[r:{n}]->(b) RETURN ID(a), labels(a), ID(b), labels(b)")) # Cypher query
    adj_matrix[1] = adj_matrix[1].str[0]
    adj_matrix[3] = adj_matrix[3].str[0]
    new_index(adj_matrix)
    adj_matrix[[0,2]].to_csv(f"2-Pytorch geometric data/hetero/MOLfiles/raw/A_{n}.txt", index=False, header=False) # Export csv as .txt

# Loops over two different edge types to query these and create the adjacency matrix
# As both the molecule and ring are connected to atoms and bonds this has to be done seperately
for n in ['HAS_BOND', 'HAS_ATOM']:
    for m in ['Molecule', 'Ring']:
        adj_matrix = pd.DataFrame(conn.query(f"MATCH (a:{m})-[r:{n}]->(b) RETURN ID(a), labels(a), ID(b), labels(b)")) # Cypher query
        adj_matrix[1] = adj_matrix[1].str[0]
        adj_matrix[3] = adj_matrix[3].str[0]
        new_index(adj_matrix)
        adj_matrix[[0,2]].to_csv(f"2-Pytorch geometric data/hetero/MOLfiles/raw/A_{n}_{m}.txt", index=False, header=False) # Export csv as .txt


### Get the attributes of every node type
Starting with the molecule nodes

In [22]:
# Query the attributes of the specific node
att_molecule = pd.DataFrame(conn.query("""MATCH (m:Molecule) RETURN m.Hdonors"""), columns=['Hdonors'])

# Convert categorical values to integers
att_molecule = to_int(att_molecule)

# One hot encode the necessary features
Hd = pd.get_dummies(att_molecule['Hdonors'], prefix='Hdonors')
att_molecule = Hd

# Write to .txt
att_molecule.to_csv(f"2-Pytorch geometric data/hetero/MOLfiles/raw/att_molecule.txt", index=False, header=False)

Atom nodes

In [23]:
# Query the attributes of the specific node
att_atom = pd.DataFrame(conn.query("""MATCH (a:Atom) RETURN a.symbol, a.degree, a.valence, a.atom_type, a.charge, a.hybridization, a.x, a.y"""), columns=['symbol', 'degree', 'valence', 'atom_type', 'charge', 'hybridization', 'x', 'y'])

# Convert categorical values to integers
att_atom = to_int(att_atom)

# One hot encode the necessary features
sym = pd.get_dummies(att_atom['symbol'], prefix='symbol')
val = pd.get_dummies(att_atom['valence'], prefix='valence')
at = pd.get_dummies(att_atom['atom_type'], prefix='atom_type')
hyb = pd.get_dummies(att_atom['hybridization'], prefix='hybridization')

# Join all created dataframes, drop old columns
att_atom = (sym.join(att_atom).join(val).join(at).join(hyb)).drop(columns=['symbol', 'valence', 'atom_type', 'hybridization'])

# Write to .txt
att_atom.to_csv(f"2-Pytorch geometric data/hetero/MOLfiles/raw/att_atom.txt", index=False, header=False)

Bond nodes

In [24]:
# Query the attributes of the specific node
att_bond = pd.DataFrame(conn.query("""MATCH (b:Bond) RETURN b.bond_type, b.distance"""), columns=['bond_type', 'distance'])

# Convert categorical values to integers
att_bond = to_int(att_bond)

# One hot encode the necessary features
bt = pd.get_dummies(att_bond['bond_type'], prefix='bond_type')

# Drop unecessary columns
att_bond = bt.join(att_bond).drop(columns=['bond_type'])

# Write to .txt
att_bond.to_csv(f"2-Pytorch geometric data/hetero/MOLfiles/raw/att_bond.txt", index=False, header=False)

Ring nodes

In [25]:
# Query the attributes of the specific node
att_ring = pd.DataFrame(conn.query("""MATCH (r:Ring) RETURN r.ring_type, r.size"""), columns=['ring_type', 'size'])

# Convert categorical values to integers
att_ring = to_int(att_ring)

# One hot encode the necessary features
rt = pd.get_dummies(att_ring['ring_type'], prefix='ring_type')
att_ring = rt.join(att_ring).drop(columns=['ring_type'])

# Write to .txt
att_ring.to_csv(f"2-Pytorch geometric data/hetero/MOLfiles/raw/att_ring.txt", index=False, header=False)

Reaction nodes

In [26]:
# Query the attributes of the specific node
att_reaction = pd.DataFrame(conn.query("""MATCH (rxn:Reaction) RETURN rxn.type"""), columns=['type'])

# Create dummy reaction feature
att_reaction = to_int(att_reaction)

# Write to .txt
att_reaction[['type']].to_csv(f"2-Pytorch geometric data/hetero/MOLfiles/raw/att_reaction.txt", index=False, header=False)

### Determine the y variables
If the molecule is in a reaction the y value will be 1, otherwise it will be 0

In [27]:
# Query the molecules that are in a reaction
y1 = pd.DataFrame(conn.query("""MATCH (m:Molecule)--(rxn:Reaction)--(m2:Molecule) RETURN ID(m)"""), columns=['ID'])
# Query the molecules that are not in a reaction
y2 = pd.DataFrame(conn.query("""MATCH (m:Molecule) WHERE NOT (m)--(:Reaction)--(:Molecule) RETURN ID(m)"""), columns=['ID'])

# Add y-values of 1
y1['y']=1
# Add y-values of 0
y2['y']=0

# Concatenate both dataframes
y = pd.concat([y1, y2])

# Write to .txt files
y['y'].to_csv(f"2-Pytorch geometric data/hetero/MOLfiles/raw/mol_y.txt", index=False, header=False)


## Get the train and test labels
Determine if a molecule belongs to the train or test dataset

In [28]:
# Give the data train and test labels

# Get number of molecules
mols = len(Molecule_id)

# Create empty train and test lists
train_mask =  []
test_mask = []

# Determine 10 random values to be in the test set
test = random.sample(range(mols), 10)

# Append test to test_mask, train to train_mask
for n in range(mols):
    if n in test:
        train_mask.append(0)
        test_mask.append(1)

    else:
        train_mask.append(1)
        test_mask.append(0)

# Write both to .txt file
pd.DataFrame(train_mask).to_csv(f"2-Pytorch geometric data/hetero/MOLfiles/raw/train_mask.txt", index=False, header=False)
pd.DataFrame(test_mask).to_csv(f"2-Pytorch geometric data/hetero/MOLfiles/raw/test_mask.txt", index=False, header=False)