# Neo4j to pytorch files
This code connects to a Neo4j database of choice and gets the needed information to construct the files that are needed for a GNN analysis with pytorch geometric (PyG). These files will be in the format that is also used for the TUData, which has multiple datasets in PyG.
</br>
In our case, we want the following tensors as .txt file format as output of this notebook:
1.	An adjacency matrix, which determines which atoms are linked to each other.
2.	Node labels, which are the atom types.
3.	A ‘graph indicator’ that determines to which graph the atoms belong.
4.	Edge labels, for additional information storage about the bonds.
5.	Node attributes, for additional information storage of the atoms.
6.	Graph attributes, for additional information about the molecules.
</br>
</br>
These files will represent the molecules with their atoms and bonds, not the reactions

## Import necessary packages

In [1]:
from sklearn.preprocessing import OneHotEncoder
from neo4j import GraphDatabase
import pandas as pd
import numpy as np
from functions import to_int
import shutil
import os

## Configure Neo4j connection
Using the official Neo4j tool, the code below lets you connect to a Neo4j database of choice
</br>This code is based on Neo4j's example: https://neo4j.com/docs/api/python-driver/current/

In [2]:

class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

Below, the neo4j username, password and URL can be given, note that you have the credentials to your own database, the ones that can be found below will not work. </br> After this, the connection with the Neo4j database is made.

In [3]:
# Give username, password and database URL
NEO4J_USER = 'neo4j'
NEO4J_PWD = '0611362444'
BOLT_URL = 'bolt://localhost:7687/'

# Establish connection with Neo4j
conn = Neo4jConnection(uri=BOLT_URL, user=NEO4J_USER, pwd=NEO4J_PWD)

## Check for necessary folders
The path 'Pytorch geometric data/TU/MOLs' has to be created

In [4]:
# Check if a folder exists, otherwise create it

if os.path.exists('2-Pytorch geometric data/TU/MOLfiles/raw') == True:
    # If the MOLfiles folder exists, delete it first
    shutil.rmtree('2-Pytorch geometric data/TU/MOLfiles/raw')
    os.mkdir('2-Pytorch geometric data/TU/MOLfiles/raw')

elif os.path.exists('2-Pytorch geometric data/TU/MOLfiles') == True:
    os.mkdir('2-Pytorch geometric data/TU/MOLfiles/raw')

elif os.path.exists('2-Pytorch geometric data/TU') == True:
    os.mkdir('2-Pytorch geometric data/TU/MOLfiles')
    os.mkdir('2-Pytorch geometric data/TU/MOLfiles/raw')

elif os.path.exists('2-Pytorch geometric data') == True:
    os.mkdir('2-Pytorch geometric data/TU')
    os.mkdir('2-Pytorch geometric data/TU/MOLfiles')
    os.mkdir('2-Pytorch geometric data/TU/MOLfiles/raw')

else:
    os.mkdir('2-Pytorch geometric data')
    os.mkdir('2-Pytorch geometric data/TU')
    os.mkdir('2-Pytorch geometric data/TU/MOLfiles')
    os.mkdir('2-Pytorch geometric data/TU/MOLfiles/raw')

## Obtain the data
Every file needed stated above will be created with the code below. First, the adjacency matrix together with the bond types are taken from the database.

In [5]:
# Cypher query to get the adjacency matrix from only the atoms
query = f"MATCH (m:Molecule)-[:HAS_ATOM]->(a1:Atom)-[:BONDED_WITH]->(b:Bond)<-[:BONDED_WITH]-(a2:Atom)<-[:HAS_ATOM]-(m) RETURN ID(a1), ID(a2), b.bond_type"
bonds = pd.DataFrame(conn.query(query), columns=['ID(a1)', 'ID(a2)', 'bond_type'])

# Transform all categorical values to integers
bonds = to_int(bonds)

# Ensure that all IDs are sequential for the adjacency matrix
bonds2 = bonds
bonds2[['ID(a1)', 'ID(a2)']] = bonds2[['ID(a1)', 'ID(a2)']].replace(list(bonds['ID(a1)'].unique()), range(0, bonds['ID(a1)'].nunique()))
bonds2

Unnamed: 0,ID(a1),ID(a2),bond_type
0,0,1,1
1,1,0,1
2,2,3,2
3,3,2,2
4,4,5,1
...,...,...,...
431,227,226,1
432,231,226,1
433,224,231,1
434,226,231,1


Split the output above in the adjacency matrix and edge label (bond type) and export them into the right .txt format

In [6]:
# Get the adjacency matrix and add 1
adj_matrix = bonds2[['ID(a1)', 'ID(a2)']] + 1
adj_matrix.to_csv('2-Pytorch geometric data/TU/MOLfiles/raw/mol_A.txt', index=False, header=False) # Export csv as .txt

# Get the edge labels
edge_labels = bonds2['bond_type']
edge_labels.to_csv('2-Pytorch geometric data/TU/MOLfiles/raw/mol_edge_labels.txt', index=False, header=False) # Export csv as .txt

Query the graph ID from Neo4j (each molecule has a different ID) and export them to the .txt file.

In [7]:
# For each node, get the graph ID
graph_id = pd.DataFrame(conn.query(f"MATCH (a:Atom)<-[]-(m:Molecule) RETURN ID(a), m.name"), columns=['ID','molecule'])# Cypher query
graph_id = to_int(graph_id)
graph_id['molecule'].to_csv('2-Pytorch geometric data/TU/MOLfiles/raw/mol_graph_indicator.txt', index=False, header=False) # Export csv as .txt

Get all the atom features from Neo4j

In [8]:
# Get the atom features
att_atom = pd.DataFrame(conn.query("""MATCH (a:Atom) RETURN a.symbol, a.degree,
 a.valence, a.atom_type, a.charge, a.hybridization, a.x, a.y"""), 
 columns=['symbol', 'degree', 'valence', 'atom_type', 'charge', 'hybridization', 'x', 'y']) # Cypher query
att_atom = to_int(att_atom)
att_atom

Unnamed: 0,symbol,degree,valence,atom_type,charge,hybridization,x,y
0,1,4,1,1,0.2175,1,8.8112,-3.0812
1,2,2,1,2,0.2175,1,8.8112,-3.0812
2,1,3,2,3,0.2794,2,13.9761,-10.0415
3,2,1,2,4,0.2794,2,13.9761,-10.0415
4,1,4,3,1,0.1522,1,7.6315,-7.6253
...,...,...,...,...,...,...,...,...
227,1,4,2,1,0.0304,1,10.1014,-12.5464
228,1,4,2,1,0.0025,1,10.1014,-12.5464
229,1,4,2,1,0.0025,1,10.1014,-12.5464
230,1,4,2,1,0.0002,1,10.1014,-12.5464


One hot encode the categorical variables

In [9]:
# one hot encode the categorical variables
val = pd.get_dummies(att_atom['valence'], prefix='valence')
at = pd.get_dummies(att_atom['atom_type'], prefix='atom_type')
hyb = pd.get_dummies(att_atom['hybridization'], prefix='hybridization')
att_atom = (att_atom.join(val).join(at).join(hyb)).drop(columns=['valence', 'atom_type', 'hybridization'])
att_atom

Unnamed: 0,symbol,degree,charge,x,y,valence_1,valence_2,valence_3,valence_4,atom_type_1,atom_type_2,atom_type_3,atom_type_4,atom_type_5,hybridization_1,hybridization_2
0,1,4,0.2175,8.8112,-3.0812,1,0,0,0,1,0,0,0,0,1,0
1,2,2,0.2175,8.8112,-3.0812,1,0,0,0,0,1,0,0,0,1,0
2,1,3,0.2794,13.9761,-10.0415,0,1,0,0,0,0,1,0,0,0,1
3,2,1,0.2794,13.9761,-10.0415,0,1,0,0,0,0,0,1,0,0,1
4,1,4,0.1522,7.6315,-7.6253,0,0,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,1,4,0.0304,10.1014,-12.5464,0,1,0,0,1,0,0,0,0,1,0
228,1,4,0.0025,10.1014,-12.5464,0,1,0,0,1,0,0,0,0,1,0
229,1,4,0.0025,10.1014,-12.5464,0,1,0,0,1,0,0,0,0,1,0
230,1,4,0.0002,10.1014,-12.5464,0,1,0,0,1,0,0,0,0,1,0


Export the node labels (atom type) and attributes

In [10]:
# Get the node labels (atom type)
node_labels = att_atom['symbol'].replace(np.NAN, 0).astype(int) # Replace string by integers
node_labels.to_csv('2-Pytorch geometric data/TU/MOLfiles/raw/mol_node_labels.txt', index=False, header=False, na_rep='NaN') # Export csv as .txt

# Get the node attributes except symbol
node_att = att_atom.drop(columns=['symbol'])
node_att.to_csv('2-Pytorch geometric data/TU/MOLfiles/raw/mol_node_attributes.txt', index=False, header=False, na_rep='NaN') # Export csv as .txt

### Determine the y variables
If the molecule is in a reaction the y value will be 1, otherwise it will be 0

In [11]:
# Query the molecules that are in a reaction
y1 = pd.DataFrame(conn.query("""MATCH (m:Molecule)--(rxn:Reaction)--(m2:Molecule) RETURN ID(m)"""), columns=['ID'])
# Query the molecules that are not in a reaction
y2 = pd.DataFrame(conn.query("""MATCH (m:Molecule) WHERE NOT (m)--(:Reaction)--(:Molecule) RETURN ID(m)"""), columns=['ID'])

# Add y-values of 1
y1['y']=1
# Add y-values of 0
y2['y']=0

# Concatenate both dataframes
y = pd.concat([y1, y2])

# Write to .txt files
y['y'].to_csv(f"2-Pytorch geometric data/TU/MOLfiles/raw/mol_graph_labels.txt", index=False, header=False)