# Neo4j to pytorch files
This code connects to a Neo4j database of choice and gets the needed information to construct the files that are needed for a GNN analysis with pytorch geometric (PyG). These files will be in the format that is also used for the TUData, which has multiple datasets in PyG.</br>
In our case, we want the following tensors as .txt file format as output of this notebook:
1.	An adjacency matrix, which determines which atoms are linked to each other.
2.	Node labels, which are the atom types.
3.	A ‘graph indicator’ that determines to which graph the atoms belong.
4.	Edge labels, for additional information storage about the bonds.
5.	Node attributes, for additional information storage of the atoms.
6.	Graph attributes, for additional information about the molecules.

In [6]:
from neo4j import GraphDatabase
import pandas as pd
import numpy as np
from functions import to_int
import shutil
import os

## Configure Neo4j connection
Using the official Neo4j tool, the code below lets you connect to a Neo4j database of choice
</br>This code is based on Neo4j's example: https://neo4j.com/docs/api/python-driver/current/

In [7]:

class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

Below, the neo4j username, password and URL can be given, note that you have the credentials to your own database, the ones that can be found below will not work. </br> After this, the connection with the Neo4j database is made.

In [8]:
# Give username, password and database URL
NEO4J_USER = 'neo4j'
NEO4J_PWD = 'metapath'
BOLT_URL = 'bolt://localhost:7687/'

# Establish connection with Neo4j
conn = Neo4jConnection(uri=BOLT_URL, user=NEO4J_USER, pwd=NEO4J_PWD)

## Check for necessary folders

In [9]:
# Check if a folder exists, otherwise create it

if os.path.exists('2-Pytorch geometric data/TU/SMILES/raw') == True:
    # If the SMILES folder exists, delete it first
    shutil.rmtree('2-Pytorch geometric data/TU/SMILES/raw')
    os.mkdir('2-Pytorch geometric data/TU/SMILES/raw')

elif os.path.exists('2-Pytorch geometric data/TU/SMILES') == True:
    os.mkdir('2-Pytorch geometric data/TU/SMILES/raw')

elif os.path.exists('2-Pytorch geometric data/TU') == True:
    os.mkdir('2-Pytorch geometric data/TU/SMILES')
    os.mkdir('2-Pytorch geometric data/TU/SMILES/raw')

elif os.path.exists('2-Pytorch geometric data') == True:
    os.mkdir('2-Pytorch geometric data/TU')
    os.mkdir('2-Pytorch geometric data/TU/SMILES')
    os.mkdir('2-Pytorch geometric data/TU/SMILES/raw')

else:
    os.mkdir('2-Pytorch geometric data')
    os.mkdir('2-Pytorch geometric data/TU')
    os.mkdir('2-Pytorch geometric data/TU/SMILES')
    os.mkdir('2-Pytorch geometric data/TU/SMILES/raw')

## Obtain the data
Every file needed stated above will be created with the code below. First, the adjacency matrix together with the bond types are taken from the database.

In [10]:
# Cypher query to get the adjacency matrix from only the atoms
bonds = pd.DataFrame(conn.query(f"MATCH (m:Molecule)-[:HAS_ATOM]->(a1:Atom)-[:BONDED_WITH]->(b:Bond)<-[:BONDED_WITH]-(a2:Atom)<-[:HAS_ATOM]-(m) RETURN ID(a1), ID(a2), b.bond_type"), columns=['ID(a1)', 'ID(a2)', 'bond_type'])

# Transform all categorical values to integers
bonds = to_int(bonds)

# Ensure that all IDs are sequential for the adjacency matrix
bonds2 = bonds.copy()
bonds2[['ID(a1)', 'ID(a2)']] = bonds2[['ID(a1)', 'ID(a2)']].replace(list(bonds['ID(a1)'].unique()), range(0, bonds['ID(a1)'].nunique()))
bonds2

Failed to write data to connection IPv4Address(('localhost', 7687)) (IPv4Address(('127.0.0.1', 7687)))


Unnamed: 0,ID(a1),ID(a2),bond_type
0,0,1,1
1,0,2,2
2,1,0,1
3,2,0,2
4,3,0,2
...,...,...,...
1365,664,661,2
1366,659,661,2
1367,661,659,2
1368,660,659,2


Split the output above in the adjacency matrix and edge label (bond typ) and export them into the right .txt format

In [11]:
# Get the adjacency matrix and add 1
adj_matrix = bonds2[['ID(a1)', 'ID(a2)']] + 1
adj_matrix.to_csv('2-Pytorch geometric data/TU/SMILES/raw/SMILES_A.txt', index=False, header=False) # Export csv as .txt

# Get the edge labels
edge_labels = bonds2['bond_type']
edge_labels.to_csv('2-Pytorch geometric data/TU/SMILES/raw/SMILES_edge_labels.txt', index=False, header=False) # Export csv as .txt

Query the graph ID from Neo4j (each molecule has a different ID) and export them to the .txt file.

In [12]:
# For each node, get the graph ID
graph_id = pd.DataFrame(conn.query(f"MATCH (a:Atom)<-[]-(m:Molecule) RETURN ID(a), m.id"), columns=['ID','molecule'])# Cypher query
graph_id2 = to_int(graph_id.copy())
graph_id2['molecule'].to_csv('2-Pytorch geometric data/TU/SMILES/raw/SMILES_graph_indicator.txt', index=False, header=False) # Export csv as .txt

Get all the atom features from Neo4j

In [13]:
att_atom = pd.DataFrame(conn.query("""MATCH (a:Atom) RETURN a.symbol, a.degree, a.valence, a.atom_type, a.charge, a.hybridization"""), columns=['symbol', 'degree', 'valence', 'atom_type', 'charge', 'hybridization'])
att_atom = to_int(att_atom)
att_atom

Unnamed: 0,symbol,degree,valence,atom_type,charge,hybridization
0,1,4,1,1,0.1711,1
1,2,4,4,2,0.3126,1
2,1,4,1,1,0.1711,1
3,1,4,1,1,0.1711,1
4,1,4,2,1,0.1429,1
...,...,...,...,...,...,...
660,3,2,2,3,0.1798,1
661,4,4,5,6,0.5404,1
662,3,2,1,3,0.1143,1
663,3,2,1,3,0.1143,1


One hot encode the categorical variables

In [14]:
# one hot encode the categorical variables
val = pd.get_dummies(att_atom['valence'], prefix='valence')
at = pd.get_dummies(att_atom['atom_type'], prefix='atom_type')
hyb = pd.get_dummies(att_atom['hybridization'], prefix='hybridization')
att_atom = (att_atom.join(val).join(at).join(hyb)).drop(columns=['valence', 'atom_type', 'hybridization'])
att_atom

Unnamed: 0,symbol,degree,charge,valence_1,valence_2,valence_3,valence_4,valence_5,valence_6,atom_type_1,...,atom_type_7,atom_type_8,atom_type_9,atom_type_10,atom_type_11,atom_type_12,atom_type_13,atom_type_14,hybridization_1,hybridization_2
0,1,4,0.1711,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,2,4,0.3126,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,4,0.1711,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,1,4,0.1711,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,1,4,0.1429,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660,3,2,0.1798,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
661,4,4,0.5404,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
662,3,2,0.1143,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
663,3,2,0.1143,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


Export the node labels (atom type) and attributes

In [15]:
# Get the node labels (atom type)
node_labels = att_atom['symbol'].replace(np.NAN, 0).astype(int) # Replace string by integers
node_labels.to_csv('2-Pytorch geometric data/TU/SMILES/raw/SMILES_node_labels.txt', index=False, header=False, na_rep='NaN') # Export csv as .txt

# Get the node attributes except symbol
node_att = att_atom.drop(columns=['symbol'])
node_att.to_csv('2-Pytorch geometric data/TU/SMILES/raw/SMILES_node_attributes.txt', index=False, header=False, na_rep='NaN') # Export csv as .txt

Create the desired y variable, for this test, all 

In [16]:
# Get the desired y variable, if 'Phthalic acid / PA' is the end molecule, the y value has to be 1, otherwise 0
end_mol = 'gamma-butyrobetaine-CoA' # Determine end molecule name

# Get a list of studies that have the end molecule as final product
mols = list(pd.DataFrame(conn.query(f"MATCH ()-[:PRODUCES]->(m:Molecule) WHERE m.name = '{end_mol}' AND NOT (m)-[:REACTS_IN]->()  RETURN (m.studyid)"))[0].unique())

# Get a list of all molecules that are in a pathway with the end molecule as final product
one = pd.DataFrame(conn.query(f"MATCH (m:Molecule) WHERE m.studyid in {mols} RETURN ID(m), m.id"), columns=['ID','molecule'])

# Get a list of all molecules
graph = pd.DataFrame(conn.query(f"MATCH (m:Molecule) RETURN ID(m), m.id"), columns=['ID','molecule'])

# Create a new feature 'label' with all values set to 0
graph['label'] = 0

# If a molecule is in the pathway of the end molecule, the label is changed to 1
for i in one['ID'].tolist():
    n = 0
    for m in graph['ID']:
        if i == m:
            graph.iloc[n,2] = 1
        n += 1

# Output the label
graph['label'].to_csv('2-Pytorch geometric data/TU/SMILES/raw/SMILES_graph_labels.txt', index=False, header=False, na_rep='NaN') # Export csv as .txt