# MOLfiles to Neo4j
This notebook converts MOLfiles to a Neo4J graph

## Import of the necessary packages 

In [1]:
from __future__ import print_function
import os, glob
from pathlib import Path
import subprocess
import shutil
from rdkit import Chem
from rdkit.Chem import AllChem, rdmolops, Lipinski
import itertools
import numpy as np
from functions import *
import pandas as pd
from neo4j import GraphDatabase
import openbabel

## Connect with Neo4j
Using the official Neo4j tool, the code below lets you connect to a Neo4j database of choice
</br>This code is based on Neo4j's example: https://neo4j.com/docs/api/python-driver/current/

In [2]:
class Neo4jConnection:
    """
    This class establishes the connectio with Neo4j and enables storing and retreiving data with cypher queries. The code is based on the official Neo4j documentation:
    https://neo4j.com/docs/api/python-driver/current/
    """
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

Below, the neo4j username, password and URL can be given, note that you have the credentials to your own database, the ones that can be found below will not work </br> After this, the connection with the Neo4j database is made

In [3]:
# Give username, password and database URL
NEO4J_USER = 'neo4j'
NEO4J_PWD = '0611362444'
BOLT_URL = 'bolt://localhost:7687/'

# Establish connection with Neo4j
conn = Neo4jConnection(uri=BOLT_URL, user=NEO4J_USER, pwd=NEO4J_PWD)

## MOL to cypher

Choose the folder containing the MOLfiles that have to be converted to cypher (**no spaces in the filename**). The reactant filename starts with the letter R and the product filename starts with the letter P, both followed by a number. If the reactant corresponds with the product they will have the same number. </br>
Note that this code is written only to deal with one reaction at a time, not for reaction pathways where a product of one reaction is the reactant of another.

In [4]:
# Select the name of the folder with the MOLfiles 
mols = '1.1-MOL'
# Load the file names
files = os.listdir(mols)
# Sort the files
files.sort()

If the MOL2 folder does not exist, create it

In [5]:
# Check for folder called MOL2
if os.path.exists('1-MOL2') == False:
    # Create MOL2 folder
    os.mkdir('1-MOL2')

The molecules have to be sorted so that the reactant and product are run in the algorithm successively.

In [6]:
# Create a list for the sorted molecules
f_sorted = []

for n in files:
    # If there is a product, the corresponding reactant will be found 
    if n[:1] == 'P':
        if 'R'+n[1:] in files:
            f_sorted.append('R'+n[1:]) # Add reactant to the list
        f_sorted.append(n) # Add product to the list
    
    # If the molecule is not yet in the list, it'll be added
    elif n not in f_sorted:
        f_sorted.append(n)


The reaction folder name has to be given below but as only the filenames are used, it is also possible to provide a list with the combined reactant and product. For example 'R01P01' is reactant R01 reacting in product P01.

In [7]:
# Input folder name
rxn = '1.1-reactions'
# Create list of reactions
rxn_lst = os.listdir(rxn)

Delete, if it is there, the existing 'MOL_to_cypher.cypher' file

In [8]:
if os.path.isfile('MOL_to_cypher.cypher') == True:
    os.remove('MOL_to_cypher.cypher')

### The main algorithm

The code below creates cypher queries and runs them in Neo4j, queries are created for the reaction, molecule, atom, bond and ring nodes together with their edges. This is done with the following steps for each molecule:</br>
1. Load the MOLfiles with RDKit, this is used to get the most features for the different nodes
2. Create the molecule node and its features
3. Convert to a MOL2 file with OpenBabel for additional atom features
4. Use the ringinfo function from the functions.py file to get ring information
5. If applicable, create the ring nodes and an edge with the molecule
6. Create the atom nodes and their edges with the molecule node, and the ring node(s) if this is applicable
7. Create the bond nodes and their edges with the molecule and atom nodes
8. Run the queries created above in the Neo4j database
9. Create all edges between the rings and their corresponding bonds, if applicable, and run it in Neo4j
10. If the molecule is a product, search the corresponding product and create a reaction node for them, then run in Neo4j

In [9]:
# Loop over all molecules
for f in f_sorted:
    # Store the molecule name and path
    mol_name = os.path.splitext(f)[0]

    # Delete the existing cypher file
    if os.path.isfile('MOL_to_cypher.cypher') == True:
        os.remove('MOL_to_cypher.cypher')
        
    # Open the cypher file to write queries to it
    with open('MOL_to_cypher.cypher', 'w') as w:
        # Create .mol in f1 and add .mol2 in f2    
        f1 = os.path.splitext(f)[0]+os.path.splitext(f)[1]
        f2 = f1 + '2'
    
        # Add RDkit format and conformer
        rdmol = Chem.MolFromMolFile(mols+'/'+f1)
        conf = rdmol.GetConformer()
        # Get SMILES string
        smiles = Chem.MolToSmiles(rdmol)
    
        # Cypher: create the molecule
        com = f"MERGE ({mol_name}:Molecule {{id:'{mol_name}'}})\n"
        com1 = f"ON CREATE\n"
        com2 = f"SET {mol_name}.smiles = '{smiles}', {mol_name}.Hdonors = {Chem.Lipinski.NumHDonors(rdmol)}, {mol_name}.name = '{mol_name}'\n"
        w.write(com+com1+com2)
    
        # Convert to MOL2 with openbabel
        !obabel -imol 1.1-MOL/$f1 -omol2 -O1-MOL2/$f2
    
        # Get features from MOL2 file
        mol2 = open('1-MOL2/'+ mol_name +'.mol2')
        mf = []
        m = 0
        
        # Looks at every line of the MOL2 file
        for n in mol2:
            feat = []
            
            # Ensures only the atom features are used
            if '@<TRIPOS>BOND' in n:
                m = 0

            # Stores the computed charche and type of the atom in the mf list
            if m == 1:
                feat.append(n[70:76])
                feat.append(n[47:52].strip())
                mf.append(feat)

            # Ensures only the atom features are used
            if '@<TRIPOS>ATOM' in n:
                m = 1
                
        # Get the information of all the rings using the ringinfo function from functions.py
        ringlst, asys, arings, rings = ringinfo(rdmol)
        
        # Cypher: create all the rings
        for ring in ringlst:
            rtype = ring[0][:ring[0].index("_")] # Get the ringtype
            com = f"MERGE ({ring[0]}:Ring {{id:'{mol_name}_{ring[0]}'}})\n" # Create the ring node
            com1 = f"ON CREATE\n"
            com2 = f"SET {ring[0]}.ring_type = '{rtype}', {ring[0]}.size = {ring[1]}\n" # Add the additional features
            com2 = com2.replace(f"ring_type = 'ring'", "ring_type = 'aliphatic'") # Replace ring by aliphatic
            com3 = f"MERGE ({mol_name}) -[:HAS_RING]-> ({ring[0]})\n" # Create edge between the molecule and ring nodes
            w.write(com+com1+com2+com3) # Write everything to the MOL_to_cypher file
    
        # Write cypher line for each atom
        for atom in rdmol.GetAtoms():
            idx = atom.GetIdx() # Get atom index
            at_name = mol_name + '_' + str(atom.GetIdx()) # Create atom name by adding the index to the mol name
            coords = np.array(conf.GetAtomPosition(0)) # Get the x and y coordinates of the atom

            com = f"MERGE ({at_name}:Atom {{id:'{at_name}'}})\n" # Create the atom node
            
            # Add the additional atom features
            com1 = f"ON CREATE\n"
            com2 = f"""
                    SET {at_name}.symbol = '{atom.GetSymbol()}', {at_name}.degree = {atom.GetTotalDegree()}, {at_name}.charge = {mf[idx][0]}, {at_name}.atom_type = '{mf[idx][1]}',
                    {at_name}.hybridization = '{atom.GetHybridization()}', {at_name}.valence = {atom.GetExplicitValence()}, {at_name}.x = {coords[0]}, {at_name}.y = {coords[1]}\n
                    """
            
            com3 = f"MERGE ({mol_name}) -[:HAS_ATOM]-> ({at_name})\n" # Create the edge between the molecule and atom nodes
            w.write(com+com1+com2+com3) # Write everything to the cypher file
            
            # For every ring in the molecule, check if the atom is in it, if so create a atom-ring edge
            for sys in asys:
                if idx in asys[sys]:
                    w.write(f"MERGE ({sys}) -[:HAS_ATOM]-> ({at_name})\n")

            for r in arings:
                if idx in arings[r]:
                    w.write(f"MERGE ({r}) -[:HAS_ATOM]-> ({at_name})\n")

            for r in rings:
                if idx in rings[r]:
                    w.write(f"MERGE ({r}) -[:HAS_ATOM]-> ({at_name})\n")
        
        # Write cypher line for each bond
        for bond in rdmol.GetBonds():
            begin = bond.GetBeginAtomIdx() # Get the begin atom of the bond
            end = bond.GetEndAtomIdx() # Get the end atom of the bond
            bo_name = mol_name + '_' + str(begin) + '_' + str(end) # Create bond name

            # Get the coordinates of the two atoms and calculate the distance
            at1Coords = np.array(conf.GetAtomPosition(begin))
            at2Coords = np.array(conf.GetAtomPosition(end))
            dist = np.linalg.norm(at2Coords - at1Coords)
            
            # Get the bond type, if the bond is aromatic change it to aromatic
            btype = str.lower(str(bond.GetBondType()))
            if bond.GetIsAromatic() == True:
                btype = 'aromatic'
        
            com = f"MERGE ({bo_name}:Bond {{id:'{bo_name}'}})\n" # Create bond node
        
            # Add additional bond features
            com1 = f"ON CREATE\n"
            com2 = f"SET {bo_name}.bond_type = '{btype}', {bo_name}.distance = {dist}\n"
            com3 = f"MERGE ({mol_name}) -[:HAS_BOND]-> ({bo_name})\n" # Create molecule-bond edge
            com4 = f"MERGE ({mol_name+'_'+str(begin)}) -[:BONDED_WITH]-> ({bo_name}) <-[:BONDED_WITH]- ({mol_name+'_'+str(end)})\n" # Create atom-bond edge

            w.write(com+com1+com2+com3+com4) # Write everything to cyper file
                
        w.write(';\n') # Add to close cypher part of the specific molecule

    # Open the created cypher file and run it in Neo4j
    f = open('MOL_to_cypher.cypher', 'r')
    conn.query(f.read())
    f.close()

    os.remove('MOL_to_cypher.cypher') # Remove existing cypher file
    
    # Create cypher command to create an edge between the ring and its bonds
    com = f"MATCH (m:Ring)-[:HAS_ATOM]->(a1:Atom)-[:BONDED_WITH]->(b:Bond)<-[:BONDED_WITH]-(a2:Atom)<-[:HAS_ATOM]-(m)\n"
    com1 = f"MERGE (m)-[:HAS_BOND]->(b)\n"
    com2 = f";\n"
    # Run in Neo4j
    conn.query(com+com1+com2)

    # Add reaction node and connected edges
    if mol_name[:1] == 'P':
        name = 'R' + mol_name[1:] + mol_name

        # If it exists, remove the cypher file
        if os.path.isfile('MOL_to_cypher.cypher') == True:
            os.remove('MOL_to_cypher.cypher')

        with open('MOL_to_cypher.cypher', 'w') as w:
            com = f"MERGE ({name}:Reaction{{id:'{name}'}})\n" # Create reaction node

            # Add additional reaction features
            com1 = f"ON CREATE\n"
            com2 = f"SET {name}.type = 'alcohol'\n"
                
            # Match create the molecule-reaction edges
            com3 = f"WITH {name}\n"
            com4 = f"MATCH (p:Molecule),(r:Molecule)\n"
            com5 = f"WHERE p.name = '{name[3:]}' AND r.name = '{name[:-3]}'\n" # This only works with the format currently provided
            com6 = f"MERGE (r)-[:REACTS_IN]->({name})-[:PRODUCES]->(p)\n"
            com7 = f";\n"
            
            w.write(com+com1+com2+com3+com4+com5+com6+com7) # Write everything to the cypher file
            w.close()
            
            # Open the created cypher file and run it in Neo4j
            f = open('MOL_to_cypher.cypher', 'r')
            conn.query(f.read())
            f.close()

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
