In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import logging
import sys

In [None]:
from tqdm import tqdm
tqdm.pandas()  # Show progress bars on pandas functions

In [None]:
import numpy as np
import pandas as pd
from IPython.display import SVG
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import MolsToGridImage, MolToImage, rdMolDraw2D

In [None]:
try:
    import ppqm
except ModuleNotFoundError:
    import pathlib

    cwd = pathlib.Path().resolve().parent
    sys.path.append(str(cwd))
    import ppqm

In [None]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger("ppqm").setLevel(logging.INFO)
logging.getLogger("xtb").setLevel(logging.INFO)
show_progress = True

# Example: Fast and accurate prediction of the regioselectivity of electrophilic aromatic substitution reactions

RegioSQM method protonates all aromatic C–H carbon atoms and identifies those with the lowest free energies in **solvent** using the semiempirical quantum chemical **method** as the most nucleophilic center.

As per the Regio2020 version, in this example we are using
**xTB GFN2** in **Methanol**

Rerference
- https://doi.org/10.1039/C7SC04156J
- https://doi.org/10.1186/s13321-021-00490-7


In [None]:
reaction1 = AllChem.ReactionFromSmarts("[C;R;H1:1]=[C,N;R;H1:2]>>[CH2:1][*H+:2]")
reaction2 = AllChem.ReactionFromSmarts("[C;R;H1:1]=[C,N;R;H0:2]>>[CH2:1][*+;H0:2]")

In [None]:
reaction1

In [None]:
reaction2

## Define a molecule you like

In [None]:
smiles = "Cc1cc(NCCO)nc(-c2ccc(Br)cc2)n1"  # CHEMBL1956589
molobj = Chem.MolFromSmiles(smiles)

In [None]:
molobj

In [None]:
Chem.Kekulize(molobj, clearAromaticFlags=True)

## Protonate all aromatic carbons




In [None]:
def get_target_atoms(molobj, target):
    """Find target atom indices from SMART"""
    atoms = molobj.GetSubstructMatches(target)
    # convert tuple of tuple to one-dimensional list
    atoms = [element for tupl in atoms for element in tupl]
    return atoms

In [None]:
# Code by Jan H. Jensen github.com/jensengroup/regiosqm

molobjs = []
target_atoms = []

smarts_1 = Chem.MolFromSmarts("[C;R;H1:1]=[C,N;R;H1:2]")
smarts_2 = Chem.MolFromSmarts("[C;R;H1:1]=[C,N;R;H0:2]")
atoms_1 = get_target_atoms(molobj, smarts_1)
atoms_2 = get_target_atoms(molobj, smarts_2)

i = 0
products_1 = reaction1.RunReactants((molobj,))
for x in products_1:

    molobj_prime = x[0]
    smiles = Chem.MolToSmiles(molobj_prime)
    smiles = smiles.replace("NH2+", "N+")
    molobj_prime = Chem.MolFromSmiles(smiles)

    molobjs.append(molobj_prime)
    target_atoms.append(atoms_1[i])

    i += 1

isav = i

products_2 = reaction2.RunReactants((molobj,))
for x in products_2:

    molobj_prime = x[0]
    smiles = Chem.MolToSmiles(molobj_prime)
    smiles = smiles.replace("NH2+", "N+")
    molobj_prime = Chem.MolFromSmiles(smiles)

    molobjs.append(molobj_prime)
    target_atoms.append(atoms_2[2 * (i - isav) - 2])

    i += 1

In [None]:
MolsToGridImage(
    molobjs,
    molsPerRow=3,
    subImgSize=(250, 250),
    useSVG=True,
)

In [None]:
[Chem.MolToSmiles(m) for m in molobjs]

## Now let's find out which are most stable using quantum chemistry

In [None]:
df = pd.DataFrame(molobjs, columns=["molobj"])
df["atom_index"] = target_atoms

In [None]:
df

In [None]:
xtb = ppqm.xtb.XtbCalculator(
    scr="_tmp_directory_", n_cores=2, cmd="xtb", show_progress=False
)

Let's define a function that we can map onto a pandas DataFrame on each row. We want to calculate the energy for each site which requires some conformer expansion. We are only interested in the lowest energy per conformer.

In [None]:
def calculate_energy(molobj):
    """

    For each protonated molecule RDkit Mol obj

    - Generate conformers (max 20 conformers
    - Minimize all conformers
    - Get the energy for each conformer
    - Return the lowest energy

    """

    xtb_options = {
        "gfn": 2,
        "alpb": "Methanol",
        "opt": None,
    }

    molobj = ppqm.chembridge.copy_molobj(molobj)
    molobj = ppqm.tasks.generate_conformers(molobj, max_conformers=20)

    results = xtb.calculate(molobj, xtb_options)

    conformer_energies = [result["scc_energy"] for result in results]
    min_energy = np.min(conformer_energies)
    min_energy *= ppqm.units.hartree_to_kcalmol

    return min_energy

In [None]:
# example: reference_energy = calculate_energy(molobj)

In [None]:
%%time
df["energy"] = df["molobj"].progress_apply(calculate_energy)

In [None]:
df["rel_energy"] = df["energy"].values - np.min(df["energy"].values)

In [None]:
df

In [None]:
cutoff1 = 1.0  # kcal/mol
cutoff2 = 3.0  # kcal/mol

green = df[df["rel_energy"] < cutoff1]["atom_index"].values
green = [int(x) for x in green]  # rdkit int type
red = df[df["rel_energy"] < cutoff2]["atom_index"].values
red = [int(x) for x in red if x not in green]

highlights = green + red

colormap = dict()
colormap.update({key: [(0.0, 1.0, 0.0)] for key in green})
colormap.update({key: [(1.0, 0.0, 0.0)] for key in red})

In [None]:
# should be working, but does not respect colors
# MolToImage(
#    molobj,
#    highlightAtoms=highlights,
#    highlightMap=colormap,
#    size=(500,500),
# )

In [None]:
# http://rdkit.blogspot.com/2020/04/new-drawing-options-in-202003-release.html
d2d = rdMolDraw2D.MolDraw2DSVG(500, 500)
d2d.DrawMoleculeWithHighlights(
    molobj, "Regioselective site(s)", dict(colormap), {}, {}, {}
)
d2d.FinishDrawing()
SVG(d2d.GetDrawingText())