# Align Murcko Scaffolds

In [1]:
%load_ext autoreload
%autoreload 2

### Libraries

In [2]:
import open3d as o3d
import numpy as np
import seaborn as sns
import pandas as pd

from numpy.random import default_rng

import re, os
from io import StringIO

import tqdm
from tqdm.auto import trange

import py3Dmol

import molgrid

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools
from rdkit.Chem.Scaffolds import MurckoScaffold as MS

from openbabel import pybel

import ipywidgets as widgets

import copy

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


  return f(*args, **kwds)
  return f(*args, **kwds)
INFO - 2021-09-13 18:02:50,012 - __init__ - Enabling RDKit 2021.03.1 jupyter extensions


In [3]:
from utils import show_molecule_idx, show_all_conformers
from utils import AlignShow, translate_and_rotate

import sys

sys.path.append("../../ligan-EVOTEC")

from molgrid_to_pcd import mol_to_grid, grid_to_pcd
from molgrid_diff import grid_diff
from fit_to_grid import molgrid_diff_to_mol

In [4]:
path = "ligands/CDK2"
files = [
    os.path.join(path, f)
    for f in os.listdir(path)
    if os.path.splitext(f)[-1] == ".pcd" and os.path.splitext(f)[0][-4:] == "tran"
]

# Order ligands
# This should make the three chemical series pop-up in the PCD fit
names = {
    "4ek4_B_1CK": "CS1",
    "4ek5_B_03K": "CS3",
    "4fkg_B_4CK": "CS4",
    "4fki_B_09K": "CS9",
    "4fkj_B_11K": "CS11",
    "3sw4_B_18K": "CS18",
    "3sw7_B_19K": "CS19",
    "4fko_B_20K": "CS20",
    "4fkp_B_LS5": "CS241",
    "4fkq_B_42K": "CS242",
    "4fkr_B_45K": "CS245",
    "4fks_B_46K": "CS246",
    "4fkt_B_48K": "CS248",
    "4fku_D_60K": "CS260",
    "4fkv_B_61K": "CS261",
    "4fkw_B_62K": "CS262",
}

files.sort(
    key=lambda f: int(
        names[os.path.splitext(os.path.basename(f))[0].replace("_tran", "")].replace(
            "CS", ""
        )
    )
)

print(files)

pcds = []
mols = []
for f in files:
    pcd = o3d.io.read_point_cloud(f)
    pcds.append(pcd)

    s = Chem.SDMolSupplier(f.replace(".pcd", ".sdf"))
    mol = next(s)
    mols.append(mol)

['ligands/CDK2/4ek4_B_1CK_tran.pcd', 'ligands/CDK2/4ek5_B_03K_tran.pcd', 'ligands/CDK2/4fkg_B_4CK_tran.pcd', 'ligands/CDK2/4fki_B_09K_tran.pcd', 'ligands/CDK2/4fkj_B_11K_tran.pcd', 'ligands/CDK2/3sw4_B_18K_tran.pcd', 'ligands/CDK2/3sw7_B_19K_tran.pcd', 'ligands/CDK2/4fko_B_20K_tran.pcd', 'ligands/CDK2/4fkp_B_LS5_tran.pcd', 'ligands/CDK2/4fkq_B_42K_tran.pcd', 'ligands/CDK2/4fkr_B_45K_tran.pcd', 'ligands/CDK2/4fks_B_46K_tran.pcd', 'ligands/CDK2/4fkt_B_48K_tran.pcd', 'ligands/CDK2/4fku_D_60K_tran.pcd', 'ligands/CDK2/4fkv_B_61K_tran.pcd', 'ligands/CDK2/4fkw_B_62K_tran.pcd']


In [5]:
_ = widgets.interact(
    lambda index: show_molecule_idx(index, mols),
    index=widgets.IntSlider(min=0, max=len(mols) - 1, step=1, value=1),
)

interactive(children=(IntSlider(value=1, description='index', max=15), Output()), _dom_classes=('widget-intera…

In [6]:
msMols = [MS.GetScaffoldForMol(mol) for mol in mols]

In [7]:
_ = widgets.interact(
    lambda index: show_molecule_idx(index, msMols),
    index=widgets.IntSlider(min=0, max=len(mols) - 1, step=1, value=1),
)

interactive(children=(IntSlider(value=1, description='index', max=15), Output()), _dom_classes=('widget-intera…

Write scaffolds to file:

In [8]:
for idx, mol in enumerate(msMols):
    # Randomly translate and rotate Murcko scaffolds
    translate_and_rotate(mol)

    with Chem.SDWriter(os.path.join(path, f"murcko_{idx}.sdf")) as w:
        w.write(mol, confId=0)

In [9]:
mkpcds = []
mkmols = []

files = [os.path.join(path, f"murcko_{i}.sdf") for i in range(len(mols))]

print(files)

typer = molgrid.FileMappedGninaTyper("../files/ligmap")

for f in tqdm.tqdm(files):
    # Load molecule as RDKit molecule
    s = Chem.SDMolSupplier(f)
    mol = next(s)
    mkmols.append(mol)

    # Load molecule as OpenBabel molecule
    obmol = next(pybel.readfile("sdf", f))

    grid, center = mol_to_grid(obmol, 23.5, 0.5, typer)
    pcd = grid_to_pcd(
        grid,
        center,
        23.5,
        0.5,
        typer,
    )

    # pcd = o3d.io.read_point_cloud(f)
    mkpcds.append(pcd)

print(mkpcds)

['ligands/CDK2/murcko_0.sdf', 'ligands/CDK2/murcko_1.sdf', 'ligands/CDK2/murcko_2.sdf', 'ligands/CDK2/murcko_3.sdf', 'ligands/CDK2/murcko_4.sdf', 'ligands/CDK2/murcko_5.sdf', 'ligands/CDK2/murcko_6.sdf', 'ligands/CDK2/murcko_7.sdf', 'ligands/CDK2/murcko_8.sdf', 'ligands/CDK2/murcko_9.sdf', 'ligands/CDK2/murcko_10.sdf', 'ligands/CDK2/murcko_11.sdf', 'ligands/CDK2/murcko_12.sdf', 'ligands/CDK2/murcko_13.sdf', 'ligands/CDK2/murcko_14.sdf', 'ligands/CDK2/murcko_15.sdf']


100%|██████████| 16/16 [00:03<00:00,  5.00it/s]

[PointCloud with 425 points., PointCloud with 490 points., PointCloud with 509 points., PointCloud with 572 points., PointCloud with 858 points., PointCloud with 602 points., PointCloud with 606 points., PointCloud with 597 points., PointCloud with 572 points., PointCloud with 633 points., PointCloud with 934 points., PointCloud with 729 points., PointCloud with 540 points., PointCloud with 541 points., PointCloud with 893 points., PointCloud with 567 points.]





In [10]:
# molid = 8

# Add reference molecule to fragments
# allmols = [mols[molid]] + mkmols
# allpcds = [pcds[molid]] + mkpcds

#als = AlignShow(allmols, allpcds)

Look at the best alignment between molecule `molid` (which has been appended at position `0`) any all Murcko's scaffolds.

#s, (i,j) = als.best_with(0) # molid has been appended in position 0

In [11]:
# als.scores

With this dataset (where ligands come from the same chemical series), the best aligned scaffold is not necessarily `molid` because other scaffold can be similar but larger. The best scaffold is clearly well aligned and has a reasonable score:

In [12]:
# _ = widgets.interact(lambda index: als.show(0, index), index=widgets.IntSlider(min=0, max=len(mols)-1, step=1, value=1))

## Align Murcko Scaffold with Original Molecule

### Reconstruction

In [13]:
def reconstruction(fname, verbose=False):
    """
    Reconstruct molecule from file.
    Files is assumed to contained the scaffold (confId=0)
    aligned to its original molecule (confId=1)
    """
    typer = molgrid.FileMappedGninaTyper("../files/ligmap")

    sdfile = pybel.readfile("sdf", fname)
    obmol1 = next(sdfile)
    obmol2 = next(sdfile)

    gdiff, c = grid_diff(obmol1, obmol2, 23.5, 0.5, typer)

    # Convert center to numpy array
    c = np.array([c[0], c[1], c[2]])

    npgdiff = gdiff.cpu().detach().numpy()

    # Load Murcko scaffold as RDKit molecule
    rdscaffold = next(Chem.SDMolSupplier(fname, removeHs=True))

    # Fit atoms into density difference
    # Link nearest atom from the fit to the scaffold to build whole molecule
    rdmolfinal = molgrid_diff_to_mol(
        npgdiff, c, 0.5, "../files/ligmap", rdscaffold, verbose=verbose
    )

    return rdmolfinal

In [14]:
molid = 0
als = AlignShow([mols[molid], mkmols[molid]], [pcds[molid], mkpcds[molid]])
s = als.align(1, 0)
print("Score:", s)
als.save(1, 0, f"murcko_self_{molid}.sdf")
als.show(1, 0)

Score: 0.7858823529411765


<py3Dmol.view at 0x7f8c64d17748>

In [15]:
rdmol0 = reconstruction("murcko_self_0.sdf")
show_all_conformers(rdmol0)

In [16]:
molid = 1
als = AlignShow([mols[molid], mkmols[molid]], [pcds[molid], mkpcds[molid]])
s = als.align(1, 0)
print("Score:", s)
als.save(1, 0, f"murcko_self_{molid}.sdf")
als.show(1, 0)

Score: 0.9489795918367347


<py3Dmol.view at 0x7f8c64d174e0>

In [17]:
rdmol1 = reconstruction("murcko_self_1.sdf")
show_all_conformers(rdmol1)

In [18]:
molid = 2
als = AlignShow([mols[molid], mkmols[molid]], [pcds[molid], mkpcds[molid]])
s = als.align(1, 0)
print("Score:", s)
als.save(1, 0, f"murcko_self_{molid}.sdf")
als.show(1, 0)

Score: 0.9489194499017681


<py3Dmol.view at 0x7f8c64d17898>

In [19]:
rdmol2 = reconstruction("murcko_self_2.sdf")
show_all_conformers(rdmol2)

In [20]:
molid = 3
als = AlignShow([mols[molid], mkmols[molid]], [pcds[molid], mkpcds[molid]])
s = als.align(1, 0)
print("Score:", s)
als.save(1, 0, f"murcko_self_{molid}.sdf")
als.show(1, 0)

Score: 0.9230769230769231


<py3Dmol.view at 0x7f8c64d32828>

In [21]:
rdmol3 = reconstruction("murcko_self_3.sdf")
show_all_conformers(rdmol3)

In [22]:
molid = 4
als = AlignShow([mols[molid], mkmols[molid]], [pcds[molid], mkpcds[molid]])
s = als.align(1, 0)
print("Score:", s)
als.save(1, 0, f"murcko_self_{molid}.sdf")
als.show(1, 0)

Score: 0.9393939393939394


<py3Dmol.view at 0x7f8c64cb6080>

In [23]:
# For molecule 4 the molecule itself is already a Murcko scaffold
# rdmol4 = reconstruction("murcko_self_4.sdf")
# show_all_conformers(rdmol4)

In [24]:
molid = 5
als = AlignShow([mols[molid], mkmols[molid]], [pcds[molid], mkpcds[molid]])
s = als.align(1, 0)
print("Score:", s)
als.save(1, 0, f"murcko_self_{molid}.sdf")
als.show(1, 0)

Score: 0.9518272425249169


<py3Dmol.view at 0x7f8c6b80a7f0>

In [25]:
rdmol5 = reconstruction("murcko_self_5.sdf")
show_all_conformers(rdmol5)

In [26]:
molid = 6
als = AlignShow([mols[molid], mkmols[molid]], [pcds[molid], mkpcds[molid]])
s = als.align(1, 0)
print("Score:", s)
als.save(1, 0, f"murcko_self_{molid}.sdf")
als.show(1, 0)

Score: 0.9356435643564357


<py3Dmol.view at 0x7f8d8c6b3160>

In [27]:
rdmol6 = reconstruction("murcko_self_6.sdf")
show_all_conformers(rdmol6)

In [28]:
molid = 7
als = AlignShow([mols[molid], mkmols[molid]], [pcds[molid], mkpcds[molid]])
s = als.align(1, 0)
print("Score:", s)
als.save(1, 0, f"murcko_self_{molid}.sdf")
als.show(1, 0)

Score: 0.6030150753768844


<py3Dmol.view at 0x7f8c64d32c18>

In [29]:
rdmol7 = reconstruction("murcko_self_7.sdf")
show_all_conformers(rdmol7)

RDKit ERROR: [18:03:26] Can't kekulize mol.  Unkekulized atoms: 28 29 30 32 34
RDKit ERROR: 


KekulizeException: Can't kekulize mol.  Unkekulized atoms: 28 29 30 32 34


In [30]:
molid = 8
als = AlignShow([mols[molid], mkmols[molid]], [pcds[molid], mkpcds[molid]])
s = als.align(1, 0)
print("Score:", s)
als.save(1, 0, f"murcko_self_{molid}.sdf")
als.show(1, 0)

Score: 0.7097902097902098


<py3Dmol.view at 0x7f8c64d32d30>

In [31]:
rdmol8 = reconstruction("murcko_self_8.sdf")
show_all_conformers(rdmol8)

RDKit ERROR: [18:04:35] Can't kekulize mol.  Unkekulized atoms: 27 28 29 30 32
RDKit ERROR: 


KekulizeException: Can't kekulize mol.  Unkekulized atoms: 27 28 29 30 32


In [32]:
molid = 9
als = AlignShow([mols[molid], mkmols[molid]], [pcds[molid], mkpcds[molid]])
s = als.align(1, 0)
print("Score:", s)
als.save(1, 0, f"murcko_self_{molid}.sdf")
als.show(1, 0)

Score: 0.5165876777251185


<py3Dmol.view at 0x7f8c64cb6518>

In [33]:
rdmol9 = reconstruction("murcko_self_9.sdf")
show_all_conformers(rdmol9)

Too many atoms?
