# Align Murcko Scaffold to Molecule

In this notebook we explire if we can use SENSAAS with point clouds from `libmolgrid`-generated densities in order to align Murcko scaffolds to their original molecule.

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

In [2]:
# sensaas: sensaas coloring method for PCDs
# molgrid: libmolgrid coloring method for PCDs
method = "molgrid"

In [3]:
# Threshold to show failures
rmsd_threshold = 0.25

In [4]:
dir = f"failed-{method}/"
for f in os.listdir(dir):
    os.remove(os.path.join(dir, f))

### Libraries

In [5]:
import sys

sys.path.append("../../")
sys.path.append("../")

In [6]:
import open3d as o3d
import pandas as pd
from collections import defaultdict

import re, os
from rdkit import Chem

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [7]:
from utils import align_and_show_scaffold, show_scaffold

Store RMSD before and after alignment (for both BRD4 and CDK2)

In [8]:
rmsds = defaultdict(list)

## BRD4 Inhibitors

In [9]:
path = "../ligands/BRD4"

Loda original BRD4 inhibitors together with their point cloud representation (pre-computed):

In [10]:
files = []
for f in os.listdir(path):
    fname, ext = os.path.splitext(f)
    if ext == ".pcd" and method in f and not "tran" in f:
        files.append(os.path.join(path, f))

# Sort BRD4 ligand files by number
r = re.compile("\d{1,2}")
files.sort(key=lambda f: int(r.search(os.path.basename(f)).group()))

print(files)

pcds = []
mols = []
for f in files:
    pcd = o3d.io.read_point_cloud(f)
    pcds.append(pcd)

    s = Chem.SDMolSupplier(f.replace(".pcd", ".sdf").replace(f"_{method}", ""))
    mol = next(s)
    mols.append(mol)

['../ligands/BRD4/ligand-1_molgrid.pcd', '../ligands/BRD4/ligand-2_molgrid.pcd', '../ligands/BRD4/ligand-3_molgrid.pcd', '../ligands/BRD4/ligand-4_molgrid.pcd', '../ligands/BRD4/ligand-5_molgrid.pcd', '../ligands/BRD4/ligand-6_molgrid.pcd', '../ligands/BRD4/ligand-7_molgrid.pcd', '../ligands/BRD4/ligand-8_molgrid.pcd', '../ligands/BRD4/ligand-9_molgrid.pcd', '../ligands/BRD4/ligand-10_molgrid.pcd']


Load original BRD4 Murcko scaffolds:

In [11]:
files = []
for f in os.listdir(path):
    fname, ext = os.path.splitext(f)
    if ext == ".sdf" and "murcko" in f and not "tran" in f:
        files.append(os.path.join(path, f))

# Sort BRD4 ligand files by number
r = re.compile("\d{1,2}")
files.sort(key=lambda f: int(r.search(os.path.basename(f)).group()))

print(files)

MSmols = []
for f in files:
    # Original Murko scaffold PCDs are not needed
    s = Chem.SDMolSupplier(f.replace(".pcd", ".sdf").replace(f"_{method}", ""))
    mol = next(s)
    MSmols.append(mol)

['../ligands/BRD4/ligand-1_murcko.sdf', '../ligands/BRD4/ligand-2_murcko.sdf', '../ligands/BRD4/ligand-3_murcko.sdf', '../ligands/BRD4/ligand-4_murcko.sdf', '../ligands/BRD4/ligand-5_murcko.sdf', '../ligands/BRD4/ligand-6_murcko.sdf', '../ligands/BRD4/ligand-7_murcko.sdf', '../ligands/BRD4/ligand-8_murcko.sdf', '../ligands/BRD4/ligand-9_murcko.sdf', '../ligands/BRD4/ligand-10_murcko.sdf']


Load translated and rotated BRD4 Murcko scaffolds with their point-cloud representation (pre-computed):

In [12]:
files = []
for f in os.listdir(path):
    fname, ext = os.path.splitext(f)
    if ext == ".pcd" and f"_murcko_tran_{method}" in fname:
        files.append(os.path.join(path, f))

# Sort BRD4 ligand files by number
r = re.compile("\d{1,2}")
files.sort(key=lambda f: int(r.search(os.path.basename(f)).group()))

print(files)

tMSpcds = []
tMSmols = []
for f in files:
    pcd = o3d.io.read_point_cloud(f)
    tMSpcds.append(pcd)

    s = Chem.SDMolSupplier(f.replace(".pcd", ".sdf").replace(f"_{method}", ""))
    mol = next(s)
    tMSmols.append(mol)

['../ligands/BRD4/ligand-1_murcko_tran_molgrid.pcd', '../ligands/BRD4/ligand-2_murcko_tran_molgrid.pcd', '../ligands/BRD4/ligand-3_murcko_tran_molgrid.pcd', '../ligands/BRD4/ligand-4_murcko_tran_molgrid.pcd', '../ligands/BRD4/ligand-5_murcko_tran_molgrid.pcd', '../ligands/BRD4/ligand-6_murcko_tran_molgrid.pcd', '../ligands/BRD4/ligand-7_murcko_tran_molgrid.pcd', '../ligands/BRD4/ligand-8_murcko_tran_molgrid.pcd', '../ligands/BRD4/ligand-9_murcko_tran_molgrid.pcd', '../ligands/BRD4/ligand-10_murcko_tran_molgrid.pcd']


In [13]:
for i, f in enumerate(files):
    rmsd_i, rmsd_f, _, _ = align_and_show_scaffold(
        mols[i], pcds[i], tMSmols[i], tMSpcds[i], MSmols[i]
    )
    rmsds["system"].append("BRD4")
    rmsds["lig"].append(
        os.path.splitext(os.path.basename(f))[0].replace(f"_murcko_tran_{method}", "")
    )
    rmsds["RMSDi"].append(rmsd_i)
    rmsds["RMSDf"].append(rmsd_f)
    rmsds["idx"].append(i)

### Failures

In [14]:
df = pd.DataFrame(rmsds)

In [15]:
df[df.RMSDf > rmsd_threshold]

Unnamed: 0,system,lig,RMSDi,RMSDf,idx
2,BRD4,ligand-3,16.842481,5.467946,2
5,BRD4,ligand-6,18.104774,6.226867,5


In [16]:
for _, (i, lig) in df[df.RMSDf > rmsd_threshold][["idx", "lig"]].iterrows():
    p = show_scaffold(mols[i], tMSmols[i])

    with Chem.SDWriter(f"failed-{method}/{lig}_alignfail.sdf") as w:
        w.write(tMSmols[i], confId=1)

    display(p)

<py3Dmol.view at 0x7f230708e520>

<py3Dmol.view at 0x7f230708e850>

## CDK2 Inhibitors

In [17]:
path = "../ligands/CDK2"

Loda original CDK2 inhibitors together with their point cloud representation (pre-computed):

In [18]:
files = []
for f in os.listdir(path):
    fname, ext = os.path.splitext(f)
    if ext == ".pcd" and method in f and not "tran" in f:
        files.append(os.path.join(path, f))

# Order ligands
# This should make the three chemical series pop-up in the PCD fit
names = {
    "4ek4_B_1CK": "CS1",
    "4ek5_B_03K": "CS3",
    "4fkg_B_4CK": "CS4",
    "4fki_B_09K": "CS9",
    "4fkj_B_11K": "CS11",
    "3sw4_B_18K": "CS18",
    "3sw7_B_19K": "CS19",
    "4fko_B_20K": "CS20",
    "4fkp_B_LS5": "CS241",
    "4fkq_B_42K": "CS242",
    "4fkr_B_45K": "CS245",
    "4fks_B_46K": "CS246",
    "4fkt_B_48K": "CS248",
    "4fku_D_60K": "CS260",
    "4fkv_B_61K": "CS261",
    "4fkw_B_62K": "CS262",
}

files.sort(
    key=lambda f: int(
        names[
            os.path.splitext(os.path.basename(f))[0].replace(f"_{method}", "")
        ].replace("CS", "")
    )
)

print(files)

pcds = []
mols = []
for f in files:
    pcd = o3d.io.read_point_cloud(f)
    pcds.append(pcd)

    s = Chem.SDMolSupplier(f.replace(".pcd", ".sdf").replace(f"_{method}", ""))
    mol = next(s)
    mols.append(mol)

['../ligands/CDK2/4ek4_B_1CK_molgrid.pcd', '../ligands/CDK2/4ek5_B_03K_molgrid.pcd', '../ligands/CDK2/4fkg_B_4CK_molgrid.pcd', '../ligands/CDK2/4fki_B_09K_molgrid.pcd', '../ligands/CDK2/4fkj_B_11K_molgrid.pcd', '../ligands/CDK2/3sw4_B_18K_molgrid.pcd', '../ligands/CDK2/3sw7_B_19K_molgrid.pcd', '../ligands/CDK2/4fko_B_20K_molgrid.pcd', '../ligands/CDK2/4fkp_B_LS5_molgrid.pcd', '../ligands/CDK2/4fkq_B_42K_molgrid.pcd', '../ligands/CDK2/4fkr_B_45K_molgrid.pcd', '../ligands/CDK2/4fks_B_46K_molgrid.pcd', '../ligands/CDK2/4fkt_B_48K_molgrid.pcd', '../ligands/CDK2/4fku_D_60K_molgrid.pcd', '../ligands/CDK2/4fkv_B_61K_molgrid.pcd', '../ligands/CDK2/4fkw_B_62K_molgrid.pcd']


Load original CDK2 Murcko scaffolds:

In [19]:
files = []
for f in os.listdir(path):
    fname, ext = os.path.splitext(f)
    if ext == ".sdf" and "murcko" in f and not "tran" in f:
        files.append(os.path.join(path, f))

# Order ligands
# This should make the three chemical series pop-up in the PCD fit
names = {
    "4ek4_B_1CK": "CS1",
    "4ek5_B_03K": "CS3",
    "4fkg_B_4CK": "CS4",
    "4fki_B_09K": "CS9",
    "4fkj_B_11K": "CS11",
    "3sw4_B_18K": "CS18",
    "3sw7_B_19K": "CS19",
    "4fko_B_20K": "CS20",
    "4fkp_B_LS5": "CS241",
    "4fkq_B_42K": "CS242",
    "4fkr_B_45K": "CS245",
    "4fks_B_46K": "CS246",
    "4fkt_B_48K": "CS248",
    "4fku_D_60K": "CS260",
    "4fkv_B_61K": "CS261",
    "4fkw_B_62K": "CS262",
}

files.sort(
    key=lambda f: int(
        names[os.path.splitext(os.path.basename(f))[0].replace(f"_murcko", "")].replace(
            "CS", ""
        )
    )
)

print(files)

MSmols = []
for f in files:
    s = Chem.SDMolSupplier(f.replace(".pcd", ".sdf").replace(f"_{method}", ""))
    mol = next(s)
    MSmols.append(mol)

['../ligands/CDK2/4ek4_B_1CK_murcko.sdf', '../ligands/CDK2/4ek5_B_03K_murcko.sdf', '../ligands/CDK2/4fkg_B_4CK_murcko.sdf', '../ligands/CDK2/4fki_B_09K_murcko.sdf', '../ligands/CDK2/4fkj_B_11K_murcko.sdf', '../ligands/CDK2/3sw4_B_18K_murcko.sdf', '../ligands/CDK2/3sw7_B_19K_murcko.sdf', '../ligands/CDK2/4fko_B_20K_murcko.sdf', '../ligands/CDK2/4fkp_B_LS5_murcko.sdf', '../ligands/CDK2/4fkq_B_42K_murcko.sdf', '../ligands/CDK2/4fkr_B_45K_murcko.sdf', '../ligands/CDK2/4fks_B_46K_murcko.sdf', '../ligands/CDK2/4fkt_B_48K_murcko.sdf', '../ligands/CDK2/4fku_D_60K_murcko.sdf', '../ligands/CDK2/4fkv_B_61K_murcko.sdf', '../ligands/CDK2/4fkw_B_62K_murcko.sdf']


Load translated and rotated CDK2 Murcko scaffolds with their point-cloud representation (pre-computed):

In [20]:
files = []
for f in os.listdir(path):
    fname, ext = os.path.splitext(f)
    if ext == ".pcd" and f"_murcko_tran_{method}" in fname:
        files.append(os.path.join(path, f))

# Order ligands
# This should make the three chemical series pop-up in the PCD fit
names = {
    "4ek4_B_1CK": "CS1",
    "4ek5_B_03K": "CS3",
    "4fkg_B_4CK": "CS4",
    "4fki_B_09K": "CS9",
    "4fkj_B_11K": "CS11",
    "3sw4_B_18K": "CS18",
    "3sw7_B_19K": "CS19",
    "4fko_B_20K": "CS20",
    "4fkp_B_LS5": "CS241",
    "4fkq_B_42K": "CS242",
    "4fkr_B_45K": "CS245",
    "4fks_B_46K": "CS246",
    "4fkt_B_48K": "CS248",
    "4fku_D_60K": "CS260",
    "4fkv_B_61K": "CS261",
    "4fkw_B_62K": "CS262",
}

files.sort(
    key=lambda f: int(
        names[
            os.path.splitext(os.path.basename(f))[0].replace(
                f"_murcko_tran_{method}", ""
            )
        ].replace("CS", "")
    )
)

print(files)

tMSpcds = []
tMSmols = []
for f in files:
    pcd = o3d.io.read_point_cloud(f)
    tMSpcds.append(pcd)

    s = Chem.SDMolSupplier(f.replace(".pcd", ".sdf").replace(f"_{method}", ""))
    mol = next(s)
    tMSmols.append(mol)

['../ligands/CDK2/4ek4_B_1CK_murcko_tran_molgrid.pcd', '../ligands/CDK2/4ek5_B_03K_murcko_tran_molgrid.pcd', '../ligands/CDK2/4fkg_B_4CK_murcko_tran_molgrid.pcd', '../ligands/CDK2/4fki_B_09K_murcko_tran_molgrid.pcd', '../ligands/CDK2/4fkj_B_11K_murcko_tran_molgrid.pcd', '../ligands/CDK2/3sw4_B_18K_murcko_tran_molgrid.pcd', '../ligands/CDK2/3sw7_B_19K_murcko_tran_molgrid.pcd', '../ligands/CDK2/4fko_B_20K_murcko_tran_molgrid.pcd', '../ligands/CDK2/4fkp_B_LS5_murcko_tran_molgrid.pcd', '../ligands/CDK2/4fkq_B_42K_murcko_tran_molgrid.pcd', '../ligands/CDK2/4fkr_B_45K_murcko_tran_molgrid.pcd', '../ligands/CDK2/4fks_B_46K_murcko_tran_molgrid.pcd', '../ligands/CDK2/4fkt_B_48K_murcko_tran_molgrid.pcd', '../ligands/CDK2/4fku_D_60K_murcko_tran_molgrid.pcd', '../ligands/CDK2/4fkv_B_61K_murcko_tran_molgrid.pcd', '../ligands/CDK2/4fkw_B_62K_murcko_tran_molgrid.pcd']


In [21]:
for i, f in enumerate(files):
    rmsd_i, rmsd_f, _, _ = align_and_show_scaffold(
        mols[i], pcds[i], tMSmols[i], tMSpcds[i], MSmols[i]
    )
    rmsds["system"].append("CDK2")
    rmsds["lig"].append(
        names[
            os.path.splitext(os.path.basename(f))[0].replace(
                f"_murcko_tran_{method}", ""
            )
        ]
    )
    rmsds["RMSDi"].append(rmsd_i)
    rmsds["RMSDf"].append(rmsd_f)
    rmsds["idx"].append(i)

### Failures

In [22]:
df = pd.DataFrame(rmsds)
df = df[df.system == "CDK2"]

In [23]:
df[df.RMSDf >= rmsd_threshold]

Unnamed: 0,system,lig,RMSDi,RMSDf,idx
17,CDK2,CS20,52.549371,3.734318,7


In [24]:
for _, (i, lig) in df[df.RMSDf > rmsd_threshold][["idx", "lig"]].iterrows():
    p = show_scaffold(mols[i], tMSmols[i])

    with Chem.SDWriter(f"failed-{method}/{lig}_alignfail.sdf") as w:
        w.write(tMSmols[i], confId=1)

    display(p)

<py3Dmol.view at 0x7f230708e100>

## Show Results

In [25]:
df = pd.DataFrame(rmsds)
df

Unnamed: 0,system,lig,RMSDi,RMSDf,idx
0,BRD4,ligand-1,14.741494,0.068554,0
1,BRD4,ligand-2,3.476632,0.036455,1
2,BRD4,ligand-3,16.842481,5.467946,2
3,BRD4,ligand-4,16.532072,0.169241,3
4,BRD4,ligand-5,7.522788,0.05597,4
5,BRD4,ligand-6,18.104774,6.226867,5
6,BRD4,ligand-7,22.284243,0.059618,6
7,BRD4,ligand-8,3.884586,0.092889,7
8,BRD4,ligand-9,7.182288,0.094402,8
9,BRD4,ligand-10,23.021753,0.091005,9


In [26]:
df[df.RMSDf > rmsd_threshold]

Unnamed: 0,system,lig,RMSDi,RMSDf,idx
2,BRD4,ligand-3,16.842481,5.467946,2
5,BRD4,ligand-6,18.104774,6.226867,5
17,CDK2,CS20,52.549371,3.734318,7


In [27]:
df.to_csv(f"alig_scaffolds_rmsds_{method}.csv", index=False, float_format="%.5f")