To open on Google Colab [link](https://colab.research.google.com/github/RodrigoAVargasHdz/CHEM-4PB3/blob/main/Course_Notes/Week9/molecular_fingerprints.ipynb)

In [None]:
!pip3 install cairosvg
!pip install rdkit-pypi
!pip install avogadro
!pip install py3Dmol

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import py3Dmol
from rdkit import Chem
from rdkit.Chem import AllChem, Draw, rdMolDescriptors, rdDistGeom, rdMolTransforms, QED
from rdkit.Chem.Scaffolds.MurckoScaffold import GetScaffoldForMol
from rdkit.Chem.rdmolops import GetAdjacencyMatrix
from rdkit.Chem.Draw import IPythonConsole

import networkx as nx

IPythonConsole.ipython_useSVG = True
IPythonConsole.drawOptions.addAtomIndices = True
IPythonConsole.molSize = 300,300

## ChemInformatics

*Molecular descriptors capture diverse parts of the structural information of molecules and they are the support of many contemporary computer-assisted toxicological and chemical applications.*

Being able to numerically represent molecules aid with the prediction problem,
$$
y = f(\mathbf{{\cal M}}),
$$
where $\mathbf{{\cal M}}$ is the molecular descriptors used in regression/classification.

<img src="https://raw.github.com/RodrigoAVargasHdz/CHEM-4PB3/master/Course_Notes/Figures/440801_1_En_1_Fig1_HTML.png"  width="500" height="700">

Figure from [link](https://link.springer.com/protocol/10.1007/978-1-4939-7899-1_1)


Molecular descriptors can describe different levels of information, from bulk properties to complex three-dimensional definitions or substructure frequency.
1. **0-Dimensional**:\
   atom counts (e.g., number of carbon atoms), molecular weight, and sum or average of atomic properties (e.g., atomic van der Waals volumes).
2. **1-Dimensional**:\
   molecules are perceived as a set of substructures, such as functional groups or atom-centered fragments.
3. **2-Dimensional**:\
   molecule is represented as a graph, whose vertexes are the atoms and edges are the bonds, and specific chemical properties of atoms are considered.
4. **3-Dimensional**:\
   descriptors deriving from 3D representation.
5. **4-Dimensional**:\
   molecular geometry combined with an “additional dimension/information”, e.g., representing each ligand by an ensemble of conformations, protonation states, and/or orientations.


## Classical molecular descriptors and binary fingerprints


**Classical molecular descriptors** (MDs) are designed to encode a precise structural/chemical feature (or a set of features of different complexity) into one, single number. 
They can also be combined with other molecular properties that could be efficiently estimated. 

**RDKit molecular descriptors** ([list](https://www.rdkit.org/docs/source/rdkit.Chem.rdMolDescriptors.html?highlight=calcnumhba#rdkit.Chem.rdMolDescriptors.CalcNumHBA))


In [None]:
# one-hot-encoding
# ALPHABET: define SMILES characters

max_len = 30
SMILES_CHARS = ["7", "6", "o", "]", "3", "s", "(", "-", "S", "/", "B", "4", "[", ")", "#", "I",
                "l", "O", "H", "c", "1", "@", "=", "n", "P", "8", "C", "2", "F", "5", "r", "N", "+", "\\", " "]
# index
smi2index = dict((c, i) for i, c in enumerate(SMILES_CHARS))


def smiles_to_one_hot(smiles, maxlen=max_len):
    X = np.zeros((maxlen, len(SMILES_CHARS)))  # (maxlen, dictionary)
    # print(smiles,type(smiles))
    smiles = smiles.replace('\n', '')
    for i, c in enumerate(smiles):
        X[i, smi2index[c]] = 1
    return X


# caffeine one hot
caffeine_smiles = 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C'
print(caffeine_smiles.split())

caffeine_one_hot = smiles_to_one_hot(caffeine_smiles)

print(caffeine_one_hot.shape)  # (120, 56)


plt.figure(figsize=(10, 10))
plt.imshow(caffeine_one_hot.T, cmap='binary')
# plt.xlabel('Tokens')
# plt.ylabel('SMILES')


caffeine_smiles_pad = caffeine_smiles + " " * (max_len - len(caffeine_smiles))

plt.title('One-hot encoding for %s' % caffeine_smiles)
plt.xticks(np.arange(len(list(caffeine_smiles))),
           list(caffeine_smiles), fontsize=8)
plt.yticks(np.arange(len(list(SMILES_CHARS))),
           list(SMILES_CHARS), fontsize=8)


In [None]:
def get_classical_md(m):
    canon_smiles = AllChem.MolToSmiles(m, canonical=True)
    # number of H-bond acceptors for a molecule
    hba = rdMolDescriptors.CalcNumHBA(m)

    # number of H-bond donors for a molecule
    hbd = rdMolDescriptors.CalcNumHBD(m)

    # number of rings for a molecule
    nrings = rdMolDescriptors.CalcNumRings(m)

    # number of rotatable bonds for a molecule
    rtb = rdMolDescriptors.CalcNumRotatableBonds(m)

    #  topological polar surface area (TPSA) of a molecule (used medicinal chemistry metric for the optimization of a drug's ability to permeate cells.)
    psa = rdMolDescriptors.CalcTPSA(m)

    # logP and mr from https://pubs.acs.org/doi/10.1021/ci990307l:
    # logP ->  water partition coefficient as measure of lipophilicity
    # MR -> molar refractivity
    logp, mr = rdMolDescriptors.CalcCrippenDescriptors(m)

    # molecular weight
    mw = rdMolDescriptors._CalcMolWt(m)

    # Csp3: fraction of sp3 carbons
    csp3 = rdMolDescriptors.CalcFractionCSP3(m)
    
    # fraction of atoms belonging to Murcko framework
    # number of heavy atoms for a molecule
    fmf = GetScaffoldForMol(m).GetNumHeavyAtoms() / m.GetNumHeavyAtoms()
    hac = m.GetNumHeavyAtoms()

    # max_ring_size: maximum ring size in a molecule
    max_ring_size = len(max(m.GetRingInfo().AtomRings(), key=len, default=()))

    # QED: quantitative estimate of drug-likeness (https://www.rdkit.org/docs/source/rdkit.Chem.QED.html)
    qed = QED.qed(m)
    
    # ChiralCenters: number of chiral centers (assigned and unassigned)
    n_chiral_centers = len(Chem.FindMolChiralCenters(m, includeUnassigned=True))
    
    # plane of best fit (PBF), to quantify and characterize the 3D character of molecules (https://pubs.acs.org/doi/pdf/10.1021/ci300293f)
    pbf = 
    
    r = {'Canonical SMILE': canon_smiles,
         'HBA': hba,
         'HBD': hbd,
         'NumRings': nrings,
         'RTB': rtb,
         'lopP': logp,
         'MR': mr,
         'MW': mw,
         'Csp3': csp3,
         'fmf': fmf,
         'QED': qed,
         'HAC': hac,
         'ChiralCenters': n_chiral_centers,
         'MaxRingSize':max_ring_size,
    }
    return r


In [None]:
# m = "O=C(Oc1cccnc1)c1cccs1"
m = 'OCCC'
mol = AllChem.MolFromSmiles(m)
print(get_classical_md(mol))
mol

In [None]:
import pandas as pd

url = 'https://raw.github.com/sp8rks/MaterialsInformatics/master/worked_examples/RDKit_tutorial/full_dataset_dd.csv'

data = pd.read_csv(url, index_col=0)

smiles_all = data['SMILES_STD']
print('Total number of molecules: ', len(smiles_all))


D = {}
for i,smi in enumerate(smiles_all):
    m = AllChem.MolFromSmiles(smi)
    r = get_classical_md(m)
    D.update({i:r})
print(D)
data = pd.DataFrame.from_dict(D,orient='index')
data.head()

In [None]:
cols = data.columns
ncols = len(cols)
print(ncols,cols)

fig = plt.figure(layout="constrained",figsize=(20,20))
axs = fig.subplots(nrows=5,ncols=3,squeeze=False)
ni_,nj_ = np.meshgrid(np.arange(0,5,dtype=np.int32),np.arange(0,3,dtype=np.int32)) 
nij = np.column_stack((ni_.flatten(),nj_.flatten()))

for i,(ij,ci) in enumerate(zip(nij,cols[1:])):
    r = data[ci].to_numpy()
    axs[ij[0],ij[1]].hist(r,bins=25)
    axs[ij[0], ij[1]].set_xlabel(cols[i+1],fontsize=15)
plt.show()

## 1D and 2D molecular finger prints

1. **1-Dimensional**:\
   molecules are perceived as a set of substructures, such as functional groups or atom-centered fragments.
2. **2-Dimensional**:\
   molecule is represented as a graph, whose vertexes are the atoms and edges are the bonds, and specific chemical properties of atoms are considered.

 
### Molecular/Circular Fingerprints aka Morgan Fingerprints ([paper](https://pubs.acs.org/doi/10.1021/ci100050t))



* topological fingerprints for molecular characterization.
* capture molecular features relevant to molecular activity.
* used in drug activity.
* Given a *radius*, we characterize what atom is connected/surrounded to a set of "candidates".

**Problem**\
They are non-reversible, meaning given some fingerprints we can't infer the original molecule.\
People have attempted to use deep-learning for this problem.

Papers:
1. [data-driven molecular descriptors](https://pubs.rsc.org/en/content/articlelanding/2019/sc/c8sc04175j)
2. [Neuraldecipher](https://pubs.rsc.org/en/content/articlelanding/2020/sc/d0sc03115a)


Code below is based on [link](https://rdkit.blogspot.com/2018/10/using-new-fingerprint-bit-rendering-code.html)

In [None]:
# m = "O=C(Oc1cccnc1)c1cccs1"
m = 'OCCC'
mol = AllChem.MolFromSmiles(m)
mol

In [None]:
# fp = AllChem.GetMorganFingerprintAsBitVect(mol,3, nBits=2048)
bi = {}
fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(
    mol, radius=1, bitInfo=bi)
print(bi)
fp_indx = np.nonzero(np.array(fp))[0]

Draw.DrawMorganBit(mol, fp_indx[0], bi)

In [None]:
prints = [(mol,x,bi)for x in fp.GetOnBits()]
Draw.DrawMorganBits(prints, molsPerRow=5, legends=[str(x) for x in fp.GetOnBits()])

In [None]:
def get_fingerprints(m_smiles, radius=2, nbits=2048):
  m = Chem.MolFromSmiles(m_smiles)
  m_fingerprints = AllChem.GetMorganFingerprintAsBitVect(
      m, radius=radius, nBits=nbits)
  return np.asarray(m_fingerprints)

In [None]:
caffeine_smiles = 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C'
m = AllChem.MolFromSmiles(caffeine_smiles)
r = get_fingerprints(caffeine_smiles)
print(r)

# plt.imshow(r[np.newaxis,:],)
plt.figure(figsize=(8,8))
plt.scatter(np.arange(r.shape[0]),r)
plt.xlabel('Caffeine Finger Prints',fontsize=18)
plt.ylabel('Binary',fontsize=18)

In [None]:
D = {}
for i, smi in enumerate(smiles_all):
    # m = AllChem.MolFromSmiles(smi)
    r = get_fingerprints(smi)
    D.update({i: r})

data = pd.DataFrame.from_dict(D, orient='index')
data.head()

In [None]:
cols = data.columns
ncols = len(cols)

print('List of FingerPrints that appear in our data set:')
count_fp = []
for i in range(ncols):
    r = data[cols[i]].to_numpy()
    nonzeros_ = np.nonzero(r)[0]
    count_fp.append(nonzeros_.shape[0])
    if nonzeros_.shape[0] > 0:
        print(f'FP-%s : %s' % (cols[i], nonzeros_.shape[0]))

count_fp = np.asarray(count_fp)
i0 = np.argsort(count_fp)[-10:]
print('Top 10 FingerPrints: ', i0)
print(count_fp[i0])


In [None]:
# What is FP-1?

smi = smiles_all[2]
print(smi)
mol = AllChem.MolFromSmiles(smi)

bi = {}
fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(
    mol, radius=1, bitInfo=bi)
fp_indx = np.nonzero(np.array(fp))[0]
print(bi)
Draw.DrawMorganBit(mol, fp_indx[0], bi, legend=str(fp_indx[0]))


In [None]:
# What is FP-1380?

# print(data[1380])
smi = smiles_all[662]
print(smi)
mol = AllChem.MolFromSmiles(smi)

bi = {}
fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(
    mol, radius=1, bitInfo=bi)
fp_indx = np.nonzero(np.array(fp))[0]
# print(bi)

i0 = np.where(fp_indx == 1380)[0][0]
Draw.DrawMorganBit(mol, fp_indx[i0], bi, legend=str(fp_indx[i0]))


# Conformer search with RDKit

In [None]:
def get_xyz_coordinates(m_rdkit):
    xyz = Chem.MolToMolBlock(m_rdkit) # Generates a 3D conformer
    n_atoms = m_rdkit.GetNumAtoms() # total numbers of atoms

    xyz_ = []
    for l in xyz.splitlines()[4:4+m_rdkit.GetNumAtoms()]:
        l = l.split()
        xyz_.append(l[:4])

    xyz_str = '%s\n * (null), Energy   -1000.0000000\n' % (n_atoms)
    for xyzi in xyz_:
        xyzi_str = '%s     %.4f     %.4f     %.4f\n' % (
            xyzi[3], float(xyzi[0]), float(xyzi[1]), float(xyzi[2]))
        xyz_str += xyzi_str
    return xyz_str

def draw_3d(smiles, bool_add_H=True):

    m3 = AllChem.MolFromSmiles(smiles)
    if bool_add_H:
        m3 = Chem.AddHs(m3)
    AllChem.EmbedMolecule(m3, randomSeed=0xf00d)

    n_atoms = m3.GetNumAtoms()

    xyz_str = get_xyz_coordinates(m3)
    xyzview = py3Dmol.view(width=400, height=400)
    xyzview.addModel(xyz_str, 'xyz')
    xyzview.setStyle({'sphere': {'radius': 0.35}, 'stick': {'radius': 0.1}})
    xyzview.setBackgroundColor('0xeeeeee')
    xyzview.zoomTo()
    xyzview.show()


In [None]:
m = 'OCCF'
draw_3d(m)
mol = AllChem.MolFromSmiles(m)
mol_wH = Chem.AddHs(mol)
# mol_wH

In [None]:
mol_wH

In [None]:
bounds = rdDistGeom.GetMoleculeBoundsMatrix(mol_wH)
print(bounds[0,3],bounds[3,0])
print(bounds)

paper [link](https://pubs.acs.org/doi/pdf/10.1021/acs.jcim.5b00654)
[drugbank](https://go.drugbank.com/drugs/DB01001)

In [None]:

ps = rdDistGeom.ETKDGv3()
ps.randomSeed = 0xf00d
ps.SetBoundsMat(bounds)

# ps = rdDistGeom.EmbedParameters()
# ps.useExpTorsionAnglePrefs = False

ps.useBasicKnowledge = False
cids = rdDistGeom.EmbedMultipleConfs(mol_wH,1500,ps)
dists_etkdg = [rdMolTransforms.GetBondLength(conf,0,3) for conf in mol_wH.GetConformers()]

plt.figure(figsize = (6, 6))
plt.hist(dists_etkdg, bins=20)
plt.title('ETKDG')
plt.xlabel('O--F distance')


In [None]:
# draw a single conformer with RDKit
IPythonConsole.drawMol3D(mol_wH,confId=cids[2])


In [None]:
# draw a single conformer with py3MOL

conf0 = Chem.MolToMolBlock(mol_wH, confId=cids[2])
print(conf0)
xyzview = py3Dmol.view(width=400, height=400)
xyzview.addModel(conf0, 'sdf')
xyzview.setStyle({'sphere': {'radius': 0.35}, 'stick': {'radius': 0.1}})
xyzview.setBackgroundColor('0xeeeeee')
xyzview.zoomTo()
xyzview.show()


In [None]:
conf0 = Chem.MolToMolBlock(mol_wH, confId=cids[100])
conf1 = Chem.MolToMolBlock(mol_wH, confId=cids[0])

xyzview = py3Dmol.view(width=400, height=400)
for i in range(10):
    conf0 = Chem.MolToMolBlock(mol_wH, confId=cids[i])
    xyzview.addModel(conf0, 'sdf')
# xyzview.addModel(conf1, 'sdf')
# xyzview.setStyle({'sphere': {'radius': 0.35}, 'stick': {'radius': 0.1}})
xyzview.setStyle({'stick':{'radius': 0.05}})
xyzview.setBackgroundColor('0xeeeeee')
xyzview.zoomTo()
xyzview.show()

Extra references:
1. https://pubs.rsc.org/en/content/articlehtml/2022/sc/d2sc02739a?page=search
2. https://github.com/atomistic-machine-learning/schnetpack
3. https://doi.org/10.1186/s13321-018-0258-y