In [None]:
!pip install rdkit-pypi


In [None]:
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem, Draw, rdMolDescriptors, rdDistGeom, rdMolTransforms, QED
from rdkit.Chem.Scaffolds.MurckoScaffold import GetScaffoldForMol
from rdkit.Chem.rdmolops import GetAdjacencyMatrix
from rdkit.Chem.Draw import IPythonConsole

import matplotlib 
import matplotlib.pyplot as plt

In [None]:
# from: https://moleculenet.org/datasets-1
# data = pd.read_csv('/Users/ravh011/Documents/McMaster/Courses/CHEM_4PB3_2022/Course_Notes/data/HIV.csv')
data_url = "https://github.com/RodrigoAVargasHdz/CHEM-4PB3/raw/main/Course_Notes/data/HIV.csv"
data = pd.read_csv(data_url)
data.head()

In [None]:
y = data['HIV_active']
y0 = y==0
y1 = y==1
y0_class = data['HIV_active'][y0]
y1_class = data['HIV_active'][y1]
print(y0_class.shape,y1_class.shape)

In [None]:
def get_fingerprints(m_smiles, radius=2, nbits=2048):
  m = Chem.MolFromSmiles(m_smiles)
  m = Chem.AddHs(m)
  m_fingerprints = AllChem.GetMorganFingerprintAsBitVect(
      m, radius=radius, nBits=nbits)
  return np.asarray(m_fingerprints)

def get_classical_md(m):
    canon_smiles = AllChem.MolToSmiles(m, canonical=True)
    # number of H-bond acceptors for a molecule
    hba = rdMolDescriptors.CalcNumHBA(m)

    # number of H-bond donors for a molecule
    hbd = rdMolDescriptors.CalcNumHBD(m)

    # number of rings for a molecule
    nrings = rdMolDescriptors.CalcNumRings(m)

    # number of rotatable bonds for a molecule
    rtb = rdMolDescriptors.CalcNumRotatableBonds(m)

    #  topological polar surface area (TPSA) of a molecule (used medicinal chemistry metric for the optimization of a drug's ability to permeate cells.)
    psa = rdMolDescriptors.CalcTPSA(m)

    # logP and mr from https://pubs.acs.org/doi/10.1021/ci990307l:
    # logP ->  water partition coefficient as measure of lipophilicity
    # MR -> molar refractivity
    logp, mr = rdMolDescriptors.CalcCrippenDescriptors(m)

    # molecular weight
    mw = rdMolDescriptors._CalcMolWt(m)

    # Csp3: fraction of sp3 carbons
    csp3 = rdMolDescriptors.CalcFractionCSP3(m)
    
    # fraction of atoms belonging to Murcko framework
    # number of heavy atoms for a molecule
    fmf = GetScaffoldForMol(m).GetNumHeavyAtoms() / m.GetNumHeavyAtoms()
    hac = m.GetNumHeavyAtoms()

    # max_ring_size: maximum ring size in a molecule
    max_ring_size = len(max(m.GetRingInfo().AtomRings(), key=len, default=()))

    # QED: quantitative estimate of drug-likeness (https://www.rdkit.org/docs/source/rdkit.Chem.QED.html)
    qed = QED.qed(m)
    
    # ChiralCenters: number of chiral centers (assigned and unassigned)
    n_chiral_centers = len(Chem.FindMolChiralCenters(m, includeUnassigned=True))
    
    r = np.array([hba,hbd,nrings,rtb,logp,mr,mw,csp3,fmf,qed,hac,n_chiral_centers,max_ring_size,psa])
    return r


In [None]:
D = {}
smiles_all = data['smiles']
for i, smi in enumerate(smiles_all):
    # m = AllChem.MolFromSmiles(smi)
    # r = get_fingerprints(smi)
    r = get_classical_md(smi)
    D.update({i: r})
    
print(D)

## Supervised learning
So far we studied how to use "label" data in order to parametrize functions to predict target tasks,
$$
\mathbf{y} = f(\mathbf{x};\boldsymbol{\theta}).
$$

To achieve this learning we rely on error functions that compare the prediction between our model and the ground truth values,
$$
{\cal L}(\boldsymbol{\theta}) = \|\hat{\mathbf{y}} - f(\mathbf{X};\boldsymbol{\theta})\|_2\\
{\cal L}(\boldsymbol{\theta}) = \|\hat{\mathbf{y}} - f(\mathbf{X};\boldsymbol{\theta})\|_2 + \lambda \|\boldsymbol{\theta}\|_2 \\
{\cal L}(\boldsymbol{\theta}) = |\hat{\mathbf{y}} - f(\mathbf{X};\boldsymbol{\theta})|,
$$
or other loss functions that depend on $\hat{\mathbf{y}}$.

What if $\hat{\mathbf{y}}$ is unknown? 

## Unsupervised learning

In unsupervised learning the data only consists on a set of input vectors $\{\mathbf{x}_i\}_i^N$ without any corresponding target values. Unsupervised learning algorithms can be described using three main approaches,
1. **clustering**, discover groups of similar examples within the data.
2. **data generation**, determine the distribution of data within the input space.
3. **dimensionality reduction**, project the data from a high-dimensional space down to two or three dimensions for the purpose of visualization.

We will only focus on **dimensionality reduction**, and cover two of the must common algorithms,
1. Principal component analysis (PCA)
2. t-distributed stochastic neighbor embedding (t-SNE)


## (auto-encoders) Dimensionality reduction
Since the only source of information we have is the training data, one possible error function that we could use is,
$$
{\cal L}(\boldsymbol{\theta}) = \| \mathbf{X} - f(\mathbf{Z};\boldsymbol{\theta})\|_2 = \sum_i^N (\mathbf{x}_i - f(\mathbf{z}_i,\boldsymbol{\theta}))^2,
$$
where $\mathbf{z}_i$ is some "encoded" version of $\mathbf{x}_i$  and $f(\cdot)$ is the function that transforms back to the original feature representation. Usually, $\mathbf{z}$'s dimension is smaller than $\mathbf{x}$.  

* What about $\mathbf{z}$? \
We can also learn this encoding using another function, $\mathbf{z} = g(\mathbf{x};\boldsymbol{\theta})$.

$$
{\cal L}(\boldsymbol{\theta}) = \| \mathbf{X} - f(\mathbf{Z};\boldsymbol{\theta}_d)\|_2\\
{\cal L}(\boldsymbol{\theta}_e,\boldsymbol{\theta}_d) = \| \mathbf{X} - f(g(\mathbf{X};\boldsymbol{\theta}_e),\boldsymbol{\theta}_d)\|_2\
$$

<!-- $f(g(\mathbf{X};\boldsymbol{\theta}_d),\boldsymbol{\theta}_e)$\ -->
<img src="https://raw.github.com/RodrigoAVargasHdz/CHEM-4PB3/master/Course_Notes/Figures/autoencoder.png"  width="250" height="200">



I believe you have all the tools to construct such models (auto-encoders) in torch. 

```python
import torch
from torch import nn
import torch.functional as F

class AutoEncoder(nn.Module):
    def __init__(self,x_dim,latent_dim):
        super(MolecularVAE, self).__init__()

        self.x_dim = x_dim
        self.latent_dim = latent_dim

        self.e = nn.Sequential(
            nn.Linear(self.x_dim, self.latent_dim),
            nn.ReLU(),
            )
            
        self.d = nn.Sequential(
            nn.Linear(self.latent_dim, self.x_dim),
            nn.ReLU(),
            )

    def encoder(self,x):
        z = self.e(x)
        return z     
    def decoder(self,x):
        z = self.d(x)
        return z 

    def forward(self, x):
        z = self.encoder(x)
        x_recon = self.decoder(z)
        return x_recon
```

## Principal Component Analysis (PCA)

PCA is by far the most used technique to compress data into lower dimensions, ([review paper](https://royalsocietypublishing.org/doi/10.1098/rsta.2015.0202)).\
I will try to motivate PCA again under the perspective of, 
$$
{\cal L}(\boldsymbol{\theta}) =  \| \mathbf{X} - \tilde{\mathbf{X}} \|_2 
$$
where $\tilde{\mathbf{X}}$ is the reconstructed $\mathbf{X}$ using the PCA model.

In PCA, each reconstructed point can be computed as, 
$$
\tilde{\mathbf{x}}_i = \sum_i^M z_{ni}\mathbf{u}_i + \sum_{i=M+1}^D b_{i}\mathbf{u}_i,
$$
where all $\mathbf{u}_i$  form an orthonormal set of D-dimensional basis vectors ($\mathbf{u}_j^\top\mathbf{u}_i = \delta_{ij}$).
The parameters of PCA are all $z_{ni}$, $b_{i}$, and $\mathbf{u}_i$, and we can optimize them so $\tilde{\mathbf{x}}_i \approx \mathbf{x}$.

The loss function of PCA is also a square distance between original and reconstructed,
$$
{\cal L}(\boldsymbol{\theta}) =  \frac{1}{N} \sum_i^N (\mathbf{x}_i - \tilde{\mathbf{x}}_i)^2
$$
First, let's consider the average point $\overline{\mathbf{x}} = \frac{1}{N}\sum_i^N\mathbf{x}_i$,
$$
{\cal L}(\boldsymbol{\theta}) =  \frac{1}{N} \sum_n^N \sum_{i=M+1}^D (\mathbf{x}_n^\top\mathbf{u}_i - \overline{\mathbf{x}}^\top\mathbf{u}_i)^2 = \sum_{i=M+1}^D \mathbf{u}_i^\top\mathbf{S}\mathbf{u}_i
$$

Remember $\mathbf{u}_i$ must be orthonormal, so $\mathbf{u}_i^\top\mathbf{S}\mathbf{u}_i$ can be solved using the **singular value decomposition (SVD)** of matrix **S**.\
**S** is the data covariance matrix defined by
$$
\mathbf{S} = \frac{1}{N} \sum_n^N(\mathbf{x_n}-\overline{\mathbf{x}})(\mathbf{x_n}-\overline{\mathbf{x}})^\top
$$


<!-- By setting,
$$
\frac{\partial {\cal L}}{\partial z_{ni}} = 0, \text{ we get, } z_{ni} = \mathbf{x}_i^\top \mathbf{u}_i\\
\frac{\partial {\cal L}}{\partial b_j} = 0, \text{ we get, } b_j = \overline{\mathbf{x}}^\top \mathbf{u}_j
$$
$\overline{\mathbf{x}} = \frac{1}{N}\sum_i^N\mathbf{x}_i$ -->

In [None]:
# PCA Sklearn https://github.com/scikit-learn/scikit-learn/blob/9aaed4987/sklearn/decomposition/_pca.py#L118
import numpy as np
from sklearn.decomposition import PCA

#get data
X = np.array([D[key] for i, key in enumerate(D)])
print(X.shape)

In [None]:
n_components = 100
pca = PCA(n_components=n_components)
pca.fit(X)

In [None]:
_,axs = plt.subplots(ncols=2)
axs[0].plot(np.cumsum(pca.explained_variance_ratio_))
axs[0].set_xlabel('Number of components')
axs[0].set_ylabel('Explained variance')

axs[1].plot(np.arange(len(pca.singular_values_)),np.array(pca.singular_values_))
axs[1].set_ylabel(f'$\lambda_i$')
axs[1].set_xlabel('Index')
plt.tight_layout()

In [None]:
pca_comp = pca.components_[:n_components]

for i,x in enumerate(pca.components_[:3]):
    plt.plot(np.arange(x.shape[0]),x,label='%i'%(i))
    # print(x)
plt.ylabel('Value of the PCA component')
plt.xlabel('Finger Print index')
plt.legend()


In [None]:
%matplotlib inline

X_r = pca.transform(X)

y = data['HIV_active']

y_c = ['blue' if yi == 0 else 'red' for yi in y]

# plt.scatter(X_r[:,0],X_r[:,1],color=y_c)

fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter3D(X_r[:, 0], X_r[:, 1], X_r[:, 2], c=y_c,s=2)



## Stochastic Neighbor Embedding (t-SNE)
[paper](https://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf)

Notation,
* $p_{j|i}$, probability that point $x_i$ would pick $x_j$ as its neighbor if neighbors were picked in proportion to their probability density under a Gaussian centered at $x_i$.
* $y_i$ is a low dimensional representation of $x_i$.
* $q_{i|j}$ is similar to $p_{j|i} just in the low dimensional representation.

If the map points $y_i$ and $y_j$ correctly model the similarity between the high-dimensional data-
points $x_i$ and $x_j$, the conditional probabilities $p_{j|i}$ and $q_{i|j}$ will be equal. 

In t-SNE, the loss function is motivated by similarity between the $p_{j|i}$ and $q_{i|j}$ distributions,
$$
{\cal L} = \sum_i KL(P_i||Q_i) = \sum_i\sum_j p_{j|i}\log\frac{p_{j|i}}{q_{j|i}}
$$

The similarity of map point  $y_j$  to map point  $y_i$ is given by,
$$
q_{i|j} = \frac{\exp(-\|y_j-y_i\|^2)}{\sum_{k\neq i}\exp(-\|y_j-y_i\|^2)}
$$


SNE minimizes the sum of Kullback-Leibler (KL) divergences over all data points using a gradient descent
method.

In [None]:
from sklearn import manifold

t_sne_model = manifold.TSNE(
    n_components=3,
    perplexity=30,
    init="random",
    n_iter=250,
    random_state=0,
)
X_t_sne = t_sne_model.fit_transform(X)

In [None]:
t_sne_model = manifold.TSNE(
    n_components=3,
    perplexity=30,
    init="random",
    n_iter=250,
    random_state=0,
)
X_t_sne = t_sne_model.fit_transform(X)

In [None]:
print(X_t_sne.shape)
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter3D(X_t_sne[:, 0], X_t_sne[:, 1], X_t_sne[:, 2], c=y_c,s=2)

## More embeddings for data

[Sklearn](https://scikit-learn.org/stable/modules/manifold.html#spectral-embedding) has a super nice tutorial on other data embedding methods. All depend on different heuristics on distance between pair of data points, 
$$
{\cal L} \approx \sum_{i>j}d_{ij}(\mathbf{X}) - d_{ij}(\mathbf{\tilde{X}})
$$