To open on Google Colab [link](https://colab.research.google.com/github/RodrigoAVargasHdz/CHEM-4PB3/blob/main/Course_Notes/Week9/property_prediction.ipynb)



In [None]:
!pip3 install cairosvg
!pip install rdkit-pypi
!pip install avogadro
!pip install py3Dmol


In [None]:
import tqdm
import numpy as np
import pandas as pd

import torch
from torch import nn
import torch.functional as F
from torch.utils.data import Dataset, DataLoader

from rdkit import Chem
from rdkit.Chem import AllChem, Draw, rdMolDescriptors, rdDistGeom, rdMolTransforms, QED
from rdkit.Chem.Scaffolds.MurckoScaffold import GetScaffoldForMol
from rdkit.Chem.rdmolops import GetAdjacencyMatrix
from rdkit.Chem.Draw import IPythonConsole

import matplotlib
import matplotlib.pyplot as plt


In [None]:
#read data
data_url = "https://github.com/RodrigoAVargasHdz/CHEM-4PB3/raw/main/Course_Notes/data/qm9.csv"
data_big = pd.read_csv(data_url)
data_big.head()
n0 = 30000
data = data_big.sample(n0)
print(data.shape)
print('Properties', data.columns)



# Data loading

In [None]:
smiles_all = data['smiles']
smiles_len = list(map(len, smiles_all))
max_len = max(smiles_len) + 5

SMILES_CHARS = ["7", "6", "o", "]", "3", "s", "(", "-", "S", "/", "B", "4", "[", ")", "#", "I",
                "l", "O", "H", "c", "1", "@", "=", "n", "P", "8", "C", "2", "F", "5", "r", "N", "+", "\\", " "]
# index
smi2index = dict((c, i) for i, c in enumerate(SMILES_CHARS))

def smiles_to_one_hot(smiles, maxlen=max_len):
    X = np.zeros((maxlen, len(SMILES_CHARS)))  # (maxlen, dictionary)
    # print(smiles,type(smiles))
    smiles = smiles.replace('\n', '')
    for i, c in enumerate(smiles):
        X[i, smi2index[c]] = 1
    return X

In [None]:
def smiles_to_MorganFP(smiles,radius=2):
    mol = AllChem.MolFromSmiles(smiles)

    bi = {}
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(
        mol, radius=radius, bitInfo=bi)
    return np.array(fp)

In [None]:
def smiles_to_classicalMD(smiles):
    m = AllChem.MolFromSmiles(smiles)
    # canon_smiles = AllChem.MolToSmiles(m, canonical=True)
    # number of H-bond acceptors for a molecule
    hba = rdMolDescriptors.CalcNumHBA(m)

    # number of H-bond donors for a molecule
    hbd = rdMolDescriptors.CalcNumHBD(m)

    # number of rings for a molecule
    nrings = rdMolDescriptors.CalcNumRings(m)

    # number of rotatable bonds for a molecule
    rtb = rdMolDescriptors.CalcNumRotatableBonds(m)

    #  topological polar surface area (TPSA) of a molecule (used medicinal chemistry metric for the optimization of a drug's ability to permeate cells.)
    psa = rdMolDescriptors.CalcTPSA(m)

    # logP and mr from https://pubs.acs.org/doi/10.1021/ci990307l:
    # logP ->  water partition coefficient as measure of lipophilicity
    # MR -> molar refractivity
    logp, mr = rdMolDescriptors.CalcCrippenDescriptors(m)

    # molecular weight
    mw = rdMolDescriptors._CalcMolWt(m)

    # Csp3: fraction of sp3 carbons
    csp3 = rdMolDescriptors.CalcFractionCSP3(m)

    # fraction of atoms belonging to Murcko framework
    # number of heavy atoms for a molecule
    fmf = GetScaffoldForMol(m).GetNumHeavyAtoms() / m.GetNumHeavyAtoms()
    hac = m.GetNumHeavyAtoms()

    # max_ring_size: maximum ring size in a molecule
    max_ring_size = len(max(m.GetRingInfo().AtomRings(), key=len, default=()))

    # QED: quantitative estimate of drug-likeness (https://www.rdkit.org/docs/source/rdkit.Chem.QED.html)
    qed = QED.qed(m)

    # ChiralCenters: number of chiral centers (assigned and unassigned)
    n_chiral_centers = len(
        Chem.FindMolChiralCenters(m, includeUnassigned=True))

    # r = [hba, hbd, nrings, rtb, logp, mr, mw, csp3, fmf, qed, hac, n_chiral_centers, max_ring_size]
    r = [hba, hbd, nrings, rtb, mr, mw, csp3, fmf, qed, hac, n_chiral_centers, max_ring_size]
    return np.array(r)


In [None]:
# Data class
class MolecDataset(Dataset):
    def __init__(self, smiles_all, molecules_labels_all, fp_style='one_hot', bool_flatten = False):
        self.labels_all = molecules_labels_all
        self.smiles_all = smiles_all
        self.fp_style = fp_style
        self.bool_flatten = bool_flatten
        self.max_len = 35
        self.radius = 3
        # self.flatten = flatten
        # self.one_hot_label = one_hot
        # self.cnn = cnn
    
    def __len__(self):
        return len(self.labels_all)

    def __getitem__(self, idx):
        smiles = self.smiles_all[idx]
        # smiles_canon = AllChem.MolToSmiles(AllChem.MolFromSmiles(smiles), canonical=True)
        label = self.labels_all[idx]
        if self.fp_style == 'one_hot':
            """
            creates a one-hot encoding representation of a smiles.
            Maxlength is 100

            Returns:
                z: one-hot molecule representation (35,100)
            """
            z = torch.tensor(smiles_to_one_hot(smiles, self.max_len)).double()
            z = z.unsqueeze(0).float() # add one channel 
        elif self.fp_style == 'ecf':
            """generates a Finger Print representation using the Morgan algorithm with radius = 2
            """
            z = smiles_to_MorganFP(smiles, self.radius)
            z = torch.tensor(z).double()
            z = z.unsqueeze(0).float()
        elif self.fp_style == 'md':
            """generates a Finger Print representation using the some Molecular Descriptors
            """
            z = smiles_to_classicalMD(smiles)
            z = torch.tensor(z).double()
            z = z.unsqueeze(0).float()
            
        if self.bool_flatten:
            z = torch.flatten(z,start_dim=1)
        
        molecule = z
    
        return molecule, label

In [None]:
X_all = smiles_all.to_list()
properties = ['homo', 'lumo']  # data[['smiles','homo']]
y_all = data[properties].to_numpy()


from sklearn.model_selection import train_test_split
# load the data

Xtr, Xtest, ytr, ytest = train_test_split(
    X_all, y_all, test_size=0.75, shuffle=False
)


# Models

In [None]:
class MLP(nn.Module):
  '''
    Multilayer Perceptron.
  '''

  def __init__(self,x_dim,z_dims):
    super().__init__()
    self.encoder_layers = nn.Sequential(
        # nn.Flatten(),
        nn.Linear(x_dim, z_dims[0]),
        nn.ReLU(),
        nn.Linear(z_dims[0], z_dims[1]),
        nn.ReLU(),
        nn.Linear(z_dims[1], z_dims[2]),
        nn.ReLU(),
    )
    self.final_layer = nn.Linear(z_dims[2], z_dims[3])
    
  def encoder(self,x):
    return self.encoder_layers(x.flatten(start_dim=1))

  def forward(self, x):
    '''Forward pass'''
    z = self.encoder(x.flatten(start_dim=1))
    return self.final_layer(z)



In [None]:

class CNN(nn.Module):
    def __init__(self,n_outputs):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=32,
                kernel_size=5,
                stride=1,
                padding=2,
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=32,
                out_channels=32,
                kernel_size=5,
                stride=1,
                padding=2,
            ),
            nn.ReLU(),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(32, 16, 3, 1, 2),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )        # fully connected layer, output 10 classes
        self.out = nn.Linear(16 * 9 * 9, n_outputs)
    
        self.encoder_layers = nn.Sequential(
            self.conv1,
            self.conv2,
            self.conv3,
        )
    def encoder(self,x):
        return self.encoder_layers(x)

    def forward(self, x):
        x = self.encoder(x)
        x = x.view(x.size(0), -1)
        output = self.out(x)
        return output  


# Training

In [None]:
molecdata_tr = MolecDataset(Xtr, torch.tensor(ytr), 'one_hot')

train_dataloader = DataLoader(molecdata_tr, batch_size=10, shuffle=True)
for d_epoch in train_dataloader:
    x,y = d_epoch
    break
print('Batch size')
print(x.shape,y.shape)
print('===================')

def get_x_dim(x):
    dim = 1
    for i in x:
       dim = dim*i 
    return dim
        
n_outputs = 2
x_dim = get_x_dim(x.shape[1:])
print(x.shape[1:],x_dim)

print('===================')
print('MLP')
mlp_model = MLP(x_dim,[3000,3000,3000,2])
# print(mlp_model(x).shape)
# print(mlp_model.encoder(x).shape)
# print('Network parameters')
for n,p in mlp_model.named_parameters():
    print(n,p.shape)

print('===================')
print('CNN')
cnn_model = CNN(2)
print(cnn_model.encoder(x).shape)
print(cnn_model(x).shape)
for n,p in cnn_model.named_parameters():
    print(n,p.shape)

In [None]:
def train(model, training_data, training_epochs=60):
    # Define the loss function and optimizer
    
    loss_function = nn.MSELoss()
    
    optimizer = torch.optim.AdamW(
        model.parameters(), lr=1e-4, weight_decay=0.02)

    trainloader = torch.utils.data.DataLoader(
        training_data, batch_size=128, shuffle=True)

    iterator = tqdm.notebook.tqdm(range(training_epochs))

    # Run the training loop (epochs)
    loss_trajectory = []
    for epoch in iterator:

        # Set current loss value
        current_loss = []
        for i, data in enumerate(trainloader, 0):
            inputs, targets = data

            outputs = model(inputs)

            optimizer.zero_grad()

            loss = loss_function(outputs, targets.float())
            loss.backward()
            optimizer.step()

            # Print statistics
            # current_loss += loss.item()
            current_loss.append(loss.item())
        # print('Epoch %s: %.4f +- %.4f'%(epoch,np.array(current_loss).mean(),np.array(current_loss).std()))
        iterator.set_postfix(loss=torch.tensor(current_loss).mean())
        loss_trajectory.append(current_loss)
        # Process is complete.
    return loss_trajectory


In [None]:
# Train our model
loss_trajectory = train(cnn_model,molecdata_tr,2)

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(np.arange(len(loss_trajectory[0])),np.array(loss_trajectory[0]))
plt.xlabel('Iterations',fontsize = 15)
plt.ylabel('Loss function', fontsize=15)


# How good is our model? 
Using the remaining data set make a ```pred vs exact``` plot and compute the RMSE value.

In [None]:
molecdata_tst = MolecDataset(Xtest, torch.tensor(ytest), 'one_hot')

cnn_model.eval()
with torch.no_grad():
    testloader = torch.utils.data.DataLoader(
        molecdata_tst, batch_size=128, shuffle=False)
    
    outputs_pred = torch.zeros((1,2)) # number of outputs
    for i, data in enumerate(testloader, 0):
            inputs, targets = data

            outputs = cnn_model(inputs)
            outputs_pred = torch.vstack((outputs_pred,outputs))

outputs_pred = outputs_pred[1:]
print(torch.tensor(ytest).shape)
print(outputs_pred.shape)

loss_function = nn.MSELoss()
print(loss_function(outputs_pred,torch.tensor(ytest)))

In [None]:
# figure here

# Transfer learning

(from [Wiki](https://en.wikipedia.org/wiki/Transfer_learning))\
Transfer learning (TL) is a research problem in machine learning (ML) that focuses on storing knowledge gained while solving one problem and applying it to a different but related problem.\
For example, knowledge gained while learning to recognize cars could be applied when trying to recognize trucks.\
From the practical standpoint, reusing or transferring information from previously learned tasks for the learning of new tasks has the potential to significantly improve the sample efficiency of a reinforcement learning agent.

**How can we use transfer learning for Chemistry?**

<img src="https://raw.github.com/cs231n/cs231n.github.io/master/assets/nn1/neural_net2.jpeg" width="500" height="300">

In [None]:
def print_net_parameters(net):
    for name, para in net.named_parameters():
        print("-"*20)
        print(f"name: {name}")
        print("size: ", para.shape)
        print("Grad: ", para.requires_grad)


In [None]:
print_net_parameters(cnn_model)

Optimization in ML is based on gradients

If we do not update part of the parameters in the training we can use them as a "fixed" encoding representation.

In [None]:
for name, param in cnn_model.named_parameters():
     if param.requires_grad and 'conv' in name:
         param.requires_grad = False
print_net_parameters(cnn_model)

# In-Class exercise  
Re-train a model to predict the ```logP``` without touching the "encoder" to predict the logP. 


In [None]:
n0 = 10000
data = data_big.sample(n0)
smiles_logP_all = data['smiles'].to_list()

logP_all = []
for s in smiles_logP_all:
    m = AllChem.MolFromSmiles(s)
    logp, mr = rdMolDescriptors.CalcCrippenDescriptors(m)
    logP_all.append(logp)

logP_all = np.array(logP_all)


In [None]:
X_log_all = smiles_logP_all
y_logP_all = logP_all

# load the data

Xtr, Xtest, ytr, ytest = train_test_split(
    X_log_all, y_logP_all, test_size=0.5, shuffle=False
)


In [None]:
logPdata_tr = MolecDataset(Xtr, torch.tensor(ytr), 'one_hot')

cnn_model_logP = CNN(1)
print(cnn_model_logP.conv1.
# for (name, param),(name1, param1) in zip(cnn_model.named_parameters(),cnn_model_logP.named_parameters()):
    # if 'conv1' in name and 'conv1' in name1:
        # print(name)
    #    cnn_model_logP.name.weight.data = cnn_model.name.weight.data
        # param1 = param
    # print(name,name1)
    # print(param.shape,param1.shape)
# loss_trajectory_logP = train(cnn_model,logPdata_tr,2)
# print()

for (name, param),(name1, param1) in zip(cnn_model.named_parameters(),cnn_model_logP.named_parameters()):
    print(param-param1)

In [None]:
cnn_model.eval()
with torch.no_grad():
    testloader = torch.utils.data.DataLoader(
        molecdata_tst, batch_size=2, shuffle=False)
    
    outputs_pred = torch.zeros((1,2)) # number of outputs
    for i, data in enumerate(testloader, 0):
            inputs, targets = data
            print(cnn_model(input))
            
            # outputs = cnn_model(inputs)
            # outputs_pred = torch.vstack((outputs_pred,outputs))