In [1]:
import sys
import os

!pip install rdkit-pypi



In [2]:
!git clone https://github.com/Romain-MIPI/Reinforcement-Learning-for-De-Novo-Drug-Design.git

fatal: destination path 'Reinforcement-Learning-for-De-Novo-Drug-Design' already exists and is not an empty directory.


In [2]:
%cd Reinforcement-Learning-for-De-Novo-Drug-Design

/content/Reinforcement-Learning-for-De-Novo-Drug-Design


In [4]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

In [5]:
from sklearn.metrics import balanced_accuracy_score, f1_score, recall_score, precision_score
import pandas as pd
import copy
import pickle
import numpy as np
import matplotlib.pyplot as plt

In [6]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR
import torch.nn.functional as F

In [7]:
from rdkit.Chem import QED

data = pd.read_csv('./data/clean_020724_all_with_update.csv')
smiles = data['SMILES'].values
labels = data['Labels'].values.astype(np.int64)

In [8]:
print(smiles[:10])
print(labels[:10])

['O=C(N(C[C@H](O)[C@@H](O)[C@H](O)[C@H](O)CO)C)c1cnc(CN(Cc2ccccc2)Cc2ccccc2)cc1'
 'O=C(SC2=NN=C(C3=C(O)C=CC=C3)N2N)NC1=CC=CC=C1'
 'OC([C@@H](N)CN[C@H](C(O)=O)CN[C@@H](CC(O)=O)C)=O'
 'CC(=CCC[C@@](C)([C@H]1CC[C@@]2([C@@H]1[C@@H](C[C@H]3[C@]2(CC[C@@H]4[C@@]3(CC[C@@H](C4(C)C)O[C@H]5[C@@H]([C@H]([C@@H]([C@H](O5)CO)O)O)O[C@H]6[C@@H]([C@H]([C@@H]([C@H](O6)CO)O)O)O)C)C)O)C)O)C '
 'C[C@@]12CCC[C@@]([C@H]1CC[C@]34[C@H]2CC[C@](C3)(C(=C)C4)O[C@H]5[C@@H]([C@H]([C@@H]([C@H](O5)CO)O)O)O[C@H]6[C@@H]([C@H]([C@@H]([C@H](O6)CO)O)O)O)(C)C(=O)O[C@H]7[C@@H]([C@H]([C@@H]([C@H](O7)CO)O)O)O '
 'C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)OC3=CC(=C4C(=O)C[C@H](OC4=C3)C5=CC(=C(C=C5)OC)O)O)O)O)O)O)O)O '
 'C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)OC3=CC(=C4C(=C3)OC(=CC4=O)C5=CC(=C(C=C5)OC)O)O)O)O)O)O)O)O '
 'C1[C@H]([C@@H]([C@H]([C@@H]([C@H]1N)O[C@@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O)N)O[C@H]3[C@@H]([C@@H]([C@H](O3)CO)O[C@@H]4[C@@H]([C@H]([C@@H]([C

In [9]:
from models.utils import get_tokens

tokens, _, _ = get_tokens(smiles)
tokens = ''.join(tokens)

tokens

' #%()+-./0123456789=@BCFHIKNOPS[\\]aceilnorst'

In [9]:
from sklearn.model_selection import KFold, train_test_split

cross_validation_split = KFold(n_splits=5, shuffle=True)

data = list(cross_validation_split.split(smiles, labels))

In [11]:
from models.smiles_dataset import SmilesDataset
from models.encoder import Encoder
from models.mlp import MLP
from models.smiles2label import Smiles2Label
from models.utils import identity

model_object = Smiles2Label

n_hidden = 128
batch_size = 128
num_epochs = 26
lr = 0.005

model_params = {
    'use_cuda': True,
    'random_seed': 42,
    'world_size': 1,
    'data_layer': SmilesDataset,
    'use_clip_grad': False,
    'batch_size': batch_size,
    'num_epochs': num_epochs,
    'logdir': './data/checkpoints/classification/',
    'print_every': 1,
    'save_every': 5,
    'train_data_layer': None,
    'val_data_layer': None,
    'eval_metrics': balanced_accuracy_score, # f1_score, recall_score, precision_score
    'average' : "macro",
    'criterion': nn.CrossEntropyLoss(),
    'optimizer': Adam,
    'optimizer_params': {
        'lr': lr,
    },
    'lr_scheduler': ExponentialLR,
    'lr_scheduler_params': {
        'gamma': 0.98
    },
    'embedding': nn.Embedding,
    'embedding_params': {
        'num_embeddings': len(tokens),
        'embedding_dim': n_hidden,
        'padding_idx': tokens.index(' ')
    },
    'encoder': Encoder,
    'encoder_params': {
        'input_size': n_hidden,
        'layer': "LSTM",
        'encoder_dim': n_hidden,
        'n_layers': 2,
        'dropout': 0.8,
        'is_bidirectional': False
    },
    'mlp': MLP,
    'mlp_params': {
        'input_size': n_hidden,
        'n_layers': 2,
        'hidden_size': [n_hidden, 3],
        'activation': [F.relu, identity],
        'dropout': 0.0
    }
}

In [14]:
pickle.dump(model_params, open('./data/checkpoints/classification/model_parameters.pkl', 'wb'))

try:
    os.stat(model_params['logdir'])
except:
    os.mkdir(model_params['logdir'])

log_dir = model_params['logdir']

tmp_data_dir = './data/tmp/'
try:
    os.stat(tmp_data_dir)
except:
    os.mkdir(tmp_data_dir)

In [17]:
from torch.utils.data import DataLoader
from models.utils import save_smiles_property_file
from models.smiles2label import fit, build_training

i = 0
models = []
results = []
for split in data:
    train, test = split
    X_train = smiles[train]
    y_train = labels[train].reshape(-1)
    X_test = smiles[test]
    y_test = labels[test].reshape(-1)
    save_smiles_property_file(tmp_data_dir + str(i) + '_train.smi',
                              X_train, y_train.reshape(-1, 1))
    save_smiles_property_file(tmp_data_dir + str(i) + '_test.smi',
                              X_test, y_test.reshape(-1, 1))

    train_dataset = SmilesDataset(tmp_data_dir + str(i) + '_train.smi',
                           delimiter=',', cols_to_read=[0, 1], tokens=tokens,
                                 flip=False)
    train_dataset.target = np.array(train_dataset.target, dtype=np.int64).squeeze()
    test_dataset = SmilesDataset(tmp_data_dir + str(i) + '_test.smi',
                       delimiter=',', cols_to_read=[0, 1], tokens=tokens,
                                flip=False)
    test_dataset.target = np.array(test_dataset.target, dtype=np.int64).squeeze()
    model_params['train_data_layer'] = train_dataset
    model_params['val_data_layer'] = test_dataset
    model_params['logdir'] = log_dir + 'fold_' + str(i)
    try:
        os.stat(model_params['logdir'])
    except:
        os.mkdir(model_params['logdir'])
    ckpt_dir = model_params['logdir'] + '/checkpoint/'
    try:
        os.stat(ckpt_dir)
    except:
        os.mkdir(ckpt_dir)
    train_loader = DataLoader(train_dataset,
                             batch_size=model_params['batch_size'],
                             shuffle=True,
                             num_workers=4,
                             pin_memory=True,
                             sampler=None)
    val_loader = DataLoader(test_dataset,
                           batch_size=model_params['batch_size'],
                           shuffle=False,
                           num_workers=1,
                           pin_memory=True)

    models.append(model_object(params=model_params).cuda())
    criterion, optimizer, lr_scheduler = build_training(models[i], model_params)
    results.append(fit(models[i], lr_scheduler, train_loader, optimizer, criterion,
        model_params, val_loader=val_loader))


    i = i+1



In [18]:
from models.smiles2label import evaluate

rmse = []
bal_acc_score = []
for i in range(5):
    test_dataset = SmilesDataset(tmp_data_dir + str(i) + '_test.smi',
                                 delimiter=',', cols_to_read=[0, 1], tokens=tokens,
                                flip=False)
    test_dataset.target = np.array(test_dataset.target, dtype=np.int64).squeeze()
    val_loader = DataLoader(test_dataset,
                               batch_size=model_params['batch_size'],
                               shuffle=False,
                               num_workers=1,
                               pin_memory=True)
    average = model_params['average']
    metrics = evaluate(models[i], val_loader, criterion, average)
    rmse.append(np.sqrt(metrics[0]))
    bal_acc_score.append(metrics[1])



In [19]:
print("Cross-validated RMSE: ",  np.mean(rmse))
print("Cross-validated balanced accuracy score: ", np.mean(bal_acc_score))

Cross-validated RMSE:  0.9105862208944178
Cross-validated balanced accuracy score:  0.5678339006421722


In [20]:
! mv ./data/checkpoints/classification/fold_0/checkpoint/epoch_25 ./data/checkpoints/classification/fold_0.pkl
! mv ./data/checkpoints/classification/fold_1/checkpoint/epoch_25 ./data/checkpoints/classification/fold_1.pkl
! mv ./data/checkpoints/classification/fold_2/checkpoint/epoch_25 ./data/checkpoints/classification/fold_2.pkl
! mv ./data/checkpoints/classification/fold_3/checkpoint/epoch_25 ./data/checkpoints/classification/fold_3.pkl
! mv ./data/checkpoints/classification/fold_4/checkpoint/epoch_25 ./data/checkpoints/classification/fold_4.pkl

## test prediction

In [10]:
predictor_tokens = list(tokens)
print(predictor_tokens)
path_to_params = './data/checkpoints/classification/model_parameters.pkl'
path_to_checkpoint = './data/checkpoints/classification/fold_'

[' ', '#', '%', '(', ')', '+', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '=', '@', 'B', 'C', 'F', 'H', 'I', 'K', 'N', 'O', 'P', 'S', '[', '\\', ']', 'a', 'c', 'e', 'i', 'l', 'n', 'o', 'r', 's', 't']


In [11]:
from models.rnn_predictor import RNNPredictor

my_predictor = RNNPredictor(path_to_params, path_to_checkpoint, predictor_tokens)

In [23]:
_, predictions, _ = my_predictor.predict(smiles)

count = 0
miss_count = 0

for i in range(len(smiles)):
    if labels[i] != predictions[i]:
        miss_count += 1
        print(f"{miss_count} | {smiles[i]} {labels[i]} {predictions[i]}")
    else:
        count += 1

print(f"accuracy : {(count/len(smiles)*100):.2f}% ({count}/{len(smiles)})")

1 | O=C(SC2=NN=C(C3=C(O)C=CC=C3)N2N)NC1=CC=CC=C1 1 0
2 | C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)OC3=CC(=C4C(=O)C[C@H](OC4=C3)C5=CC(=C(C=C5)OC)O)O)O)O)O)O)O)O  1 0
3 | C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)OC3=C(OC4=CC(=CC(=C4C3=O)O)O)C5=CC(=C(C=C5)O)O)O)O)O)O)O)O  1 0
4 | O=S(NCC1=CC=CC=C1)(C2=CC=C(OC4=C3C=CC(N5C=NN=N5)=C4)C3=C2)=O 1 2
5 | CC1=CC=C(S(N2CCCCC2C(NC3=CC=C(N4C=NN=N4)C=C3)=O)(=O)=O)C=C1 1 0
6 | [H][C@@]12N([C@H](C(OC)=O)[C@@H](C)O2)[C@@H](CS)SC1 1 0
7 | O=C(C1=CC=C(OCC)C=C1)N(OCC2=CC=CC=C2)C3NC(C3)=O 0 1
8 | O=C(C1=CC=C([N+]([O-])=O)C=C1)N(OCC2=CC=CC=C2)C3N(SC)C(C3)=O 1 0
9 | O=C(C1=CC=C(OCC)C=C1)N(OCC2=CC=CC=C2)C3N(SC)C(C3)=O 2 1
10 | O=C(C1=CC=C([N+]([O-])=O)C=C1)N(OCC=C)C2N(SC)C(C2)=O 2 1
11 | O=C(C1=CC(C(F)(F)F)=CC([N+]([O-])=O)=C1)N(OCC=C)C2N(SC)C(C2)=O 2 1
12 | O=C(C1=CC=CC=C1)N(OCC2=CC=CC=C2)[C@@H]3NC([C@H]3NC(CC4=CC=CC=C4)=O)=O 1 0
13 | O=C(C1=CC=CC=C1)N(OCC2=CC=CC=C2)[C@@H]3N(SC)C([C@H]3NC(CC4=C