In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
# load dependencies
import sys
import pickle
import os
import shutil
import json
import pandas as pd
import numpy as np
import rdkit.Chem as Chem
from numpy.random import default_rng
import torch
from ast import literal_eval
from torch import nn, optim
from rdkit import Chem
from rdkit.Chem import AllChem
from tdc import Oracle
import subprocess

### Loading the main dataset

In [9]:
from helper import compute_fingerprints

drd2_final_test_large = pd.read_csv("data/drd2_final_large.csv")

print("The number of compounds in the final test set is: ", drd2_final_test_large.shape[0])

smiles = drd2_final_test_large["smiles"].to_numpy()
# features = compute_fingerprints(smiles)
labels = drd2_final_test_large["label"].to_numpy()

num_with_drd2 = len([x for x in labels if x == 1])
num_without_drd2 = len([x for x in labels if x == 0])
print("The number of compounds with drd2 is: ", num_with_drd2)
print("The number of compounds without drd2 is: ", num_without_drd2)

print("Ratio of compounds without drd2: ", num_without_drd2/len(labels))
print("Ratio of compounds with drd2: ", num_with_drd2/len(labels))
print("Ratio of compounds with drd2/compounds without drd2: ", num_with_drd2/num_without_drd2)

# delete the drd2_final_test_large dataframe to free up memory
# del drd2_final_test_large

The number of compounds in the final test set is:  620480
The number of compounds with drd2 is:  7759
The number of compounds without drd2 is:  612721
Ratio of compounds without drd2:  0.9874951650335224
Ratio of compounds with drd2:  0.012504834966477566
Ratio of compounds with drd2/compounds without drd2:  0.012663186017779706


### Constructing the training and testing dataset

In [10]:
# Calculate nCr 
def nCr(n, r):
    return (np.math.factorial(n) / (np.math.factorial(r) * np.math.factorial(n - r)))

number_of_values = nCr(1000, 3)
print("The number of values is: ", number_of_values)

# fps finger print has dimension 2048
number_of_values_fps = number_of_values * 2048
print("The number of values after converting to finger prints is: ", number_of_values_fps)
# Each is float 32, now we need to know in MB
size_of_values = number_of_values_fps * 4 / 1024 / 1024

num_batches = 10000
print("The size of the values in MB is: ", size_of_values)  
print("The size of the values in MB/batch is: ", size_of_values/num_batches)
print("The size of the values in GB is: ", size_of_values/1024)
print("The size of the values in GB/batch is: ", size_of_values/1024/num_batches)


The number of values is:  166167000.0
The number of values after converting to finger prints is:  340310016000.0
The size of the values in MB is:  1298179.6875
The size of the values in MB/batch is:  129.81796875
The size of the values in GB is:  1267.7536010742188
The size of the values in GB/batch is:  0.12677536010742188


In [11]:
num_points_test = 1000
num_points_train = 200
num_points = num_points_test + num_points_train

ratio_drd2_over_num_points = 0.5 # Even though the actual ratio is 0.015, we want to have a higher ratio of compounds with drd2 in the final test set

num_with_drd2_to_select = int(num_points * ratio_drd2_over_num_points)
num_without_drd2_to_select = num_points - num_with_drd2_to_select

# get the indices of the compounds with drd2
indices_without_drd2 = [i for i, x in enumerate(labels) if x == 0]
indices_with_drd2 = [i for i, x in enumerate(labels) if x == 1]


# randomly select compounds with drd2
rng = default_rng()
selected_indices_with_drd2 = rng.choice(indices_with_drd2, num_with_drd2_to_select, replace=False)
selected_indices_without_drd2 = rng.choice(indices_without_drd2, num_without_drd2_to_select, replace=False)

chosen_indices = np.concatenate((selected_indices_without_drd2, selected_indices_with_drd2))

print(chosen_indices)
# get the smiles and labels of the chosen indices
chosen_smiles = smiles[chosen_indices]
chosen_labels = [0] * num_without_drd2_to_select + [1] * num_with_drd2_to_select
chosen_labels = np.array(chosen_labels)

# train test split

from sklearn.model_selection import train_test_split

chosen_smiles_train, chosen_smiles_test, chosen_labels_train, chosen_labels_test = train_test_split(chosen_smiles, chosen_labels, 
                                                                                                    test_size=num_points_test/(num_points_test + num_points_train), 
                                                                                                    random_state=42, stratify=chosen_labels)
drd2_final_train_small = pd.DataFrame()
drd2_final_test_small = pd.DataFrame()

drd2_final_train_small["smiles"] = chosen_smiles_train
drd2_final_train_small["label"] = chosen_labels_train

drd2_final_test_small["smiles"] = chosen_smiles_test
drd2_final_test_small["label"] = chosen_labels_test

# Sanity checking

num_with_drd2_train = len([x for x in chosen_labels_train if x == 1])
num_without_drd2_train = len([x for x in chosen_labels_train if x == 0])
print("The number of compounds with drd2 in the training set is: ", num_with_drd2_train)
print("The number of compounds without drd2 in the training set is: ", num_without_drd2_train)

num_with_drd2_test = len([x for x in chosen_labels_test if x == 1])
num_without_drd2_test = len([x for x in chosen_labels_test if x == 0])

print("The number of compounds with drd2 in the test set is: ", num_with_drd2_test)
print("The number of compounds without drd2 in the test set is: ", num_without_drd2_test)

drd2_final_train_small.to_csv("data/drd2_final_train_small.csv", index=False)
drd2_final_test_small.to_csv("data/drd2_final_test_small.csv", index=False)

[452802 415423 597784 ... 612910 618730 613726]
The number of compounds with drd2 in the training set is:  100
The number of compounds without drd2 in the training set is:  100
The number of compounds with drd2 in the test set is:  500
The number of compounds without drd2 in the test set is:  500


### Checking if the Oracle agrees with the labels

In [12]:
from helper import compute_fingerprints

drd2_final_test_small = pd.read_csv("data/drd2_final_test_small.csv")
smiles_list = drd2_final_test_small["smiles"].tolist()
print(smiles_list)
labels = drd2_final_test_small["label"].to_numpy()

oracle = Oracle(name='DRD2')
proba_molecules = [oracle(smiles) for smiles in smiles_list]

# Probability list of molecules where label is 0
proba_molecules_0 = [proba for i, proba in enumerate(proba_molecules) if labels[i] == 0]

# Probability list of molecules where label is 1
proba_molecules_1 = [proba for i, proba in enumerate(proba_molecules) if labels[i] == 1]

# Average values 
average_proba_molecules_0 = np.mean(proba_molecules_0)
average_proba_molecules_1 = np.mean(proba_molecules_1)

print("The average probability of molecules with label 0 is: ", average_proba_molecules_0)
print("The average probability of molecules with label 1 is: ", average_proba_molecules_1)


Found local copy...


['CCCNC(=O)COC(=O)c1ccccc1OCC(=O)Nc1ccc(Br)cc1', 'O=C(NCCC(F)CN1CCN(c2cccc(Cl)c2Cl)CC1)c1ccc(-c2ccccn2)cc1', 'Cn1c(SCc2nnc(-c3ccccc3Br)o2)nc2sc3c(c2c1=O)CCC3', 'Cc1nc(S(=O)(=O)c2ccc(Cl)cc2)c(N(C)C)o1', 'CN1CCN(C2Cc3ccccc3Sc3ccc(-c4ccccn4)cc32)CC1', 'Cc1ccc2c(-c3nnc(SCCCN4CCc5cc6c(cc5CC4)OCCN6)n3C)cccc2n1', 'COc1ccc(N2CCN(CCCOc3ccc(-c4nc5ccccc5[nH]4)cc3)CC2)cc1', 'COc1ccccc1N1CCN(CC(O)COc2ccc3c(c2)OCO3)CC1', 'CCC(=NNC(N)=O)c1ccc(OC)c(OC)c1', 'COc1ccc(C(=O)NNC(=O)C23CC4CC(CC(C4)C2)C3)c(OC)c1', 'COc1ccc(C=CC(=O)c2ccc(OC)cc2OC)c(OC)c1', 'O=C(c1cccc2ccccc12)N1CCN(Cc2ccccc2F)CC1', 'COc1ccc(C(=O)NNC(=O)c2ccc3ccccc3n2)cc1', 'O=C(NCN1CCN(c2ccccc2Cl)CC1)c1cccc(Cl)c1', 'N#Cc1ccc2[nH]c(CN3CCN(c4ccccc4Cl)CC3)cc2c1', 'COc1cccc(CCc2ccccc2OCCCN2CCN(c3cccc(C(F)(F)F)c3)CC2)c1', 'O=C(Oc1ccc(-n2cnnn2)cc1)c1ccco1', 'COc1ccc(-c2cccc3c2CCC2C3c3cc(O)c(Cl)cc3CCN2C)cc1', 'CCNC(=O)Nc1ccc2c(c1)CCC1C2c2cc(O)c(Cl)cc2CCN1C', 'CCOC(=O)c1sc2nc(N3CCOCC3)c3c(c2c1N)CC(C(C)C)OC3', 'Cn1c(CN2CCC(c3ccccc3)CC2)nc2ccccc21', 'O