In [30]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
# load dependencies
import sys
import pickle
import os
import shutil
import json
import pandas as pd
import numpy as np
import rdkit.Chem as Chem
from numpy.random import default_rng
import torch
from ast import literal_eval
from torch import nn, optim
from rdkit import Chem
from rdkit.Chem import AllChem
from tdc import Oracle
import subprocess

### Loading the main dataset

In [32]:
from helper import compute_fingerprints

drd2_final_test_large = pd.read_csv("data/drd2_final_large.csv")

print("The number of compounds in the final test set is: ", drd2_final_test_large.shape[0])

smiles = drd2_final_test_large["smiles"].to_numpy()
# features = compute_fingerprints(smiles)
labels = drd2_final_test_large["label"].to_numpy()

num_with_drd2 = len([x for x in labels if x == 1])
num_without_drd2 = len([x for x in labels if x == 0])
print("The number of compounds with drd2 is: ", num_with_drd2)
print("The number of compounds without drd2 is: ", num_without_drd2)

print("Ratio of compounds without drd2: ", num_without_drd2/len(labels))
print("Ratio of compounds with drd2: ", num_with_drd2/len(labels))
print("Ratio of compounds with drd2/compounds without drd2: ", num_with_drd2/num_without_drd2)

# delete the drd2_final_test_large dataframe to free up memory
# del drd2_final_test_large

The number of compounds in the final test set is:  620480
The number of compounds with drd2 is:  7759
The number of compounds without drd2 is:  612721
Ratio of compounds without drd2:  0.9874951650335224
Ratio of compounds with drd2:  0.012504834966477566
Ratio of compounds with drd2/compounds without drd2:  0.012663186017779706


### Constructing the training and testing dataset

In [33]:
# Calculate nCr 
def nCr(n, r):
    return (np.math.factorial(n) / (np.math.factorial(r) * np.math.factorial(n - r)))

number_of_combinations = int(nCr(200, 3))
print("The number of combinations is: ", number_of_combinations)

# fps finger print has dimension 2048
number_of_values_fps = number_of_combinations * 2048
print("The number of values after converting to fingerprints is: ", number_of_values_fps)
# Each is float 32, now we need to know in MB
size_of_values = number_of_values_fps * 4 / 1024 / 1024

batch_pred = 100000
num_batch = np.ceil(number_of_combinations / batch_pred)
print("The number of batches is: ", num_batch)
print("The size of the values in MB is: ", size_of_values)  
print("The size of the values in MB/batch is: ", size_of_values/num_batch)
print("The size of the values in GB is: ", size_of_values/1024)
print("The size of the values in GB/batch is: ", size_of_values/1024/num_batch)


The number of combinations is:  1313400
The number of values after converting to fingerprints is:  2689843200
The number of batches is:  14.0
The size of the values in MB is:  10260.9375
The size of the values in MB/batch is:  732.9241071428571
The size of the values in GB is:  10.02044677734375
The size of the values in GB/batch is:  0.7157461983816964


In [34]:
num_points_test = 200
num_points_train = 200
num_points = num_points_test + num_points_train

ratio_drd2_over_num_points = 0.5 # Even though the actual ratio is 0.015, we want to have a higher ratio of compounds with drd2 in the final test set

num_with_drd2_to_select = int(num_points * ratio_drd2_over_num_points)
num_without_drd2_to_select = num_points - num_with_drd2_to_select

# get the indices of the compounds with drd2
indices_without_drd2 = [i for i, x in enumerate(labels) if x == 0]
indices_with_drd2 = [i for i, x in enumerate(labels) if x == 1]

# randomly select compounds with drd2
rng = default_rng()
selected_indices_with_drd2 = rng.choice(indices_with_drd2, num_with_drd2_to_select, replace=False)
selected_indices_without_drd2 = rng.choice(indices_without_drd2, num_without_drd2_to_select, replace=False)

chosen_indices = np.concatenate((selected_indices_without_drd2, selected_indices_with_drd2))

# get the smiles and labels of the chosen indices
chosen_smiles = smiles[chosen_indices]
chosen_labels = [0] * num_without_drd2_to_select + [1] * num_with_drd2_to_select
chosen_labels = np.array(chosen_labels)

# train test split

from sklearn.model_selection import train_test_split

chosen_smiles_train, chosen_smiles_test, chosen_labels_train, chosen_labels_test = train_test_split(chosen_smiles, chosen_labels, 
                                                                                                    test_size=num_points_test/(num_points_test + num_points_train), 
                                                                                                    random_state=42, stratify=chosen_labels)
drd2_final_train_small = pd.DataFrame()
drd2_final_test_small = pd.DataFrame()

drd2_final_train_small["smiles"] = chosen_smiles_train
drd2_final_train_small["label"] = chosen_labels_train

drd2_final_test_small["smiles"] = chosen_smiles_test
drd2_final_test_small["label"] = chosen_labels_test

# Sanity checking

num_with_drd2_train = len([x for x in chosen_labels_train if x == 1])
num_without_drd2_train = len([x for x in chosen_labels_train if x == 0])
print("The number of compounds with drd2 in the training set is: ", num_with_drd2_train)
print("The number of compounds without drd2 in the training set is: ", num_without_drd2_train)

num_with_drd2_test = len([x for x in chosen_labels_test if x == 1])
num_without_drd2_test = len([x for x in chosen_labels_test if x == 0])

print("The number of compounds with drd2 in the test set is: ", num_with_drd2_test)
print("The number of compounds without drd2 in the test set is: ", num_without_drd2_test)

drd2_final_train_small.to_csv("data/drd2_final_train_small.csv", index=False)
drd2_final_test_small.to_csv("data/drd2_final_test_small.csv", index=False)

The number of compounds with drd2 in the training set is:  100
The number of compounds without drd2 in the training set is:  100
The number of compounds with drd2 in the test set is:  100
The number of compounds without drd2 in the test set is:  100


### Checking if the Oracle agrees with the labels

In [35]:
from helper import compute_fingerprints

drd2_final_test_small = pd.read_csv("data/drd2_final_test_small.csv")
smiles_list = drd2_final_test_small["smiles"].tolist()
print(smiles_list)
labels = drd2_final_test_small["label"].to_numpy()

oracle = Oracle(name='DRD2')
proba_molecules = [oracle(smiles) for smiles in smiles_list]

# Probability list of molecules where label is 0
proba_molecules_0 = [proba for i, proba in enumerate(proba_molecules) if labels[i] == 0]

# Probability list of molecules where label is 1
proba_molecules_1 = [proba for i, proba in enumerate(proba_molecules) if labels[i] == 1]

# Average values 
average_proba_molecules_0 = np.mean(proba_molecules_0)
average_proba_molecules_1 = np.mean(proba_molecules_1)

print("The average probability of molecules with label 0 is: ", average_proba_molecules_0)
print("The average probability of molecules with label 1 is: ", average_proba_molecules_1)


Found local copy...


['CC1CC(C)N1C(=O)C1Cc2c(cc3c[nH]c4c3c2C=CC4)N(C)C1', 'CN(C)c1ccc(C=C2C(=O)OC3(CCCC3)OC2=O)cc1', 'COc1ccc(C(=O)C=Cc2ccccc2F)c(OC(=O)c2ccc([N+](=O)[O-])cc2)c1', 'O=C(CN1CCN(Cc2ccc(Cl)cc2)CC1)N1c2ccccc2CC12CCCC2', 'CCc1ccc(NC2=NC(=O)N(C3CCCCC3)C(=O)C2)cc1', 'O=C1c2ccccc2C(=O)N1CCCCCCCN1CC=C(c2c[nH]c3ccc(F)cc23)CC1', 'Clc1ccc(N2CCN(Cc3cn4cc(Br)ccc4n3)CC2)cc1Cl', 'Clc1ccc(-c2cc(C3CCN(CC4CCCCC4)CC3)[nH]n2)cc1', 'Oc1cc2c(cc1O)C1c3ccncc3CNC1CC2', 'C=CCN1CCC2c3cccc(OC)c3CCC21', 'Cc1nnc(NC(=O)CSc2nc3nc4ccccc4c-3n[nH]2)s1', 'Cc1cc([N+](=O)[O-])nn1CC(=O)NC(C)c1ccccc1', 'Cc1cccc(NC(=O)c2ccc(CN3CCc4ccccc4C3)cc2)c1C', 'Oc1c2c(c(O)n1C1CCCCC1)C1CC2C2ON=C(c3ccc(Cl)cc3)C12', 'CCCc1nnc(SCc2ccccc2Cl)[nH]1', 'CN(C)CCC=C1c2ccccc2Sc2ccc(Cl)cc21', 'Cc1ccc(CC(=O)NCC=S(O)c2ccc(C)cc2)cc1', 'Cn1ncc2c1-c1c(Cl)sc(Cl)c1CCC2', 'CCOC(=O)C(C)On1c(-c2cccc(Cl)c2)nc2ccc([N+](=O)[O-])cc21', 'CN(C(=O)COC(=O)c1cc(S(=O)(=O)N2CCCCC2)ccc1N1CCOCC1)C1C=S(O)(O)=CC1', 'Cc1c(S(=O)(=O)NCCCCN2CCC(c3noc4cc(F)ccc34)CC2)sc2ccc(F)cc12', 'O