In [52]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [53]:
import pandas as pd
import numpy as np
import torch
from torch import nn, optim
from rdkit import Chem
from rdkit.Chem import AllChem
from tdc import Oracle

In [54]:
# Load data and Oracle
oracle = Oracle(name='DRD2')
data = pd.read_csv("small_drd2_data.csv")
smiles_list = data['smiles'].tolist()
proba_molecules = [oracle(smiles) for smiles in smiles_list]
print(proba_molecules)

Found local copy...


[0.0005901676463998309, 0.001288978921112421, 0.00023099921452992108, 0.0014480634352253118, 0.00023229149413919086, 0.007226910417238079, 0.03177009486102104, 0.00018465196104625156, 0.0006181948403864203, 0.017767959705749174, 0.0001353658415040822, 7.584214925439865e-05, 0.00011932658307753439, 0.00019596215439119096, 5.0310012366721855e-05, 0.00153997550154151, 0.004863899555067754, 0.0013627517075909072, 0.00022636359589811632, 0.9999907104594737, 0.002364173240172854, 0.9999987724917501, 0.0004128064910287372, 4.515114620151249e-05, 0.9999899941967955, 3.439700377056164e-05, 0.005426522090583173, 0.00015732934690030053, 0.0003269993975438858, 0.0003363411034820678, 0.0055611952891646, 0.9890921212669289, 0.9999988552735308, 0.9905578204031097, 0.00045109684719108454, 0.0012732716327158269, 0.0005288131087850552, 0.0021161386976471714, 0.014029352175312171, 0.013244877011903917, 0.9999973055784004, 0.9999998974481366, 0.0032787785378776348, 0.0002981816185162872, 0.000420367794340

### Computing morgan fingerprints

In [55]:
def compute_fingerprints(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    else:
        return np.zeros((2048,), dtype=int)

features = torch.tensor([compute_fingerprints(smiles) for smiles in smiles_list])
print(features.shape)

torch.Size([240, 2048])


### Generate pairs and labels

In [56]:
def generate_pairs_labels(features, proba_molecules, smiles_list, num_sets = 100):
    n = len(features)
    features_1 = []
    features_2 = []
    features_3 = []
    labels_1_proba = []
    labels_1_rank = []
    labels_2_proba = []
    labels_2_rank = []
    labels_3_proba = []
    labels_3_rank = []
    smiles_1 = []
    smiles_2 = []
    smiles_3 = []

    # set seed
    np.random.seed(42) 
    
    count = 0
    # sample unrepeated num_sets number of pairs
    while count < num_sets:
        i = np.random.randint(0, n)
        j = np.random.randint(0, n)
        k = np.random.randint(0, n)
        if i != j and i != k and j != k:
            features_1.append(features[i])
            features_2.append(features[j])
            features_3.append(features[k])

            smiles_1.append(smiles_list[i])
            smiles_2.append(smiles_list[j])
            smiles_3.append(smiles_list[k])

            # convert to float
            proba_list = [proba_molecules[i], proba_molecules[j], proba_molecules[k]]
            proba_softmax = nn.Softmax(dim=0)(torch.tensor(proba_list))

            # Now we need to rank them
            # Rank 1 has lowest value, Rank 3 has highest value
            ranks = np.argsort(np.argsort(proba_softmax)) + 1

            labels_1_proba.append(proba_softmax[0])
            labels_1_rank.append(ranks[0])
            labels_2_proba.append(proba_softmax[1])
            labels_2_rank.append(ranks[1])
            labels_3_proba.append(proba_softmax[2])
            labels_3_rank.append(ranks[2])

            count += 1
        
    # Convert to numpy
    features_1 = torch.stack(features_1).float()
    features_2 = torch.stack(features_2).float()
    features_3 = torch.stack(features_3).float()
    labels_1_proba = torch.stack(labels_1_proba).float()
    labels_1_rank = torch.tensor(labels_1_rank)
    labels_2_proba = torch.stack(labels_2_proba).float()
    labels_2_rank = torch.tensor(labels_2_rank)
    labels_3_proba = torch.stack(labels_3_proba).float()
    labels_3_rank = torch.tensor(labels_3_rank)

    return features_1, features_2, features_3,\
        labels_1_proba, labels_1_rank, labels_2_proba, labels_2_rank, labels_3_proba, labels_3_rank,\
            smiles_1, smiles_2, smiles_3

features_1, features_2, features_3,\
        labels_1_proba, labels_1_rank, labels_2_proba, labels_2_rank, labels_3_proba, labels_3_rank,\
            smiles_1, smiles_2, smiles_3 =\
    generate_pairs_labels(features, proba_molecules, smiles_list, num_sets=len(smiles_list))

In [57]:
# Calculation for choosing 3 elements from 240
import math
def comb(n, k):
    return math.factorial(n) / (math.factorial(k) * math.factorial(n-k))
num_ways_125 = comb(250, 3)
num_ways_125

2573000.0

In [58]:
print(features_1.shape)
print(features_2.shape)
print(features_3.shape)

torch.Size([240, 2048])
torch.Size([240, 2048])
torch.Size([240, 2048])


In [59]:
from sklearn.model_selection import train_test_split

# Why is test_size very high here?
# This is because we are using active learning later
# Where the Human in the loop process would generate more better training data for the model
# So the amount of training data to train the initial model is not important, as the model should not be better than random guess

features_1_train, features_1_test,\
    features_2_train, features_2_test,\
    features_3_train, features_3_test,\
    labels_1_proba_train, labels_1_proba_test,\
    labels_1_rank_train, labels_1_rank_test,\
    labels_2_proba_train, labels_2_proba_test,\
    labels_2_rank_train, labels_2_rank_test,\
    labels_3_proba_train, labels_3_proba_test,\
    labels_3_rank_train, labels_3_rank_test,\
    smiles_1_train, smiles_1_test,\
    smiles_2_train, smiles_2_test,\
    smiles_3_train, smiles_3_test\
            = train_test_split(features_1, features_2, features_3, 
                                 labels_1_proba, labels_1_rank, labels_2_proba, labels_2_rank, labels_3_proba, labels_3_rank,
                                    smiles_1, smiles_2, smiles_3, test_size=0.9, random_state=42)

# Now we need to save them 

small_drd2_training_data = pd.DataFrame()
small_drd2_training_data['smiles_1'] = smiles_1_train
small_drd2_training_data['smiles_2'] = smiles_2_train
small_drd2_training_data['smiles_3'] = smiles_3_train
small_drd2_training_data['label_1_proba'] = labels_1_proba_train
small_drd2_training_data['label_2_proba'] = labels_2_proba_train
small_drd2_training_data['label_3_proba'] = labels_3_proba_train
small_drd2_training_data['label_1_rank'] = labels_1_rank_train
small_drd2_training_data['label_2_rank'] = labels_2_rank_train
small_drd2_training_data['label_3_rank'] = labels_3_rank_train

small_drd2_training_data.to_csv("small_drd2_training_data.csv", index=False)

small_drd2_testing_data = pd.DataFrame()
small_drd2_testing_data['smiles_1'] = smiles_1_test
small_drd2_testing_data['smiles_2'] = smiles_2_test
small_drd2_testing_data['smiles_3'] = smiles_3_test
small_drd2_testing_data['label_1_proba'] = labels_1_proba_test
small_drd2_testing_data['label_2_proba'] = labels_2_proba_test
small_drd2_testing_data['label_3_proba'] = labels_3_proba_test
small_drd2_testing_data['label_1_rank'] = labels_1_rank_test
small_drd2_testing_data['label_2_rank'] = labels_2_rank_test
small_drd2_testing_data['label_3_rank'] = labels_3_rank_test

small_drd2_testing_data.to_csv("small_drd2_testing_data.csv", index=False)

In [60]:
from rank_listnet import RankListNetModel
    
# Training the model
model = RankListNetModel(feature_dim=2048)

# When using Binary Cross-Entropy Loss (BCELoss) in neural networks, the input expected by the 
# loss function is a list of probabilities, not binary values (0 or 1)

# Define the loss function
criterion = nn.KLDivLoss(reduction='batchmean')

# Correct Usage of KLDivLoss
# Model Outputs: Should be log-probabilities.
# True Labels: Should be probabilities.

optimizer = optim.Adam(model.parameters(), lr=0.001)

# Create torch data loader

from torch.utils.data import TensorDataset, DataLoader

# Assuming X_train and y_train are numpy arrays, convert them to PyTorch tensors
features_1_train_tensor = torch.tensor(features_1_train).float()  
features_2_train_tensor = torch.tensor(features_2_train).float()
features_3_train_tensor = torch.tensor(features_3_train).float()
labels_1_proba_train_tensor = torch.tensor(labels_1_proba_train).float()
labels_2_proba_train_tensor = torch.tensor(labels_2_proba_train).float()
labels_3_proba_train_tensor = torch.tensor(labels_3_proba_train).float()

features_1_test_tensor = torch.tensor(features_1_test).float()
features_2_test_tensor = torch.tensor(features_2_test).float()
features_3_test_tensor = torch.tensor(features_3_test).float()
labels_1_proba_test_tensor = torch.tensor(labels_1_proba_test).float()
labels_2_proba_test_tensor = torch.tensor(labels_2_proba_test).float()
labels_3_proba_test_tensor = torch.tensor(labels_3_proba_test).float()

# Create a TensorDataset
train_dataset = TensorDataset(features_1_train_tensor, features_2_train_tensor, 
                              features_3_train_tensor, labels_1_proba_train_tensor,
                                labels_2_proba_train_tensor, labels_3_proba_train_tensor)
              
test_dataset = TensorDataset(features_1_test_tensor, features_2_test_tensor,
                                features_3_test_tensor, labels_1_proba_test_tensor,
                                    labels_2_proba_test_tensor, labels_3_proba_test_tensor)

# Create a DataLoader
batch_size = 64  # You can adjust the batch size as needed
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

def train(model, train_loader, epochs=1):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for features_1, features_2, features_3, labels_proba_1, labels_proba_2, labels_proba_3 in train_loader:
            optimizer.zero_grad()
            ranking_scores = model(features_1, features_2, features_3) # softmax scores
            true_label = torch.stack([labels_proba_1, labels_proba_2, labels_proba_3], dim=1)
            softmax_label = torch.softmax(true_label, dim=1) # true labels should also be softmax

            # Add a small epsilon to avoid log(0)
            epsilon = 1e-9
            ranking_scores = ranking_scores + epsilon

            # Taking the logs of the ranking scores
            log_ranking_scores = torch.log(ranking_scores)  
            
            # As we see softmax_label is not log
            loss = criterion(log_ranking_scores, softmax_label)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(features)}')

train(model, train_loader)

# save state dict
torch.save(model.state_dict(), "rank_listnet_model.pth")

print("Model trained and saved")

Epoch 1, Loss: 3.4451767957458895e-05
Model trained and saved


### Perform prediction on the testing dataset

In [61]:
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

def compute_metrics(labels, predictions):
    num_classes = labels.shape[1]
    class_metrics = {}

    for i in range(0, num_classes):
        # Selecting each class: True for the current class, False for the others
        true_class = labels[:, i]
        pred_class = predictions[:, i]

        # Confusion matrix for the current class
        cm = confusion_matrix(true_class, pred_class)
        
        # Calculate each metric
        accuracy = accuracy_score(true_class, pred_class)
        precision = precision_score(true_class, pred_class, average="weighted")
        recall = recall_score(true_class, pred_class, average="weighted")
        f1 = f1_score(true_class, pred_class, average="weighted")
        mcc = matthews_corrcoef(true_class, pred_class)

        # Store metrics for the current class
        class_metrics[f'Rank {i+1}'] = {
            'Confusion Matrix': cm,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'MCC': mcc
        }

    return class_metrics

In [62]:
# Initialize the model
model = RankListNetModel(feature_dim=2048)

# Load the state dict
model.load_state_dict(torch.load("rank_listnet_model.pth"))

# Finally we perform prediction
model.eval()

prediction_rank_test = []

with torch.no_grad():
    for features_1, features_2, features_3, labels_proba_1, labels_proba_2, labels_proba_3 in test_loader:
        ranking_scores = model(features_1, features_2, features_3) # shape (batch_size, 3)
        
        # print(outputs.shape)
        true_rankings = torch.argsort(ranking_scores, dim=1) + 1
        
        #print(output_rank)
        prediction_rank_test.extend(true_rankings)

prediction_rank_test = torch.stack(prediction_rank_test).numpy()
labels_rank_test = np.stack([labels_1_rank_test, labels_2_rank_test, labels_3_rank_test]).T
# print(prediction_rank_test.shape)
# print(labels_rank_test.shape)
metrics = compute_metrics(labels_rank_test, prediction_rank_test)

for rank in metrics:
    print(f"Metrics for {rank}")
    print(metrics[rank])

Metrics for Rank 1
{'Confusion Matrix': array([[38, 17, 16],
       [23, 23, 20],
       [ 9, 30, 40]]), 'Accuracy': 0.4675925925925926, 'Precision': 0.4713311055416318, 'Recall': 0.4675925925925926, 'F1 Score': 0.469292715306718, 'MCC': 0.20039302718898558}
Metrics for Rank 2
{'Confusion Matrix': array([[21, 26, 24],
       [21, 25, 21],
       [32, 14, 32]]), 'Accuracy': 0.3611111111111111, 'Precision': 0.3626549251549252, 'Recall': 0.3611111111111111, 'F1 Score': 0.3618090589494, 'MCC': 0.03973061935191471}
Metrics for Rank 3
{'Confusion Matrix': array([[28, 38,  8],
       [35, 32, 16],
       [ 9, 11, 39]]), 'Accuracy': 0.4583333333333333, 'Precision': 0.4541282905480436, 'Recall': 0.4583333333333333, 'F1 Score': 0.4559959121374281, 'MCC': 0.18188926125547827}


#### The model performs weakly, which means that we can now use this model as a human component for HITL. Ideally, the initial model should be no better than random guess