In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import pandas as pd
import numpy as np
import torch
from torch import nn, optim
from rdkit import Chem
from rdkit.Chem import AllChem
from tdc import Oracle

In [11]:
# Load data and Oracle
oracle = Oracle(name='DRD2')
data = pd.read_csv("small_drd2_data.csv")
smiles_list = data['smiles'].tolist()
proba_molecules = [oracle(smiles) for smiles in smiles_list]
print(proba_molecules)

Found local copy...


[0.0005901676463998309, 0.001288978921112421, 0.00023099921452992108, 0.0014480634352253118, 0.00023229149413919086, 0.007226910417238079, 0.03177009486102104, 0.00018465196104625156, 0.0006181948403864203, 0.017767959705749174, 0.0001353658415040822, 7.584214925439865e-05, 0.00011932658307753439, 0.00019596215439119096, 5.0310012366721855e-05, 0.00153997550154151, 0.004863899555067754, 0.0013627517075909072, 0.00022636359589811632, 0.9999907104594737, 0.002364173240172854, 0.9999987724917501, 0.0004128064910287372, 4.515114620151249e-05, 0.9999899941967955, 3.439700377056164e-05, 0.005426522090583173, 0.00015732934690030053, 0.0003269993975438858, 0.0003363411034820678, 0.0055611952891646, 0.9890921212669289, 0.9999988552735308, 0.9905578204031097, 0.00045109684719108454, 0.0012732716327158269, 0.0005288131087850552, 0.0021161386976471714, 0.014029352175312171, 0.013244877011903917, 0.9999973055784004, 0.9999998974481366, 0.0032787785378776348, 0.0002981816185162872, 0.000420367794340

### Computing morgan fingerprints

In [12]:
def compute_fingerprints(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    else:
        return np.zeros((2048,), dtype=int)

features = torch.tensor([compute_fingerprints(smiles) for smiles in smiles_list])
labels_proba = torch.tensor(proba_molecules)
labels_binary = torch.tensor([1 if proba > 0.5 else 0 for proba in proba_molecules])

In [13]:
from sklearn.model_selection import train_test_split

# Why is test_size very high here?
# This is because we are using active learning later
# Where the Human in the loop process would generate more better training data for the model
# So the amount of training data to train the initial model is not important, as the model should not be better than random guess

features_train, features_test,\
    labels_proba_train, labels_proba_test,\
    labels_binary_train, labels_binary_test,\
    smiles_train, smiles_test =\
    train_test_split(features, labels_proba, labels_binary, smiles_list, test_size=0.9, random_state=42)


# Now we need to save them 

small_drd2_training_data = pd.DataFrame()
small_drd2_training_data['smiles'] = smiles_train
small_drd2_training_data['label_proba'] = labels_proba_train
small_drd2_training_data['label_binary'] = labels_binary_train

small_drd2_training_data.to_csv("small_drd2_training_data.csv", index=False)

small_drd2_testing_data = pd.DataFrame()
small_drd2_testing_data['smiles'] = smiles_test
small_drd2_testing_data['label_proba'] = labels_proba_test
small_drd2_testing_data['label_binary'] = labels_binary_test

small_drd2_testing_data.to_csv("small_drd2_testing_data.csv", index=False)

In [14]:
from score_regression import ScoreRegressionModel
    
# Training the model
model = ScoreRegressionModel(feature_dim=2048)

# When using Binary Cross-Entropy Loss (BCELoss) in neural networks, the input expected by the 
# loss function is a list of probabilities, not binary values (0 or 1)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Create torch data loader

from torch.utils.data import TensorDataset, DataLoader

# Assuming X_train and y_train are numpy arrays, convert them to PyTorch tensors
features_train_tensor = torch.tensor(features_train).float()  # Ensure dtype is float32 for features
labels_proba_train_tensor = torch.tensor(labels_proba_train).float()  # Ensure dtype is float32 if regression, or long if classification
features_test_tensor = torch.tensor(features_test).float()  # Ensure dtype is float32 for features
labels_proba_test_tensor = torch.tensor(labels_proba_test).float()  # Ensure dtype is float32 if regression, or long if classification

# Create a TensorDataset
train_dataset = TensorDataset(features_train_tensor, labels_proba_train_tensor)
test_dataset = TensorDataset(features_test_tensor, labels_proba_test_tensor)

# Create a DataLoader
batch_size = 16  # You can adjust the batch size as needed
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

def train(model, train_loader, epochs=1):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for features, labels_proba in train_loader:
            optimizer.zero_grad()
            output = model(features)
            loss = criterion(output, labels_proba.unsqueeze(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(features)}')

train(model, train_loader)

# save state dict
torch.save(model.state_dict(), "score_regression_model.pth")

print("Model trained and saved")


Epoch 1, Loss: 0.17033924162387848
Model trained and saved


### Perform prediction on the testing dataset

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

def compute_metrics(labels, predictions):
    """
    Compute classification metrics: accuracy, precision, recall, F1 score, and MCC.
    
    Args:
    labels (list[int]): True binary labels.
    predictions (list[int]): Predicted binary labels.

    Returns:
    dict: A dictionary containing the computed metrics.
    """
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    mcc = matthews_corrcoef(labels, predictions)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'F1': f1,
        'MCC': mcc
    }

In [16]:
# Initialize the model
model = ScoreRegressionModel(feature_dim=2048)
# Load the state dict
model.load_state_dict(torch.load("score_regression_model.pth"))

# Finally we perform prediction
model.eval()

prediction_binary_test = []
with torch.no_grad():
    for features, labels in test_loader:
        outputs = model(features)
        prediction_binary_test.extend(outputs.squeeze().tolist())

prediction_binary_test = (torch.tensor(prediction_binary_test) > 0.5).int().tolist()

metrics = compute_metrics(labels_binary_test, prediction_binary_test)

print("Accuracy:", metrics['accuracy'])
print("Precision:", metrics['precision'])
print("Recall:", metrics['recall'])
print("F1:", metrics['F1'])
print("MCC:", metrics['MCC'])

Accuracy: 0.8287037037037037
Precision: 1.0
Recall: 0.38333333333333336
F1: 0.5542168674698795
MCC: 0.5566368291673596


#### The model performs weakly, which means that we can now use this model as a human component for HITL. Ideally, the initial model should be no better than random guess