In [3]:
%load_ext autoreload
%autoreload 2

In [35]:
# load dependencies
import sys
import pickle
import os
import shutil
import json
import pandas as pd
import numpy as np
import rdkit.Chem as Chem
from rdkit.Chem import Crippen
from rdkit import DataStructs
from numpy.random import default_rng
import torch
from ast import literal_eval
from torch import nn, optim
from rdkit import Chem
from rdkit.Chem import AllChem
from tdc import Oracle
import subprocess

from rdkit.Chem import RDConfig
import os
import sys
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
# now you can import sascore!
import sascorer

In [5]:
print(os.getcwd())

/home/springnuance/reinvent-hitl/Base-Code-Binh


In [45]:
from training_Bradley_Terry_model.bradley_terry import BradleyTerryModel
from training_Rank_ListNet_model.rank_listnet import RankListNetModel
from training_Score_Regression_model.score_regression import ScoreRegressionModel
from scripts.helper import load_drd2_dataset, write_REINVENT_config, change_config_json, \
                    read_scaffold_result, load_feedback_model, smiles_human_score, \
                    compute_fingerprints, retrain_feedback_model,\
                    create_drd2_dataset, combine_drd2_dataset, save_drd2_dataset
                        
from sklearn.metrics import roc_curve, auc, precision_recall_fscore_support, matthews_corrcoef

def predict_proba_from_model_fps(feedback_type, feedback_model, features):

    # This is not computationally extensive yet
    # choose float 32
    
    features = torch.tensor(features, dtype=torch.float32)
    pred_label_proba = feedback_model.predict_proba(features).cpu().detach().numpy()
    return pred_label_proba


def check_create(path):
    """
    Check if the directory exists, if not, create it.
    """
    if not os.path.exists(path):
        os.makedirs(path)
    
def evaluate_results(
        output_dir, benchmark, # Choose whether to evaluate the results of the benchmark
        feedback_type, # scoring, comparing, ranking
        initial_feedback_model_path, # path to the initial feedback model
        final_testing_dataset_path, # path to the final training dataset
        model_pretrained_name, # Name of the pretrained model before REINVENT_round_1
        num_rounds, # number of rounds, corresponding to R in the paper
        num_iters, # number of iterations of showing molecules to the human for feedback at each round, corresponding to T in the paper
):
    """
    This function returns five dictionaries of benchmark results
    
    benchmark_1: ML model metrics (metrics dict)
        - REINVENT_round_1
          - accuracy
          - precision
          ...
        - REINVENT_round_2
        - ...
    
    benchmark_2: DRD2 probability distribution of REINVENT generated molecules (drd2_proba dict)
        - REINVENT_round_1
          - np.array of probabilities
        - REINVENT_round_2
        - ...

    benchmark_3: Novelty of REINVENT generated molecules (novelty dict)
        - REINVENT_round_1
          - np.array of novelty scores
        - REINVENT_round_2
        - ...
    
    benchmark_4: Synthetic accessibility of REINVENT generated molecules (sa dict)
        - REINVENT_round_1
          - np.array of synthetic accessibility scores
        - REINVENT_round_2
        - ...

    benchmark_5: logP of REINVENT generated molecules (logP dict)
        - REINVENT_round_1
          - np.array of logP scores
        - REINVENT_round_2
        - ...

    """
    
    final_testing_dataset = pd.read_csv(final_testing_dataset_path)

    print("Loading final testing dataset successfully")
        
    smiles_test = final_testing_dataset['smiles'].to_numpy()
    label_test = final_testing_dataset['label'].to_numpy()
    
    # for score regression, dataset_outputs have columns: smiles, features, label_proba, label_binary
    # for bradley-terry, dataset_outputs have columns: smiles_1, smiles_2, features_1, features_2, 
                                                      # label_1_proba, label_2_proba, label_1_binary, label_2_binary
                                                      # compare_proba, compare_binary
    # for rank listnet, dataset_outputs have columns: smiles_1/2/3, features_1/2/3, label_1/2/3_proba, label_1/2/3_binary
                                                     # label_1/2/3_softmax, label_1/2/3_rank
    
    ###########################
    # BENCHMARK 1: ML METRICS #
    ###########################
    
    features_list = [compute_fingerprints(smiles) for smiles in smiles_test]
    features = np.array(features_list, dtype=np.float32)
    
    metrics = {}

    if benchmark["metrics"] == True:
        metrics_model = {
            'thresholds': [],
            'TP': [], 'TN': [], 'FP': [], 'FN': [],
            'accuracy': [], 'precision': [], 'recall': [], 'F1': [], 'MCC': [],
        }
        
        initial_feedback_model = load_feedback_model(feedback_type=feedback_type, 
                                                    feedback_model_path=initial_feedback_model_path)
        predicted_scores = predict_proba_from_model_fps(feedback_type, initial_feedback_model, features)

        # Compute metrics over a range of thresholds
        thresholds = np.linspace(0, 1, 101)
        for threshold in thresholds:
            predicted_labels = (predicted_scores > threshold).astype(int)
            TP = np.sum((predicted_labels == 1) & (label_test == 1))
            TN = np.sum((predicted_labels == 0) & (label_test == 0))
            FP = np.sum((predicted_labels == 1) & (label_test == 0))
            FN = np.sum((predicted_labels == 0) & (label_test == 1))

            accuracy = (TP + TN) / len(label_test)
            precision, recall, f1, _ = precision_recall_fscore_support(label_test, predicted_labels, average='binary')
            mcc = matthews_corrcoef(label_test, predicted_labels)

            metrics_model['thresholds'].append(threshold)
            metrics_model['TP'].append(TP)
            metrics_model['TN'].append(TN)
            metrics_model['FP'].append(FP)
            metrics_model['FN'].append(FN)
            metrics_model['accuracy'].append(accuracy)
            metrics_model['precision'].append(precision)
            metrics_model['recall'].append(recall)
            metrics_model['F1'].append(f1)
            metrics_model['MCC'].append(mcc)
        
        fpr, tpr, _ = roc_curve(label_test, predicted_scores, pos_label=1)
        roc_auc = auc(fpr, tpr)

        metrics_model['fpr'] = fpr
        metrics_model['tpr'] = tpr
        metrics_model['roc_auc'] = roc_auc
        metrics["REINVENT_round_1"] = metrics_model
        
        ########################
        # HITL MODEL BENCHMARK #
        ########################

        for REINVENT_round in range(2, num_rounds + 2):
            reinvent_round_name = f"REINVENT_round_{REINVENT_round}"
            
            hitl_iteration_name = f"HITL_iteration_{num_iters}"
            HITL_iteration = num_iters
            # for HITL_iteration in range(1, num_iters+1):
            #     hitl_iteration_name = f"HITL_iteration_{HITL_iteration}"    

            metrics_model = {
                'thresholds': [],
                'TP': [], 'TN': [], 'FP': [], 'FN': [],
                'accuracy': [], 'precision': [], 'recall': [], 'F1': [], 'MCC': [],
            }

            feedback_model_path = f"{output_dir}/REINVENT_round_{REINVENT_round - 1}/HITL_iteration_{HITL_iteration}/{model_pretrained_name}"

            if not os.path.exists(feedback_model_path):
                print(f"Feedback model {feedback_model_path} does not exist.")
            else:
                feedback_model = load_feedback_model(feedback_type=feedback_type, feedback_model_path=feedback_model_path)
                predicted_scores = predict_proba_from_model_fps(feedback_type, feedback_model, features)

                # Compute metrics over a range of thresholds
                thresholds = np.linspace(0, 1, 101)
                for threshold in thresholds:
                    predicted_labels = (predicted_scores > threshold).astype(int)
                    TP = np.sum((predicted_labels == 1) & (label_test == 1))
                    TN = np.sum((predicted_labels == 0) & (label_test == 0))
                    FP = np.sum((predicted_labels == 1) & (label_test == 0))
                    FN = np.sum((predicted_labels == 0) & (label_test == 1))
                    
                    accuracy = (TP + TN) / len(label_test)
                    precision, recall, f1, _ = precision_recall_fscore_support(label_test, predicted_labels, average='binary')
                    mcc = matthews_corrcoef(label_test, predicted_labels)
                    
                    metrics_model['thresholds'].append(threshold)
                    metrics_model['TP'].append(TP)
                    metrics_model['TN'].append(TN)
                    metrics_model['FP'].append(FP)
                    metrics_model['FN'].append(FN)
                    metrics_model['accuracy'].append(accuracy)
                    metrics_model['precision'].append(precision)
                    metrics_model['recall'].append(recall)
                    metrics_model['F1'].append(f1)
                    metrics_model['MCC'].append(mcc)
        
                # Compute ROC curve and AUC
                fpr, tpr, _ = roc_curve(label_test, predicted_scores, pos_label=1)
                roc_auc = auc(fpr, tpr)

                metrics_model['fpr'] = fpr
                metrics_model['tpr'] = tpr
                metrics_model['roc_auc'] = roc_auc

                #metrics[f"{reinvent_round_name}_{hitl_iteration_name}"] = metrics_model
                metrics[reinvent_round_name] = metrics_model
        
    ##############################################
    # BENCHMARK 2: DRD2 PROBABILITY DISTRIBUTION #
    ##############################################

    drd2_proba = {}

    if benchmark["drd2_proba"] == True:
        
        for REINVENT_round in range(1, num_rounds + 2):
            reinvent_round_name = f"REINVENT_round_{REINVENT_round}"
            
            scaffold_memory_path = f"{output_dir}/{reinvent_round_name}/results/scaffold_memory.csv"
            
            if os.path.exists(scaffold_memory_path):
                scaffold_df = pd.read_csv(scaffold_memory_path)
                smiles_list_round = scaffold_df['SMILES']
                
                # Load the TDC Oracle
                oracle = Oracle(name='DRD2')
                
                # Compute DRD2 probabilities
                drd2_probs = [oracle(smiles) for smiles in smiles_list_round]
                
                # Store the results
                drd2_proba[reinvent_round_name] = np.array(drd2_probs)
            else:
                print(f"Scaffold memory file {scaffold_memory_path} does not exist.")

    ##############################
    # BENCHMARK 3: NOVELTY SCORE #
    ##############################

    novelty_score = {}

    if benchmark["novelty_score"] == True:
        
        for REINVENT_round in range(1, num_rounds + 2):
            reinvent_round_name = f"REINVENT_round_{REINVENT_round}"
            scaffold_memory_path = f"{output_dir}/{reinvent_round_name}/results/scaffold_memory.csv"
            
            if os.path.exists(scaffold_memory_path):
                scaffold_df = pd.read_csv(scaffold_memory_path)
                smiles_list = scaffold_df['SMILES']
                
                novelty_scores = []

                for smiles in smiles_list:
                    mol = Chem.MolFromSmiles(smiles)
                    if mol:
                        fps = compute_fingerprints(smiles)
                        # Reference fps is the testing features
                        similarities = DataStructs.BulkTanimotoSimilarity(fps, features_list)
                        max_similarity = max(similarities)
                        novelty = 1 - max_similarity
                        novelty_scores.append(novelty)
                    else:
                        novelty_scores.append(None)  # Handle invalid SMILES
                    
                # Store the results
                novelty_score[reinvent_round_name] = np.array(novelty_scores)
            else:
                print(f"Scaffold memory file {scaffold_memory_path} does not exist.")


    ##############################################
    # BENCHMARK 4: SYNTHETIC ACCESSIBILITY SCORE #
    ##############################################

    sa_score = {}
    
    if benchmark["sa_score"] == True:

        for REINVENT_round in range(1, num_rounds + 2):
            reinvent_round_name = f"REINVENT_round_{REINVENT_round}"
            scaffold_memory_path = f"{output_dir}/{reinvent_round_name}/results/scaffold_memory.csv"
            
            if os.path.exists(scaffold_memory_path):
                scaffold_df = pd.read_csv(scaffold_memory_path)
                smiles_list = scaffold_df['SMILES']

                sa_scores = []
                for smiles in smiles_list:
                    mol = Chem.MolFromSmiles(smiles)
                    if mol:
                        sa = sascorer.calculateScore(mol)
                        sa_scores.append(sa)
                    else:
                        sa_scores.append(None)  # Handle invalid SMILES

                # Store the results
                sa_score[reinvent_round_name] = np.array(sa_scores)
            else:
                print(f"Scaffold memory file {scaffold_memory_path} does not exist.")
                      

    ###########################
    # BENCHMARK 5: LOGP SCORE #
    ###########################

    logP_score = {}

    if benchmark["logP_score"] == True:
        for REINVENT_round in range(1, num_rounds + 2):
            reinvent_round_name = f"REINVENT_round_{REINVENT_round}"
            scaffold_memory_path = f"{output_dir}/{reinvent_round_name}/results/scaffold_memory.csv"
            
            if os.path.exists(scaffold_memory_path):
                scaffold_df = pd.read_csv(scaffold_memory_path)
                smiles_list = scaffold_df['SMILES']

                logP_scores = []
                for smiles in smiles_list:
                    mol = Chem.MolFromSmiles(smiles)
                    if mol:
                        logP = Crippen.MolLogP(mol)
                        logP_scores.append(logP)
                    else:
                        logP_scores.append(None)  # Handle invalid SMILES

                # Store the results
                logP_score[reinvent_round_name] = np.array(logP_scores)
            else:
                print(f"Scaffold memory file {scaffold_memory_path} does not exist.")

    return metrics, drd2_proba, novelty_score, sa_score, logP_score

### Running score regression model

In [46]:
feedback_type = "scoring" 

# feedback type as scoring:
# Given a molecule, what is the probability that the molecule is active regarding DRD2?  

num_rounds = 3 # number of rounds, corresponding to R in the paper
num_iters = 5 # number of iterations of showing molecules to the human for feedback at each round
num_queries = 20 # number of molecules, pairs or a set of molecules, dependig on the task, 
                 # shown to the simulated chemist at each HITL_iteration

for acquisition in ["random", "uncertainty", "greedy"]:
    for sigma_noise in [0.0, 0.1]:
        output_dir = f"output_score_regression/R{num_rounds}_T{num_iters}_Q{num_queries}_acq_{acquisition}_noise_{sigma_noise}"
        initial_feedback_model_path = f"/home/springnuance/reinvent-hitl/Base-Code-Binh/training_Score_Regression_model/score_regression_model.pth"
        final_testing_dataset_path = "/home/springnuance/reinvent-hitl/Base-Code-Binh/data/drd2_final_test_large.csv"

        model_pretrained_name = "score_regression_model.pth"

        benchmark = {
            "metrics": True,
            "drd2_proba": False,
            "novelty_score": False,
            "sa_score": False,
            "logP_score": False,
        }

        metrics, drd2_proba, novelty_score, sa_score, logP_score = evaluate_results(
                output_dir, benchmark,
                feedback_type, # scoring, comparing, ranking
                initial_feedback_model_path,
                final_testing_dataset_path, # path to the final training dataset
                model_pretrained_name, # Name of the pretrained model before REINVENT_round_1
                num_rounds, # number of rounds, corresponding to R in the paper
                num_iters, # number of iterations of showing molecules to the human for feedback at each round, corresponding to T in the paper
        )

        if benchmark["metrics"] == True:
            np.save(f"results_score_regression/acq_{acquisition}_noise_{sigma_noise}_metrics.npy", metrics)
        if benchmark["drd2_proba"] == True:
            np.save(f"results_score_regression/acq_{acquisition}_noise_{sigma_noise}_drd2_proba.npy", drd2_proba)
        if benchmark["novelty_score"] == True:
            np.save(f"results_score_regression/acq_{acquisition}_noise_{sigma_noise}_novelty_score.npy", novelty_score)
        if benchmark["sa_score"] == True:
            np.save(f"results_score_regression/acq_{acquisition}_noise_{sigma_noise}_sa_score.npy", sa_score)
        if benchmark["logP_score"] == True:
            np.save(f"results_score_regression/acq_{acquisition}_noise_{sigma_noise}_logP_score.npy", logP_score)

Loading final testing dataset successfully
Loading Score Regression model from /home/springnuance/reinvent-hitl/Base-Code-Binh/training_Score_Regression_model/score_regression_model.pth
Loading Score Regression model from output_score_regression/R3_T5_Q20_acq_random_noise_0.0/REINVENT_round_1/HITL_iteration_5/score_regression_model.pth
Loading Score Regression model from output_score_regression/R3_T5_Q20_acq_random_noise_0.0/REINVENT_round_2/HITL_iteration_5/score_regression_model.pth
Loading Score Regression model from output_score_regression/R3_T5_Q20_acq_random_noise_0.0/REINVENT_round_3/HITL_iteration_5/score_regression_model.pth
Loading final testing dataset successfully
Loading Score Regression model from /home/springnuance/reinvent-hitl/Base-Code-Binh/training_Score_Regression_model/score_regression_model.pth
Loading Score Regression model from output_score_regression/R3_T5_Q20_acq_random_noise_0.1/REINVENT_round_1/HITL_iteration_5/score_regression_model.pth
Loading Score Regres

### Running Bradley Terry model

In [47]:
feedback_type = "comparing" # scoring, comparing, ranking

# feedback type as comparing:
# Given two molecules, what is the probability that the first molecule is more active than the second molecule regarding DRD2?

num_rounds = 3 # number of rounds, corresponding to R in the paper
num_iters = 5 # number of iterations of showing molecules to the human for feedback at each round
num_queries = 5 # number of molecules, pairs or a set of molecules, dependig on the task, 
                 # shown to the simulated chemist at each HITL_iteration

for acquisition in ["random", "uncertainty", "greedy"]:
    for sigma_noise in [0.0, 0.1]:
        output_dir = f"output_bradley_terry/R{num_rounds}_T{num_iters}_Q{num_queries}_acq_{acquisition}_noise_{sigma_noise}"
        initial_feedback_model_path = f"/home/springnuance/reinvent-hitl/Base-Code-Binh/training_Bradley_Terry_model/bradley_terry_model.pth"
        final_testing_dataset_path = "/home/springnuance/reinvent-hitl/Base-Code-Binh/data/drd2_final_test_large.csv"

        model_pretrained_name = "bradley_terry_model.pth"

        benchmark = {
            "metrics": True,
            "drd2_proba": False,
            "novelty_score": False,
            "sa_score": False,
            "logP_score": False,
        }

        metrics, drd2_proba, novelty_score, sa_score, logP_score = evaluate_results(
                output_dir, benchmark,
                feedback_type, # scoring, comparing, ranking
                initial_feedback_model_path,
                final_testing_dataset_path, # path to the final training dataset
                model_pretrained_name, # Name of the pretrained model before REINVENT_round_1
                num_rounds, # number of rounds, corresponding to R in the paper
                num_iters, # number of iterations of showing molecules to the human for feedback at each round, corresponding to T in the paper
        )

        if benchmark["metrics"] == True:
            np.save(f"results_bradley_terry/acq_{acquisition}_noise_{sigma_noise}_metrics.npy", metrics)
        if benchmark["drd2_proba"] == True:
            np.save(f"results_bradley_terry/acq_{acquisition}_noise_{sigma_noise}_drd2_proba.npy", drd2_proba)
        if benchmark["novelty_score"] == True:
            np.save(f"results_bradley_terry/acq_{acquisition}_noise_{sigma_noise}_novelty_score.npy", novelty_score)
        if benchmark["sa_score"] == True:
            np.save(f"results_bradley_terry/acq_{acquisition}_noise_{sigma_noise}_sa_score.npy", sa_score)
        if benchmark["logP_score"] == True:
            np.save(f"results_bradley_terry/acq_{acquisition}_noise_{sigma_noise}_logP_score.npy", logP_score)

Loading final testing dataset successfully
Loading Bradley Terry model successfully from /home/springnuance/reinvent-hitl/Base-Code-Binh/training_Bradley_Terry_model/bradley_terry_model.pth
Loading Bradley Terry model successfully from output_bradley_terry/R3_T5_Q5_acq_random_noise_0.0/REINVENT_round_1/HITL_iteration_5/bradley_terry_model.pth
Loading Bradley Terry model successfully from output_bradley_terry/R3_T5_Q5_acq_random_noise_0.0/REINVENT_round_2/HITL_iteration_5/bradley_terry_model.pth
Loading Bradley Terry model successfully from output_bradley_terry/R3_T5_Q5_acq_random_noise_0.0/REINVENT_round_3/HITL_iteration_5/bradley_terry_model.pth
Loading final testing dataset successfully
Loading Bradley Terry model successfully from /home/springnuance/reinvent-hitl/Base-Code-Binh/training_Bradley_Terry_model/bradley_terry_model.pth
Loading Bradley Terry model successfully from output_bradley_terry/R3_T5_Q5_acq_random_noise_0.1/REINVENT_round_1/HITL_iteration_5/bradley_terry_model.pth


### Running Rank ListNet model

In [49]:
feedback_type = "ranking" # scoring, comparing, ranking

# feedback type as ranking:
# Given N molecules, what are the orders of preference of these molecules regarding DRD2?

num_rounds = 3 # number of rounds, corresponding to R in the paper
num_iters = 5 # number of iterations of showing molecules to the human for feedback at each round
num_queries = 6 # number of molecules, pairs or a set of molecules, depending on the task, 
                 # shown to the simulated chemist at each HITL_iteration

for acquisition in ["random", "uncertainty", "greedy"]:
    for sigma_noise in [0.0, 0.1]:
        output_dir = f"output_rank_listnet/R{num_rounds}_T{num_iters}_Q{num_queries}_acq_{acquisition}_noise_{sigma_noise}"
        initial_feedback_model_path = f"/home/springnuance/reinvent-hitl/Base-Code-Binh/training_Rank_ListNet_model/rank_listnet_model.pth"
        final_testing_dataset_path = "/home/springnuance/reinvent-hitl/Base-Code-Binh/data/drd2_final_test_large.csv"

        model_pretrained_name = "rank_listnet_model.pth"

        benchmark = {
            "metrics": True,
            "drd2_proba": False,
            "novelty_score": False,
            "sa_score": False,
            "logP_score": False,
        }

        metrics, drd2_proba, novelty_score, sa_score, logP_score = evaluate_results(
                output_dir, benchmark,
                feedback_type, # scoring, comparing, ranking
                initial_feedback_model_path,
                final_testing_dataset_path, # path to the final training dataset
                model_pretrained_name, # Name of the pretrained model before REINVENT_round_1
                num_rounds, # number of rounds, corresponding to R in the paper
                num_iters, # number of iterations of showing molecules to the human for feedback at each round, corresponding to T in the paper
        )

        if benchmark["metrics"] == True:
            np.save(f"results_rank_listnet/acq_{acquisition}_noise_{sigma_noise}_metrics.npy", metrics)
        if benchmark["drd2_proba"] == True:
            np.save(f"results_rank_listnet/acq_{acquisition}_noise_{sigma_noise}_drd2_proba.npy", drd2_proba)
        if benchmark["novelty_score"] == True:
            np.save(f"results_rank_listnet/acq_{acquisition}_noise_{sigma_noise}_novelty_score.npy", novelty_score)
        if benchmark["sa_score"] == True:
            np.save(f"results_rank_listnet/acq_{acquisition}_noise_{sigma_noise}_sa_score.npy", sa_score)
        if benchmark["logP_score"] == True:
            np.save(f"results_rank_listnet/acq_{acquisition}_noise_{sigma_noise}_logP_score.npy", logP_score)

Loading final testing dataset successfully
Loading Rank ListNet model successfully from /home/springnuance/reinvent-hitl/Base-Code-Binh/training_Rank_ListNet_model/rank_listnet_model.pth
Loading Rank ListNet model successfully from output_rank_listnet/R3_T5_Q6_acq_random_noise_0.0/REINVENT_round_1/HITL_iteration_5/rank_listnet_model.pth
Loading Rank ListNet model successfully from output_rank_listnet/R3_T5_Q6_acq_random_noise_0.0/REINVENT_round_2/HITL_iteration_5/rank_listnet_model.pth
Loading Rank ListNet model successfully from output_rank_listnet/R3_T5_Q6_acq_random_noise_0.0/REINVENT_round_3/HITL_iteration_5/rank_listnet_model.pth
Loading final testing dataset successfully
Loading Rank ListNet model successfully from /home/springnuance/reinvent-hitl/Base-Code-Binh/training_Rank_ListNet_model/rank_listnet_model.pth
Loading Rank ListNet model successfully from output_rank_listnet/R3_T5_Q6_acq_random_noise_0.1/REINVENT_round_1/HITL_iteration_5/rank_listnet_model.pth
Loading Rank ListN