In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


##### Please run this code with the kernel reinvent.v3.2

In [16]:
# load dependencies
import sys
import pickle
import os
import shutil
import json
import pandas as pd
import numpy as np
import rdkit.Chem as Chem
from numpy.random import default_rng
import torch
from ast import literal_eval
from torch import nn, optim
from rdkit import Chem
from rdkit.Chem import AllChem
from tdc import Oracle
import subprocess

from utils import fingerprints_from_mol
from scripts.write_config_bradley_terry import write_REINVENT_config_bradley_terry
from models.RandomForest import RandomForestReg, RandomForestClf
from scripts.acquisition import select_query

### First, we need to install the custom reinvent scoring package to support the Bradley-Terry model

In [17]:
# print python version
import sys

# Print Python version
print(f"Python version: {sys.version}")

Python version: 3.7.6 | packaged by conda-forge | (default, Jun  1 2020, 18:57:50) 
[GCC 7.5.0]


In [18]:
! pip show reinvent_scoring

Name: reinvent-scoring
Version: 0.0.73
Summary: Scoring functions for Reinvent
Home-page: https://github.com/MolecularAI/reinvent-scoring.git
Author: MolecularAI
Author-email: patronov@gmail.com
License: UNKNOWN
Location: /home/springnuance/reinvent-hitl/reinvent-scoring
Requires: 
Required-by: 


In [19]:
# ! conda install scikit-learn=0.21.3

##### If there already exists reinvent_scoring, we should uninstall it

In [20]:
! pip uninstall -y reinvent_scoring

Found existing installation: reinvent-scoring 0.0.73
Uninstalling reinvent-scoring-0.0.73:
  Successfully uninstalled reinvent-scoring-0.0.73


##### Now we install the custom reinvent scoring package
##### The flag -e means that the package is installed in editable mode, so that changes to the code will be immediately available without reinstalling the package. All package info is stored in the setup.py file.

In [21]:
! pip install -e "/home/springnuance/reinvent-hitl/reinvent-scoring"
! pip install -e "/home/springnuance/reinvent-hitl/reinvent-chemistry"
! pip install -e "/home/springnuance/reinvent-hitl/reinvent-models"

Obtaining file:///home/springnuance/reinvent-hitl/reinvent-scoring
  Preparing metadata (setup.py) ... [?25ldone
[?25hInstalling collected packages: reinvent-scoring
  Running setup.py develop for reinvent-scoring
Successfully installed reinvent-scoring-0.0.73
Obtaining file:///home/springnuance/reinvent-hitl/reinvent-chemistry
  Preparing metadata (setup.py) ... [?25ldone
[?25hInstalling collected packages: reinvent-chemistry
  Attempting uninstall: reinvent-chemistry
    Found existing installation: reinvent-chemistry 0.0.51
    Uninstalling reinvent-chemistry-0.0.51:
      Successfully uninstalled reinvent-chemistry-0.0.51
  Running setup.py develop for reinvent-chemistry
Successfully installed reinvent-chemistry-0.0.51
Obtaining file:///home/springnuance/reinvent-hitl/reinvent-models
  Preparing metadata (setup.py) ... [?25ldone
[?25hInstalling collected packages: reinvent-models
  Attempting uninstall: reinvent-models
    Found existing installation: reinvent-models 0.0.15rc

In [22]:
# ! conda install -y scikit-learn=0.21.3
! pip list | grep reinvent_scoring

In [23]:
# ! pip install PyTDC
# ! pip install chemprop
# ! pip install absl-py==1.1.0
# ! pip install astor==0.8.1
# ! pip install cloudpickle==2.1.0
# ! pip install dataclasses==0.6
# ! pip install deprecated==1.2.13
# ! pip install dm-tree==0.1.7
# ! pip install flatbuffers==1.12
# ! pip install future==0.18.2
# ! pip install gast==0.3.2
# ! pip install gpflow==2.3.1
# ! pip install grpcio==1.27.2
# ! pip install keras==2.7.0
# ! pip install keras-applications==1.0.8
# ! pip install libclang==14.0.1
# ! pip install markdown==3.2.1
# ! pip install multipledispatch==0.6.0
# ! pip install opt-einsum==3.2.0
# ! pip install protobuf==3.11.3
# ! pip install reinvent-chemistry==0.0.51
# ! pip install reinvent-models==0.0.15rc1
# ! pip install tabulate==0.8.9
# ! pip install tensorboard==2.9.1
# ! pip install tensorboard-data-server==0.6.1
# ! pip install tensorflow==2.7.0
# ! pip install tensorflow-estimator==2.7.0
# ! pip install tensorflow-io-gcs-filesystem==0.26.0
# ! pip install tensorflow-probability==0.17.0
# ! pip install werkzeug==2.1.2

In [24]:
def extract_ECFP_dataset(init_train_set_path, num_train_samples):
    """
        Load background training data used to pre-train the predictive model    
    """
    
    print("Loading D0")
    train_set = pd.read_csv(init_train_set_path)
    feature_cols = [f"bit{i}" for i in range(2048)]
    target_col = ["activity"]
    smiles_train = train_set["smiles"].values.reshape(-1)
    x_train = train_set[feature_cols].values
    y_train = train_set[target_col].values.reshape(-1)
    sample_weight = np.array([1. for i in range(len(x_train))])
    print("The feature matrix shape: ", x_train.shape)
    print("The labels shape: ", y_train.shape)

    train_sample = train_set[train_set["activity"] == 1].sample(num_train_samples).smiles.tolist()
    return x_train, y_train, sample_weight, smiles_train, train_sample

In [25]:
from training_Bradley_Terry_model.bradley_terry import BradleyTerryModel

def run_HITL_classify(
        seed, dirname, reinvent_dir, reinvent_env,
        feedback_type, # score, compare, rank
        init_model_path, 
        init_train_set_path, 
        num_rounds, # number of rounds, corresponding to R in the paper
        num_iters, # number of molecules shown at each iteration to the human for feedback, corresponding to T in the paper
        num_queries, # number of molecules shown to the simulated chemist at each iteration
        num_train_samples, # number of training samples to select from the training set
        REINVENT_n_steps, # number of REINVENT optimization steps
        train_similarity, # if True, use the similarity of the training set to select queries
        acquisition, # acquisition: 'uncertainty', 'random', 'thompson', 'greedy' (if None run with no human interaction)
        sigma_noise, # noise level for simulated chemist's responses
        threshold # threshold for high scoring molecules
        ):

    np.random.seed(seed)
    rng = default_rng(seed)

    if acquisition:
        jobname = "fine-tune predictive component HITL"
        jobid = f"{dirname}_rounds_{num_rounds}_iters_{num_iters}_queries_{num_queries}_{acquisition}_noise_{sigma_noise}"
        output_dir = f"{jobid}_seed_{seed}"
    else:
        jobname = "fine-tune predictive component no HITL"
        jobid = f"{dirname}_rounds_{num_rounds}_None"
        output_dir = f"{jobid}_seed_{seed}"
    
    # initial configuration
    conf_filename = "config.json"

    # create root output dir
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)
    
    print(f"Creating output directory: {output_dir}.")
    if feedback_type == "scoring":
        raise NotImplementedError("Score feedback not implemented yet.")
    elif feedback_type == "comparing":
        configuration_JSON_path = write_REINVENT_config_bradley_terry(reinvent_dir, reinvent_env, output_dir, conf_filename, jobid, jobname)
    elif feedback_type == "ranking":
        raise NotImplementedError("Rank feedback not implemented yet.")
    
    print(f"Creating config file: {configuration_JSON_path}.")

    configuration = json.load(open(os.path.join(output_dir, conf_filename)))

    # write specified number of RL optimization steps in configuration
    # (example: if num_rounds = 5 (rounds) and Reinvent REINVENT_n_steps = 100, we will do 5*100 RL optimization steps)
    configuration["parameters"]["reinforcement_learning"]["n_steps"] = REINVENT_n_steps

    # write initial model path in configuration
    configuration_scoring_function = configuration["parameters"]["scoring_function"]["parameters"]
    
    for i in range(len(configuration_scoring_function)):
        configuration_scoring_function[i]["specific_parameters"]["model_path"] = init_model_path
        if feedback_type == "scoring":
            raise NotImplementedError("Score feedback not implemented yet.")
        elif feedback_type == "comparing":
            configuration_scoring_function[i]["specific_parameters"]["bradley_terry"] = "classification"
            predictive_model_name = "bradley_terry_model"
        elif feedback_type == "ranking":
            raise NotImplementedError("Rank feedback not implemented yet.")
        
    # write the updated configuration file to the disc
    configuration_JSON_path = f"{os.getcwd()}/{output_dir}/{conf_filename}"
    print("The configuration file path: ", configuration_JSON_path)

    with open(configuration_JSON_path, 'w') as f:
        json.dump(configuration, f, indent=4, sort_keys=True)
    
    # initialize the active learning with the same pool of generated compounds resulting from a standard Reinvent run
    initial_dir = f"{dirname}_rounds_{num_rounds}_None_seed_{seed}"
    
    if os.path.exists(initial_dir): # if you already have a standard Reinvent run
        # copy the file containing the initial unlabelled pool in your current directory
        os.makedirs(os.path.join(output_dir, "iteration_0"))
        try:
            initial_unlabelled_pool = os.path.join(initial_dir, "results/scaffold_memory.csv")
            shutil.copy(initial_unlabelled_pool, os.path.join(output_dir, "iteration_0"))
        # if this file does not exist, skip this step
        except FileNotFoundError:
            pass
    else: # if you do not have a standard Reinvent run, skip this step
        pass
    
    # multi-parameter optimization (MPO) loop
    print(f"Running MPO experiment with rounds {num_rounds}, iters {num_iters}, queries {num_queries}, seed {seed}. \n Results will be saved at {output_dir}")

    # Initialize human feedback model

    print(f"Loading {feedback_type} feedback model.")

    if feedback_type == "scoring":
        raise NotImplementedError("Score feedback not implemented yet.")
    elif feedback_type == "comparing":
        from training_Bradley_Terry_model.bradley_terry import BradleyTerryModel
        # pickle that torch model of bradley_terry_model.pth
        feedback_model = BradleyTerryModel(feature_dim=2048)
        feedback_model.load_state_dict(torch.load("training_Bradley_Terry_model/bradley_terry_model.pth"))
        print("Loading Bradley Terry model successfully")
    elif feedback_type == "ranking":
        raise NotImplementedError("Rank feedback not implemented yet.")
    
    # store expert scores
    expert_score = []

    READ_ONLY = False # if folder exists, do not overwrite results there

    for REINVENT_round in np.arange(1, num_rounds + 1):
        print("=====================================")
        print(f"REINVENT round = {REINVENT_round}")

        if REINVENT_round == 1:
            if os.path.exists(f"{output_dir}/iteration_0/scaffold_memory.csv"):
                # start from your pre-existing pool of unlabelled compounds
                with open(f"{output_dir}/iteration_0/scaffold_memory.csv", 'r') as file:
                    data = pd.read_csv(file)
                data = data[data["Step"] < 100]
                data.reset_index(inplace=True)
            else:
                # generate a pool of unlabelled compounds with REINVENT
                # print("Run REINVENT")
                # exit_code = os.system(f"{reinvent_env}/bin/python {reinvent_dir}/input.py {configuration_JSON_path} &> {output_dir} /run.err")
                # print("The exit code: ", exit_code)
                
                command = f"{reinvent_env}/bin/python"
                script = f"{reinvent_dir}/input.py"
                config_path = configuration_JSON_path
                stderr_file = f"{output_dir}/run.err"
                stdout_file = f"{output_dir}/run.out"

                # Construct the full command to run
                cmd = [command, script, config_path]

                # Open the file to which you want to redirect stderr and stdout
                
                with open(stderr_file, 'w') as ferr, open(stdout_file, 'w') as fout:
                    # Execute the command
                    result = subprocess.run(cmd, text=True, stdout=fout, stderr=ferr)

                # Check the result
                print("Exit code:", result.returncode)

                with open(f"{output_dir}/results/scaffold_memory.csv", 'r') as file:
                    data = pd.read_csv(file)

        N = len(data)
        colnames = list(data) 
        smiles = data['SMILES']
        bioactivity_score = data['bioactivity'] # the same as raw_bioactivity since no transformation applied
        raw_bioactivity_score = data['raw_bioactivity']
        high_scoring_threshold = threshold
        # save the indexes of high scoring molecules for bioactivity
        high_scoring_idx = bioactivity_score > high_scoring_threshold

        # Scoring component values
        scoring_component_names = [s.split("raw_")[1] for s in colnames if "raw_" in s]
        print(f"scoring components: {scoring_component_names}")
        x = np.array(data[scoring_component_names])
        print(f'Scoring component matrix dimensions: {x.shape}')
        x = x[high_scoring_idx,:]

        # Only analyse highest scoring molecules
        smiles = smiles[high_scoring_idx]
        bioactivity_score = bioactivity_score[high_scoring_idx]
        raw_bioactivity_score = raw_bioactivity_score[high_scoring_idx]
        print(f'{len(smiles)} high-scoring (> {high_scoring_threshold}) molecules')

        if len(smiles) == 0:
            smiles = data['SMILES']
            print(f'{len(smiles)} molecules')

               
        # store molecule indexes selected for feedback
        selected_feedback = np.empty(0).astype(int)
        human_sample_weight = np.empty(0).astype(float)
        # store number of accepted queries (y = 1) at each iteration
        n_accept = []

        ########################### HITL rounds ######################################
        
        for iteration in np.arange(num_iters): # T number of HITL iterations
            print("=====================================")
            print(f"Round = {REINVENT_round}, Iteration = {iteration}")
            
            # classify 
            model = RandomForestClf(fitted_model)
            
            if len(smiles) > num_queries:
                new_query = select_query(data, num_queries, list(smiles), model, selected_feedback, acquisition, rng) # select n smiles with Active Learning
            else:
                new_query = select_query(data, len(smiles), list(smiles), model, selected_feedback, acquisition, rng)
            
            # Initialize the expert values vector
            s_bioactivity = [] # for scores (between 0 and 1)
            v_bioactivity = [] # for continuous feedback (regression)
            
            # Get expert feedback on selected queries
            print(new_query)
            for i in new_query:
                cur_mol = data.iloc[i]["SMILES"]
                print(cur_mol)
                value = feedback_model.human_score(cur_mol, sigma_noise)
                s_bioactivity.append(value)
            
            # Get raw scores and transformed score (if any) from the high scoring molecules in U
            raw_scoring_component_names = ["raw_"+name for name in scoring_component_names] 
            x_raw = data[raw_scoring_component_names].to_numpy()
            x =  data[scoring_component_names].to_numpy()

            # get (binary) simulated chemist's responses
            
            new_y = np.array([1 if s > 0.5 else 0 for s in s_bioactivity])
            accepted = new_y.tolist()
            
            expert_score += [accepted]
            n_accept += [sum(accepted)]

            print(f"Feedback idx at iteration {REINVENT_round}, {iteration}: {new_query}")
            print(f"Number of accepted molecules at iteration {REINVENT_round}, {iteration}: {n_accept[iteration]}")   
            
            # append feedback
            if len(new_y) > 0:
                selected_feedback = np.hstack((selected_feedback, new_query))

            mask = np.ones(N, dtype=bool)
            mask[selected_feedback] = False

            # use the augmented training data to retrain the model
            new_smiles = data.iloc[new_query].SMILES.tolist()
            new_mols = [Chem.MolFromSmiles(s) for s in new_smiles]
            new_x = fingerprints_from_mol(new_mols, type = "counts")
            new_human_sample_weight = np.array([s if s > 0.5 else 1-s for s in s_bioactivity])
            sample_weight = np.concatenate([sample_weight, new_human_sample_weight])
            print(len(new_x), len(new_y))
            x_train = np.concatenate([x_train, new_x])
            y_train = np.concatenate([y_train, new_y])
            smiles_train = np.concatenate([smiles_train, new_smiles])
            print(f"Augmented train set size at iteration {REINVENT_round}: {x_train.shape[0]} {y_train.shape[0]}")
            # save augmented training data
            D_r = pd.DataFrame(np.concatenate([smiles_train.reshape(-1,1), x_train, y_train.reshape(-1,1)], 1))
            D_r.columns = ["SMILES"] + [f"bit{i}" for i in range(x_train.shape[1])] + ["target"]
            D_r.to_csv(os.path.join(output_dir, f"augmented_train_set_iter{REINVENT_round}.csv"))

            # re-fit and save the model using the augmented train set and save to new directory
            model_new_savefile = output_dir + '/{}_iteration_{}.pkl'.format(predictive_model_name, REINVENT_round)
            model._retrain(x_train, y_train, sample_weight = sample_weight, save_to_path = model_new_savefile)
            fitted_model = pickle.load(open(model_new_savefile, 'rb'))

            # get current configuration
            configuration = json.load(open(os.path.join(output_dir, conf_filename)))
            conf_filename = "iteration{}_config.json".format(REINVENT_round)    

            # modify model path in configuration
            configuration_scoring_function = configuration["parameters"]["scoring_function"]["parameters"]
            for i in range(len(configuration_scoring_function)):
                if configuration_scoring_function[i]["component_type"] == "predictive_property":
                    configuration_scoring_function[i]["specific_parameters"]["model_path"] = model_new_savefile

            # Keep agent checkpoint
            if REINVENT_round == 1:
                configuration["parameters"]["reinforcement_learning"]["agent"] = os.path.join(initial_dir, "results/Agent.ckpt")
            else:
                configuration["parameters"]["reinforcement_learning"]["agent"] = os.path.join(output_dir, "results/Agent.ckpt")

        root_output_dir = os.path.expanduser("{}_seed{}".format(jobid, seed))

        # Define new directory for the next round
        output_dir = os.path.join(root_output_dir, "iteration{}_{}".format(REINVENT_round, acquisition))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        print(output_dir)

        # modify log and result paths in configuration
        configuration["logging"]["logging_path"] = os.path.join(output_dir, "progress.log")
        configuration["logging"]["result_folder"] = os.path.join(output_dir, "results")

        # write the updated configuration file to the disc
        configuration_JSON_path = os.path.join(output_dir, conf_filename)
        with open(configuration_JSON_path, 'w') as f:
            json.dump(configuration, f, indent=4, sort_keys=True)

    r = np.arange(len(expert_score))
    m_score = [np.mean(expert_score[i]) for i in r]
    print("Mean expert score : ", m_score)

In [26]:
print(os.getcwd())

/home/springnuance/reinvent-hitl/Base-Code-Binh


In [27]:
seed = 42
dirname = "outputs"

# change these path variables as required
reinvent_dir = os.path.expanduser("/home/springnuance/reinvent-hitl/Reinvent") # We must use absolute path
#reinvent_env = os.path.expanduser("/home/springnuance/miniconda3/envs/cc_env_hitl") # We must use absolute path
#reinvent_env = os.path.expanduser("/home/springnuance/miniconda3/envs/reinvent.v3.2") # We must use absolute path
reinvent_env = os.path.expanduser("/home/springnuance/miniconda3/envs/ReinventCommunity") # We must use absolute path



# the performance should not be good, around random accuracy 
# If the model is too good, retrain the model to become weaker, we are trying to make the model to learn via HITL

feedback_type = "comparing" # scoring, comparing, ranking

# feedback type as scoring:
# Given a molecule, what is the probability that the molecule is active regarding DRD2?  
# init_model_path = "/home/springnuance/reinvent-hitl/Base-Code-Binh/training_Random_Forest_model/random_forest_model.pkl"

# feedback type as comparing:
# Given two molecules, what is the probability that the first molecule is more active than the second molecule regarding DRD2?
init_model_path = "/home/springnuance/reinvent-hitl/Base-Code-Binh/training_Bradley_Terry_model/bradley_terry_model.pth"
init_train_set_path = "/home/springnuance/reinvent-hitl/Base-Code-Binh/training_Bradley_Terry_model/small_drd2_rank_data.csv" 


# feedback type as ranking:
# Given N molecules, what are the orders of preference of these molecules regarding DRD2?
# init_model_path = "/home/springnuance/reinvent-hitl/Base-Code-Binh/training_List_Net_model/list_net_model.pth"

num_rounds = 2 # number of rounds
REINVENT_n_steps = 100 # number of REINVENT optimization steps
train_similarity = False # if True, use the similarity of the training set to select queries

# Please look at the thompson sampling code and fix it!
acquisition = "thompson" # acquisition: 'uncertainty', 'random', 'thompson', 'greedy' (if None run with no human interaction)

sigma_noise = 0.0 # noise level for simulated chemist's responses
num_iters = 10 # number of molecules shown at each iteration to the human for feedback
num_queries = 10 # number of molecules shown to the simulated chemist at each iteration (10 pairs)
num_train_samples = 30 # number of training samples to select from the training set, ignore if train_similarity is False
threshold = 0.5 # threshold for high scoring molecules

run_HITL_classify(
        seed, dirname, reinvent_dir, reinvent_env,
        feedback_type, # score, compare, rank
        init_model_path, 
        init_train_set_path, 
        num_rounds, # number of rounds, corresponding to R in the paper
        num_iters, # number of molecules shown at each iteration to the human for feedback, corresponding to T in the paper
        num_queries, # number of molecules shown to the simulated chemist at each iteration
        num_train_samples, # number of training samples to select from the training set
        REINVENT_n_steps, # number of REINVENT optimization steps
        train_similarity, # if True, use the similarity of the training set to select queries
        acquisition, # acquisition: 'uncertainty', 'random', 'thompson', 'greedy' (if None run with no human interaction)
        sigma_noise, # noise level for simulated chemist's responses
        threshold # threshold for high scoring molecules
)


Creating output directory: outputs_rounds_2_iters_10_queries_10_thompson_noise_0.0_seed_42.
Creating config file: outputs_rounds_2_iters_10_queries_10_thompson_noise_0.0_seed_42/config.json.
The configuration file path:  /home/springnuance/reinvent-hitl/Base-Code-Binh/outputs_rounds_2_iters_10_queries_10_thompson_noise_0.0_seed_42/config.json
Running MPO experiment with rounds 2, iters 10, queries 10, seed 42. 
 Results will be saved at outputs_rounds_2_iters_10_queries_10_thompson_noise_0.0_seed_42
Loading comparing feedback model.
Loading Bradley Terry model successfully
REINVENT round = 1
Exit code: 1


FileNotFoundError: [Errno 2] No such file or directory: 'outputs_rounds_2_iters_10_queries_10_thompson_noise_0.0_seed_42/results/scaffold_memory.csv'