In [31]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
# load dependencies
import sys
import pickle
import os
import shutil
import json
import pandas as pd
import numpy as np
import rdkit.Chem as Chem
from numpy.random import default_rng
from ast import literal_eval

from utils import fingerprints_from_mol
from scripts.simulated_expert import ActivityEvaluationModel, logPEvaluationModel
from scripts.write_config import write_REINVENT_config, write_sample_file
from models.RandomForest import RandomForestReg, RandomForestClf
from scripts.acquisition import select_query

### First, we need to install the custom reinvent scoring package to support the Bradley-Terry model

In [23]:
! pip show reinvent_scoring

Name: reinvent-scoring
Version: 0.0.73
Summary: Scoring functions for Reinvent
Home-page: https://github.com/MolecularAI/reinvent-scoring.git
Author: MolecularAI
Author-email: patronov@gmail.com
License: UNKNOWN
Location: /home/springnuance/reinvent-hitl/reinvent_scoring
Requires: 
Required-by: 


### If there already exists reinvent_scoring, we should uninstall it

In [24]:
! pip uninstall -y reinvent_scoring

Found existing installation: reinvent-scoring 0.0.73
Can't uninstall 'reinvent-scoring'. No files were found to uninstall.


### Now we install the custom reinvent scoring package
### The flag -e means that the package is installed in editable mode, so that changes to the code will be immediately available without reinstalling the package. All package info is stored in the setup.py file.

In [25]:
! pip install -e "../reinvent_scoring"

Obtaining file:///home/springnuance/reinvent-hitl/reinvent_scoring
  Preparing metadata (setup.py) ... [?25ldone
[?25hInstalling collected packages: reinvent-scoring-bradley-terry
  Attempting uninstall: reinvent-scoring-bradley-terry
    Found existing installation: reinvent-scoring-bradley-terry 0.0.73
    Uninstalling reinvent-scoring-bradley-terry-0.0.73:
      Successfully uninstalled reinvent-scoring-bradley-terry-0.0.73
  Running setup.py develop for reinvent-scoring-bradley-terry
Successfully installed reinvent-scoring-bradley-terry-0.0.73


In [26]:
# !pip install PyTDC

In [33]:
def extract_ECFP_dataset(init_train_set_path, num_train_samples):
    """
        Load background training data used to pre-train the predictive model    
    """
    
    print("Loading D0")
    train_set = pd.read_csv(init_train_set_path)
    feature_cols = [f"bit{i}" for i in range(2048)]
    target_col = ["activity"]
    smiles_train = train_set["smiles"].values.reshape(-1)
    x_train = train_set[feature_cols].values
    y_train = train_set[target_col].values.reshape(-1)
    sample_weight = np.array([1. for i in range(len(x_train))])
    print("The feature matrix shape: ", x_train.shape)
    print("The labels shape: ", y_train.shape)

    train_sample = train_set[train_set["activity"] == 1].sample(num_train_samples).smiles.tolist()
    return x_train, y_train, sample_weight, smiles_train, train_sample

In [28]:
def run_HITL(
        seed, dirname, reinvent_dir, reinvent_env,
        init_model_path, 
        init_train_set_path, 
        model_type, # either "regression" or "classification"
        num_rounds, # number of rounds, corresponding to R in the paper
        num_iters, # number of molecules shown at each iteration to the human for feedback, corresponding to T in the paper
        num_queries, # number of molecules shown to the simulated chemist at each iteration
        num_train_samples, # number of training samples to select from the training set
        REINVENT_n_steps, # number of REINVENT optimization steps
        train_similarity, # if True, use the similarity of the training set to select queries
        pretrained_prior, # if True, use a pre-trained prior
        acquisition, # acquisition: 'uncertainty', 'random', 'thompson', 'greedy' (if None run with no human interaction)
        sigma_noise, # noise level for simulated chemist's responses
        threshold # threshold for high scoring molecules
        ):

    np.random.seed(seed)
    rng = default_rng(seed)

    if acquisition:
        jobname = "fine-tune predictive component HITL"
        jobid = f"{dirname}_rounds_{num_rounds}_iters_{num_iters}_queries_{num_queries}_{acquisition}_noise_{sigma_noise}"
        output_dir = f"{jobid}_seed_{seed}"
    else:
        jobname = "fine-tune predictive component no HITL"
        jobid = f"{dirname}_rounds_{num_rounds}_None"
        output_dir = f"{jobid}_seed_{seed}"
    
    # initial configuration
    conf_filename = "config.json"

    # create root output dir
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)
    
    print(f"Creating output directory: {output_dir}.")
    configuration_JSON_path = write_REINVENT_config(reinvent_dir, reinvent_env, output_dir, conf_filename, jobid, jobname)
    print(f"Creating config file: {configuration_JSON_path}.")

    configuration = json.load(open(os.path.join(output_dir, conf_filename)))
    print(configuration)
    # load background training data used to pre-train the predictive model
    x_train, y_train, sample_weight, smiles_train, train_sample = extract_ECFP_dataset(init_train_set_path, num_train_samples)

    
    print("The training similarity used to select queries: ", train_similarity)
    print("The number of training samples: ", len(train_sample))

    # write specified number of RL optimization steps in configuration
    # (example: if num_rounds = 5 (rounds) and Reinvent REINVENT_n_steps = 100, we will do 5*100 RL optimization steps)
    configuration["parameters"]["reinforcement_learning"]["n_steps"] = REINVENT_n_steps
    #print(configuration)

    # write initial model path in configuration
    configuration_scoring_function = configuration["parameters"]["scoring_function"]["parameters"]
    print(configuration_scoring_function)
    for i in range(len(configuration_scoring_function)):
        if configuration_scoring_function[i]["component_type"] == "predictive_property":
            configuration_scoring_function[i]["specific_parameters"]["model_path"] = init_model_path
            configuration_scoring_function[i]["specific_parameters"]["scikit"] = model_type
            if model_type == "classification": 
                configuration_scoring_function[i]["specific_parameters"]["transformation"] = {"transformation_type": "no_transformation"}
        if configuration_scoring_function[i]["component_type"] == "tanimoto_similarity":
           configuration_scoring_function[i]["specific_parameters"]["smiles"] = train_sample 
    if pretrained_prior:
       configuration["parameters"]["reinforcement_learning"]["agent"] = "/home/klgx638/Generations/HITL_qsar_experiments_final/priors/logp/focused.agent"
            
    
    # write the updated configuration file to the disc
    configuration_JSON_path = os.path.join(output_dir, conf_filename)
    with open(configuration_JSON_path, 'w') as f:
        json.dump(configuration, f, indent=4, sort_keys=True)

    # initialize the active learning with the same pool of generated compounds resulting from a standard Reinvent run
    initial_dir = f"{dirname}_rounds_{num_rounds}_None_seed_{seed}"
    if os.path.exists(initial_dir): # if you already have a standard Reinvent run
        # copy the file containing the initial unlabelled pool in your current directory
        os.makedirs(os.path.join(output_dir, "iteration_0"))
        try:
            initial_unlabelled_pool = os.path.join(initial_dir, "results/scaffold_memory.csv")
            shutil.copy(initial_unlabelled_pool, os.path.join(output_dir, "iteration_0"))
        # if this file does not exist, skip this step
        except FileNotFoundError:
            pass
    else: # if you do not have a standard Reinvent run, skip this step
        pass

    print(f"Running MPO experiment with rounds {num_rounds}, iters {num_iters}, queries {num_queries}, seed {seed}. \n Results will be saved at {output_dir}")

    # initialize human feedback model, this 
    # Loading the bradley terry model using Torch
    feedback_model = ... 
    print("Loading feedback model.")

    # load the predictive model
    predictive_model_name = init_model_path.split("/")[-1].split(".")[0]
    print("The predictive model name: ", predictive_model_name)
    model_load_path = output_dir + '/{}_iteration_0.pkl'.format(predictive_model_name)
    if not os.path.exists(model_load_path):
        shutil.copy(init_model_path, output_dir)
    fitted_model = pickle.load(open(init_model_path, 'rb'))
    print("Loading predictive model.")

    # store expert scores
    expert_score = []

    READ_ONLY = False # if folder exists, do not overwrite results there

    for REINVENT_round in np.arange(1, num_rounds + 1):

        if REINVENT_round == 1 and acquisition:
            if os.path.exists(os.path.join(output_dir, "iteration_0/scaffold_memory.csv")):
                # start from your pre-existing pool of unlabelled compounds
                with open(os.path.join(output_dir, "iteration_0/scaffold_memory.csv"), 'r') as file:
                    data = pd.read_csv(file)
                data = data[data["Step"] < 100]
                data.reset_index(inplace=True)
            else:
                # generate a pool of unlabelled compounds with REINVENT
                print("Run REINVENT")
                os.system(reinvent_env + '/bin/python ' + reinvent_dir + '/input.py ' + configuration_JSON_path + '&> ' + output_dir + '/run.err')
                
                with open(os.path.join(output_dir, "results/scaffold_memory.csv"), 'r') as file:
                    data = pd.read_csv(file)

        else:
            if(not READ_ONLY):
                # run REINVENT
                print("Run REINVENT")
                os.system(reinvent_env + '/bin/python ' + reinvent_dir + '/input.py ' + configuration_JSON_path + '&> ' + output_dir + '/run.err')
            else:
                print("Reading REINVENT results from file, no re-running.")
                pass

            with open(os.path.join(output_dir, "results/scaffold_memory.csv"), 'r') as file:
                data = pd.read_csv(file)
        
        N = len(data)
        colnames = list(data) 
        smiles = data['SMILES']
        bioactivity_score = data['bioactivity'] # the same as raw_bioactivity since no transformation applied
        raw_bioactivity_score = data['raw_bioactivity']
        high_scoring_threshold = threshold
        # save the indexes of high scoring molecules for bioactivity
        high_scoring_idx = bioactivity_score > high_scoring_threshold

        # Scoring component values
        scoring_component_names = [s.split("raw_")[1] for s in colnames if "raw_" in s]
        print(f"scoring components: {scoring_component_names}")
        x = np.array(data[scoring_component_names])
        print(f'Scoring component matrix dimensions: {x.shape}')
        x = x[high_scoring_idx,:]

        # Only analyse highest scoring molecules
        smiles = smiles[high_scoring_idx]
        bioactivity_score = bioactivity_score[high_scoring_idx]
        raw_bioactivity_score = raw_bioactivity_score[high_scoring_idx]
        print(f'{len(smiles)} high-scoring (> {high_scoring_threshold}) molecules')

        if len(smiles) == 0:
            smiles = data['SMILES']
            print(f'{len(smiles)} molecules')

               
        # store molecule indexes selected for feedback
        selected_feedback = np.empty(0).astype(int)
        human_sample_weight = np.empty(0).astype(float)
        # store number of accepted queries (y = 1) at each iteration
        n_accept = []

        ########################### HITL rounds ######################################
        
        for iteration in np.arange(num_iters): # T number of HITL iterations
            print(f"Round = {REINVENT_round}, Iteration = {iteration}")
            # query selection
            if model_type == "regression":
                model = RandomForestReg(fitted_model)
            if model_type == "classification":
                model = RandomForestClf(fitted_model)
            if len(smiles) > num_queries:
                new_query = select_query(data, num_queries, list(smiles), model, selected_feedback, acquisition, rng) # select n smiles with Active Learning
            else:
                new_query = select_query(data, len(smiles), list(smiles), model, selected_feedback, acquisition, rng)
            
            # Initialize the expert values vector
            s_bioactivity = [] # for scores (between 0 and 1)
            v_bioactivity = [] # for continuous feedback (regression)
            # Get expert feedback on selected queries
            print(new_query)
            for i in new_query:
                cur_mol = data.iloc[i]["SMILES"]
                print(cur_mol)
                value = feedback_model.human_score(cur_mol, sigma_noise)
                s_bioactivity.append(value)
                if model_type == "regression":
                    v_bioactivity.append(feedback_model.utility(value, low = 2, high = 4))
            
            # Get raw scores and transformed score (if any) from the high scoring molecules in U
            raw_scoring_component_names = ["raw_"+name for name in scoring_component_names] 
            x_raw = data[raw_scoring_component_names].to_numpy()
            x =  data[scoring_component_names].to_numpy()

            # get (binary) simulated chemist's responses
            if model_type == "regression":
                new_y = np.array(v_bioactivity)
                s_bioactivity = [1 if s > 0.5 else 0 for s in v_bioactivity]
                accepted = s_bioactivity
            if model_type == "classification":
                new_y = np.array([1 if s > 0.5 else 0 for s in s_bioactivity])
                accepted = new_y.tolist()
            expert_score += [accepted]
            n_accept += [sum(accepted)]

            print(f"Feedback idx at iteration {REINVENT_round}, {iteration}: {new_query}")
            print(f"Number of accepted molecules at iteration {REINVENT_round}, {iteration}: {n_accept[iteration]}")   
            
            # append feedback
            if len(new_y) > 0:
                selected_feedback = np.hstack((selected_feedback, new_query))

            mask = np.ones(N, dtype=bool)
            mask[selected_feedback] = False

            # use the augmented training data to retrain the model
            new_smiles = data.iloc[new_query].SMILES.tolist()
            new_mols = [Chem.MolFromSmiles(s) for s in new_smiles]
            new_x = fingerprints_from_mol(new_mols, type = "counts")
            new_human_sample_weight = np.array([s if s > 0.5 else 1-s for s in s_bioactivity])
            sample_weight = np.concatenate([sample_weight, new_human_sample_weight])
            print(len(new_x), len(new_y))
            x_train = np.concatenate([x_train, new_x])
            y_train = np.concatenate([y_train, new_y])
            smiles_train = np.concatenate([smiles_train, new_smiles])
            print(f"Augmented train set size at iteration {REINVENT_round}: {x_train.shape[0]} {y_train.shape[0]}")
            # save augmented training data
            D_r = pd.DataFrame(np.concatenate([smiles_train.reshape(-1,1), x_train, y_train.reshape(-1,1)], 1))
            D_r.columns = ["SMILES"] + [f"bit{i}" for i in range(x_train.shape[1])] + ["target"]
            D_r.to_csv(os.path.join(output_dir, f"augmented_train_set_iter{REINVENT_round}.csv"))

            # re-fit and save the model using the augmented train set and save to new directory
            model_new_savefile = output_dir + '/{}_iteration_{}.pkl'.format(predictive_model_name, REINVENT_round)
            model._retrain(x_train, y_train, sample_weight = sample_weight, save_to_path = model_new_savefile)
            fitted_model = pickle.load(open(model_new_savefile, 'rb'))

            # get current configuration
            configuration = json.load(open(os.path.join(output_dir, conf_filename)))
            conf_filename = "iteration{}_config.json".format(REINVENT_round)    

            # modify model path in configuration
            configuration_scoring_function = configuration["parameters"]["scoring_function"]["parameters"]
            for i in range(len(configuration_scoring_function)):
                if configuration_scoring_function[i]["component_type"] == "predictive_property":
                    configuration_scoring_function[i]["specific_parameters"]["model_path"] = model_new_savefile

            # Keep agent checkpoint
            if REINVENT_round == 1:
                configuration["parameters"]["reinforcement_learning"]["agent"] = os.path.join(initial_dir, "results/Agent.ckpt")
            else:
                configuration["parameters"]["reinforcement_learning"]["agent"] = os.path.join(output_dir, "results/Agent.ckpt")

        root_output_dir = os.path.expanduser("{}_seed{}".format(jobid, seed))

        # Define new directory for the next round
        output_dir = os.path.join(root_output_dir, "iteration{}_{}".format(REINVENT_round, acquisition))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        print(output_dir)

        # modify log and result paths in configuration
        configuration["logging"]["logging_path"] = os.path.join(output_dir, "progress.log")
        configuration["logging"]["result_folder"] = os.path.join(output_dir, "results")

        # write the updated configuration file to the disc
        configuration_JSON_path = os.path.join(output_dir, conf_filename)
        with open(configuration_JSON_path, 'w') as f:
            json.dump(configuration, f, indent=4, sort_keys=True)

    r = np.arange(len(expert_score))
    m_score = [np.mean(expert_score[i]) for i in r]
    print("Mean expert score : ", m_score)

In [29]:
print(os.getcwd())

/home/springnuance/reinvent-hitl/Base-Code-Binh


In [30]:
seed = 42
dirname = "outputs"

# change these path variables as required
reinvent_dir = os.path.expanduser("/home/springnuance/reinvent-hitl/Reinvent") # We must use absolute path
reinvent_env = os.path.expanduser("/home/springnuance/miniconda3/envs/cc_env_reinvent") # We must use absolute path

init_model_path = "models/bradley_terry_model.pth" 
# Please check the performance of this bradley terry model
# the performance should not be good, around random accuracy 
# If the model is too good, retrain the model to become weaker, we are trying to learn the model  

init_train_set_path = "/home/springnuance/reinvent-hitl/Base-Code-Binh/data/drd2/small_drd2_rank_data.csv" 

model_type = "classification" # either "regression" or "classification"
num_rounds = 2 # number of rounds
REINVENT_n_steps = 100 # number of REINVENT optimization steps
train_similarity = False # if True, use the similarity of the training set to select queries
pretrained_prior = False # if True, use a pre-trained prior


# Please look at the thompson sampling code and fix it!
acquisition = "thompson" # acquisition: 'uncertainty', 'random', 'thompson', 'greedy' (if None run with no human interaction)

sigma_noise = 0.0 # noise level for simulated chemist's responses
num_iters = 10 # number of molecules shown at each iteration to the human for feedback
num_queries = 10 # number of molecules shown to the simulated chemist at each iteration (10 pairs)
num_train_samples = 30 # number of training samples to select from the training set, ignore if train_similarity is False
threshold = 0.5 # threshold for high scoring molecules

run_HITL(seed, dirname, reinvent_dir, reinvent_env,
         init_model_path, init_train_set_path, 
         model_type, num_rounds, 
         num_iters, num_queries, 
         num_train_samples,
         REINVENT_n_steps, 
         train_similarity, pretrained_prior, 
         acquisition, sigma_noise, 
         threshold)



Creating output directory: outputs_rounds_2_iters_10_queries_10_thompson_noise_0.0_seed_42.
Creating config file: outputs_rounds_2_iters_10_queries_10_thompson_noise_0.0_seed_42/config.json.
{'logging': {'job_id': 'outputs_rounds_2_iters_10_queries_10_thompson_noise_0.0', 'job_name': 'fine-tune predictive component HITL', 'logging_frequency': 0, 'logging_path': 'outputs_rounds_2_iters_10_queries_10_thompson_noise_0.0_seed_42/progress.log', 'recipient': 'local', 'result_folder': 'outputs_rounds_2_iters_10_queries_10_thompson_noise_0.0_seed_42/results', 'sender': 'http://127.0.0.1'}, 'model_type': 'default', 'parameters': {'diversity_filter': {'bucket_size': 25, 'minscore': 0.2, 'minsimilarity': 0.4, 'name': 'IdenticalMurckoScaffold'}, 'inception': {'memory_size': 20, 'sample_size': 5, 'smiles': []}, 'reinforcement_learning': {'agent': '../Reinvent/data/random.prior.new', 'batch_size': 128, 'learning_rate': 0.0001, 'margin_threshold': 50, 'n_steps': 250, 'prior': '../Reinvent/data/random

UnpicklingError: A load persistent id instruction was encountered,
but no persistent_load function was specified.