In [None]:
# imports
import pickle
import sys
sys.path.append(f"../active_learning/regression")
sys.path.append(f"../utils")
import acqf as a
import acquisition as aq
import modelling as md
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from rdkit.Chem import rdFMCS
import pandas as pd
import numpy as np
from sklearn.base import clone


In [2]:
# initialization with SMILES and model-descriptor.
target_SMILES = "CC1(C)CCC[C@]2(C)[C@H]3CC(=O)O[C@]3(C)CC[C@@H]12"
products = ["O=C(O[C@@]1(CC[C@@]23[H])C)C[C@@H]1[C@@]3(C)C[C@@H](O)CC2(C)C",
             "O=C(O[C@@]1(CC[C@@]23[H])C)C[C@@H]1[C@@]3(C)CCC(C2(C)C)=O",
             "O=C(O[C@@]1(CC[C@@]23[H])C)C[C@@H]1[C@@]3(C)CC(CC2(C)C)=O"]
sels = None
yields = [25, 57, 18]
feature_choice = "custom"
atom = "O"
df_folder = 'preprocessed_reactions_no_unspec_no_intra_unnorm'
file = "../results/active_learning/regression/experimental/results_CC1(C)CCC[C@]2(C)[C@H]3CC(=O)O[C@]3(C)CC[C@@H]12_1_coldstart_custom.pkl"

In [3]:
if sels:
    target_df = pd.DataFrame({"Reactant_SMILES": [target_SMILES]*len(products), "Product_SMILES": products, "rxn_ID":[0]*len(products), "Selectivity (%)": sels})
else:
    target_df = pd.DataFrame({"Reactant_SMILES": [target_SMILES]*len(products), "Product_SMILES": products, "rxn_ID":[0]*len(products), "Yield (%)": yields, "Selectivity (%)":[np.nan]*len(products)})
target_df.to_csv("target.csv")

In [None]:
dfs = []
if feature_choice in ["xtb", "custom", "selected"]:
    target_df = md.prepare_reactivity_mapping('XTB', file="target.csv", 
                                            preprocess=True,
                                            normalize=False, threshold_correlated=1,
                                            rxn_folder="target_data", atom=atom)
    dfs.append(target_df)
if feature_choice in ["bde", "custom", "selected"]:
    target_df = md.prepare_reactivity_mapping('BDE', file="target.csv", 
                                            preprocess=True,
                                            normalize=False, threshold_correlated=1,
                                            rxn_folder="target_data", atom=atom)
    dfs.append(target_df)
if feature_choice in ["gas", "custom", "selected"]:
    target_df = md.prepare_reactivity_mapping('Gasteiger', file="target.csv", 
                                            preprocess=True,
                                            normalize=False, threshold_correlated=1,
                                            rxn_folder="target_data", atom=atom)
    dfs.append(target_df)
if feature_choice in ["env1", "selected"]:
    target_df = md.prepare_reactivity_mapping('ENV-1', file="target.csv", 
                                            preprocess=True,
                                            normalize=False, threshold_correlated=1,
                                            rxn_folder="target_data", atom=atom)
    dfs.append(target_df)
if feature_choice in ["env2", "selected"]:
    target_df = md.prepare_reactivity_mapping('ENV-2', file="target.csv", 
                                            preprocess=True,
                                            normalize=False, threshold_correlated=1,
                                            rxn_folder="target_data", atom=atom)
    dfs.append(target_df)
if feature_choice in ["dbstep", "selected"]:
    target_df = md.prepare_reactivity_mapping('DBSTEP', file="target.csv", 
                                            preprocess=True,
                                            normalize=False, threshold_correlated=1,
                                            rxn_folder="target_data", atom=atom)
    dfs.append(target_df)

df = a.make_descriptors_basic(option=feature_choice, df_folder=df_folder)
if len(dfs) == 1:
    targ = dfs[0]
else:
    targ = pd.DataFrame()
    for col in df.columns:
        for df_ in dfs:
            if col in df_.columns:
                targ[col] = df_[col]

In [5]:
# set atom of interest to most reactive carbon, or something other atom of interest
atom_of_interest = 3

In [6]:
def rank_of_atom(y_pt, atom_of_interest):
    sorted_items = sorted(y_pt.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)

    for i in range(len(sorted_items)):
        kv = sorted_items[i]
        if kv[0] == atom_of_interest:
            return i+1

In [7]:
with open(file, 'rb') as f:
    results_1 = pickle.load(f)
t5, smis, carbon_preds = results_1

In [None]:
from rdkit import Chem
Chem.MolFromSmiles('CC1(C)CCC[C@]2(C)[C@H]3CC(=O)O[C@]3(C)CC[C@@H]12')

In [None]:
Chem.MolFromSmiles('C[C@@H]1CC[C@H]2C(C)(C)[C@H]3C[C@@]12CC[C@@]3(C)O')

In [10]:
afs = ['random',
                             'acqf_1', 'acqf_10',
                             'acqf_2-1', 'acqf_3', 'acqf_4-1', 
                             'acqf_5', 'acqf_6', 'acqf_7', 'acqf_9']

In [11]:
af_mean_ranks = {}
af_std_ranks  = {}
for i in range(len(carbon_preds)):
    af_y = carbon_preds[i]
    af = afs[i]
    all_af_ranks = []
    for run in af_y:
        y = run[0]
        ranks = []
        for y_pt in y:
            ranks.append(rank_of_atom(y_pt, atom_of_interest))
        all_af_ranks.append(ranks)
    all_af_ranks = np.array(all_af_ranks)
    af_mean_ranks[af] = all_af_ranks.mean(axis=0)
    af_std_ranks[af] = all_af_ranks.std(axis=0)

In [12]:
labels = ["Random", "AL", "AL + Small molecules"]
colors = ['gray', 'skyblue','tab:blue']

In [13]:
import matplotlib.pyplot as plt

In [None]:
num_acqf = len(af_mean_ranks["random"])
afs_of_interest = ["random", "acqf_1", "acqf_10"]

fig, ax = plt.subplots(figsize=(6, 4))
for i in range(len(afs_of_interest)):
    af = afs_of_interest[i]
    mean_ranks = af_mean_ranks[af]
    ax.plot(mean_ranks, color=colors[i], linewidth=2)
    std_ranks = af_std_ranks[af]
    ax.fill_between(range(num_acqf), mean_ranks-std_ranks, mean_ranks+std_ranks, color=colors[i], alpha=0.2)
    ax.invert_yaxis()
    #ax.gca().invert_yaxis()
#plt.legend(labels)
fig.tight_layout()
plt.show()
fig.savefig("rank_of_atom_fig_5.png", dpi=600)