In [1]:
%%capture
!pip install deepsig

In [2]:
import pandas as pd

task_to_df = {
    "bbbp": pd.read_pickle('/content/bbbp_test_fs_final.pkl'),
    "bace": pd.read_pickle('/content/bace_test_fs_final.pkl'),
    "esol": pd.read_pickle('/content/esol_test_fs_final.pkl'),
    "freesolv": pd.read_pickle('/content/freesolv_test_fs_final.pkl'),
    "clintox": pd.read_pickle('/content/clintox_test_fs_final.pkl')
}

In [3]:
import json

def index(s):
  return int(s.split("_")[-1])


file_paths = ["/content/id_to_pred_mistral-large_fewshot.json", "/content/id_to_pred_llama-3.1-405b-instruct_fewshot.json", "/content/id_to_pred_gpt-4o_fewshot.json", "/content/id_to_pred_gemini-1.5-pro-002_fewshot_final.json"]
responses = {}

for file_path in file_paths:
  with open(file_path, 'r') as f:
    data = json.load(f)
    responses[file_path] = {}
    for task_name in ['bbbp', 'bace', 'clintox', 'esol', 'freesolv']:
      responses[file_path][task_name] = {}
      for colname in ['smiles', 'deepsmiles', 'selfies', 'inchi', 'iupac']:
        responses[file_path][task_name][colname] = [data[k] for k in list(sorted(data.keys(), key = index)) if (k.split("_")[0] == task_name) and (k.split("_")[1] == colname)]

In [4]:
for file_path in file_paths:
  for colname in ['smiles', 'deepsmiles', 'selfies', 'inchi', 'iupac']:
    task_to_df['bbbp'][f'{file_path}_{colname}_response'] = responses[file_path]['bbbp'][colname]
    task_to_df['bace'][f'{file_path}_{colname}_response'] = responses[file_path]['bace'][colname]
    task_to_df['clintox'][f'{file_path}_{colname}_response'] = responses[file_path]['clintox'][colname]
    task_to_df['esol'][f'{file_path}_{colname}_response'] = responses[file_path]['esol'][colname]
    task_to_df['freesolv'][f'{file_path}_{colname}_response'] = responses[file_path]['freesolv'][colname]

In [5]:
def extract_boolean_answer(s):
  if "yes" in s.lower().split("decision:")[-1]:
    return True
  elif "no" in s.lower().split("decision:")[-1]:
    return False
  else:
    return False


import re
def extract_float_answer(s):

    if "decision:" in s.lower():
      s = s.lower().split('decision:')[-1]
    elif "decision" in s.lower():
      s = s.lower().split('decision')[-1]

    float_pattern = r'(-?\d+\.?\d*)'

    match = re.search(float_pattern, s)

    if match:
        return float(match.group())
    else:
       return 0


for file_path in file_paths:
  for df in list(task_to_df.values())[:2]: # Classification tasks
    for colname in ['smiles', 'deepsmiles', 'selfies', 'inchi', 'iupac']:
      df[f'{file_path}_{colname}_pred'] = df[f'{file_path}_{colname}_response'].apply(extract_boolean_answer)

for file_path in file_paths:
  for df in list(task_to_df.values())[-1:]: # Classification tasks
    for colname in ['smiles', 'deepsmiles', 'selfies', 'inchi', 'iupac']:
      df[f'{file_path}_{colname}_pred'] = df[f'{file_path}_{colname}_response'].apply(extract_boolean_answer)

for file_path in file_paths:
  for df in list(task_to_df.values())[2:4]: # Regression tasks
    for colname in ['smiles', 'deepsmiles', 'selfies', 'inchi', 'iupac']:
      df[f'{file_path}_{colname}_pred'] = df[f'{file_path}_{colname}_response'].apply(extract_float_answer)


In [None]:
list(task_to_df.values())[:2]

In [6]:
# Score each example with accuracy

task_to_label_colname = {
    "bbbp" : "p_np",
    "bace" : "Class",
    "clintox" : "CT_TOX"
}

for file_path in file_paths:
  for k, df in list(task_to_df.items())[:2]: # Classification tasks
    for colname in ['smiles', 'deepsmiles', 'selfies', 'inchi', 'iupac']:
      df[f'{file_path}_{colname}_score'] = df[f'{file_path}_{colname}_pred'] == df[task_to_label_colname[k]]

for file_path in file_paths:
  for k, df in list(task_to_df.items())[-1:]: # Classification tasks
    for colname in ['smiles', 'deepsmiles', 'selfies', 'inchi', 'iupac']:
      df[f'{file_path}_{colname}_score'] = df[f'{file_path}_{colname}_pred'] == df[task_to_label_colname[k]]

In [7]:
# Score each regression example with negative absolute error

task_to_label_colname = {
    "bbbp": "p_np",
    "bace": "Class",
    "esol": "measured log solubility in mols per litre",
    "freesolv": "expt",
    "clintox": "CT_TOX"
}

for file_path in file_paths:
  for k, df in list(task_to_df.items())[2:4]: # Regression tasks
    for colname in ['smiles', 'deepsmiles', 'selfies', 'inchi', 'iupac']:
      df[f'{file_path}_{colname}_score'] = abs(df[f'{file_path}_{colname}_pred'] - df[task_to_label_colname[k]]) * -1

In [None]:
task_to_df['esol'].head()

Unnamed: 0,Compound ID,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre,smiles,...,/content/id_to_pred_gpt-4o.json_smiles_score,/content/id_to_pred_gpt-4o.json_deepsmiles_score,/content/id_to_pred_gpt-4o.json_selfies_score,/content/id_to_pred_gpt-4o.json_inchi_score,/content/id_to_pred_gpt-4o.json_iupac_score,/content/id_to_pred_gemini-1.5-pro-002_final.json_smiles_score,/content/id_to_pred_gemini-1.5-pro-002_final.json_deepsmiles_score,/content/id_to_pred_gemini-1.5-pro-002_final.json_selfies_score,/content/id_to_pred_gemini-1.5-pro-002_final.json_inchi_score,/content/id_to_pred_gemini-1.5-pro-002_final.json_iupac_score
0,Dieldrin,-4.533,1,380.913,0,5,0,12.53,-6.29,ClC4=C(Cl)C5(Cl)C3C1CC(C2OC12)C3C4(Cl)C5(Cl)Cl,...,-0.29,-1.79,-2.79,-1.29,-1.29,-1.79,-6.29,-1.79,-0.71,-1.29
1,Valeraldehyde,-1.103,1,86.134,0,0,3,17.07,-0.85,CCCCC=O,...,-0.95,-0.35,-0.95,-0.15,-0.05,-1.15,-0.15,-0.65,-0.15,-0.05
2,4-Pentene-1-ol,-0.791,1,86.134,1,0,3,20.23,-0.15,OCCCC=C,...,-1.35,-0.75,-1.35,-1.2,-1.05,-2.75,-3.35,-3.65,-1.35,-0.45
3,brompyrazone,-3.005,1,266.098,1,2,1,60.91,-3.127,c1ccccc1n2ncc(N)c(Br)c2(=O),...,-0.627,-0.373,-0.373,-3.627,-0.627,-0.627,-0.373,-0.127,-0.127,-0.127
4,p-Aminophenol,-1.231,1,109.128,2,1,0,46.25,-0.8,Nc1ccc(O)cc1,...,-1.3,-0.7,-1.7,-0.8,-1.5,-1.3,-0.3,-2.2,-1.3,-1.3


# ASO Significance Testing

In [8]:
from itertools import product

import numpy as np
from deepsig import multi_aso

seed = 1234
np.random.seed(seed)

scores_by_representation = [list(np.concatenate(
    [task_to_df[task_name][f'{file_path}_{colname}_score'].astype(int) for file_path in file_paths for task_name in
     ['bbbp', 'bace', 'clintox', 'esol', 'freesolv']])) for colname in
                            ['smiles', 'deepsmiles', 'selfies', 'inchi', 'iupac']]

In [9]:
scores_smiles = scores_by_representation[0]
scores_deepsmiles = scores_by_representation[1]
scores_selfies = scores_by_representation[2]
scores_inchi = scores_by_representation[3]
scores_iupac = scores_by_representation[4]

In [10]:
from deepsig import aso

eps_min_deepsmiles = aso(scores_deepsmiles, scores_smiles, confidence_level=0.95, seed=seed, num_comparisons=4, show_progress=True)
eps_min_selfies = aso(scores_selfies, scores_smiles, confidence_level=0.95, seed=seed, num_comparisons=4, show_progress=True)
eps_min_inchi = aso(scores_inchi, scores_smiles, confidence_level=0.95, seed=seed, num_comparisons=4, show_progress=True)
eps_min_iupac = aso(scores_iupac, scores_smiles, confidence_level=0.95, seed=seed, num_comparisons=4, show_progress=True)

Bootstrap iterations: 100%|█████████▉| 999/1000 [04:43<00:00,  3.52it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [04:37<00:00,  3.60it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [04:29<00:00,  3.71it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [04:32<00:00,  3.67it/s]


In [11]:
print(f"Eps min DeepSMILES: {eps_min_deepsmiles}\nEps min SELFIES: {eps_min_selfies}\nEps min InChI: {eps_min_inchi}\nEps min IUPAC: {eps_min_iupac}")

Eps min DeepSMILES: 0.9059049909174397
Eps min SELFIES: 1.0
Eps min InChI: 0.3552702939103346
Eps min IUPAC: 0.1618812351576461
