In [3]:
import pandas as pd

REP_COLNAMES = ["smiles", "deepsmiles", "selfies", "inchi", "iupac"]

task_to_df = {
    "bbbp": pd.read_pickle('/content/bbbp_test_fs_final.pkl'),
    "bace": pd.read_pickle('/content/bace_test_fs_final.pkl'),
    "esol": pd.read_pickle('/content/esol_test_fs_final.pkl'),
    "freesolv": pd.read_pickle('/content/freesolv_test_fs_final.pkl'),
    "clintox": pd.read_pickle('/content/clintox_test_fs_final.pkl')
}

### Llama 3.1

In [10]:
REP_NAMES = ['smiles', 'deepsmiles', 'selfies', 'inchi', 'iupac']
COL_NAMES = ['bbbp', 'bace', 'clintox', 'esol', 'freesolv']

In [84]:
import json

with open("/content/id_to_pred_llama-3.1-405b-instruct_fewshot.json", 'r') as f:
  data = json.load(f)


def index(s):
  return int(s.split("_")[-1])

responses = {}

for task_name in COL_NAMES:
  responses[task_name] = {}

  for colname in REP_NAMES:
    responses[task_name][colname] = [data[k] for k in list(sorted(data.keys(), key = index)) if (k.split("_")[0] == task_name) and (k.split("_")[1] == colname)]

In [85]:
responses.keys()

dict_keys(['bbbp', 'bace', 'clintox', 'esol', 'freesolv'])

In [115]:
for colname in REP_NAMES:
  task_to_df['bbbp'][f'{colname}_response'] = responses['bbbp'][colname]
  task_to_df['bace'][f'{colname}_response'] = responses['bace'][colname]
  task_to_df['clintox'][f'{colname}_response'] = responses['clintox'][colname]
  task_to_df['esol'][f'{colname}_response'] = responses['esol'][colname]
  task_to_df['freesolv'][f'{colname}_response'] = responses['freesolv'][colname]

In [116]:
def extract_boolean_answer(s):
  if "yes" in s.lower().split("decision:")[-1]:
    return True
  elif "no" in s.lower().split("decision:")[-1]:
    return False
  else:
    return False


import re
def extract_float_answer(s):

    if "decision:" in s.lower():
      s = s.lower().split('decision:')[-1]
    elif "decision" in s.lower():
      s = s.lower().split('decision')[-1]

    float_pattern = r'(-?\d+\.?\d*)'

    match = re.search(float_pattern, s)

    if match:
        return float(match.group())
    else:
       return 0

for df in list(task_to_df.values())[:2]: # Classification tasks
  for colname in REP_NAMES:
    df[f'{colname}_pred'] = df[f'{colname}_response'].apply(extract_boolean_answer)

for df in list(task_to_df.values())[-1:]: # Classification tasks
  for colname in REP_NAMES:
    df[f'{colname}_pred'] = df[f'{colname}_response'].apply(extract_boolean_answer)

for df in list(task_to_df.values())[2:4]: # Regression tasks
  for colname in REP_NAMES:
    df[f'{colname}_pred'] = df[f'{colname}_response'].apply(extract_float_answer)

In [103]:
task_to_df.keys()

dict_keys(['bbbp', 'bace', 'esol', 'freesolv', 'clintox'])

In [105]:
task_to_df['esol'].head()

Unnamed: 0,Compound ID,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre,smiles,...,selfies_response,inchi_response,iupac_response,smiles_pred,deepsmiles_pred,selfies_pred,inchi_pred,iupac_pred,majority_vote_5,majority_vote_3
0,Dieldrin,-4.533,1,380.913,0,5,0,12.53,-6.29,ClC4=C(Cl)C5(Cl)C3C1CC(C2OC12)C3C4(Cl)C5(Cl)Cl,...,Reasoning:\n\nThe molecule presented is a comp...,"The given molecule, C12H8Cl6O, is similar in s...","Reasoning:\nThe given molecule, 3,4,5,6,13,13-...",-6.5,-6.5,-4.0,-5.9,-6.2,-6.78,-7.733333
1,Valeraldehyde,-1.103,1,86.134,0,0,3,17.07,-0.85,CCCCC=O,...,Reasoning:\n\nThe given examples suggest a rel...,"The given molecule, C5H10O (InChI=1S/C5H10O/c1...",Reasoning:\nThe provided examples suggest a re...,-1.3,-1.3,-0.6,-0.7,-0.66,-1.76,-1.5
2,4-Pentene-1-ol,-0.791,1,86.134,1,0,3,20.23,-0.15,OCCCC=C,...,"Reasoning:\n\nThe molecule provided, simplifie...","The given molecule, C5H10O (InChI=1S/C5H10O/c1...",Reasoning:\nThe given data shows a trend of de...,-1.5,-0.2,-2.5,-1.5,-1.5,-1.1,-0.5
3,brompyrazone,-3.005,1,266.098,1,2,1,60.91,-3.127,c1ccccc1n2ncc(N)c(Br)c2(=O),...,The molecule contains a large aromatic ring sy...,The given molecule is very similar to the firs...,The provided examples suggest a structure-acti...,-3.0,-2.5,-3.5,-3.0,-3.0,-3.62,-3.566667
4,p-Aminophenol,-1.231,1,109.128,2,1,0,46.25,-0.8,Nc1ccc(O)cc1,...,The provided examples suggest a relationship b...,The molecule is 2-aminophenol (C6H7NO). It has...,Reasoning:\nThe given data suggests a relation...,-1.0,-0.8,-1.0,-0.85,-0.9,-1.49,-1.15


In [117]:
for df in list(task_to_df.values())[:2]: # Classification tasks
  for colname in REP_NAMES:
    df['majority_vote_5'] = df[['smiles_pred', 'deepsmiles_pred', 'selfies_pred', 'inchi_pred', 'iupac_pred']].mode(axis=1)[0]
    df['majority_vote_3'] = df[['smiles_pred', 'inchi_pred', 'iupac_pred']].mode(axis=1)[0]

for df in list(task_to_df.values())[-1:]: # Classification tasks
  for colname in REP_NAMES:
    df['majority_vote_5'] = df[['smiles_pred', 'deepsmiles_pred', 'selfies_pred', 'inchi_pred', 'iupac_pred']].mode(axis=1)[0]
    df['majority_vote_3'] = df[['smiles_pred', 'inchi_pred', 'iupac_pred']].mode(axis=1)[0]

for df in list(task_to_df.values())[2:4]: # Regression tasks
  for colname in REP_NAMES:
    df['majority_vote_5'] = df[['smiles_pred', 'deepsmiles_pred', 'selfies_pred', 'inchi_pred', 'iupac_pred']].mean(axis=1)
    df['majority_vote_3'] = df[['smiles_pred', 'inchi_pred', 'iupac_pred']].mean(axis=1)

In [95]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import root_mean_squared_error


task_to_label_column = {
    "bbbp" : 'p_np',
    "bace" : 'Class',
    "clintox" : 'CT_TOX',
    "esol" : 'measured log solubility in mols per litre',
    "freesolv" : 'expt'
}

print("Llama 3.1")
print("Classification tasks. Scores reported as ROC-AUC.")

for task, df in list(task_to_df.items())[:2]:
  print(f"----{task}----")
  for colname in ['smiles_pred', 'iupac_pred', 'inchi_pred', 'majority_vote_5', 'majority_vote_3']:
    print(f"{colname}: {round(100 * roc_auc_score(df[task_to_label_column[task]], df[colname]), 1)}")
  print()

for task, df in list(task_to_df.items())[-1:]:
  print(f"----{task}----")
  for colname in ['smiles_pred', 'iupac_pred', 'inchi_pred', 'majority_vote_5', 'majority_vote_3']:
    print(f"{colname}: {round(100 * roc_auc_score(df[task_to_label_column[task]], df[colname]), 1)}")
  print()


print("Regression tasks. Scores reported as RMSE.")

for task, df in list(task_to_df.items())[2:4]:
  print(f"----{task}----")
  for colname in ['smiles_pred', 'iupac_pred', 'inchi_pred', 'majority_vote_5', 'majority_vote_3']:
    print(f"{colname}: {round(root_mean_squared_error(df[task_to_label_column[task]], df[colname]), 2)}")
  print()


Llama 3.1
Classification tasks. Scores reported as ROC-AUC.
----bbbp----
smiles_pred: 85.4
iupac_pred: 83.3
inchi_pred: 84.4
majority_vote_5: 86.1
majority_vote_3: 85.0

----bace----
smiles_pred: 74.9
iupac_pred: 68.7
inchi_pred: 79.7
majority_vote_5: 79.7
majority_vote_3: 78.1

----clintox----
smiles_pred: 59.6
iupac_pred: 60.8
inchi_pred: 58.5
majority_vote_5: 59.6
majority_vote_3: 59.6

Regression tasks. Scores reported as RMSE.
----esol----
smiles_pred: 1.41
iupac_pred: 1.05
inchi_pred: 1.02
majority_vote_5: 1.12
majority_vote_3: 1.01

----freesolv----
smiles_pred: 4.67
iupac_pred: 4.08
inchi_pred: 2.86
majority_vote_5: 3.18
majority_vote_3: 3.23



### Gemini

In [100]:
import json

with open("/content/fewshot_response_files/id_to_pred_gemini-1.5-pro-002_fewshot_final.json", 'r') as f:
  data = json.load(f)


def index(s):
  return int(s.split("_")[-1])

responses = {}

for task_name in COL_NAMES:
  responses[task_name] = {}

  for colname in REP_NAMES:
    responses[task_name][colname] = [data[k] for k in list(sorted(data.keys(), key = index)) if (k.split("_")[0] == task_name) and (k.split("_")[1] == colname)]

In [107]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import root_mean_squared_error


task_to_label_column = {
    "bbbp" : 'p_np',
    "bace" : 'Class',
    "clintox" : 'CT_TOX',
    "esol" : 'measured log solubility in mols per litre',
    "freesolv" : 'expt'
}

print("Gemini 1.5")
print("Classification tasks. Scores reported as ROC-AUC.")

for task, df in list(task_to_df.items())[:2]:
  print(f"----{task}----")
  for colname in ['smiles_pred', 'iupac_pred', 'inchi_pred', 'majority_vote_5', 'majority_vote_3']:
    print(f"{colname}: {round(100 * roc_auc_score(df[task_to_label_column[task]], df[colname]), 1)}")
  print()

for task, df in list(task_to_df.items())[-1:]:
  print(f"----{task}----")
  for colname in ['smiles_pred', 'iupac_pred', 'inchi_pred', 'majority_vote_5', 'majority_vote_3']:
    print(f"{colname}: {round(100 * roc_auc_score(df[task_to_label_column[task]], df[colname]), 1)}")
  print()


print("Regression tasks. Scores reported as RMSE.")

for task, df in list(task_to_df.items())[2:4]:
  print(f"----{task}----")
  for colname in ['smiles_pred', 'iupac_pred', 'inchi_pred', 'majority_vote_5', 'majority_vote_3']:
    print(f"{colname}: {round(root_mean_squared_error(df[task_to_label_column[task]], df[colname]), 2)}")
  print()


Gemini 1.5
Classification tasks. Scores reported as ROC-AUC.
----bbbp----
smiles_pred: 68.1
iupac_pred: 77.3
inchi_pred: 74.3
majority_vote_5: 69.1
majority_vote_3: 76.8

----bace----
smiles_pred: 69.2
iupac_pred: 73.1
inchi_pred: 74.7
majority_vote_5: 74.6
majority_vote_3: 74.0

----clintox----
smiles_pred: 68.8
iupac_pred: 51.2
inchi_pred: 60.4
majority_vote_5: 62.7
majority_vote_3: 58.1

Regression tasks. Scores reported as RMSE.
----esol----
smiles_pred: 1.0
iupac_pred: 0.9
inchi_pred: 0.92
majority_vote_5: 0.82
majority_vote_3: 0.84

----freesolv----
smiles_pred: 2.2
iupac_pred: 2.23
inchi_pred: 2.1
majority_vote_5: 1.82
majority_vote_3: 1.94



### Mistral

In [108]:
import json

with open("/content/fewshot_response_files/id_to_pred_mistral-large_fewshot.json", 'r') as f:
  data = json.load(f)


def index(s):
  return int(s.split("_")[-1])

responses = {}

for task_name in COL_NAMES:
  responses[task_name] = {}

  for colname in REP_NAMES:
    responses[task_name][colname] = [data[k] for k in list(sorted(data.keys(), key = index)) if (k.split("_")[0] == task_name) and (k.split("_")[1] == colname)]

In [112]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import root_mean_squared_error


task_to_label_column = {
    "bbbp" : 'p_np',
    "bace" : 'Class',
    "clintox" : 'CT_TOX',
    "esol" : 'measured log solubility in mols per litre',
    "freesolv" : 'expt'
}

print("Mistral Large 2")
print("Classification tasks. Scores reported as ROC-AUC.")

for task, df in list(task_to_df.items())[:2]:
  print(f"----{task}----")
  for colname in ['smiles_pred', 'iupac_pred', 'inchi_pred', 'majority_vote_5', 'majority_vote_3']:
    print(f"{colname}: {round(100 * roc_auc_score(df[task_to_label_column[task]], df[colname]), 1)}")
  print()

for task, df in list(task_to_df.items())[-1:]:
  print(f"----{task}----")
  for colname in ['smiles_pred', 'iupac_pred', 'inchi_pred', 'majority_vote_5', 'majority_vote_3']:
    print(f"{colname}: {round(100 * roc_auc_score(df[task_to_label_column[task]], df[colname]), 1)}")
  print()


print("Regression tasks. Scores reported as RMSE.")

for task, df in list(task_to_df.items())[2:4]:
  print(f"----{task}----")
  for colname in ['smiles_pred', 'iupac_pred', 'inchi_pred', 'majority_vote_5', 'majority_vote_3']:
    print(f"{colname}: {round(root_mean_squared_error(df[task_to_label_column[task]], df[colname]), 2)}")
  print()


Mistral Large 2
Classification tasks. Scores reported as ROC-AUC.
----bbbp----
smiles_pred: 74.1
iupac_pred: 82.2
inchi_pred: 79.2
majority_vote_5: 77.0
majority_vote_3: 80.8

----bace----
smiles_pred: 61.8
iupac_pred: 76.4
inchi_pred: 74.0
majority_vote_5: 78.3
majority_vote_3: 76.0

----clintox----
smiles_pred: 50.4
iupac_pred: 48.5
inchi_pred: 51.5
majority_vote_5: 59.2
majority_vote_3: 49.2

Regression tasks. Scores reported as RMSE.
----esol----
smiles_pred: 1.56
iupac_pred: 1.25
inchi_pred: 1.12
majority_vote_5: 1.22
majority_vote_3: 1.04

----freesolv----
smiles_pred: 3.81
iupac_pred: 1.43
inchi_pred: 2.07
majority_vote_5: 1.68
majority_vote_3: 1.94



In [113]:
task_to_df['freesolv'].head()

Unnamed: 0,iupac,smiles,expt,calc,deepsmiles,inchi,selfies,fewshot,smiles_response,deepsmiles_response,selfies_response,inchi_response,iupac_response,smiles_pred,deepsmiles_pred,selfies_pred,inchi_pred,iupac_pred,majority_vote_5,majority_vote_3
0,2-methylpentan-2-ol,CCCC(C)(C)O,-3.92,-2.779,CCCCC)C)O,"InChI=1S/C6H14O/c1-4-5-6(2,3)7/h7H,4-5H2,1-3H3",[C][C][C][C][Branch1][C][C][Branch1][C][C][O],"[(0.5333333333333333, [2-methylbutan-2-ol, CCC...",To predict the hydration free energy of the mo...,To predict the hydration free energy of the mo...,To predict the hydration free energy of the mo...,To predict the hydration free energy of the mo...,To predict the hydration free energy of 2-meth...,-5.5,-4.45,-4.5,-4.35,-4.35,-4.63,-4.733333
1,diethoxyphosphinothioylsulfanylmethylsulfanyl-...,CCOP(=S)(OCC)SCSP(=S)(OCC)OCC,-6.1,-10.644,CCOP=S)OCC)))SCSP=S)OCC)))OCC,"InChI=1S/C9H22O4P2S4/c1-5-10-14(16,11-6-2)18-9...",[C][C][O][P][=Branch1][C][=S][Branch1][Ring2][...,"[(0.34782608695652173, [triethyl phosphate, CC...",To predict the hydration free energy of the gi...,To predict the hydration free energy of the gi...,To predict the hydration free energy of the gi...,To predict the hydration free energy of the gi...,To predict the hydration free energy of dietho...,-15.0,-6.5,-7.0,-7.8,-7.2,-8.7,-10.0
2,"penta-1,4-diene",C=CCC=C,0.93,2.357,C=CCC=C,"InChI=1S/C5H8/c1-3-5-4-2/h3-4H,1-2,5H2",[C][=C][C][C][=C],"[(0.45454545454545453, [hexa-1,5-diene, C=CCCC...",To predict the hydration free energy of the mo...,To predict the hydration free energy of the mo...,To predict the hydration free energy of the mo...,To predict the hydration free energy of the mo...,To predict the hydration free energy of penta-...,2.2,1.2,1.5,1.5,1.25,1.53,1.65
3,2-hydroxybenzaldehyde,c1ccc(c(c1)C=O)O,-4.68,-8.809,cccccc6)C=O)))O,"InChI=1S/C7H6O2/c8-5-6-3-1-2-4-7(6)9/h1-5,9H",[C][=C][C][=C][Branch1][=Branch2][C][=Branch1]...,"[(0.375, [4-hydroxybenzaldehyde, c1cc(ccc1C=O)...",To predict the hydration free energy of the mo...,To predict the hydration free energy of the mo...,To predict the hydration free energy of the gi...,To predict the hydration free energy of the mo...,To predict the hydration free energy of 2-hydr...,-7.5,-8.5,-6.5,-8.5,-8.5,-7.9,-8.166667
4,"6-chloro-2-N,4-N-diethyl-1,3,5-triazine-2,4-di...",CCNc1nc(nc(n1)Cl)NCC,-10.22,-10.914,CCNcncncn6)Cl)))NCC,InChI=1S/C7H12ClN5/c1-3-9-6-11-5(8)12-7(13-6)1...,[C][C][N][C][=N][C][=Branch1][=Branch2][=N][C]...,"[(0.4375, [2-N-tert-butyl-4-N-ethyl-6-methylsu...",To predict the hydration free energy of the mo...,To predict the hydration free energy of the mo...,To predict the hydration free energy of the gi...,To predict the hydration free energy of the mo...,To predict the hydration free energy of 6-chlo...,-7.0,-6.5,-7.5,-7.0,-7.0,-7.0,-7.0


# GPT-4o

In [114]:
import json

with open("/content/fewshot_response_files/id_to_pred_gpt-4o_fewshot.json", 'r') as f:
  data = json.load(f)


def index(s):
  return int(s.split("_")[-1])

responses = {}

for task_name in COL_NAMES:
  responses[task_name] = {}

  for colname in REP_NAMES:
    responses[task_name][colname] = [data[k] for k in list(sorted(data.keys(), key = index)) if (k.split("_")[0] == task_name) and (k.split("_")[1] == colname)]

In [118]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import root_mean_squared_error


task_to_label_column = {
    "bbbp" : 'p_np',
    "bace" : 'Class',
    "clintox" : 'CT_TOX',
    "esol" : 'measured log solubility in mols per litre',
    "freesolv" : 'expt'
}

print("GPT-4o")
print("Classification tasks. Scores reported as ROC-AUC.")

for task, df in list(task_to_df.items())[:2]:
  print(f"----{task}----")
  for colname in ['smiles_pred', 'iupac_pred', 'inchi_pred', 'majority_vote_5', 'majority_vote_3']:
    print(f"{colname}: {round(100 * roc_auc_score(df[task_to_label_column[task]], df[colname]), 1)}")
  print()

for task, df in list(task_to_df.items())[-1:]:
  print(f"----{task}----")
  for colname in ['smiles_pred', 'iupac_pred', 'inchi_pred', 'majority_vote_5', 'majority_vote_3']:
    print(f"{colname}: {round(100 * roc_auc_score(df[task_to_label_column[task]], df[colname]), 1)}")
  print()


print("Regression tasks. Scores reported as RMSE.")

for task, df in list(task_to_df.items())[2:4]:
  print(f"----{task}----")
  for colname in ['smiles_pred', 'iupac_pred', 'inchi_pred', 'majority_vote_5', 'majority_vote_3']:
    print(f"{colname}: {round(root_mean_squared_error(df[task_to_label_column[task]], df[colname]), 2)}")
  print()


GPT-4o
Classification tasks. Scores reported as ROC-AUC.
----bbbp----
smiles_pred: 80.8
iupac_pred: 81.3
inchi_pred: 77.7
majority_vote_5: 76.8
majority_vote_3: 79.9

----bace----
smiles_pred: 75.9
iupac_pred: 78.8
inchi_pred: 73.6
majority_vote_5: 78.9
majority_vote_3: 76.9

----clintox----
smiles_pred: 58.1
iupac_pred: 56.9
inchi_pred: 54.6
majority_vote_5: 66.9
majority_vote_3: 57.7

Regression tasks. Scores reported as RMSE.
----esol----
smiles_pred: 0.98
iupac_pred: 0.75
inchi_pred: 0.79
majority_vote_5: 0.82
majority_vote_3: 0.77

----freesolv----
smiles_pred: 1.62
iupac_pred: 1.54
inchi_pred: 1.65
majority_vote_5: 1.52
majority_vote_3: 1.55

