In [19]:
import pandas as pd
from LocalTemplate.template_decoder import *
import pickle
dataset = 'USPTO_50K'
test_file = pd.read_csv('data/%s/raw_test.csv' % dataset)

In [13]:
rxn_ps = [rxn.split('>>')[1] for rxn in test_file['reactants>reagents>production']]

In [14]:
ground_truth = [demap(Chem.MolFromSmiles(rxn.split('>>')[0])) for rxn in test_file['reactants>reagents>production']]
ground_truth_MaxFrag = [get_MaxFrag(g) for g in ground_truth]

In [53]:
class_given = False

result_dir = 'outputs/decoded_prediction' 
if class_given:
    result_dir += '_class'

result_file = '%s/LocalRetro_%s.txt' % (result_dir, dataset)

results = {}       
results_MaxFrag = {}
with open(result_file, 'r') as f:
    for i, line in enumerate(f.readlines()):
        line = line.split('\n')[0]
        i = int(line.split('\t')[0])
        predictions = line.split('\t')[1:]
        MaxFrags = []
        results[i] = [eval(p)[0] for p in predictions]
        for p in results[i]:
            if p not in MaxFrags:
                MaxFrags.append(get_MaxFrag(p))
        results_MaxFrag[i] = MaxFrags

In [22]:
with open(result_file, 'r') as f:
    for i, line in enumerate(f.readlines()):
        line = line.split('\n')[0]
        i = int(line.split('\t')[0])
        predictions = line.split('\t')[1:]
        break

In [5]:
stereo_aware = False

Exact_matches = []
MaxFrag_matches = [] # Only compares the largest reactant fragment

Exact_matches_multi = []
MaxFrag_matches_multi = [] 
for i in range(len(results)):
    if stereo_aware:
        metric = isomer_match
    else:
        metric = exact_match
    match_exact = metric(results[i], ground_truth[i])
    match_maxfrag = metric(results_MaxFrag[i], ground_truth_MaxFrag[i])
    if len(rxn_ps[i].split('.')) > 1:
        Exact_matches_multi.append(match_exact)
        MaxFrag_matches_multi.append(match_maxfrag)
    Exact_matches.append(match_exact)
    MaxFrag_matches.append(match_maxfrag)
    if i % 100 == 0:
        print ('\rCalculating accuracy... %s/%s' % (i, len(results)), end='', flush=True)

Calculating accuracy... 5000/5007

In [6]:
# Stereo-unaware
ks = [1, 3, 5, 10, 50]
exact_k = {k:0 for k in ks}
MaxFrag_k = {k:0 for k in ks}

print(len(Exact_matches))
for i in range(len(Exact_matches)):
    for k in ks:
        if Exact_matches[i] <= k and Exact_matches[i] != -1:
            exact_k[k] += 1
        if MaxFrag_matches[i] <= k and MaxFrag_matches[i] != -1:
            MaxFrag_k[k] += 1

for k in ks:
    print ('Top-%d Exact accuracy: %.3f, MaxFrag accuracy: %.3f' % (k, exact_k[k]/len(Exact_matches), MaxFrag_k[k]/len(MaxFrag_matches)))

5007
Top-1 Exact accuracy: 0.549, MaxFrag accuracy: 0.593
Top-3 Exact accuracy: 0.772, MaxFrag accuracy: 0.811
Top-5 Exact accuracy: 0.845, MaxFrag accuracy: 0.874
Top-10 Exact accuracy: 0.903, MaxFrag accuracy: 0.924
Top-50 Exact accuracy: 0.959, MaxFrag accuracy: 0.967


In [7]:
stereo_aware = True

Exact_matches = []
MaxFrag_matches = [] # Only compares the largest reactant fragment

Exact_matches_multi = []
MaxFrag_matches_multi = [] 
for i in range(len(results)):
    if stereo_aware:
        metric = isomer_match
    else:
        metric = exact_match
    match_exact = metric(results[i], ground_truth[i])
    match_maxfrag = metric(results_MaxFrag[i], ground_truth_MaxFrag[i])
    if len(rxn_ps[i].split('.')) > 1:
        Exact_matches_multi.append(match_exact)
        MaxFrag_matches_multi.append(match_maxfrag)
    Exact_matches.append(match_exact)
    MaxFrag_matches.append(match_maxfrag)
    if i % 100 == 0:
        print ('\rCalculating accuracy... %s/%s' % (i, len(results)), end='', flush=True)

Calculating accuracy... 0/5007

Calculating accuracy... 5000/5007

In [8]:
# Stereo-aware
ks = [1, 3, 5, 10, 50]
exact_k = {k:0 for k in ks}
MaxFrag_k = {k:0 for k in ks}

print(len(Exact_matches))
for i in range(len(Exact_matches)):
    for k in ks:
        if Exact_matches[i] <= k and Exact_matches[i] != -1:
            exact_k[k] += 1
        if MaxFrag_matches[i] <= k and MaxFrag_matches[i] != -1:
            MaxFrag_k[k] += 1

for k in ks:
    print ('Top-%d Exact accuracy: %.3f, MaxFrag accuracy: %.3f' % (k, exact_k[k]/len(Exact_matches), MaxFrag_k[k]/len(MaxFrag_matches)))

5007
Top-1 Exact accuracy: 0.561, MaxFrag accuracy: 0.602
Top-3 Exact accuracy: 0.789, MaxFrag accuracy: 0.825
Top-5 Exact accuracy: 0.864, MaxFrag accuracy: 0.889
Top-10 Exact accuracy: 0.923, MaxFrag accuracy: 0.940
Top-50 Exact accuracy: 0.981, MaxFrag accuracy: 0.986


### On LLM results

In [35]:
import re

In [71]:
llm_output = pickle.load(open('/root/Projects/LLM4Retro/finetuned_llm_uspto_50k_test.pkl', 'rb'))
len(llm_output)

1000

In [72]:
ion_pattern = r"^\[[^\]]+[+-]\]$"

In [73]:
llm_results = {}       
llm_results_MaxFrag = {}
for i, predictions in enumerate(llm_output):
    MaxFrags = []
    llm_results[i] = ['.'.join([tmp for tmp in p if not bool(re.match(ion_pattern, tmp))]) for p in predictions] # remove ions/reagents
    for p in llm_results[i]:
        if p not in MaxFrags:
            MaxFrags.append(get_MaxFrag(p))
    llm_results_MaxFrag[i] = MaxFrags

In [74]:
llm_output[2]

[['CCOC(=O)c1nn(-c2ccc(Cl)cc2Cl)c(-c2ccc(OC)cc2)c1CO',
  'Cc1ccccc1',
  'O=C([O-])O',
  'O=P(Br)(Br)Br',
  '[Na+]'],
 ['CCOC(=O)c1nn(-c2ccc(Cl)cc2Cl)c(-c2ccc(OC)cc2)c1CO',
  'ClC(Cl)(Cl)Cl',
  'O',
  'O=C1CCC(=O)N1Br'],
 ['CCOC(=O)c1nn(-c2ccc(Cl)cc2Cl)c(-c2ccc(OC)cc2)c1C',
  'ClC(Cl)(Cl)Cl',
  'O=C1CCC(=O)N1Br'],
 ['CCOC(=O)c1nn(-c2ccc(Cl)cc2Cl)c(-c2ccc(OC)cc2)c1C',
  'ClC(Cl)(Cl)Cl',
  'O',
  'O=C1CCC(=O)N1Br'],
 ['CCOC(=O)c1nn(-c2ccc(Cl)cc2Cl)c(-c2ccc(OC)cc2)c1CO',
  'ClC(Cl)(Cl)Cl',
  'O=C(OOC(=O)c1ccccc1)c1ccccc1',
  'O=C1CCC(=O)N1Br']]

In [75]:
llm_results[2]

['CCOC(=O)c1nn(-c2ccc(Cl)cc2Cl)c(-c2ccc(OC)cc2)c1CO.Cc1ccccc1.O=C([O-])O.O=P(Br)(Br)Br',
 'CCOC(=O)c1nn(-c2ccc(Cl)cc2Cl)c(-c2ccc(OC)cc2)c1CO.ClC(Cl)(Cl)Cl.O.O=C1CCC(=O)N1Br',
 'CCOC(=O)c1nn(-c2ccc(Cl)cc2Cl)c(-c2ccc(OC)cc2)c1C.ClC(Cl)(Cl)Cl.O=C1CCC(=O)N1Br',
 'CCOC(=O)c1nn(-c2ccc(Cl)cc2Cl)c(-c2ccc(OC)cc2)c1C.ClC(Cl)(Cl)Cl.O.O=C1CCC(=O)N1Br',
 'CCOC(=O)c1nn(-c2ccc(Cl)cc2Cl)c(-c2ccc(OC)cc2)c1CO.ClC(Cl)(Cl)Cl.O=C(OOC(=O)c1ccccc1)c1ccccc1.O=C1CCC(=O)N1Br']

In [76]:
stereo_aware = False

Exact_matches = []
MaxFrag_matches = [] # Only compares the largest reactant fragment

Exact_matches_multi = []
MaxFrag_matches_multi = [] 
for i in range(len(llm_results)):
    if stereo_aware:
        metric = isomer_match
    else:
        metric = exact_match
    match_exact = metric(llm_results[i], ground_truth[i])
    match_maxfrag = metric(llm_results_MaxFrag[i], ground_truth_MaxFrag[i])
    if len(rxn_ps[i].split('.')) > 1:
        Exact_matches_multi.append(match_exact)
        MaxFrag_matches_multi.append(match_maxfrag)
    Exact_matches.append(match_exact)
    MaxFrag_matches.append(match_maxfrag)
    if i % 100 == 0:
        print ('\rCalculating accuracy... %s/%s' % (i, len(llm_results)), end='', flush=True)

Calculating accuracy... 900/1000

In [77]:
ks = [1, 3, 5]
exact_k = {k:0 for k in ks}
MaxFrag_k = {k:0 for k in ks}

print(len(Exact_matches))
for i in range(len(Exact_matches)):
    for k in ks:
        if Exact_matches[i] <= k and Exact_matches[i] != -1:
            exact_k[k] += 1
        if MaxFrag_matches[i] <= k and MaxFrag_matches[i] != -1:
            MaxFrag_k[k] += 1

for k in ks:
    print ('Top-%d Exact accuracy: %.3f, MaxFrag accuracy: %.3f' % (k, exact_k[k]/len(Exact_matches), MaxFrag_k[k]/len(MaxFrag_matches)))

1000
Top-1 Exact accuracy: 0.024, MaxFrag accuracy: 0.195
Top-3 Exact accuracy: 0.043, MaxFrag accuracy: 0.324
Top-5 Exact accuracy: 0.052, MaxFrag accuracy: 0.360


In [78]:
stereo_aware = True

Exact_matches = []
MaxFrag_matches = [] # Only compares the largest reactant fragment

Exact_matches_multi = []
MaxFrag_matches_multi = [] 
for i in range(len(llm_results)):
    if stereo_aware:
        metric = isomer_match
    else:
        metric = exact_match
    match_exact = metric(llm_results[i], ground_truth[i])
    match_maxfrag = metric(llm_results_MaxFrag[i], ground_truth_MaxFrag[i])
    if len(rxn_ps[i].split('.')) > 1:
        Exact_matches_multi.append(match_exact)
        MaxFrag_matches_multi.append(match_maxfrag)
    Exact_matches.append(match_exact)
    MaxFrag_matches.append(match_maxfrag)
    if i % 100 == 0:
        print ('\rCalculating accuracy... %s/%s' % (i, len(llm_results)), end='', flush=True)

Calculating accuracy... 900/1000

In [79]:
# Stereo-aware
ks = [1, 3, 5]
exact_k = {k:0 for k in ks}
MaxFrag_k = {k:0 for k in ks}

print(len(Exact_matches))
for i in range(len(Exact_matches)):
    for k in ks:
        if Exact_matches[i] <= k and Exact_matches[i] != -1:
            exact_k[k] += 1
        if MaxFrag_matches[i] <= k and MaxFrag_matches[i] != -1:
            MaxFrag_k[k] += 1

for k in ks:
    print ('Top-%d Exact accuracy: %.3f, MaxFrag accuracy: %.3f' % (k, exact_k[k]/len(Exact_matches), MaxFrag_k[k]/len(MaxFrag_matches)))

1000
Top-1 Exact accuracy: 0.026, MaxFrag accuracy: 0.205
Top-3 Exact accuracy: 0.048, MaxFrag accuracy: 0.344
Top-5 Exact accuracy: 0.061, MaxFrag accuracy: 0.388
