# 1. Train test split

In [None]:
import sys
sys.path.append('../../')
import pandas as pd
from SynTemp.SynUtils.utils import load_database, save_database
data = pd.DataFrame(load_database('../../Data/DPO/USPTO_50K/USPTO_50K_aam_reactions.json.gz'))
original_data = pd.read_csv('../../Data/USPTO_50K/USPTO_50K_original.csv')

data = pd.concat([data, original_data['class']], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, random_state=42, stratify=data['class'], test_size=0.2)

save_database(train.to_dict('records'), '../../Data/DPO/USPTO_50K/train.json.gz')
save_database(test.to_dict('records'), '../../Data/DPO/USPTO_50K/test.json.gz')

# 2. Hydrogen adjustments

In [None]:
from SynTemp.SynUtils.utils import load_from_pickle
data = load_from_pickle('../../Data/DPO/USPTO_50K/Hydrogen/USPTO_50K_its_graph_rules_cluster.pkl.gz')

In [None]:
from SynTemp.SynUtils.utils import stratified_random_sample
samples = stratified_random_sample(data, 'naive_cluster', 1, 42)

In [None]:
pd.DataFrame(samples)['Reaction Type'].value_counts()

In [None]:
samples[0]

In [None]:
from SynTemp.SynUtils.utils import stratified_random_sample

samples = stratified_random_sample(data, 'naive_cluster', 1, 42)
samples_good = [value for value in samples if value['Reaction Type'] in ['Single Cyclic', 'Complex Cyclic']]

In [None]:
from SynTemp.SynRule.rule_writing import RuleWriting
rules = RuleWriting.auto_extraction(data_dicts=samples, id_column='naive_cluster',
                                    save_path='../../Data/DPO/USPTO_50K/Hydrogen/Rules')

In [None]:
from SynTemp.SynRule.rule_writing import RuleWriting
rules_good = RuleWriting.auto_extraction(data_dicts=samples, id_column='naive_cluster',
                                    save_path='../../Data/DPO/USPTO_50K/Hydrogen/Rules_good')

In [None]:
from SynTemp.SynRule.rule_benchmark import RuleBenchmark
from SynTemp.SynChemistry.sf_similarity import SFSimilarity
from SynTemp.SynChemistry.sf_maxfrag import SFMaxFrag

In [None]:
import logging
import pandas as pd

logging.basicConfig(filename=f'../../Data/DPO/USPTO_50K/Hydrogen/topk_accuracy_good.log', level=logging.INFO, format='%(asctime)s - %(message)s')

database = load_database('../../Data/DPO/USPTO_50K/test.json.gz')
database = [{'R-id':value['R-id'], 'reactions':value['reactions'], 
             'class': value['class']} for value in database]

In [None]:
value = [i for i in database if i['reactions']==bug]

In [None]:
value

In [None]:
bug = 'CC(C)(C)OC(=O)NCCCN(CCCCN(CCCN(CCCN1C(=O)c2ccccc2C1=O)C(=O)OC(C)(C)C)C(=O)OC(C)(C)C)C(=O)OC(C)(C)C.O.O>>CC(C)(C)OC(=O)NCCCN(CCCCN(CCCN(CCCN)C(=O)OC(C)(C)C)C(=O)OC(C)(C)C)C(=O)OC(C)(C)C.O=C(O)c1ccccc1C(=O)O'



In [13]:
import sys
sys.path.append('../../')
import pandas as pd
from SynTemp.SynUtils.utils import load_from_pickle, save_database
data = load_from_pickle('../../Data/DPO/USPTO_50K/Hydrogen/USPTO_50K_its_graph_rules_cluster.pkl.gz')

In [16]:
data = [{'R-id':value['R-id'], 'cluster':value['naive_cluster']} for value in data]

In [20]:
id = [value['R-id'] for value in data]

In [24]:
from SynTemp.SynUtils.utils import load_database
original = load_database('../../Data/DPO/USPTO_50K/train.json.gz')
original = [{'R-id':value['R-id'], 'class':value['class'], 'reactions':value['reactions']} for value in original]

In [26]:
original_reduce = [value for value in original if value['R-id'] in id]

In [31]:
all_data = pd.concat([pd.DataFrame(original_reduce), pd.DataFrame(data)['cluster']],axis=1)

In [39]:
all_data.to_csv('../../Data/DPO/USPTO_50K/Hydrogen/data_cluster.csv', index=False)

In [44]:
pd.DataFrame(all_data['cluster'].value_counts()).head(10)

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
2,5967
7,4761
18,2182
0,2036
23,1944
15,1438
12,1280
9,1250
14,1003
3,874
