In [1]:
import json
from tqdm import tqdm

In [2]:
txt_templates = [  
    "In the presence of a suitable catalyst, {reactants} can undergo a transformation to yield {product}.",  
    "{reactants} have the potential to chemically interact and form {product}.",  
    "Through various chemical processes, {reactants} can be converted into {product}.",  
    "Under certain conditions, {reactants} can combine to produce {product}.",  
    "When subjected to specific temperatures and pressures, {reactants} can result in the formation of {product}.",  
    "The chemical union of {reactants} can lead to the formation of {product}.",  
    "A chemical reaction involving {reactants} has the potential to synthesize {product}.",  
    "The interaction between {reactants} can facilitate the production of {product}.",  
    "Under optimal conditions, {reactants} can yield {product} as a result of a chemical reaction.",  
    "Through a series of chemical reactions, {reactants} can result in {product}.",  
    "Upon reacting, {reactants} can give rise to {product}.",  
    "{reactants} can be chemically combined to produce {product}.",  
    "Through the process of chemical synthesis, {reactants} can lead to the formation of {product}.",  
    "When chemically reacted, {reactants} can produce {product}.",  
    "{reactants} have the ability to chemically transform into {product}.",  
    "With the aid of a catalyst, {reactants} can react to form {product}.",  
    "The combination of {reactants} can result in the generation of {product}.",  
    "In a controlled environment, {reactants} can be manipulated to produce {product}.",  
    "In a chemical reaction, {reactants} can be transformed into {product}.",  
    "When properly catalyzed, {reactants} can yield {product}.",
    "{product} can be synthesized from {reactants} using appropriate catalysts.",  
    "The retrosynthesis of {product} can be achieved from {reactants}.",  
    "The compound {product} can be obtained from the precursors {reactants}.",  
    "The successful synthesis of {product} can be achieved through the reaction of {reactants}.",  
    "For the preparation of {product}, {reactants} can be used as starting materials.",  
    "{product} can be derived from {reactants} under suitable reaction conditions.",  
    "The production of {product} involves the use of {reactants} as raw materials.",  
    "The chemical formation of {product} can be planned starting from {reactants}.",  
    "The strategy for the synthesis of {product} involves the use of {reactants}.",  
    "The construction of {product} can be accomplished from {reactants}.",  
    "Through the process of retrosynthesis, {product} can be traced back to {reactants}.",  
    "{reactants} are the key components in the synthetic pathway leading to {product}.",  
    "The synthesis of {product} can be traced back to {reactants} through a series of chemical reactions.",  
    "{reactants} serve as the precursor compounds for the synthesis of {product}.",  
    "The retrosynthetic analysis suggests that {product} can be derived from {reactants}.",  
    "{product} can be synthesized by manipulating the chemical structure of {reactants}.",  
    "The retrosynthesis plan for {product} involves the use of {reactants} as starting materials.",  
    "The synthetic pathway to {product} involves the transformation of {reactants}.",  
    "{reactants} can be reassembled to produce {product} in a synthetic process.",  
    "Through the application of retrosynthetic principles, {product} can be synthesized from {reactants}."  
]  



In [3]:
len(txt_templates)

40

In [4]:
import random
random.seed(42)

In [5]:
raw_jsonl_path = '/blob/guoqing/pistachio_2023Q2_v2_o_smiles_preprocessed/train_augmentation_20.jsonl'

raw_data = []
with open(raw_jsonl_path, 'r') as f:
    lines = f.readlines()
    for lineno, line in tqdm(enumerate(lines)):
        aug_id = lineno % 20
        if aug_id < 3: # keep 3/20 = 15% of the data
            raw_data.append(json.loads(line))

56784240it [00:34, 1633268.12it/s]


In [6]:
len(raw_data)

8517636

In [8]:
raw_data[0]

{'psmiles': 'c1c(-c2cnc(O[C@H]3C[C@H](C(OCc4ccccc4)=O)C3)cc2)cc(F)c(C=O)c1',
 'rsmiles': 'c1c(B(O)O)cc(F)c(C=O)c1.c1(Br)cnc(O[C@H]2C[C@H](C(OCc3ccccc3)=O)C2)cc1'}

In [9]:
result = []
for i, data in enumerate(tqdm(raw_data)):
    # for 5% data, use text tampltes, otherwise, just concatenate reactants and products
    if random.random() < 0.05:
        template = random.choice(txt_templates)
        result.append(template.format(reactants='<reactants>'+data['rsmiles']+'</reactants>', product='<product>'+data['psmiles']+'</product>'))
    else:
        if i % 2 == 0:
            result.append(f'<product>{data["psmiles"]}</product><reactants>{data["rsmiles"]}</reactants>')
        else:
            result.append(f'<reactants>{data["rsmiles"]}</reactants><product>{data["psmiles"]}</product>')

100%|██████████| 8517636/8517636 [00:05<00:00, 1423258.45it/s]


In [10]:

random.shuffle(result)

In [11]:
# print some examples
for i in range(30):
    print(result[i])

<reactants>CC(=O)N1CC2CC2(c2ccc([N+]([O-])=O)cc2)C1</reactants><product>CC(=O)N1CC2CC2(c2ccc(N)cc2)C1</product>
<reactants>c1c(NCCOCCOCCOCCN)c2c(cc1)C(=O)N(C1C(=O)NC(=O)CC1)C2=O.C(CN1CCN(c2ccc(-c3c4c([nH]n3)cnc(-c3c(OC)cccc3F)n4)cc2)CC1)(=O)O</reactants><product>c1c(NCCOCCOCCOCCNC(CN2CCN(c3ccc(-c4c5c([nH]n4)cnc(-c4c(OC)cccc4F)n5)cc3)CC2)=O)c2c(cc1)C(=O)N(C1C(=O)NC(=O)CC1)C2=O</product>
<reactants>n1c(N)c2c(nc1)n([C@@H]1O[C@H](COP(OP(O)(=O)OCC(C)(C)[C@@H](O)C(NCCC(NCCSC(=O)/C=C/CC)=O)=O)(=O)O)[C@@H](OP(O)(=O)O)[C@H]1O)cn2.N[C@H](C(O)=O)CC(C)C</reactants><product>N(C)c1nc(NC(C)C)nc(NC)n1</product>
<reactants>n1n([C@@H]2CC[C@@H](CO)CC2)cc2cc(NC(OC(C)(C)C)=O)c(OC)cc12</reactants><product>n1n([C@@H]2CC[C@@H](C=O)CC2)cc2cc(NC(OC(C)(C)C)=O)c(OC)cc12</product>
<reactants>O=C(O)/C=C\C(O)=O.[Ca]1OC(=O)O1</reactants><product>O1C(=O)C=CC(=O)O[Ca]1</product>
The combination of <reactants>[C@@H](N[C@@H](C)c1ccccc1)(C1CC1)C</reactants> can result in the generation of <product>[C@@H](N)(C1CC1)C</produ

In [12]:
# save to file
save_path = '/blob/shufxi/data/scigpt/pistachio_2023Q2_v2_o_smiles/train.txt'

# ensure folder exits
import os
os.makedirs(os.path.dirname(save_path), exist_ok=True)

with open(save_path, 'w') as f:
    for line in tqdm(result):
        f.write(line+'\n')

100%|██████████| 8517636/8517636 [00:19<00:00, 438080.93it/s]
