In [None]:
# If database is not empty and you want to erase it, uncomment this
#from CGRdb.database import db
#from pony.orm import db_session
#from CGRdb.database.config import Config
#db.bind(provider='postgres', user='postgres', host='localhost', password="example", database='test',
#        port=5432)
#db.generate_mapping(check_tables=False, create_tables=True)
#db.drop_all_tables(with_all_data=True)
#db.commit()
#db.disconnect()
#db.unbind()

In [None]:
# Connect to database and create tables
from CGRdb.database import db
import zipfile
db.bind(provider='postgres', user='postgres', host='localhost', password="example", database='test',
        port=5432)
db.generate_mapping(create_tables=True)

In [None]:
# DB settings for fingerprints(linera fingerprints) and LSH(please refer to datasketch library)
from CGRdb.database.config import Config
db.execute("Create extension if not exists intarray;")
Config(key="fingerprint",value={"min_radius":1, "max_radius":6, "length":2048,
                 "number_active_bits":2, "number_bit_pairs":4})
Config(key="lsh_num_permute",value=64)
Config(key="lsh_threshold",value=0.7)
Config(key="cgr_lsh_num_permute",value=64)
Config(key="cgr_lsh_threshold",value=0.7)
db.commit()
db.disconnect()
db.unbind()

In [None]:
from CGRdb.database import db
import zipfile
from pony.orm import db_session
from CGRtools import smiles
from CGRdb.database import Reaction
from multiprocess import Process, Queue
from tqdm import tqdm
from CGRtools.exceptions import InvalidAromaticRing, IncorrectSmiles, ValenceError, MappingError

In [None]:
# loader into the database with datacleaning procedure
def worker(q):
    db.bind(provider='postgres', user='postgres', host='localhost', password="example", database='test',
        port=5432)
    db.generate_mapping()
    while True:
        data = q.get()
        if data is None:
            break
        reaction = smiles(data)
        try:
            reaction.canonicalize()
        except (InvalidAromaticRing, IncorrectSmiles, ValenceError, MappingError):
            print(reaction)
            continue
        # putting reaction into the DB in 10 attempts due to parallel loading
        for _ in range(10):
            try:
                with db_session():
                    Reaction(reaction, keep_cgr=True) # change here to include storage of CGRs in DB
                    break
            except Exception as e:
                continue
        else:
            print(f" upload failed with error \n {e}")
    print("finished")

In [None]:
# define number of workers
num_workers=10
q = Queue(maxsize=num_workers*2)
pr = [Process(target=worker, 
              args=[q], ) for _ in range(num_workers)]
[p.start() for p in pr]

In [None]:
# put reactions into the que for uploading
with zipfile.ZipFile("../dataset/USPTO.smi.zip", 'r') as zip_ref:
    zip_ref.extractall("")
with open("USPTO.smi") as f:
    for cgr in tqdm(f):
        q.put(cgr.strip("\n"))
    for i in range(num_workers):
        q.put(None)

In [None]:
# generate indexes for database

db.bind(provider='postgres', user='postgres', host='localhost', password="example", database='test',
        port=5432)
db.generate_mapping()
db.create_fing_index()
db.create_sim_index()
db.create_cgr_sim_index()