In [None]:
from pony.orm import db_session
from CGRdb.database import db
from CGRtools import smiles
from CGRdb.database import Substance
from multiprocessing import Process, Queue
from tqdm import tqdm
import zipfile

In [None]:
# Connection to the database
db.bind(provider='postgres', user='postgres', host='localhost', password="example", database='test',
        port=5432)
db.generate_mapping(create_tables=True)

In [None]:
# initialize parameres for fingerprint and LSH
from CGRdb.database.config import Config
db.execute("Create extension if not exists intarray;")
Config(key="fingerprint",value={"min_radius":1, "max_radius":4, "length":2048,
                 "number_active_bits":2, "number_bit_pairs":4, "include_hydrogens":False})
Config(key="lsh_num_permute",value=64)
Config(key="lsh_threshold",value=0.7)
db.commit()
db.disconnect()
db.unbind()

In [None]:
def worker(q):
    db.bind(provider='postgres', user='postgres', host='localhost', password="example", database='test',
        port=5432)
    db.generate_mapping()
    while True:
        data = q.get()
        if data is None:
            break
        for i in data:
            mol, label = i
            mol = smiles(mol)
            subs = [(x,None) for x in mol.split()]
            for _ in range(10):
                try:
                    with db_session():
                        Substance(subs)
                        break
                except Exception:
                    continue
    print("finished")

In [None]:
q = Queue(maxsize=30)
num_workers=20
pr = [Process(target=worker, 
              args=[q]) for _ in range(num_workers)]
[p.start() for p in pr]

In [None]:
with zipfile.ZipFile("../dataset/Chembl28_cleaned_06_07_2021.smi.zip", 'r') as zip_ref:
    zip_ref.extractall("")
with open("Chembl28_cleaned_06_07_2021.smi",) as f:
    tmp = []
    for n, row in tqdm(enumerate(f)):
        columns = row.rstrip('\n').lstrip().split(" ")
        smi = columns[0]
        idx = None
        if len(columns) > 1:
            if columns[1].startswith("|"):
                smi += columns[1]
                if len(columns) > 2:
                    idx = columns[2]
            else:
                idx = columns[1]
        if not idx:
            idx = n
        tmp.append((smi, idx))
        if len(tmp) == 1000:
            q.put(tmp)
            tmp = []
    else:
        q.put(tmp)
    for i in range(num_workers):
        q.put(None)

In [None]:
db.bind(provider='postgres', user='postgres', host='localhost', password="example", database='test',
        port=5432)
db.generate_mapping()
db.create_fing_index()
db.create_sim_index()