In [9]:
from random import shuffle
from io import StringIO
from itertools import chain, islice
from subprocess import run
from collections import defaultdict
from time import time

from CGRtools.files import SDFRead, SDFWrite, SMILESRead
from CGRtools.exceptions import InvalidAromaticRing
from multiprocessing import Pool

import logging
import pickle
import warnings
import os
import csv
from tqdm import tqdm

In [10]:
names_transfrom = {'PUBCHEM_COMPOUND_CID': 'pubchem_id',
'PUBCHEM_OPENEYE_CAN_SMILES' : 'pubchem_smiles',
'PUBCHEM_IUPAC_OPENEYE_NAME' : 'iupac_name',
'PUBCHEM_CACTVS_ROTATABLE_BOND' : 'rot_bonds',
'PUBCHEM_XLOGP3_AA': 'logp',
'PUBCHEM_CACTVS_HBOND_ACCEPTOR': 'h_acc',
'PUBCHEM_CACTVS_HBOND_DONOR': 'h_don',
'PUBCHEM_HEAVY_ATOM_COUNT': 'heavy_atoms',
}
module_headers = ['ring_count', 'mol_mass', 'bonds_count']

In [11]:
def std_molecule(molecule):
    id_mol = int(molecule.meta['PUBCHEM_COMPOUND_CID'])

    if len(molecule) > 150 or len(molecule) < 2:
        return id_mol
    try:
        molecule.kekule()
    except InvalidAromaticRing:
        return id_molLast

    molecule.standardize(fix_stereo=False)
    if molecule.is_radical:
        return id_mol

    if molecule.check_valence():
        return id_mol

    molecule.implicify_hydrogens(fix_stereo=False)
    molecule.clean_stereo()

    try:
        molecule.thiele()
    except InvalidAromaticRing:
        return id_mol

    conn_comp = molecule.connected_components_count
    if conn_comp > 1:
        return id_mol

    metadata = []
    for key in names_transfrom.keys():
        if key in molecule.meta.keys():
            metadata.append(molecule.meta[key])
        else:
            metadata.append('nan')

    ring_count = molecule.rings_count
    metadata.append(ring_count)
    mol_mass = molecule.molecular_mass
    metadata.append(mol_mass)
    b_count = molecule.bonds_count
    metadata.append(b_count)

    return ([str(molecule)] + metadata)

In [12]:
def run_std(readed_reactions):
    with Pool(7) as p:
        filtered_reactions = p.map(std_molecule, readed_reactions)
    return filtered_reactions

In [13]:
def handle_writing(readed_molecules, out, mistakes):
    filtered_molecules = run_std(readed_molecules)
    del readed_molecules
    for new_mol in filtered_molecules:
        if isinstance(new_mol, int):
            mistakes.writerow([new_mol])
        else:
            out.writerow(new_mol)
    del filtered_molecules

In [26]:
def read_std_pubchem(file_name, path, out, mistakes, names_transfrom, size):
    if file_name[-2:] == 'gz':
        file = path + file_name
        name = file[:-3]
        print(name)
        run(["gunzip", "-k", file])
        readed_molecules = []
        with SDFRead(name, indexable=True) as inp:
            inp.reset_index()
            for n, molecule in tqdm(enumerate(inp), total=len(inp)):
                readed_molecules.append(molecule)
                if len(readed_molecules) == size:
                    handle_writing(readed_molecules, out, mistakes)
                    readed_molecules = []
            if readed_molecules:
                handle_writing(readed_molecules, out, mistakes)
        os.remove(name)

In [27]:
with open('pubchem_test.csv', 'w', newline='\n') as out, \
open('pubchem_test_mistakes.csv', 'w', newline='\n') as mistakes_out:
    smi_writer = csv.writer(out, delimiter=',')
    mistakes_writer = csv.writer(mistakes_out, delimiter=',')
    fields_set = set()
    fields_names = ['smiles'] + list(names_transfrom.values()) + module_headers
    smi_writer.writerow(fields_names)
    mistakes_writer.writerow(['pubchem_id'])
    read_std_pubchem('Compound_000000001_000500000.sdf.gz', '../data/', smi_writer, mistakes_writer, names_transfrom, 10000)

../data/Compound_000000001_000500000.sdf



gzip: ../data/Compound_000000001_000500000.sdf.gz: unexpected end of file


FileNotFoundError: [Errno 2] No such file or directory: '../data/Compound_000000001_000500000.sdf'