In [None]:
import pandas as pd
from core import app
from models import *
from app import *

def populate_chemical_table(file_path_lexicon, file_path_subset):
    df_lexicon = pd.read_csv(file_path_lexicon, sep='\t')
    df_subset = pd.read_csv(file_path_subset, sep='\t')

    common_name_dict = dict(zip(df_subset['pubchem_id'], df_subset['common_name']))

    with app.app_context():
        print("Clearing existing Chemical entries...")
        db.session.query(Chemical).delete()
        db.session.commit()

        print("Adding new Chemical entries with Common Name from subset...")

        for _, row in df_lexicon.iterrows():
            try:
                common_name = common_name_dict.get(row['pubchem_id'], None)  
                print(f"Processing PubChem ID: {row['pubchem_id']} - Common Name: {common_name}")

                if Chemical.query.get(row['pubchem_id']):
                    print(f"PubChem ID {row['pubchem_id']} already exists. Skipping.")
                    continue

                # Create a new Chemical instance
                chem = Chemical(
                    pubchem_id=row.get('pubchem_id'),
                    common_name=common_name,
                    synonyms = row.get('synonyms', ''),
                    smiles=row.get('canonical_smiles'),
                    iupac_name=row.get('iupac_name'),
                    isomeric_smiles=row.get('isomeric_smiles'),
                    molecular_formula=row.get('molecular_formula'),
                    functional_group_idx=row.get('functional_group_idx'),
                    functional_group=row.get('functional_group'),
                )

                db.session.add(chem)

            except Exception as e:
                print(f"Error processing row with PubChem ID {row.get('pubchem_id')}: {e}")
                db.session.rollback()
        db.session.commit()
        print("Chemical table population complete.")

if __name__ == "__main__":
    # Provide the paths to your TSV files
    populate_chemical_table('./data/version-3/chemical-lexicon.tsv', './data/version-3/chemical-lexicon-subset.tsv')


Clearing existing Chemical entries...
Adding new Chemical entries with Common Name from subset...
Processing PubChem ID: 101679106 - Common Name: nan
Processing PubChem ID: 15826952 - Common Name: Schembl15010960
Processing PubChem ID: 19 - Common Name: 2,3-Dihydroxybenzoic Acid
Processing PubChem ID: 101285910 - Common Name: Alkaloid Po-5
Processing PubChem ID: 98331 - Common Name: Canambrin
Processing PubChem ID: 65575 - Common Name: Cedrol
Processing PubChem ID: 65576 - Common Name: Tomatidine
Processing PubChem ID: 45 - Common Name: Tartronic Acid
Processing PubChem ID: 51 - Common Name: 2-Ketoglutaric Acid
Processing PubChem ID: 6455362 - Common Name: 66762-19-4
Processing PubChem ID: 70 - Common Name: 4-Methyl-2-Oxovaleric Acid
Processing PubChem ID: 72 - Common Name: 3,4-Dihydroxybenzoic Acid
Processing PubChem ID: 65609 - Common Name: Ac1L23Lq
Processing PubChem ID: 426059 - Common Name: Tomatoside A
Processing PubChem ID: 5275725 - Common Name: 5-Hydroxy-1-(4-Hydroxy-3-Methoxy

  if Chemical.query.get(row['pubchem_id']):


Processing PubChem ID: 10879399 - Common Name: Phellodenol C
Processing PubChem ID: 428 - Common Name: 1,3-Diaminopropane
Processing PubChem ID: 3080632 - Common Name: Stigmast-7-Enol
Processing PubChem ID: 44237242 - Common Name: nan
Processing PubChem ID: 449 - Common Name: Dl-Mevalonic Acid
Processing PubChem ID: 450 - Common Name: 1,3,5(10)-Estratriene-3,17 Beta-Diol
Processing PubChem ID: 21201348 - Common Name: N-Gamma-Glutamyl-S-Propylcysteine
Processing PubChem ID: 453 - Common Name: Hexitol
Processing PubChem ID: 454 - Common Name: Octanal
Processing PubChem ID: 456 - Common Name: Oxaluric Acid
Processing PubChem ID: 44237258 - Common Name: Glucohirsutin
Processing PubChem ID: 458 - Common Name: S-Methylmethionine
Processing PubChem ID: 460 - Common Name: Guaiacol
Processing PubChem ID: 464 - Common Name: Hippuric Acid
Processing PubChem ID: 623060 - Common Name: 9-Methoxy-6A,11A-Dihydro-6H-[1]Benzofuro[3,2-C]Chromen-3-Ol
Processing PubChem ID: 469 - Common Name: 2-Aminohexane

In [22]:
import pandas as pd
import pubchempy as pcp
from openbabel import pybel  # Use pybel.readstring and pybel.readfile
import requests
import time
# from pychem.pychem import * 
import certifi
from rdkit import Chem
from io import StringIO
from glob import glob
from subprocess import check_output, CalledProcessError

DATA = './data/version-3/'


In [23]:
chemicals = pd.read_csv(DATA+'food-chemical.tsv', sep='\t')
print("Unique chemicals: ", len(set(chemicals['pubchem-id'])))

Unique chemicals:  6992


### Get PubChem SDF Files

In [24]:
sdf_files = list()
completed = set()

In [25]:
# while len(sdf_files) != len(set(chemicals['pubchem-id'])):
#     try:
#         for i, pubchem in enumerate(set(chemicals['pubchem-id'])):
#             if i not in completed:
#                 # print(f"[INFO] Attempting CID {pubchem} at index {i}")
#                 url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{pubchem}/SDF'
#                 # print(f"[INFO] Fetching URL: {url}")
#                 r = requests.get(url, verify=certifi.where())
#                 # print(r.text)
#                 sdf_files.append(pybel.readstring('sdf', r.text))
#                 completed.add(i)

#             if i % 100 == 0:
#                 print("Completed: %i" % len(sdf_files))
#     except KeyboardInterrupt:
#         break
#     except:
#         print("Sleeping for a lil time.")
#         time.sleep(30)

In [26]:
import os
output_dir = './data/version-3/sdf-files/'

for f in sdf_files:
    filepath = os.path.join(output_dir, f.title + '.sdf')
    if not os.path.exists(filepath):
        f.write('sdf', filename=filepath)
    else:
        print(f"[INFO] Skipping existing file: {filepath}")

In [27]:
for f in sdf_files:
    f.write('sdf', filename='./data/version-3/sdf-files/' + f.title + '.sdf', overwrite=True)

### Generate ChemoPy properties

In [28]:
from rdkit import Chem
from mordred import Calculator
from mordred import (
    Constitutional, KappaShapeIndex, BCUT, EState, Autocorrelation,
    MoeType, MoRSE, GeometricalIndex, MolecularId,
    InformationContent, FragmentComplexity, BertzCT,
    PathCount, WalkCount, TopologicalIndex, WienerIndex
)

# Updated calculator
calc = Calculator([
    Constitutional,
    KappaShapeIndex,
    BCUT,
    EState,
    Autocorrelation,
    MoeType,
    MoRSE,
    GeometricalIndex,
    MolecularId,
    InformationContent,
    FragmentComplexity,
    BertzCT,
    PathCount,
    WalkCount,
    TopologicalIndex,
    WienerIndex
], ignore_3D=True)

def generate_properties(mol):
    props = {}
    try:
        props.update(calc(mol).asdict())
    except Exception as e:
        print(f"[ERROR] descriptor calculation failed: {e}")
        pass
    return props


In [29]:
for f in glob(DATA + 'sdf-files/*.sdf'):
    mol = next(pybel.readfile('sdf', f))
    sdf_files.append(mol)
properties = list()
details = list()
covered = set()

In [30]:
# for i, mol in enumerate(sdf_files):
#     if i not in covered:
#         molecule = Chem.MolFromMolBlock(mol.write('sdf'))
#         properties.append(generate_properties(molecule))
#         details.append(mol.data)

#     if i % 100 == 0:
#         print("Completed: %i" % i)

# # Save pubchem info
# pubchem_info = pd.DataFrame([dict(d) for d in details])
# pubchem_info.to_csv(DATA+'pubchem_details.tsv', sep='\t', encoding='utf-8', index=None)

# # Save chemical properties
# properties = pd.DataFrame(properties)
# properties.to_csv(DATA+'properties.tsv', sep='\t', encoding='utf-8', index=None)

### Get common names from PubChem

In [31]:
pubchem_info = pd.read_csv(DATA+'pubchem_details.tsv', sep='\t', encoding='utf-8')

In [32]:
pchem2synonyms = dict()
pchems = list(set(pubchem_info[u'PUBCHEM_COMPOUND_CID']) - {0})
completed = set()

In [34]:
for i in range(0, len(pchems), 100):

    if i not in completed:
        subs_pchems = pchems[i: i+100] 

        for c in pcp.get_synonyms(subs_pchems):
            try:
                pchem2synonyms[c['CID']] = '|'.join(c['Synonym'])
            except KeyError:
                continue
                
        completed.add(i)
    else:
        continue

    print("Completed: ", len(pchem2synonyms))

Completed:  2628
Completed:  2723
Completed:  2822
Completed:  2916
Completed:  3010
Completed:  3104
Completed:  3199
Completed:  3287
Completed:  3381
Completed:  3476
Completed:  3567
Completed:  3659
Completed:  3747
Completed:  3840
Completed:  3934
Completed:  4024
Completed:  4114
Completed:  4204
Completed:  4299
Completed:  4385
Completed:  4472
Completed:  4565
Completed:  4655
Completed:  4748
Completed:  4844
Completed:  4939
Completed:  5018
Completed:  5110
Completed:  5205
Completed:  5302
Completed:  5391
Completed:  5484
Completed:  5575
Completed:  5672
Completed:  5766
Completed:  5863
Completed:  5958
Completed:  6049
Completed:  6137
Completed:  6230
Completed:  6322
Completed:  6415
Completed:  6500


In [35]:
synonyms = list()

for pchem in pubchem_info['PUBCHEM_COMPOUND_CID']:
    try: synonyms.append(pchem2synonyms[pchem])
    except KeyError: synonyms.append('')
        
pubchem_info['synonyms'] = synonyms

### Save subset of all properties

In [36]:
import os
print(os.path.getsize(DATA + 'properties.tsv'))

190343014


In [37]:
# if not properties.empty:
#     properties.to_csv(DATA + 'properties.tsv', sep='\t', encoding='utf-8', index=False)
#     print("Saved properties.tsv successfully.")
# else:
#     print("[ERROR] DataFrame is empty — not writing file.")

In [38]:
properties = pd.read_csv(DATA+'properties.tsv', sep='\t', encoding='utf-8')

  properties = pd.read_csv(DATA+'properties.tsv', sep='\t', encoding='utf-8')


In [39]:
# Define the valid Mordred descriptor columns to keep
props_to_keep = [
    'MW',
    'RingCount.R',
    'RotatableBond.Count',
    'Lipinski.NumHDonors',
    'Lipinski.NumHAcceptors',
    'AMR',              # Approx. hydrophobicity
    'SLogP',            # Mordred's LogP
    'HeteroatomCount'   # Approx. nhev
]

# Filter only available ones from properties DataFrame
props_to_keep = [col for col in props_to_keep if col in properties.columns]

# Concatenate with selected PubChem metadata
chem_info = pd.concat([
    pubchem_info[[
        'PUBCHEM_COMPOUND_CID', 'synonyms', 'PUBCHEM_OPENEYE_CAN_SMILES',
        'PUBCHEM_IUPAC_INCHI', 'PUBCHEM_IUPAC_INCHIKEY',
        'PUBCHEM_IUPAC_NAME', 'PUBCHEM_OPENEYE_ISO_SMILES',
        'PUBCHEM_MOLECULAR_FORMULA'
    ]],
    properties[props_to_keep]
], axis=1)


In [40]:
print("Available columns:", properties.columns.tolist())
missing = [col for col in props_to_keep if col not in properties.columns]
print("Missing columns:", missing)


Available columns: ['SZ', 'Sm', 'Sv', 'Sse', 'Spe', 'Sare', 'Sp', 'Si', 'MZ', 'Mm', 'Mv', 'Mse', 'Mpe', 'Mare', 'Mp', 'Mi', 'Kier1', 'Kier2', 'Kier3', 'BCUTc-1h', 'BCUTc-1l', 'BCUTdv-1h', 'BCUTdv-1l', 'BCUTd-1h', 'BCUTd-1l', 'BCUTs-1h', 'BCUTs-1l', 'BCUTZ-1h', 'BCUTZ-1l', 'BCUTm-1h', 'BCUTm-1l', 'BCUTv-1h', 'BCUTv-1l', 'BCUTse-1h', 'BCUTse-1l', 'BCUTpe-1h', 'BCUTpe-1l', 'BCUTare-1h', 'BCUTare-1l', 'BCUTp-1h', 'BCUTp-1l', 'BCUTi-1h', 'BCUTi-1l', 'NsLi', 'NssBe', 'NssssBe', 'NssBH', 'NsssB', 'NssssB', 'NsCH3', 'NdCH2', 'NssCH2', 'NtCH', 'NdsCH', 'NaaCH', 'NsssCH', 'NddC', 'NtsC', 'NdssC', 'NaasC', 'NaaaC', 'NssssC', 'NsNH3', 'NsNH2', 'NssNH2', 'NdNH', 'NssNH', 'NaaNH', 'NtN', 'NsssNH', 'NdsN', 'NaaN', 'NsssN', 'NddsN', 'NaasN', 'NssssN', 'NsOH', 'NdO', 'NssO', 'NaaO', 'NsF', 'NsSiH3', 'NssSiH2', 'NsssSiH', 'NssssSi', 'NsPH2', 'NssPH', 'NsssP', 'NdsssP', 'NsssssP', 'NsSH', 'NdS', 'NssS', 'NaaS', 'NdssS', 'NddssS', 'NsCl', 'NsGeH3', 'NssGeH2', 'NsssGeH', 'NssssGe', 'NsAsH2', 'NssAsH', 'Nss

### Functional groups

In [41]:
def generate_fg(smiles):
    mol = pybel.readstring('smi', smiles)
    mol.write('mol', 'temp.mol', overwrite=True)
    out = check_output(['C:/Users/Sneha/Downloads/checkmol/checkmol.exe', '-p', '-e', 'temp.mol'])
    out = out.decode('utf-8') 

    # Process  and return output
    return ','.join([o.split(':')[0][1:] for o in out.split('\n') if o])

fgs = list()

In [42]:
for i, smi in enumerate(chem_info['PUBCHEM_OPENEYE_ISO_SMILES']):
    try:
        fg = generate_fg(smi)
        fgs.append([smi, fg])
    except CalledProcessError:
        fgs.append([smi, ''])
        
    if i % 100 == 0:
        print(i)
        
fgs_df = pd.DataFrame(fgs, columns=['smiles', 'functional_groups'])

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800


In [43]:
# Add to main dataframe
chem_info['functional_group_idx'] = fgs_df['functional_groups'].fillna('').tolist()

In [44]:
print(chem_info['functional_group_idx'].head(10))


0                001,003,005,026,199,202
1                                025,199
2                027,034,035,075,076,201
3    001,019,027,028,029,037,039,201,202
4        027,028,031,075,078,079,199,202
5                            027,028,031
6                    020,027,028,030,202
7                027,028,030,075,076,204
8                        003,005,075,076
9                    027,034,106,201,202
Name: functional_group_idx, dtype: object


In [45]:
# Add to main dataframe
chem_info['functional_group_idx'] = fgs_df['functional_groups'].fillna('').tolist()
# Read mapping from functional group idx to name
fgidx_map = pd.read_csv('./data/version-3/haider_fg_list.tsv',
                        sep='\t', encoding='utf-8', dtype=str)
fgidx_map = fgidx_map.set_index('id').to_dict()['functional_group']

chem_info['functional_group'] = chem_info['functional_group_idx'].map(
    lambda s: ', '.join([fgidx_map[fidx].capitalize() for fidx in s.split(',')]) if (s != '') else '')

In [46]:
print(chem_info[['functional_group_idx', 'functional_group']].head(10))

                  functional_group_idx  \
0              001,003,005,026,199,202   
1                              025,199   
2              027,034,035,075,076,201   
3  001,019,027,028,029,037,039,201,202   
4      027,028,031,075,078,079,199,202   
5                          027,028,031   
6                  020,027,028,030,202   
7              027,028,030,075,076,204   
8                      003,005,075,076   
9                  027,034,106,201,202   

                                    functional_group  
0  Cation, Carbonyl compound, Ketone, Enolether, ...  
1                                       Enol, Alkene  
2  Hydroxy compound, Phenol, 1,2-diphenol, Carbox...  
3  Cation, Acetal, Hydroxy compound, Alcohol, Pri...  
4  Hydroxy compound, Alcohol, Tert. alcohol, Carb...  
5           Hydroxy compound, Alcohol, Tert. alcohol  
6  Hemiaminal, Hydroxy compound, Alcohol, Sec. al...  
7  Hydroxy compound, Alcohol, Sec. alcohol, Carbo...  
8  Carbonyl compound, Ketone, Carboxylic a

In [47]:
import pickle as pkl

# Save list of functional groups present in db (for autocomplete)
unique_fgs = [fg for fglst in chem_info['functional_group'] .map(lambda s: s.split(', ')) for fg in fglst]
unique_fgs = list(set(unique_fgs))

pkl.dump(unique_fgs, open('../DietRX/static/unique_fgs.p', 'wb'))
pkl.dump(fgidx_map, open('../DietRX/static/fgid2name.p', 'wb'))
idxfg_map = {fg:id for id, fg in fgidx_map.items()}
pkl.dump(idxfg_map, open('../DietRX/static/fgname2id.p', 'wb'))

### Save chemical lexicon

In [None]:
# chem_info['common_name'] = chem_info['synonyms'].fillna('').map(
#     lambda s: str(s.split('|')[0]) if s else ''
# ).apply(str.title)


In [49]:
# Change column names to match with db column names
chem_info.rename(columns={
    'PUBCHEM_COMPOUND_CID': 'pubchem_id',
    'Weight':'molecular_weight',
    'nhyd':'num_hydrogen_atoms',
    'nring':'num_rings',
    'nrot':'num_rotatablebonds',
    'ndonr':'hbd_count',
    'naccr':'hba_count',
    'nta':'num_atoms',
    'naro':'number_of_aromatic_bonds',
    'nhev':'num_heavy_atoms',
    'Hy':'hyrophilic_index',
    'LogP':'alogp',
    'PUBCHEM_OPENEYE_CAN_SMILES': 'canonical_smiles',
    'PUBCHEM_IUPAC_INCHI': 'inchi',
    'PUBCHEM_IUPAC_INCHIKEY': 'inchikey',
    'PUBCHEM_IUPAC_NAME': 'iupac_name',
    'PUBCHEM_OPENEYE_ISO_SMILES': 'isomeric_smiles',
    'PUBCHEM_MOLECULAR_FORMULA': 'molecular_formula'
}, inplace=True)

In [None]:
# Save to disk
#chem_info.to_csv(DATA+'chemical-lexicon.tsv', sep='\t', index=None)

### Generate molecule images

In [51]:
from rdkit.Chem import MolFromSmiles, Draw
from rdkit.Chem.AllChem import Compute2DCoords
from shutil import copy

In [52]:
# !rm {DATA/'images/'}
# !mkdir {DATA+'images/'}

In [53]:
for i, f in enumerate(sdf_files):
    try:
        m = Chem.MolFromMolBlock(f.write('mol'))
        tmp = Compute2DCoords(m)
        Draw.MolToFile(m, DATA+'images/' + f.title + '.png')
    except KeyboardInterrupt:
        break
    except:
        print("Error encountered.")
        copy('./DietRX/static/images/no-image.png', 
             DATA+'images/' + sdf_files[0].title + '.png')
        
    if i % 100 == 0:
        print("Completed: %i" % i)

Completed: 0


Completed: 100
Completed: 200
Completed: 300
Completed: 400
Completed: 500
Completed: 600
Completed: 700




Completed: 800
Completed: 900
Completed: 1000
Completed: 1100
Completed: 1200
Completed: 1300
Completed: 1400
Completed: 1500
Completed: 1600
Completed: 1700
Completed: 1800
Completed: 1900
Completed: 2000
Completed: 2100
Completed: 2200
Completed: 2300
Completed: 2400




Completed: 2500




Completed: 2600




Completed: 2700
Completed: 2800
Completed: 2900
Completed: 3000
Completed: 3100
Completed: 3200
Completed: 3300
Completed: 3400
Completed: 3500
Completed: 3600
Completed: 3700
Completed: 3800
Completed: 3900
Completed: 4000
Completed: 4100
Completed: 4200
Completed: 4300
Completed: 4400
Completed: 4500
Completed: 4600
Completed: 4700


[14:44:06] unsupported number of radical electrons 4


Completed: 4800
Completed: 4900
Completed: 5000
Completed: 5100
Completed: 5200
Completed: 5300
Completed: 5400


[14:44:11] unsupported number of radical electrons 4


Completed: 5500
Completed: 5600
Completed: 5700
Completed: 5800
Completed: 5900
Completed: 6000




Completed: 6100
Completed: 6200
Completed: 6300
Completed: 6400
Completed: 6500
Completed: 6600
Completed: 6700




Completed: 6800
Completed: 6900


### Similarity Search

In [54]:
#from pybel import Outputfile, readstring

In [55]:
chem_info = pd.read_csv(DATA+'chemical-lexicon.tsv', sep='\t')

In [56]:
largeSDfile = pybel.Outputfile("sdf", "../dietrx/static/allmol.sdf", overwrite=True)
for i, row in chem_info.iterrows():
    m = pybel.readstring("smi", row['isomeric_smiles'])
    m.title = str(row['pubchem_id'])
    m.data['pubchem_id'] = row['pubchem_id']
    
    largeSDfile.write(m)
    
largeSDfile.close()

In [57]:
%%bash
cd ../dietrx/static/
source activate bittersweetpy2
babel allmol.sdf -ofs

Couldn't find program: 'bash'
