In [1]:
import glob
import json
import os
import re
import time
import wget
import urllib.parse
import argparse


import numpy as np
import pandas as pd
import pubchempy as pcp


from pybatchclassyfire import *
from pandas import json_normalize
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import rdFMCS
from rdkit.Chem import PandasTools

INFO:rdkit:Enabling RDKit 2021.09.4 jupyter extensions


In [2]:
def isNaN(string):
    return string != string

# suspectlist generation for SIRIUS and MetFrag

In [26]:
def slist_sirius(input_dir, slist_csv, name, substring=None):

    """slist_sirius is used to create a tsv file that contains a list of
    SMILES. The function also runs the sirius command custom db to create
    fingerprints for each SMILES in a folder that we by default name as
    SL_Frag/. This fingerprints folder is later used by SIRIUS to use
    these compounds as a another small list of compounds to match against
    the input spectra fingerprints.
    Since SIRIUS doesn't take disconnected structure, Multiply charged,
    Incorrect syntax, wild card(*) in smiles; this function removes all
    such SMILES from the Suspect List.

    Parameters:
    input_dir (str): This is the input directory where all the .mzML
    files and their respective result directories are stored. For this
    function this directory must contain a csv file that has a column
    named "SMILES".

    slist_csv (str): This is the csv file that contains a column of
    "SMILES". Additionally this file can contain other information
    about the compounds, but for this function, column of "SMILES",
    named as "SMILES" is necessary.

    substring (list): provide a list of strings of SMILES that
    shouldn't be considered, provide a list even if there is one string
    that shouldnt be considered. e.g: "[Fe+2]".

    Returns:
    tsv: a tsv file of list of SMILES, named as SL_Sirius.tsv, is stored
    in input_dir
    directory: a directory with compound fragmentations will be created
    in a folder named SL_Frag/ within the same input_dir


    Usage:
    slist_sirius("/user/project/", "suspectlist.csv",
    substring = None)

    """

    sl = pd.read_csv(slist_csv)

    # define function to neutralize the charged SMILES
    def neutralize_atoms(mol):

        pattern = Chem.MolFromSmarts(
            "[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]"
        )
        at_matches = mol.GetSubstructMatches(pattern)
        at_matches_list = [y[0] for y in at_matches]
        if len(at_matches_list) > 0:
            for at_idx in at_matches_list:
                atom = mol.GetAtomWithIdx(at_idx)
                chg = atom.GetFormalCharge()
                hcount = atom.GetTotalNumHs()
                atom.SetFormalCharge(0)
                atom.SetNumExplicitHs(hcount - chg)
                atom.UpdatePropertyCache()
        return mol

    for i, row in sl.iterrows():
        # remove SMILES with wild card
        if "*" in sl["SMILES"][i]:
            sl = sl.drop(labels=i, axis=0)
    for i, row in sl.iterrows():
        # remove SMILES with any string present in the substring
        if substring:
            if bool([ele for ele in substring if (ele in sl["SMILES"][i])]):
                sl = sl.drop(labels=i, axis=0)
    for i, row in sl.iterrows():
        if "." in sl["SMILES"][i]:
            sl.loc[i, "SMILES"] = sl["SMILES"][i].split(".")[0]
    # Neutralize the charged SMILES
    for i, row in sl.iterrows():
        if "+" in sl["SMILES"][i] or "-" in sl["SMILES"][i]:
            mol = Chem.MolFromSmiles(sl["SMILES"][i])
            neutralize_atoms(mol)
            sl.loc[i, "SMILES"] = Chem.MolToSmiles(mol)

            # Remove multiple charged SMILES
            if "+" in sl["SMILES"][i] or "-" in sl["SMILES"][i]:
                pos = sl["SMILES"][i].count("+")
                neg = sl["SMILES"][i].count("-")
                charge = pos + neg
                if charge > 1:
                    sl = sl.drop(labels=i, axis=0)

    slsirius = pd.DataFrame({"smiles": sl["SMILES"]})
    slsirius.to_csv(input_dir + "SL_Sirius.tsv", sep="\t", header=False, index=False)
    print(
        "sirius --input "
        + input_dir
        + "SL_Sirius.tsv"
        + " --name "
        + name
        + " custom-db --output "
        + input_dir
         )
    os.system(
        "sirius --input "
        + input_dir
        + "SL_Sirius.tsv"
        + " --name"
        + name
        + " custom-db --output "
        + input_dir
    )


# In[ ]:
def slist_metfrag(input_dir, slist_csv, name):
    """slist_metfrag is used to create a txt file that contains a list of
    InChIKeys. This list is later used by MetFrag to use these compounds
    as a Suspect List.

    Parameters:
    input_dir (str): This is the input directory where all the .mzML
    files and their respective result directories are stored. For this
    function this directory must contain a csv file that has a column
    named "SMILES".

    slist_csv (str): This is the csv file that contains a column of
    "SMILES". Additionally this file can contain other information
    about the compounds, but for this function, column of "SMILES",
    named as "SMILES" is necessary.

    Returns:
    list: list of InChIKeys
    txt: a txt file of list of InChIKeys, is stored in input_dir

    Usage:
    slist_metfrag(input_dir = "/user/project/", slist_csv =
    "suspectlist.csv")

    """

    sl = pd.read_csv(slist_csv)
    sl_mtfrag = []
    for i, rows in sl.iterrows():
        if i is not None:
            mols = Chem.MolFromSmiles(sl["SMILES"][i])
            try:
                sl.loc[i, "InChIKey"] = Chem.inchi.MolToInchiKey(mols)
                sl_mtfrag.append(sl["InChIKey"][i])
            except Exception as e:
                print(e)

    with open((input_dir + "/SL_" + name + ".txt"), "w") as filehandle:
        for listitem in sl_mtfrag:
            filehandle.write("%s\n" % listitem)

    return sl_mtfrag

In [27]:
#slist_metfrag(input_dir = "/Users/mahnoorzulfiqar/OneDriveUNI/MAW-Diatom", slist_csv = "/Users/mahnoorzulfiqar/OneDriveUNI/SuspectList/SkeletonemaSuspectListV1.csv", name ="Smarinoi_MetFrag")

In [28]:
slist_sirius(input_dir = "/Users/mahnoorzulfiqar/OneDriveUNI/MAW-Diatom/", slist_csv = "/Users/mahnoorzulfiqar/OneDriveUNI/SuspectList/SkeletonemaSuspectListV1.csv", name = "SL_Smarinoi_Sirius", substring="[Fe+2]")

sirius --input /Users/mahnoorzulfiqar/OneDriveUNI/MAW-Diatom/SL_Sirius.tsv --name SL_Smarinoi_Sirius custom-db --output /Users/mahnoorzulfiqar/OneDriveUNI/MAW-Diatom/


Mar 07, 2023 12:58:32 PM org.apache.commons.beanutils.FluentPropertyBeanIntrospector introspect
INFO: Error when creating PropertyDescriptor for public final void org.apache.commons.configuration2.AbstractConfiguration.setProperty(java.lang.String,java.lang.Object)! Ignoring this property.
INFO    12:58:32 - Sirius Workspace Successfull initialized at: /Users/mahnoorzulfiqar/.sirius-5.6
INFO    12:58:32 - You run SIRIUS 5.6.3
INFO    12:58:32 - You run SIRIUS in 'CLI' mode.
INFO    12:58:33 - Sirius was compiled with the following ILP solvers: GLPK-v1.7.0 (included), Gurobi-v9.1.1, CPLEX-v12.7.1, COIN-OR-v1.17.3
INFO    12:58:33 - Treebuilder priorities loaded from 'sirius.properties' are: [CLP, GUROBI, CPLEX]
INFO    12:58:33 - CPU check done. 5 cores that handle 10 threads were found.
INFO    12:58:35 - Web API initialized.
INFO    12:58:35 - Running with following arguments: [--input, /Users/mahnoorzulfiqar/OneDriveUNI/MAW-Diatom/SL_Sirius.tsv, --nameSL_Smarinoi_Sirius, custom-db, -

os.system("sirius --input /Users/mahnoorzulfiqar/OneDriveUNI/MAW-Diatom/SL_Sirius.tsv --name SL_Smarinoi_Sirius custom-db")

# MetFrag-coconut

In [31]:
coco_db = PandasTools.LoadSDF("/Users/mahnoorzulfiqar/Downloads/COCONUT_DB.sdf")
#                     (input_dir + "/structures.sdf"),
#                     idName="HMDB_ID",
#                     smilesName="SMILES",
#                     molColName="Molecule",
#                     includeFingerprints=False)

RDKit ERROR: [13:46:53] Explicit valence for atom # 20 C, 6, is greater than permitted
[13:46:53] Explicit valence for atom # 20 C, 6, is greater than permitted
RDKit ERROR: [13:46:53] ERROR: Could not sanitize molecule ending on line 53
[13:46:53] ERROR: Could not sanitize molecule ending on line 53
RDKit ERROR: [13:46:53] ERROR: Explicit valence for atom # 20 C, 6, is greater than permitted
[13:46:53] ERROR: Explicit valence for atom # 20 C, 6, is greater than permitted
[13:46:58] Explicit valence for atom # 3 B, 4, is greater than permitted
RDKit ERROR: [13:46:58] Explicit valence for atom # 3 B, 4, is greater than permitted
RDKit ERROR: [13:46:58] ERROR: Could not sanitize molecule ending on line 329955
[13:46:58] ERROR: Could not sanitize molecule ending on line 329955
RDKit ERROR: [13:46:58] ERROR: Explicit valence for atom # 3 B, 4, is greater than permitted
[13:46:58] ERROR: Explicit valence for atom # 3 B, 4, is greater than permitted
[13:47:29] Explicit valence for atom # 1 N

RDKit ERROR: [13:47:31] ERROR: Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [13:47:31] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [13:47:31] ERROR: Could not sanitize molecule ending on line 2368682
[13:47:31] ERROR: Explicit valence for atom # 1 N, 4, is greater than permitted
[13:47:31] Explicit valence for atom # 1 N, 4, is greater than permitted
[13:47:31] ERROR: Could not sanitize molecule ending on line 2368682
RDKit ERROR: [13:47:31] ERROR: Explicit valence for atom # 1 N, 4, is greater than permitted
[13:47:31] ERROR: Explicit valence for atom # 1 N, 4, is greater than permitted
[13:47:32] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [13:47:32] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [13:47:32] ERROR: Could not sanitize molecule ending on line 2379854
[13:47:32] ERROR: Could not sanitize molecule ending on line 2379854
RDKit ERROR: [13:47:32] ERROR: Exp

RDKit ERROR: [13:48:38] ERROR: Explicit valence for atom # 1 S, 8, is greater than permitted
[13:48:38] ERROR: Explicit valence for atom # 1 S, 8, is greater than permitted
[13:48:40] Explicit valence for atom # 1 C, 5, is greater than permitted
RDKit ERROR: [13:48:40] Explicit valence for atom # 1 C, 5, is greater than permitted
RDKit ERROR: [13:48:40] ERROR: Could not sanitize molecule ending on line 6462417
[13:48:40] ERROR: Could not sanitize molecule ending on line 6462417
RDKit ERROR: [13:48:40] ERROR: Explicit valence for atom # 1 C, 5, is greater than permitted
[13:48:40] ERROR: Explicit valence for atom # 1 C, 5, is greater than permitted
[13:48:40] Explicit valence for atom # 1 C, 5, is greater than permitted
RDKit ERROR: [13:48:40] Explicit valence for atom # 1 C, 5, is greater than permitted
RDKit ERROR: [13:48:40] ERROR: Could not sanitize molecule ending on line 6464060
[13:48:40] ERROR: Could not sanitize molecule ending on line 6464060
RDKit ERROR: [13:48:40] ERROR: Exp

RDKit ERROR: [13:49:41] ERROR: moving to the beginning of the next molecule
[13:49:41] ERROR: moving to the beginning of the next molecule
[13:50:01] 

****
Pre-condition Violation
bad dir
Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
****

[13:50:01] Unexpected error hit on line 11893311
RDKit ERROR: [13:50:01] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bad dir
RDKit ERROR: Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
RDKit ERROR: Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [13:50:01] Unexpected error hit on line 11893311
R

RDKit ERROR: [13:50:57] ERROR: moving to the beginning of the next molecule
[13:50:57] ERROR: moving to the beginning of the next molecule
[13:51:04] 

****
Pre-condition Violation
bad dir
Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
****

[13:51:04] Unexpected error hit on line 15899577
RDKit ERROR: [13:51:04] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bad dir
RDKit ERROR: Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
RDKit ERROR: Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [13:51:04] Unexpected error hit on line 15899577
R

RDKit ERROR: [13:52:40] ERROR: moving to the beginning of the next molecule
[13:52:40] ERROR: moving to the beginning of the next molecule
[13:52:49] 

****
Pre-condition Violation
bad dir
Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
****

[13:52:49] Unexpected error hit on line 22569486
RDKit ERROR: [13:52:49] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bad dir
RDKit ERROR: Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
RDKit ERROR: Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [13:52:49] Unexpected error hit on line 22569486
R

RDKit ERROR: [13:53:46] ERROR: moving to the beginning of the next molecule
[13:53:46] ERROR: moving to the beginning of the next molecule
[13:53:47] Explicit valence for atom # 18 O, 3, is greater than permitted
RDKit ERROR: [13:53:47] Explicit valence for atom # 18 O, 3, is greater than permitted
RDKit ERROR: [13:53:47] ERROR: Could not sanitize molecule ending on line 26226250
[13:53:47] ERROR: Could not sanitize molecule ending on line 26226250
RDKit ERROR: [13:53:47] ERROR: Explicit valence for atom # 18 O, 3, is greater than permitted
[13:53:47] ERROR: Explicit valence for atom # 18 O, 3, is greater than permitted
[13:53:52] Explicit valence for atom # 19 H, 2, is greater than permitted
RDKit ERROR: [13:53:52] Explicit valence for atom # 19 H, 2, is greater than permitted
RDKit ERROR: [13:53:52] ERROR: Could not sanitize molecule ending on line 26579803
[13:53:52] ERROR: Could not sanitize molecule ending on line 26579803
RDKit ERROR: [13:53:52] ERROR: Explicit valence for atom #

RDKit ERROR: [13:54:54] ERROR: Explicit valence for atom # 8 N, 4, is greater than permitted
[13:54:54] ERROR: Explicit valence for atom # 8 N, 4, is greater than permitted
[13:54:56] 

****
Pre-condition Violation
bad dir
Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
****

[13:54:56] Unexpected error hit on line 30659884
RDKit ERROR: [13:54:56] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bad dir
RDKit ERROR: Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
RDKit ERROR: Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [13:54:56] Unexp

RDKit ERROR: [13:55:48] ERROR: moving to the beginning of the next molecule
[13:55:48] ERROR: moving to the beginning of the next molecule
[13:55:49] 

****
Pre-condition Violation
bad dir
Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
****

[13:55:49] Unexpected error hit on line 34068982
RDKit ERROR: [13:55:49] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bad dir
RDKit ERROR: Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
RDKit ERROR: Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [13:55:49] Unexpected error hit on line 34068982
R

RDKit ERROR: [13:56:51] ERROR: moving to the beginning of the next molecule
[13:56:51] ERROR: moving to the beginning of the next molecule
[13:56:52] Explicit valence for atom # 20 H, 2, is greater than permitted
RDKit ERROR: [13:56:52] Explicit valence for atom # 20 H, 2, is greater than permitted
RDKit ERROR: [13:56:52] ERROR: Could not sanitize molecule ending on line 38107898
[13:56:52] ERROR: Could not sanitize molecule ending on line 38107898
RDKit ERROR: [13:56:52] ERROR: Explicit valence for atom # 20 H, 2, is greater than permitted
[13:56:52] ERROR: Explicit valence for atom # 20 H, 2, is greater than permitted
[13:56:57] Explicit valence for atom # 9 N, 4, is greater than permitted
RDKit ERROR: [13:56:57] Explicit valence for atom # 9 N, 4, is greater than permitted
RDKit ERROR: [13:56:57] ERROR: Could not sanitize molecule ending on line 38378324
[13:56:57] ERROR: Could not sanitize molecule ending on line 38378324
RDKit ERROR: [13:56:57] ERROR: Explicit valence for atom # 9

RDKit ERROR: [13:57:24] ERROR: Explicit valence for atom # 6 H, 2, is greater than permitted
[13:57:24] ERROR: Explicit valence for atom # 6 H, 2, is greater than permitted
[13:57:25] 

****
Pre-condition Violation
bad dir
Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
****

[13:57:25] Unexpected error hit on line 40079925
RDKit ERROR: [13:57:25] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bad dir
RDKit ERROR: Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
RDKit ERROR: Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [13:57:25] Unexp

RDKit ERROR: [13:57:46] ERROR: moving to the beginning of the next molecule
[13:57:46] ERROR: moving to the beginning of the next molecule
[13:57:53] 

****
Pre-condition Violation
bad dir
Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
****

[13:57:53] Unexpected error hit on line 41909698
RDKit ERROR: [13:57:53] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bad dir
RDKit ERROR: Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
RDKit ERROR: Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [13:57:53] Unexpected error hit on line 41909698
R

RDKit ERROR: [13:58:23] ERROR: moving to the beginning of the next molecule
[13:58:23] ERROR: moving to the beginning of the next molecule
[13:58:23] 

****
Pre-condition Violation
bad dir
Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
****

[13:58:23] Unexpected error hit on line 43801584
RDKit ERROR: [13:58:23] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bad dir
RDKit ERROR: Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
RDKit ERROR: Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [13:58:23] Unexpected error hit on line 43801584
R

RDKit ERROR: [13:58:26] ERROR: moving to the beginning of the next molecule
[13:58:26] ERROR: moving to the beginning of the next molecule
[13:58:26] 

****
Pre-condition Violation
bad dir
Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
****

[13:58:26] Unexpected error hit on line 43986719
RDKit ERROR: [13:58:26] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bad dir
RDKit ERROR: Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
RDKit ERROR: Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [13:58:26] Unexpected error hit on line 43986719
R

RDKit ERROR: [13:58:29] ERROR: moving to the beginning of the next molecule
[13:58:29] ERROR: moving to the beginning of the next molecule
[13:58:29] 

****
Pre-condition Violation
bad dir
Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
****

[13:58:29] Unexpected error hit on line 44158464
RDKit ERROR: [13:58:29] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bad dir
RDKit ERROR: Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
RDKit ERROR: Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [13:58:29] Unexpected error hit on line 44158464
R

RDKit ERROR: [13:58:39] ERROR: moving to the beginning of the next molecule
[13:58:39] ERROR: moving to the beginning of the next molecule
[13:58:39] 

****
Pre-condition Violation
bad dir
Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
****

[13:58:39] Unexpected error hit on line 44775632
RDKit ERROR: [13:58:39] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bad dir
RDKit ERROR: Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
RDKit ERROR: Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [13:58:39] Unexpected error hit on line 44775632
R

RDKit ERROR: [13:58:41] ERROR: moving to the beginning of the next molecule
[13:58:41] ERROR: moving to the beginning of the next molecule
[13:58:48] 

****
Pre-condition Violation
bad dir
Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
****

[13:58:48] Unexpected error hit on line 45295797
RDKit ERROR: [13:58:48] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bad dir
RDKit ERROR: Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
RDKit ERROR: Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [13:58:48] Unexpected error hit on line 45295797
R

[13:58:54] ERROR: moving to the beginning of the next molecule
[13:58:54] 

****
Pre-condition Violation
bad dir
Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
****

[13:58:54] Unexpected error hit on line 45715317
RDKit ERROR: [13:58:54] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bad dir
RDKit ERROR: Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
RDKit ERROR: Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [13:58:54] Unexpected error hit on line 45715317
RDKit ERROR: [13:58:54] ERROR: moving to the beginning of the next molecule
[

RDKit ERROR: [13:59:37] ERROR: moving to the beginning of the next molecule
[13:59:37] ERROR: moving to the beginning of the next molecule
[13:59:48] 

****
Pre-condition Violation
bad dir
Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
****

[13:59:48] Unexpected error hit on line 49028996
RDKit ERROR: [13:59:48] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: bad dir
RDKit ERROR: Violation occurred on line 386 in file /private/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/pip-req-build-sxoc5k67/build/temp.macosx-10.9-x86_64-3.10/rdkit/Code/GraphMol/Chirality.cpp
RDKit ERROR: Failed Expression: dir == Bond::ENDUPRIGHT || dir == Bond::ENDDOWNRIGHT
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [13:59:48] Unexpected error hit on line 49028996
R

RDKit ERROR: [14:00:51] ERROR: Explicit valence for atom # 11 N, 4, is greater than permitted
RDKit ERROR: [14:00:51] Explicit valence for atom # 13 N, 4, is greater than permitted
RDKit ERROR: [14:00:51] ERROR: Could not sanitize molecule ending on line 53121680
[14:00:51] ERROR: Explicit valence for atom # 11 N, 4, is greater than permitted
[14:00:51] Explicit valence for atom # 13 N, 4, is greater than permitted
[14:00:51] ERROR: Could not sanitize molecule ending on line 53121680
RDKit ERROR: [14:00:51] ERROR: Explicit valence for atom # 13 N, 4, is greater than permitted
RDKit ERROR: [14:00:51] Explicit valence for atom # 6 N, 4, is greater than permitted
RDKit ERROR: [14:00:51] ERROR: Could not sanitize molecule ending on line 53122357
[14:00:51] ERROR: Explicit valence for atom # 13 N, 4, is greater than permitted
[14:00:51] Explicit valence for atom # 6 N, 4, is greater than permitted
[14:00:51] ERROR: Could not sanitize molecule ending on line 53122357
RDKit ERROR: [14:00:51] 

RDKit ERROR: [14:00:56] ERROR: Explicit valence for atom # 8 N, 4, is greater than permitted
[14:00:56] ERROR: Explicit valence for atom # 8 N, 4, is greater than permitted
[14:00:57] Explicit valence for atom # 22 N, 4, is greater than permitted
RDKit ERROR: [14:00:57] Explicit valence for atom # 22 N, 4, is greater than permitted
RDKit ERROR: [14:00:57] ERROR: Could not sanitize molecule ending on line 53475405
[14:00:57] ERROR: Could not sanitize molecule ending on line 53475405
RDKit ERROR: [14:00:57] ERROR: Explicit valence for atom # 22 N, 4, is greater than permitted
[14:00:57] ERROR: Explicit valence for atom # 22 N, 4, is greater than permitted
[14:00:57] Explicit valence for atom # 18 N, 4, is greater than permitted
RDKit ERROR: [14:00:57] Explicit valence for atom # 18 N, 4, is greater than permitted
RDKit ERROR: [14:00:57] ERROR: Could not sanitize molecule ending on line 53481129
[14:00:57] ERROR: Could not sanitize molecule ending on line 53481129
RDKit ERROR: [14:00:57] 

RDKit ERROR: [14:01:04] ERROR: Explicit valence for atom # 18 H, 2, is greater than permitted
[14:01:04] ERROR: Explicit valence for atom # 18 H, 2, is greater than permitted
[14:01:05] Explicit valence for atom # 17 N, 4, is greater than permitted
RDKit ERROR: [14:01:05] Explicit valence for atom # 17 N, 4, is greater than permitted
RDKit ERROR: [14:01:05] ERROR: Could not sanitize molecule ending on line 53994033
[14:01:05] ERROR: Could not sanitize molecule ending on line 53994033
RDKit ERROR: [14:01:05] ERROR: Explicit valence for atom # 17 N, 4, is greater than permitted
[14:01:05] ERROR: Explicit valence for atom # 17 N, 4, is greater than permitted
[14:01:05] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [14:01:05] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [14:01:05] ERROR: Could not sanitize molecule ending on line 54021123
[14:01:05] ERROR: Could not sanitize molecule ending on line 54021123
RDKit ERROR: [14:01:05

RDKit ERROR: [14:01:10] ERROR: Explicit valence for atom # 23 N, 4, is greater than permitted
[14:01:10] ERROR: Explicit valence for atom # 23 N, 4, is greater than permitted
[14:01:10] Explicit valence for atom # 4 N, 4, is greater than permitted
RDKit ERROR: [14:01:10] Explicit valence for atom # 4 N, 4, is greater than permitted
RDKit ERROR: [14:01:10] ERROR: Could not sanitize molecule ending on line 54352232
[14:01:10] ERROR: Could not sanitize molecule ending on line 54352232
RDKit ERROR: [14:01:10] ERROR: Explicit valence for atom # 4 N, 4, is greater than permitted
RDKit ERROR: [14:01:11] Explicit valence for atom # 8 N, 4, is greater than permitted
RDKit ERROR: [14:01:11] ERROR: Could not sanitize molecule ending on line 54392943
[14:01:10] ERROR: Explicit valence for atom # 4 N, 4, is greater than permitted
[14:01:11] Explicit valence for atom # 8 N, 4, is greater than permitted
[14:01:11] ERROR: Could not sanitize molecule ending on line 54392943
RDKit ERROR: [14:01:11] ERRO

RDKit ERROR: [14:01:16] ERROR: Explicit valence for atom # 13 N, 4, is greater than permitted
[14:01:16] ERROR: Explicit valence for atom # 13 N, 4, is greater than permitted
[14:01:16] Explicit valence for atom # 9 N, 4, is greater than permitted
RDKit ERROR: [14:01:16] Explicit valence for atom # 9 N, 4, is greater than permitted
RDKit ERROR: [14:01:16] ERROR: Could not sanitize molecule ending on line 54746806
[14:01:16] ERROR: Could not sanitize molecule ending on line 54746806
RDKit ERROR: [14:01:16] ERROR: Explicit valence for atom # 9 N, 4, is greater than permitted
[14:01:16] ERROR: Explicit valence for atom # 9 N, 4, is greater than permitted
[14:01:16] Explicit valence for atom # 20 N, 4, is greater than permitted
RDKit ERROR: [14:01:16] Explicit valence for atom # 20 N, 4, is greater than permitted
RDKit ERROR: [14:01:16] ERROR: Could not sanitize molecule ending on line 54766602
RDKit ERROR: [14:01:16] ERROR: Explicit valence for atom # 20 N, 4, is greater than permitted
RD

RDKit ERROR: [14:01:23] Explicit valence for atom # 15 N, 4, is greater than permitted
RDKit ERROR: [14:01:23] ERROR: Could not sanitize molecule ending on line 55227559
[14:01:22] ERROR: Explicit valence for atom # 11 N, 4, is greater than permitted
[14:01:23] Explicit valence for atom # 15 N, 4, is greater than permitted
[14:01:23] ERROR: Could not sanitize molecule ending on line 55227559
RDKit ERROR: [14:01:23] ERROR: Explicit valence for atom # 15 N, 4, is greater than permitted
[14:01:23] ERROR: Explicit valence for atom # 15 N, 4, is greater than permitted
[14:01:23] Explicit valence for atom # 37 N, 4, is greater than permitted
RDKit ERROR: [14:01:23] Explicit valence for atom # 37 N, 4, is greater than permitted
RDKit ERROR: [14:01:23] ERROR: Could not sanitize molecule ending on line 55245499
RDKit ERROR: [14:01:23] ERROR: Explicit valence for atom # 37 N, 4, is greater than permitted
[14:01:23] ERROR: Could not sanitize molecule ending on line 55245499
[14:01:23] ERROR: Expl

RDKit ERROR: [14:01:30] ERROR: Explicit valence for atom # 18 N, 4, is greater than permitted
[14:01:30] ERROR: Explicit valence for atom # 18 N, 4, is greater than permitted
[14:01:31] Explicit valence for atom # 28 O, 3, is greater than permitted
RDKit ERROR: [14:01:31] Explicit valence for atom # 28 O, 3, is greater than permitted
RDKit ERROR: [14:01:31] ERROR: Could not sanitize molecule ending on line 55728782
[14:01:31] ERROR: Could not sanitize molecule ending on line 55728782
RDKit ERROR: [14:01:31] ERROR: Explicit valence for atom # 28 O, 3, is greater than permitted
[14:01:31] ERROR: Explicit valence for atom # 28 O, 3, is greater than permitted
[14:01:32] Explicit valence for atom # 9 N, 4, is greater than permitted
RDKit ERROR: [14:01:32] Explicit valence for atom # 9 N, 4, is greater than permitted
RDKit ERROR: [14:01:32] ERROR: Could not sanitize molecule ending on line 55780103
[14:01:32] ERROR: Could not sanitize molecule ending on line 55780103
RDKit ERROR: [14:01:32] 

RDKit ERROR: [14:01:35] ERROR: Explicit valence for atom # 25 N, 4, is greater than permitted
[14:01:35] ERROR: Explicit valence for atom # 25 N, 4, is greater than permitted
[14:01:36] Explicit valence for atom # 18 N, 4, is greater than permitted
RDKit ERROR: [14:01:36] Explicit valence for atom # 18 N, 4, is greater than permitted
RDKit ERROR: [14:01:36] ERROR: Could not sanitize molecule ending on line 56062115
[14:01:36] ERROR: Could not sanitize molecule ending on line 56062115
RDKit ERROR: [14:01:36] ERROR: Explicit valence for atom # 18 N, 4, is greater than permitted
[14:01:36] ERROR: Explicit valence for atom # 18 N, 4, is greater than permitted
[14:01:36] Explicit valence for atom # 34 N, 4, is greater than permitted
RDKit ERROR: [14:01:36] Explicit valence for atom # 34 N, 4, is greater than permitted
RDKit ERROR: [14:01:36] ERROR: Could not sanitize molecule ending on line 56085314
[14:01:36] ERROR: Could not sanitize molecule ending on line 56085314
RDKit ERROR: [14:01:36

RDKit ERROR: [14:01:41] ERROR: Explicit valence for atom # 22 N, 4, is greater than permitted
[14:01:41] ERROR: Explicit valence for atom # 22 N, 4, is greater than permitted
[14:01:41] Explicit valence for atom # 6 N, 4, is greater than permitted
RDKit ERROR: [14:01:41] Explicit valence for atom # 6 N, 4, is greater than permitted
RDKit ERROR: [14:01:41] ERROR: Could not sanitize molecule ending on line 56424172
[14:01:41] ERROR: Could not sanitize molecule ending on line 56424172
RDKit ERROR: [14:01:41] ERROR: Explicit valence for atom # 6 N, 4, is greater than permitted
[14:01:41] ERROR: Explicit valence for atom # 6 N, 4, is greater than permitted
[14:01:42] Explicit valence for atom # 35 N, 4, is greater than permitted
RDKit ERROR: [14:01:42] Explicit valence for atom # 35 N, 4, is greater than permitted
RDKit ERROR: [14:01:42] ERROR: Could not sanitize molecule ending on line 56484378
[14:01:42] ERROR: Could not sanitize molecule ending on line 56484378
RDKit ERROR: [14:01:42] ER

RDKit ERROR: [14:01:47] ERROR: Explicit valence for atom # 11 N, 4, is greater than permitted
[14:01:47] ERROR: Explicit valence for atom # 11 N, 4, is greater than permitted
[14:01:48] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [14:01:48] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [14:01:48] ERROR: Could not sanitize molecule ending on line 56865335
[14:01:48] ERROR: Could not sanitize molecule ending on line 56865335
RDKit ERROR: [14:01:48] ERROR: Explicit valence for atom # 12 N, 4, is greater than permitted
[14:01:48] ERROR: Explicit valence for atom # 12 N, 4, is greater than permitted
[14:01:49] Explicit valence for atom # 3 N, 4, is greater than permitted
RDKit ERROR: [14:01:49] Explicit valence for atom # 3 N, 4, is greater than permitted
RDKit ERROR: [14:01:49] ERROR: Could not sanitize molecule ending on line 56932333
[14:01:49] ERROR: Could not sanitize molecule ending on line 56932333
RDKit ERROR: [14:01:49] 

RDKit ERROR: [14:01:55] ERROR: Explicit valence for atom # 17 N, 4, is greater than permitted
[14:01:55] ERROR: Explicit valence for atom # 17 N, 4, is greater than permitted
[14:01:55] Explicit valence for atom # 10 N, 4, is greater than permitted
RDKit ERROR: [14:01:55] Explicit valence for atom # 10 N, 4, is greater than permitted
RDKit ERROR: [14:01:55] ERROR: Could not sanitize molecule ending on line 57363453
[14:01:55] ERROR: Could not sanitize molecule ending on line 57363453
RDKit ERROR: [14:01:55] ERROR: Explicit valence for atom # 10 N, 4, is greater than permitted
[14:01:55] ERROR: Explicit valence for atom # 10 N, 4, is greater than permitted
[14:01:55] Explicit valence for atom # 18 N, 4, is greater than permitted
RDKit ERROR: [14:01:55] Explicit valence for atom # 18 N, 4, is greater than permitted
RDKit ERROR: [14:01:55] ERROR: Could not sanitize molecule ending on line 57371344
[14:01:55] ERROR: Could not sanitize molecule ending on line 57371344
RDKit ERROR: [14:01:55

RDKit ERROR: [14:02:00] ERROR: Explicit valence for atom # 4 N, 4, is greater than permitted
[14:02:00] ERROR: Explicit valence for atom # 4 N, 4, is greater than permitted
[14:02:01] Explicit valence for atom # 9 N, 4, is greater than permitted
RDKit ERROR: [14:02:01] Explicit valence for atom # 9 N, 4, is greater than permitted
RDKit ERROR: [14:02:01] ERROR: Could not sanitize molecule ending on line 57745862
[14:02:01] ERROR: Could not sanitize molecule ending on line 57745862
RDKit ERROR: [14:02:01] ERROR: Explicit valence for atom # 9 N, 4, is greater than permitted
[14:02:01] ERROR: Explicit valence for atom # 9 N, 4, is greater than permitted
[14:02:01] Explicit valence for atom # 24 N, 4, is greater than permitted
RDKit ERROR: [14:02:01] Explicit valence for atom # 24 N, 4, is greater than permitted
RDKit ERROR: [14:02:01] ERROR: Could not sanitize molecule ending on line 57760239
[14:02:01] ERROR: Could not sanitize molecule ending on line 57760239
RDKit ERROR: [14:02:01] ERRO

RDKit ERROR: [14:02:05] ERROR: Explicit valence for atom # 8 N, 5, is greater than permitted
[14:02:05] ERROR: Explicit valence for atom # 8 N, 5, is greater than permitted
[14:02:06] Explicit valence for atom # 63 N, 4, is greater than permitted
RDKit ERROR: [14:02:06] Explicit valence for atom # 63 N, 4, is greater than permitted
RDKit ERROR: [14:02:06] ERROR: Could not sanitize molecule ending on line 58029322
[14:02:06] ERROR: Could not sanitize molecule ending on line 58029322
RDKit ERROR: [14:02:06] ERROR: Explicit valence for atom # 63 N, 4, is greater than permitted
[14:02:06] ERROR: Explicit valence for atom # 63 N, 4, is greater than permitted
[14:02:06] Explicit valence for atom # 8 N, 4, is greater than permitted
RDKit ERROR: [14:02:06] Explicit valence for atom # 8 N, 4, is greater than permitted
RDKit ERROR: [14:02:06] ERROR: Could not sanitize molecule ending on line 58087367
[14:02:06] ERROR: Could not sanitize molecule ending on line 58087367
RDKit ERROR: [14:02:06] ER

[14:02:10] ERROR: Could not sanitize molecule ending on line 58344731
RDKit ERROR: [14:02:10] ERROR: Explicit valence for atom # 22 N, 4, is greater than permitted
[14:02:10] ERROR: Explicit valence for atom # 22 N, 4, is greater than permitted
[14:02:11] Explicit valence for atom # 8 N, 5, is greater than permitted
RDKit ERROR: [14:02:11] Explicit valence for atom # 8 N, 5, is greater than permitted
RDKit ERROR: [14:02:11] ERROR: Could not sanitize molecule ending on line 58366666
[14:02:11] ERROR: Could not sanitize molecule ending on line 58366666
RDKit ERROR: [14:02:11] ERROR: Explicit valence for atom # 8 N, 5, is greater than permitted
[14:02:11] ERROR: Explicit valence for atom # 8 N, 5, is greater than permitted
[14:02:11] Explicit valence for atom # 35 N, 4, is greater than permitted
RDKit ERROR: [14:02:11] Explicit valence for atom # 35 N, 4, is greater than permitted
RDKit ERROR: [14:02:11] ERROR: Could not sanitize molecule ending on line 58392121
[14:02:11] ERROR: Could no

RDKit ERROR: [14:02:17] ERROR: Explicit valence for atom # 10 N, 4, is greater than permitted
[14:02:17] ERROR: Explicit valence for atom # 10 N, 4, is greater than permitted
[14:02:18] Explicit valence for atom # 23 N, 4, is greater than permitted
RDKit ERROR: [14:02:18] Explicit valence for atom # 23 N, 4, is greater than permitted
RDKit ERROR: [14:02:18] ERROR: Could not sanitize molecule ending on line 58823287
[14:02:18] ERROR: Could not sanitize molecule ending on line 58823287
RDKit ERROR: [14:02:18] ERROR: Explicit valence for atom # 23 N, 4, is greater than permitted
[14:02:18] ERROR: Explicit valence for atom # 23 N, 4, is greater than permitted
[14:02:18] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [14:02:18] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [14:02:18] ERROR: Could not sanitize molecule ending on line 58835996
[14:02:18] ERROR: Could not sanitize molecule ending on line 58835996
RDKit ERROR: [14:02:18] 

RDKit ERROR: [14:02:24] ERROR: Explicit valence for atom # 15 N, 4, is greater than permitted
[14:02:24] ERROR: Explicit valence for atom # 15 N, 4, is greater than permitted
[14:02:24] Explicit valence for atom # 4 N, 4, is greater than permitted
RDKit ERROR: [14:02:24] Explicit valence for atom # 4 N, 4, is greater than permitted
RDKit ERROR: [14:02:24] ERROR: Could not sanitize molecule ending on line 59241088
[14:02:24] ERROR: Could not sanitize molecule ending on line 59241088
RDKit ERROR: [14:02:24] ERROR: Explicit valence for atom # 4 N, 4, is greater than permitted
[14:02:24] ERROR: Explicit valence for atom # 4 N, 4, is greater than permitted
[14:02:25] Explicit valence for atom # 20 N, 4, is greater than permitted
RDKit ERROR: [14:02:25] Explicit valence for atom # 20 N, 4, is greater than permitted
RDKit ERROR: [14:02:25] ERROR: Could not sanitize molecule ending on line 59285328
[14:02:25] ERROR: Could not sanitize molecule ending on line 59285328
RDKit ERROR: [14:02:25] ER

In [33]:
coco_db.columns

Index(['coconut_id', 'inchi', 'inchikey', 'SMILES', 'sugar_free_smiles',
       'molecular_formula', 'molecular_weight', 'citationDOI', 'textTaxa',
       'name', 'synonyms', 'NPL_score', 'number_of_carbons',
       'number_of_nitrogens', 'number_of_oxygens', 'number_of_rings',
       'total_atom_number', 'bond_count', 'found_in_databases',
       'murko_framework', 'alogp', 'apol', 'topoPSA', 'ID', 'ROMol'],
      dtype='object')

In [40]:
coco_db2 = coco_db[['coconut_id', 'inchi', 'inchikey', 'SMILES', 'molecular_formula', 'molecular_weight', 'textTaxa',
       'name', 'synonyms', 'NPL_score']]

In [47]:
coco_db2.columns = coco_db2.columns.str.replace('identifier', 'Identifier')
coco_db2.columns = coco_db2.columns.str.replace("inchi", "InChI")
coco_db2.columns = coco_db2.columns.str.replace("inchikey", "InChIKey")
coco_db2.columns = coco_db2.columns.str.replace("molecular_formula", "MolecularFormula")
coco_db2.columns = coco_db2.columns.str.replace("name", "CompoundName")
coco_db2.columns = coco_db2.columns.str.replace("synonym", "Synonym")

In [61]:
coco_db2.to_csv("/Users/mahnoorzulfiqar/OneDriveUNI/GitHub-Repos/COCONUT_Jan2022.csv")

In [64]:
slist_csv = pd.read_csv("/Users/mahnoorzulfiqar/OneDriveUNI/SuspectList/SkeletonemaSuspectListV1.csv")
del slist_csv["Unnamed: 0.3"]
del slist_csv["Unnamed: 0.2"]
del slist_csv["Unnamed: 0.1"]
del slist_csv["Unnamed: 0"]

In [66]:
slist_csv.columns

Index(['Name', 'Formula', 'Species', 'SMILES', 'InChI', 'Monoisotopic_mass',
       'ChEBIid', 'KEGGid', 'PubChemId', 'source_database', 'Source',
       'nonIsomeric_SMILES_byRDKit', 'iupac', 'synonyms', 'PubChemPY',
       'correct_Name', 'Molecular mass', 'subclass', 'class', 'superclass',
       'Enzymes'],
      dtype='object')

In [67]:
#slist_csv.columns = slist_csv.columns.str.replace('identifier', 'Identifier')
slist_csv.columns = slist_csv.columns.str.replace("Monoisotopic_mass", "MonoisotopicMass")
slist_csv.columns = slist_csv.columns.str.replace("Formula", "MolecularFormula")
slist_csv.columns = slist_csv.columns.str.replace("Name", "CompoundName")
slist_csv.columns = slist_csv.columns.str.replace("synonyms", "Synonym")

In [68]:
slist_csv.columns

Index(['CompoundName', 'MolecularFormula', 'Species', 'SMILES', 'InChI',
       'MonoisotopicMass', 'ChEBIid', 'KEGGid', 'PubChemId', 'source_database',
       'Source', 'nonIsomeric_SMILES_byRDKit', 'iupac', 'Synonym', 'PubChemPY',
       'correct_CompoundName', 'Molecular mass', 'subclass', 'class',
       'superclass', 'Enzymes'],
      dtype='object')

In [72]:
for i, rows in slist_csv.iterrows():
    if i is not None:
        mols = Chem.MolFromSmiles(slist_csv["SMILES"][i])
        try:
            slist_csv.loc[i, "InChIKey"] = Chem.inchi.MolToInchiKey(mols)
        except Exception as e:
            print(e)

RDKit ERROR: [14:02:26] ERROR: Explicit valence for atom # 8 N, 4, is greater than permitted
[14:02:26] ERROR: Explicit valence for atom # 8 N, 4, is greater than permitted
[15:09:12] Invalid InChI prefix in generating InChI Key
RDKit ERROR: [15:09:12] Invalid InChI prefix in generating InChI Key
RDKit ERROR: [15:09:12] Invalid InChI prefix in generating InChI Key
[15:09:12] Invalid InChI prefix in generating InChI Key


In [76]:
for i, rows in slist_csv.iterrows():
    slist_csv.loc[i, "Identifier"] = "SL_"+str(i)

In [75]:
slist_csv.to_csv("/Users/mahnoorzulfiqar/OneDriveUNI/GitHub-Repos/SkeletonemaSuspectListV1_correctedColumns.csv")

In [78]:
slist_csv.columns

Index(['CompoundName', 'MolecularFormula', 'Species', 'SMILES', 'InChI',
       'MonoisotopicMass', 'ChEBIid', 'KEGGid', 'PubChemId', 'source_database',
       'Source', 'nonIsomeric_SMILES_byRDKit', 'iupac', 'Synonym', 'PubChemPY',
       'correct_CompoundName', 'Molecular mass', 'subclass', 'class',
       'superclass', 'Enzymes', 'InChIKey', 'Identifier'],
      dtype='object')

In [79]:
slist_csv = slist_csv[['Identifier', 'CompoundName', 'MolecularFormula', 'Species', 'SMILES', 'InChI',
       'MonoisotopicMass', 'ChEBIid', 'KEGGid', 'PubChemId', 'source_database',
       'Source', 'nonIsomeric_SMILES_byRDKit', 'iupac', 'Synonym', 'PubChemPY',
       'correct_CompoundName', 'Molecular mass', 'subclass', 'class',
       'superclass', 'Enzymes', 'InChIKey']]

In [80]:
slist_csv

Unnamed: 0,Identifier,CompoundName,MolecularFormula,Species,SMILES,InChI,MonoisotopicMass,ChEBIid,KEGGid,PubChemId,...,iupac,Synonym,PubChemPY,correct_CompoundName,Molecular mass,subclass,class,superclass,Enzymes,InChIKey
0,SL_0,N-Acetyl-L-glutamic acid,C7H11NO5,S. marinoi,CC(=O)N[C@@H](CCC(=O)O)C(=O)O,InChI=1S/C7H11NO5/c1-4(9)8-5(7(12)13)2-3-6(10)...,189.063723,CHEBI:44337,C00624,70914.0,...,(2S)-2-acetamidopentanedioic acid,"['N-Acetyl-L-glutamic acid', '1188-37-0', 'ace...",checked; NEW name,,189.17,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives,Organic acids and derivatives,,RFMMMVDNIPUKGG-YFKPBYRVSA-N
1,SL_1,Threonic acid,C4H8O5,S. marinoi,C(C(C(C(=O)O)O)O)O,"InChI=1S/C4H8O5/c5-1-2(6)3(7)4(8)9/h2-3,5-7H,1...",136.037173,CHEBI:26984,,439535.0,...,"2,3,4-trihydroxybutanoic acid",,checked; structure added,,136.10,Carbohydrates and carbohydrate conjugates,Organooxygen compounds,Organic oxygen compounds,,JPIJQSOTBSSVTP-UHFFFAOYSA-N
2,SL_2,L-citrulline,C6H13N3O3,S. costatum,C(C[C@@H](C(=O)O)N)CNC(=O)N,InChI=1S/C6H13N3O3/c7-4(5(10)11)2-1-3-9-6(8)12...,175.095691,CHEBI:16349,C00327,9750.0,...,(2S)-2-amino-5-(carbamoylamino)pentanoic acid,"['L-citrulline', 'citrulline', '372-75-8', 'H-...",checked; NEW name,,175.19,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives,Organic acids and derivatives,,RHGKLRLOHDJJDR-BYPYZUCNSA-N
3,SL_3,DL-HISTIDINE,C6H9N3O2,S. marinoi and S. costatum,NC(Cc1c[nH]cn1)C(O)=O,InChI=1S/C6H9N3O2/c7-5(6(10)11)1-4-2-8-3-9-4/h...,155.069477,CHEBI:27570,,773.0,...,2-amino-3-(1H-imidazol-5-yl)propanoic acid,"['DL-HISTIDINE', '4998-57-6', 'H-DL-His-OH', '...",checked; NEW name,,155.15,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives,Organic acids and derivatives,,HNDVDQJCIGZPNO-UHFFFAOYSA-N
4,SL_4,DL-Leucine,C6H13NO2,S. marinoi and S. costatum,CC(C)CC(N)C(O)=O,"InChI=1S/C6H13NO2/c1-4(2)3-5(7)6(8)9/h4-5H,3,7...",131.094629,CHEBI:25017,,857.0,...,2-amino-4-methylpentanoic acid,"['DL-Leucine', '328-39-2', 'H-DL-Leu-OH', '2-A...",checked; NEW name,,131.17,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives,Organic acids and derivatives,,ROHFNLRQFUQHCH-UHFFFAOYSA-N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
881,SL_881,Phosphatidylethanolamine,C9H18NO8P,S.marinoi,CC(=O)OC[C@H](COP(=O)(O)OCCN)OC(=O)C,InChI=1S/C9H18NO8P/c1-7(11)15-5-9(18-8(2)12)6-...,299.077004,,,5327011.0,...,[(2R)-2-acetyloxy-3-[2-aminoethoxy(hydroxy)pho...,"['UNII-7CMB6B4449', '7CMB6B4449', '[(2R)-2-ace...",,,299.21,Glycerophosphoethanolamines,Glycerophospholipids,Lipids and lipid-like molecules,,CFWRDBDJAOHXSH-SECBINFHSA-N
882,SL_882,cholesterol sulphate,C27H46O4S,S.marinoi,C[C@H](CCCC(C)C)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3...,InChI=1S/C27H46O4S/c1-18(2)7-6-8-19(3)23-11-12...,466.311681,,,65076.0,...,"[(3S,8S,9S,10R,13R,14S,17R)-10,13-dimethyl-17-...","['Cholesterol sulfate', 'Cholesteryl sulfate',...",,,466.70,Cholestane steroids,Steroids and steroid derivatives,Lipids and lipid-like molecules,,BHYOQNUELFTYRT-DPAQBDIFSA-N
883,SL_883,EHETE,C20H30O4,S.marinoi,C(CCCCCC=CC=CC=CC1=C(O1)O)CCCCCC(=O)O,InChI=1S/C20H30O4/c21-19(22)17-15-13-11-9-7-5-...,334.214409,,,139267243.0,...,"18-(3-hydroxyoxiren-2-yl)octadeca-13,15,17-tri...",['epoxy-hydroxy-eicosatetraenoic acid'],,,334.40,,,,,UKMMYLXMVSMRAX-UHFFFAOYSA-N
884,SL_884,HHTrE,C16H26O3,S.marinoi,CCCCCCCCCC=CC=CC=C(C(=O)O)O,InChI=1S/C16H26O3/c1-2-3-4-5-6-7-8-9-10-11-12-...,266.188195,,,129650825.0,...,"2-hydroxyhexadeca-2,4,6-trienoic acid",['hydroxy hexadecatrienoic acid'],,,266.38,,,,,NERZKBYSVCSUHD-UHFFFAOYSA-N


In [81]:
slist_csv.to_csv("/Users/mahnoorzulfiqar/OneDriveUNI/GitHub-Repos/SkeletonemaSuspectListV1_correctedColumns.csv")

In [None]:
adducts <-c() ##adducts from final_adduct
    mz_pre <- c() ##m/z from first_list
    pc <- c() ##pc from final_adduct
    peak_list <- c() ##peak lists from first_file
    result_name <- c()##result names from first_file
    isotope_list <- c()
    rt_list <- c()##rt from first_file
    into_list <- c()##intensity of m/z from final_adduct
    idX<- c()
    for (d in first_list[,"precursor_list"]) {
        for (j in 1:length(final_adduct[,"mz"])) {
            if (d<=final_adduct[j, "mzmax"] && final_adduct[j, "mzmin"] <=d){
                if (first_list[which(first_list[,"precursor_list"] == d), "rt_list"]<=final_adduct[j, "rtmax"] && final_adduct[j, "rtmin"] <=first_list[which(first_list[,"precursor_list"] == d), "rt_list"]){
                    #mz_pre
                    pre_mzzz <- final_adduct[j, "mz"]
                    mz_pre <- c(mz_pre, pre_mzzz)

# MAW-PY using old data

In [112]:
matches = [
        "M+",
        "[M",
        "M-",
        "2M",
        "M*", 
        "20.0",
        "50.0",
        "30.0",
        "40.0",
        "60.0",
        "70.0",
        "eV",
        "Massbank",
        "Spectral",
        "Match",
        "to",
        "from",
        "NIST14",
        "MoNA",
        "[IIN-based:",
        "[IIN-based",
        "on:",
        "CCMSLIB00003136269]",
        "CollisionEnergy:"
    ]


In [113]:
# Define scoring for all DBs
def HMDB_Scoring(db, i):
    if (
        db["HMDBintScore"][i] >= 0.50
        and db["HMDBmzScore"][i] >= 0.50
        and db["HQMatchingPeaks"][i] / db["hQueryTotalPeaks"][i] >= 0.50
    ):
        return True
    else:
        return False

def GNPS_Scoring(db, i):
    if (
        db["GNPSintScore"][i] >= 0.50
        and db["GNPSmzScore"][i] >= 0.50
        and db["GQMatchingPeaks"][i] / db["gQueryTotalPeaks"][i] >= 0.50
    ):
        return True
    else:
        return False

def MB_Scoring(db, i):
    if (
        db["MBintScore"][i] >= 0.50
        and db["MBmzScore"][i] >= 0.50
        and db["MQMatchingPeaks"][i] / db["mQueryTotalPeaks"][i] >= 0.50
    ):
        return True
    else:
        return False

In [114]:
entry = "/Users/mahnoorzulfiqar/OneDriveUNI/MAW-Diatom/SmarinoiRun1"

In [115]:
msp_file = glob.glob(
        entry + "/spectral_dereplication/DS_201124_SC_full_PRM_neg_01" + "/*.csv"
    )
msp_file

[]

In [16]:
glob.glob("/Users/mahnoorzulfiqar/OneDriveUNI/MAW-Diatom/SmarinoiRun1/DS_201124_SC_full_PRM_neg_01/spectral_dereplication" + "/*.csv")

['/Users/mahnoorzulfiqar/OneDriveUNI/MAW-Diatom/SmarinoiRun1/DS_201124_SC_full_PRM_neg_01/spectral_dereplication/spectral_results_for_file_1.csv']

In [3]:
def spec_postproc(entry, Source="all"):
    # currently only these subsets are removed from the names from GNPS
    matches = [
        "M+",
        "[M",
        "M-",
        "2M",
        "M*", 
        "20.0",
        "50.0",
        "30.0",
        "40.0",
        "60.0",
        "70.0",
        "eV",
        "Massbank",
        "Spectral",
        "Match",
        "to",
        "from",
        "NIST14",
        "MoNA",
        "[IIN-based:",
        "[IIN-based",
        "on:",
        "CCMSLIB00003136269]",
        "CollisionEnergy:"
    ]


    # Define scoring for all DBs
    def HMDB_Scoring(db, i):
        if (
            db["HMDBintScore"][i] >= 0.50
            and db["HMDBmzScore"][i] >= 0.50
            and db["HQMatchingPeaks"][i] / db["hQueryTotalPeaks"][i] >= 0.50
        ):
            return True
        else:
            return False

    def GNPS_Scoring(db, i):
        if (
            db["GNPSintScore"][i] >= 0.50
            and db["GNPSmzScore"][i] >= 0.50
            and db["GQMatchingPeaks"][i] / db["gQueryTotalPeaks"][i] >= 0.50
        ):
            return True
        else:
            return False

    def MB_Scoring(db, i):
        if (
            db["MBintScore"][i] >= 0.50
            and db["MBmzScore"][i] >= 0.50
            and db["MQMatchingPeaks"][i] / db["mQueryTotalPeaks"][i] >= 0.50
        ):
            return True
        else:
            return False
    # in case if we need HMDB later
    if os.path.exists(entry.split("/DS")[0] + "/hmdb_dframe_str.csv"):
        extract_smiles = pd.read_csv(entry.split("/DS")[0] + "/hmdb_dframe_str.csv", low_memory=False)
   

    msp_file = glob.glob(
        entry + "/spectral_dereplication" + "/*.csv"
    )
    if len(msp_file) > 0:

        if os.path.exists(msp_file[0]):

            msp = pd.read_csv(msp_file[0])
            # enter the directory with /spectral_dereplication/ results

            # enter the directory with /spectral_dereplication/ results
            # GNPS Results
            if Source == "gnps" or Source == "all":
                msp["gnps_results_csv"] = np.nan

                # print(entry)
                # enter the directory with /spectral_dereplication/ results
                sub_dir = (
                    entry + "/spectral_dereplication/GNPS/"
                )

                if os.path.exists(sub_dir):
                    files = glob.glob(sub_dir + "/*.csv")
                    # print(files)
                    files = [item for item in files if 'proc' not in item]

                    for mz, row in msp.iterrows():
                        for fls_g in files:

                            if msp["id_X"][mz] in fls_g:
                                
                                gnps_df = pd.read_csv(fls_g)
                                if len(gnps_df) > 0:

                                    for i, row in gnps_df.iterrows():
                                        # if compound name is present

                                        if GNPS_Scoring(gnps_df, i):
                                            
                                            if not isNaN(
                                                gnps_df["GNPScompound_name"][i]
                                            ):
                                                # split if there is a gap in the names

                                                string_chng = gnps_df[
                                                    "GNPScompound_name"
                                                ][i].split(" ")

                                                # create an empty list
                                                newstr = []

                                                # for each part of the string in the names
                                                chng = []

                                                for j in range(
                                                    len(string_chng)
                                                ):
                                                    # check if the substrings are present in the matches and no - is present

                                                    if not any(
                                                        x in string_chng[j]
                                                        for x in matches
                                                    ):  # and not '-' == string_chng[j]:

                                                        # IF | and ! not in the substring
                                                        if (
                                                            "|"
                                                            not in string_chng[
                                                                j
                                                            ]
                                                            or "!"
                                                            not in string_chng[
                                                                j
                                                            ]
                                                        ):

                                                            newstr.append(
                                                                string_chng[j]
                                                            )
                                                        # if | present in the substring
                                                        elif (
                                                            "|"
                                                            in string_chng[j]
                                                        ):

                                                            # split the string
                                                            jlen = string_chng[
                                                                j
                                                            ].split("|")
                                                            # how many substrings are left now
                                                            lst = len(jlen) - 1
                                                            # append this to chng
                                                            chng.append(
                                                                jlen[lst]
                                                            )
                                                            break

                                                            # now append chng to newstr
                                                chng.append(" ".join(newstr))

                                                # save this as the correct name
                                                gnps_df.loc[
                                                    i, "corr_names"
                                                ] = chng[0]

                                                if not isNaN(
                                                    gnps_df["GNPSSMILES"][i]
                                                ):
                                                    if chng == "":
                                                        break
                                                    elif gnps_df["GNPSSMILES"][
                                                        i
                                                    ].isalpha():
                                                        s = pcp.get_compounds(
                                                            chng[0], "name"
                                                        )
                                                        if s:
                                                            for comp in s:
                                                                gnps_df[
                                                                    "GNPSSMILES"
                                                                ][
                                                                    i
                                                                ] = (
                                                                    comp.isomeric_smiles
                                                                )
                                                        else:
                                                            gnps_df[
                                                                "GNPSSMILES"
                                                            ][i] = ""
                                            else:
                                                gnps_df["GNPSSMILES"][i] = ""
                                        else:
                                            gnps_df.drop(
                                                [i], axis=0, inplace=True
                                            )
                                    gnps_df = gnps_df.drop_duplicates(
                                        subset=["GNPSSMILES"]
                                    )
                                    for k, row in gnps_df.iterrows():

                                        if isNaN(gnps_df["GNPSSMILES"][k]):

                                            if (
                                                "["
                                                in gnps_df["GNPScompound_name"][
                                                    k
                                                ].split(" ")[-1]
                                            ):
                                                string_chng = gnps_df[
                                                    "GNPScompound_name"
                                                ][k].split("[")
                                                # print(gnps_df['GNPScompound_name'][i])

                                                # keep_names = []
                                                for j in range(
                                                    len(string_chng) - 1
                                                ):
                                                    gnps_df.loc[
                                                        k, "corr_names"
                                                    ] == string_chng[j]
                                                    s = pcp.get_compounds(
                                                        string_chng[j], "name"
                                                    )

                                                    if s:
                                                        for comp in s:
                                                            gnps_df[
                                                                "GNPSSMILES"
                                                            ][
                                                                k
                                                            ] = (
                                                                comp.isomeric_smiles
                                                            )
                                                            gnps_df.loc[
                                                                k, "GNPSformula"
                                                            ] = (
                                                                comp.molecular_formula
                                                            )
                                                            gnps_df.loc[
                                                                k, "GNPSinchi"
                                                            ] = Chem.MolToInchi(
                                                                Chem.MolFromSmiles(
                                                                    comp.isomeric_smiles
                                                                )
                                                            )

                                                    else:
                                                        gnps_df["GNPSSMILES"][
                                                            k
                                                        ] = ""
                                                        gnps_df.loc[
                                                            k, "GNPSformula"
                                                        ] = ""
                                                        gnps_df.loc[
                                                            k, "GNPSinchi"
                                                        ] = ""
                                        if not isNaN(gnps_df["GNPSSMILES"][k]):
                                            try:
                                                sx = pcp.get_compounds(
                                                    gnps_df["GNPSSMILES"][k],
                                                    "smiles",
                                                )
                                                gnps_df.loc[
                                                    k, "GNPSinchi"
                                                ] = Chem.MolToInchi(
                                                    Chem.MolFromSmiles(
                                                        comp.isomeric_smiles
                                                    )
                                                )
                                                if sx:
                                                    sx = str(sx)
                                                    comp = pcp.Compound.from_cid(
                                                        [
                                                            int(x)
                                                            for x in re.findall(
                                                                r"\b\d+\b", sx
                                                            )
                                                        ]
                                                    )
                                                    gnps_df.loc[
                                                        k, "GNPSformula"
                                                    ] = comp.molecular_formula

                                            except Exception:
                                                gnps_df.loc[
                                                    k, "GNPSformula"
                                                ] = ""
                                                gnps_df.loc[k, "GNPSinchi"] = ""

                                gnps_df = gnps_df.dropna(axis=0, how="all")
                                csvname = (
                                    (os.path.splitext(fls_g)[0])
                                    + "proc"
                                    + ".csv"
                                )

                                msp.loc[
                                    mz, "gnps_results_csv"
                                ] = csvname
                                
                                if not os.path.exists(csvname):
                                    #print("this is wrong?")
                                    #print(csvname)
                                    #print(os.path.splitext(fls_g)[0])
                                    gnps_df.to_csv(csvname)


            msp.to_csv(msp_file[0])
            # HMDB Results
            if Source == "hmdb" or Source == "all":
                sub_dir = (
                    entry + "/spectral_dereplication/HMDB/"
                )
                if os.path.exists(sub_dir):
                    files = glob.glob(sub_dir + "/*.csv")
                    files = [item for item in files if 'proc' not in item]
                    if os.path.exists(sub_dir):
                        # print(files)
                        for mz, row in msp.iterrows():
                            #print(mz)
                            # print(msp["id_X"][mz])
                            for fls_h in files:
                                 if msp["id_X"][mz] in fls_h:
                                        hmdb_df = pd.read_csv(fls_h)

                                        if len(hmdb_df) > 0:
                                            if "HMDBSMILES" in hmdb_df.columns:
                                                #print(hmdb_df)
                                                for i, row in hmdb_df.iterrows():
                                                # if compound name is present
                                                    if not HMDB_Scoring(hmdb_df, i):
                                                        hmdb_df.drop(i, inplace=True)
                                                hmdb_df = hmdb_df.drop_duplicates(
                                                    subset=["HMDBSMILES"]
                                                )


                                                csvname = (
                                                    (os.path.splitext(fls_h)[0])
                                                    + "proc"
                                                    + ".csv"
                                                )  
                                                msp.loc[
                                                    mz, "hmdb_results_csv"
                                                ] = csvname

                                                if not os.path.exists(csvname):
                                                    hmdb_df.to_csv(csvname) 
                                            else:    
                                                # merge on basis of id, frame and hmdb result files
                                                SmilesHM = pd.merge(
                                                    hmdb_df,
                                                    extract_smiles,
                                                    left_on=hmdb_df.HMDBcompoundID,
                                                    right_on=extract_smiles.DATABASE_ID,
                                                )
                                                hmdb_df["HMDBcompoundID"] = np.nan
                                                hmdb_df["HMDBSMILES"] = np.nan
                                                hmdb_df["HMDBformula"] = np.nan
                                                hmdb_df["HMDBcompound_name"] = np.nan
                                                for i, row in hmdb_df.iterrows():
                                                    #print(i)
                                                    # if compound name is present
                                                    if HMDB_Scoring(hmdb_df, i):

                                                        #hmdb_df.drop(i, inplace=True)
                                                        for j, row in SmilesHM.iterrows():
                                                            #print("SmilesHM")
                                                            # where index for both match, add the name and SMILES
                                                            if (
                                                                hmdb_df["HMDBcompoundID"][i]
                                                                == SmilesHM[
                                                                    "HMDBcompoundID"
                                                                ][j]
                                                            ):
                                                                hmdb_df.loc[
                                                                    i, "HMDBSMILES"
                                                                ] = SmilesHM["SMILES"][
                                                                    j
                                                                ]  # add SMILES
                                                                hmdb_df.loc[
                                                                    i, "HMDBcompound_name"
                                                                ] = SmilesHM[
                                                                    "GENERIC_NAME"
                                                                ][
                                                                    j
                                                                ]  # add name
                                                                hmdb_df.loc[
                                                                    i, "HMDBformula"
                                                                ] = SmilesHM["FORMULA"][
                                                                    j

                                                                ]
                                                                #print(hmdb_df["HMDBSMILES"][i])
                #                             
                                            #print(hmdb_df)
#                                         hmdb_df = hmdb_df.drop_duplicates(
#                                              subset=["HMDBSMILES"]
#                                          )
                                        csvname = (
                                            (os.path.splitext(fls_h)[0])
                                            + "proc"
                                            + ".csv"
                                        )  
                                        msp.loc[
                                            mz, "hmdb_results_csv"
                                        ] = csvname

                                        if not os.path.exists(csvname):
                                            hmdb_df.to_csv(csvname) 
            msp.to_csv(msp_file[0])
            # MASSBANK Results

            # enter the directory with /spectral_dereplication/ results
            if Source == "mbank" or Source == "all":

                sub_dir = (
                    
                    entry
                    + "/spectral_dereplication/MassBank/"
                )
                if os.path.exists(sub_dir):
                    files = glob.glob(sub_dir + "/*.csv")
                    files = [item for item in files if 'proc' not in item]
                    for mz, row in msp.iterrows():
                        # print(msp["id_X"][mz])
                        for fls_m in files:
                            if msp["id_X"][mz] in fls_m:
                                mbank_df = pd.read_csv(fls_m)
                                if len(mbank_df) > 0:

                                    for i, row in mbank_df.iterrows():
                                        # if compound name is present
                                         if not MB_Scoring(mbank_df, i):
                                            mbank_df.drop(i, inplace=True)
                                mbank_df = mbank_df.drop_duplicates(
                                    subset=["MBSMILES"]
                                )
                                csvname = (
                                    (os.path.splitext(fls_m)[0])
                                    + "proc"
                                    + ".csv"
                                )

                                msp.loc[
                                    mz, "mbank_results_csv"
                                ] = csvname

                                if not os.path.exists(csvname):

                                    mbank_df.to_csv(csvname)
            msp.to_csv(msp_file[0])


In [10]:
entry = path + "/" + 'DS_201124_SC_full_PRM_neg_08'

In [11]:
spec_postproc(entry, Source="all")

DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=OC%3D1C%3DCC%28%3DCC1C2%3DCC%28%3DCC%3DC2O%29CC%3DC%29CC%3DC'
DEBUG:pubchempy:Created Compound(72300)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C1%5BC%40H%5D%28%5BC%40H%5D%28OC2%3DC1C%28%3DCC%28%3DC2%5BC%40%40H%5D3%5BC%40H%5D%28%5BC%40H%5D%28OC4%3DCC%28%3DCC%28%3DC34%29O%29O%29C5%3DCC%28%3DC%28C%3DC5%29O%29O%29O%29O%29O%29C6%3DCC%28%3DC%28C%3DC6%29O%29O%29O'
DEBUG:pubchempy:Created Compound(122738)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=OC%5BC%40H%5D1O%5BC%40%40H%5D%28OC2%3DC%28O%29C%28O%29%3DC%28C%3DC2%29C%28%3DO%29%5CC%3DC%5CC2%3DCC%28O%29%3DC%28O%29C%3DC2%29%5BC%40H%5D%28O%29%5BC%40%40H%5D%28O%29%5BC%40%40H%5D1O'
DEBUG:pubchempy:Created Compound(6441269)
DEBUG:pubchempy:Reques

DEBUG:pubchempy:Request data: b'smiles=CC1%3DC%28O%29C%28C%29%3DC2CCC%28C%29%28COC3%3DCC%3DC%28CC4SC%28%3DO%29NC4%3DO%29C%3DC3%29OC2%3DC1C'
DEBUG:pubchempy:Created Compound(5591)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCCCCCCCCCCOS%28O%29%28%3DO%29%3DO'
DEBUG:pubchempy:Created Compound(5248)


In [205]:
for i in folders2:
    entry = path + "/" +i
    print(i)
    spec_postproc(entry, Source="all")

DS_201124_SC_full_PRM_pos_03


DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCCCCCCCCCCCCCC%28%3DO%29OC%5BC%40H%5D%28COP%28%3DO%29%28%5BO-%5D%29OCC%5BN%2B%5D%28C%29%28C%29C%29OC%28%3DO%29CCCCCCC%2FC%3DC%5CC%2FC%3DC%5CCCCCC'
DEBUG:pubchempy:Created Compound(6441487)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCCCCCCCCCCCC%28%3DO%29OC%5BC%40H%5D%28COP%28%3DO%29%28%5BO-%5D%29OCC%5BN%2B%5D%28C%29%28C%29C%29OC%28%3DO%29CCCCCCCCCCCCCCC'
DEBUG:pubchempy:Created Compound(452110)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCCCCCCCCCC%28%3DO%29O%5BC%40H%5D%28COC%28%3DO%29CCCCCCC%2FC%3DC%5CCCCCCCCC%29COP%28%3DO%29%28%5BO-%5D%29OCC%5BN%2B%5D%28C%29%28C%29C'
DEBUG:pubchempy:Created Compound(24778931)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/r

DS_201124_SC_full_PRM_pos_04


DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CSCC1OC%28C%28O%29C1O%29%5BN%5D2C%3DNC3%3DC2N%3DCN%3DC3N'
DEBUG:pubchempy:Created Compound(149)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=OCC%28%3DCCNC1%3DNC%3DNC2%3DC1N%3DCN2C3OC%28CO%29C%28O%29C3O%29C'
DEBUG:pubchempy:Created Compound(5529)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C1%3DNC2%3DC%28C%28%3DN1%29N%29N%3DCN2C3C%28C%28C%28O3%29CO%29O%29O'
DEBUG:pubchempy:Created Compound(191)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=OC%5BC%40%40H%5D%28O1%29%5BC%40%40H%5D%28O%29%5BC%40%40H%5D%28O%29%5BC%40%40H%5D1n%28c3%29c%28n2%29c%28n3%29c%28N%29nc2'
DEBUG:pubchempy:Created Compound(60961)
DEBUG:p

DEBUG:pubchempy:Created Compound(213144)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=c1ccc%28Cn2nnnc2SCc2nc3ccccc3%5BnH%5D2%29cc1'
DEBUG:pubchempy:Created Compound(45923866)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=+'
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=O%3DC%28O%29%5BC%40%40H%5D%28N%29CCC%2FN%3DC%28%5CN%29N'
DEBUG:pubchempy:Created Compound(6322)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C%28CC%28C%28%3DO%29O%29N%29CN%3DC%28N%29N'
DEBUG:pubchempy:Created Compound(232)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C%28C%5BC%40%40H%5D%28C%28%3DO%29O

DS200309_Scost_QC_70k_neg_PRM


DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=OC%3D1C%3DCC2%3DC%28OC3%3DC2N%3DC4C%28C%3DNN4C%28C%29C%29%3DC3C%3D5C%3DCC%3DCC5OC%29C1'
DEBUG:pubchempy:Created Compound(71827319)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C1C%28C%28OC2%3DCC%28%3DCC%28%3DC21%29O%29O%29C3%3DCC%28%3DC%28C%3DC3%29O%29O%29O'
DEBUG:pubchempy:Created Compound(1203)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=O%3DC1C%3DC%28OC2%3DCC%3DC%28O%29C%3DC12%29C%3D3C%3DCC%28OC%29%3DCC3'
DEBUG:pubchempy:Created Compound(688679)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C1%5BC%40H%5D%28%5BC%40H%5D%28%5BC%40H%5D%28C%28O1%29%28CO%29O%29O%29O%29O'
DEBUG:pubchempy:Created Compound(441

DEBUG:pubchempy:Created Compound(163166904)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C%5BC%40H%5D%28CCC%28N1CCCC1C%28O%29%3DO%29%3DO%29%5BC%40H%5D2CC%5BC%40%40%5D3%28%5BH%5D%29%5BC%40%5D4%28%5BH%5D%29CC%5BC%40%5D5%28%5BH%5D%29C%5BC%40H%5D%28O%29CC%5BC%40%5D5%28C%29%5BC%40H%5D4C%5BC%40H%5D%28O%29%5BC%40%40%5D32C'
DEBUG:pubchempy:Created Compound(163120360)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C%5BC%40H%5D%28CCC%28N1C%28C%28O%29%3DO%29CCC1%29%3DO%29%5BC%40H%5D2CC%5BC%40%40%5D3%28%5BH%5D%29%5BC%40%5D4%28%5BH%5D%29C%5BC%40H%5D%28O%29%5BC%40%5D5%28%5BH%5D%29C%5BC%40H%5D%28O%29CC%5BC%40%5D5%28C%29%5BC%40H%5D4CC%5BC%40%40%5D32C'
DEBUG:pubchempy:Created Compound(163122711)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C%5BC%40H%5

DS_201124_SC_full_PRM_pos_05


DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCCCCCCCCC%2FC%3DC%2F%5BC%40H%5D%28%5BC%40H%5D%28COP%28%3DO%29%28%5BO-%5D%29OCC%5BN%2B%5D%28C%29%28C%29C%29NC%28%3DO%29CCCCCCCCCCC%29O'
DEBUG:pubchempy:Created Compound(44260123)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCCCCCCCCCCCCCC%28%3DO%29OC%5BC%40H%5D%28COP%28%3DO%29%28%5BO-%5D%29OCC%5BN%2B%5D%28C%29%28C%29C%29OC%28%3DO%29CCCCCCC%2FC%3DC%5CC%2FC%3DC%5CCCCCC'
DEBUG:pubchempy:Created Compound(6441487)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCCCCCCCCCCCCCC%28%3DO%29O%5BC%40H%5D%28COC%28%3DO%29CCCCCCCCCCCCC%29COP%28%3DO%29%28%5BO-%5D%29OCC%5BN%2B%5D%28C%29%28C%29C'
DEBUG:pubchempy:Created Compound(131150)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/

DS_201124_SC_full_PRM_pos_02


DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCC%2FC%3DC%5CCCCCCCCC%28%3DO%29OCC%28CO%29O'
DEBUG:pubchempy:Created Compound(9883914)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=NC%28%3DO%29C1%3DCC%3DCN%3DC1'
DEBUG:pubchempy:Created Compound(936)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=c1cc%28cnc1%29C%28%3DN%29O'
DEBUG:pubchempy:Created Compound(936)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C1%3DCC%28%3DCN%3DC1%29C%28%3DO%29N'
DEBUG:pubchempy:Created Compound(936)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCCCC%28%3DO%29OCC%28COP%28%3DO%29%28%5BO

DS_201124_SC_full_PRM_neg_03


DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=COCCOC%28%3DO%29c1c%28C%29n%28CCOc2cccc%28Cl%29c2Cl%29c2c1cc%28O%29c1ccccc12'
DEBUG:pubchempy:Created Compound(17138706)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C1%3DCC%3DC2C%28%3DC1%29N%3DNN2Cl'
DEBUG:pubchempy:Created Compound(88761)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCCCCCCCCOS%28%3DO%29%28%3DO%29O'
DEBUG:pubchempy:Created Compound(8778)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=ClC1%3DC%28Cl%29C2%28Cl%29C3COS%28%3DO%29%28%3DO%29OCC3C1%28Cl%29C2%28Cl%29Cl'
DEBUG:pubchempy:Created Compound(13940)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smile

DS_201124_SC_full_PRM_neg_04


DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CO%5BC%40H%5D1%5CC%3DC%5CC%3DC%28C%29%5CC%5BC%40H%5D%28C%29%5BC%40H%5D%28O%29%5BC%40H%5D%28C%29%5CC%3DC%28C%29%5CC%3DC%28OC%29%5CC%28%3DO%29O%5BC%40%40H%5D1%5BC%40%40H%5D%28C%29%5BC%40%40H%5D%28O%29%5BC%40H%5D%28C%29C%28%3DO%29%5CC%3DC%5C%5BC%40H%5D%28C%29%5BC%40H%5D%28O%29C%28C%29C'
DEBUG:pubchempy:Created Compound(23757106)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCCCCCCCCOS%28%3DO%29%28%3DO%29O'
DEBUG:pubchempy:Created Compound(8778)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=ClC1%3DC%28Cl%29C2%28Cl%29C3COS%28%3DO%29%28%3DO%29OCC3C1%28Cl%29C2%28Cl%29Cl'
DEBUG:pubchempy:Created Compound(13940)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smil

DEBUG:pubchempy:Created Compound(142768)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=O%3DC1OC%3D2C%3DC%28OC%29C%28OC%29%3DCC2C%3DC1'
DEBUG:pubchempy:Created Compound(8417)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=O%3DC%28O%29C12CCC%28C%29%28C%29CC2C3%3DCCC4C5%28C%29CC%28O%29C%28O%29C%28C%29%28C%29C5CCC4%28C%29C3%28C%29CC1'
DEBUG:pubchempy:Created Compound(3694932)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=OC1CCC2%28C%29C%28CCC3%28C%29C2CC%3DC4C5C%28C%29C%28C%29CCC5%28C%29CCC43C%29C1%28C%29C'
DEBUG:pubchempy:Created Compound(225688)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=O%3DC1OC%3D2C%3DC%28O%29C%28%3DCC2C%3DC1%29CC%28O%29C%28O%29%28C%

DS200309_Scost_QC_70k_pos_PRM


DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCCCCCCCCCCCCCC%28%3DO%29OC%5BC%40H%5D%28COP%28%3DO%29%28%5BO-%5D%29OCC%5BN%2B%5D%28C%29%28C%29C%29OC%28%3DO%29CCC%2FC%3DC%5CC%2FC%3DC%5CC%2FC%3DC%5CC%2FC%3DC%5CCCCCC'
DEBUG:pubchempy:Created Compound(16219824)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCCCCCCCCCCCC%28%3DO%29OC%5BC%40H%5D%28COP%28%3DO%29%28%5BO-%5D%29OCC%5BN%2B%5D%28C%29%28C%29C%29OC%28%3DO%29CCCCCCC%2FC%3DC%5CCCCCCCCC'
DEBUG:pubchempy:Created Compound(5497103)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCC%2FC%3DC%5CC%2FC%3DC%5CCCCCCCCC%28%3DO%29OC%5BC%40H%5D%28COP%28%3DO%29%28%5BO-%5D%29OCC%5BN%2B%5D%28C%29%28C%29C%29OC%28%3DO%29CCCCCCC%2FC%3DC%5CC%2FC%3DC%5CCCCCC'
DEBUG:pubchempy:Created Compound(5288075)


DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=O%3DC%28NC%28C%28O%29%3DO%29CC1%3DCC%3DC%28O%29C%3DC1%29CCCCCCCCCCCC%3DCCC%3DCCC%3DCCC'
DEBUG:pubchempy:Created Compound()
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCCCCCCCCCCCC%28%3DO%29OC%5BC%40H%5D%28COP%28%3DO%29%28%5BO-%5D%29OCC%5BN%2B%5D%28C%29%28C%29C%29SC%28%3DO%29CCCCCCCCCCCCCCC'
DEBUG:pubchempy:Created Compound(195048)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCCCCCCCCCCCC%28%3DO%29OC%5BC%40H%5D%28COP%28%3DO%29%28%5BO-%5D%29OCC%5BN%2B%5D%28C%29%28C%29C%29OC%28%3DO%29CCCCCCC%2FC%3DC%5CCCCCCCCC'
DEBUG:pubchempy:Created Compound(5497103)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C

DEBUG:pubchempy:Created Compound(442530)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CC%28C%29CCCCCCCCCCCCC%28%3DO%29NC%28CC1%3DCC%3DCC%3DC1%29C%28O%29%3DO'
DEBUG:pubchempy:Created Compound(102571592)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=c1%28C%5BC%40%40H%5D%28C%28%3DO%29O%29N%29ccccc1'
DEBUG:pubchempy:Created Compound(6140)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=O%3DC1O%2FC%28%3DC%28%2FC%28%3DO%29N%5BC%40H%5D%28C%28%3DO%29O%29Cc2ccccc2%29%5Cc2ccccc2%29%2FC%28%3DC1c1ccccc1%29O'
DEBUG:pubchempy:Created Compound(139292154)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=+'
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest

DEBUG:pubchempy:Request data: b'smiles=NC1%3DNC2%3DC%28N%3DCN2%5BC%40%40H%5D2O%5BC%40H%5D%28CO%29%5BC%40%40H%5D%28O%29%5BC%40H%5D2O%29C%28%3DO%29N1'
DEBUG:pubchempy:Created Compound(135398635)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCC%2FC%3DC%5CCCCCCCCC%28%3DO%29OC%5BC%40H%5D%28COP%28%3DO%29%28%5BO-%5D%29OCC%5BN%2B%5D%28C%29%28C%29C%29OC%28%3DO%29CCCCCCC%2FC%3DC%5CCCCC'
DEBUG:pubchempy:Created Compound(24778648)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCCCCCCCCCCCC%28%3DO%29OCC%28COP%28%3DO%29%28%5BO-%5D%29OCC%5BN%2B%5D%28C%29%28C%29C%29OC%28%3DO%29CCCCCCCC%28%3DO%29C%3DCC%28%3DO%29O'
DEBUG:pubchempy:Created Compound(53394014)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=OC%3D1C%3DC%28O%29C%3D2C%3DC%28O%29C%28%3D%5BO

DEBUG:pubchempy:Created Compound(94206)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCC%5BC%40%40H%5D%28C%28%3DO%29O%29N'
DEBUG:pubchempy:Created Compound(21236)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CC%5BC%40%40H%5D%28C%29%5BC%40%40H%5D%28C%28%3DO%29O%29N'
DEBUG:pubchempy:Created Compound(99288)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CC%28C%29CC%28NCc1c%28O%29ccc2c%28-c3ccccc3%29cc%28%3DO%29oc12%29C%28%3DO%29O'
DEBUG:pubchempy:Created Compound(5796946)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CC%5BC%40H%5D%28C%29%5BC%40%40H%5D%28C%28%3DO%29N%5BC%40%40H%5D%28CC%28C%29C%29C%28%3DO%29O%29N'
DEBUG:pubchempy:Created Compound(7019083)

DEBUG:pubchempy:Created Compound(119895)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CN1CC%28C%28O%29%3DNC2%28C%29OC3%28O%29C4CCCN4C%28%3DO%29C%28Cc4ccccc4%29N3C2%3DO%29CC2c3cccc4%5BnH%5Dcc%28c34%29CC21'
DEBUG:pubchempy:Created Compound(3066)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=Cc1cc2c%28cc1S%28%3DO%29%28%3DO%29N1CCCC1C%28%3DO%29O%29N%3DC%28O%29CO2'
DEBUG:pubchempy:Created Compound(17175025)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C1%3DCC%28%3DCC%3DC1C%5BC%40%40H%5D%28C%28%3DO%29O%29N%29O'
DEBUG:pubchempy:Created Compound(6057)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=N%5BC%40%40H%5D%28CC1%3DCC%3DC%28O%29C%3DC1%29C%28O%29%3DO'
DE

DEBUG:pubchempy:Created Compound(142984)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCC%28%3DO%29OC%28CC%28%3DO%29%5BO-%5D%29C%5BN%2B%5D%28C%29%28C%29C'
DEBUG:pubchempy:Created Compound(107738)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCC%28%3DO%29O%5BC%40H%5D%28CC%28O%29%3DO%29C%5BN%2B%5D%28C%29%28C%29C'
DEBUG:pubchempy:Created Compound(157837)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C%28%5BC%40%40H%5D1%5BC%40H%5D%28%5BC%40%40H%5D%28%5BC%40H%5D%28%5BC%40H%5D%28O1%29O%5BC%40%40H%5D2%5BC%40H%5D%28O%5BC%40%40H%5D%28%5BC%40%40H%5D%28%5BC%40H%5D2O%29O%29O%5BC%40%40H%5D3%5BC%40H%5D%28OC%28%5BC%40%40H%5D%28%5BC%40H%5D3O%29O%29O%29CO%29CO%29O%29O%29O%29O'
DEBUG:pubchempy:Created Compound(439586)
DEBUG:pubchempy:Request URL: http

DEBUG:pubchempy:Created Compound(3496897)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=O%3DC1C%3DC%28OC%3D2C%28OC%29%3DC%28OC%29C%3DC%28OC%29C12%29C%3D3C%3DCC%28OC%29%3DCC3'
DEBUG:pubchempy:Created Compound(629964)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=O%3DC1C%3DC%28OC%3D2C%3DC%28OC%29C%28OC%29%3DC%28OC%29C12%29C%3D3C%3DCC%28OC%29%3DCC3'
DEBUG:pubchempy:Created Compound(96118)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=O%3DC%28O%29C1%3DCOC%28OC2OC%28CO%29C%28O%29C%28O%29C2O%29C3C1C%28O%29CC3%28O%29C'
DEBUG:pubchempy:Created Compound(14378605)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=O%3DC1C%3D2C%28O%29%3DCC%28O%29%3DCC2OC%28C%3D3C%3DCC

DEBUG:pubchempy:Created Compound(34059)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=OC1C%3D2C%3DC%28OC%29C%28OC%29%3DC%28OC%29C2C3%3DC%28OC%29C%3D4OCOC4C%3DC3CC%28C%29C1C'
DEBUG:pubchempy:Created Compound(634472)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C%28CC%28C%28%3DO%29O%29N%29CN%3DC%28N%29N'
DEBUG:pubchempy:Created Compound(232)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C%28CC%28C%28%3DO%29O%29N%29CN'
DEBUG:pubchempy:Created Compound(389)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C1CC%28NC1%29C%28%3DO%29O'
DEBUG:pubchempy:Created Compound(614)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSO

DS_201124_SC_full_PRM_neg_05


DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCC%28O%29CC%28%3DO%29CCc1ccc%28O%29c%28OC%29c1'
DEBUG:pubchempy:Created Compound(3473)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C%5BC%40H%5D%28CCC%28%3DO%29NCCS%28%3DO%29%28%3DO%29O%29%5BC%40H%5D1CC%5BC%40%40H%5D2%5BC%40%40%5D1%28%5BC%40H%5D%28C%5BC%40H%5D3%5BC%40H%5D2CC%5BC%40H%5D4%5BC%40%40%5D3%28CC%5BC%40H%5D%28C4%29O%29C%29O%29C'
DEBUG:pubchempy:Created Compound(2733768)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=C%5BC%40H%5D%28CCC%28%3DO%29NCCS%28%3DO%29%28%3DO%29%5BO-%5D%29%5BC%40H%5D1CC%5BC%40%40H%5D2%5BC%40%40%5D1%28CC%5BC%40H%5D3%5BC%40H%5D2%5BC%40%40H%5D%28C%5BC%40H%5D4%5BC%40%40%5D3%28CC%5BC%40H%5D%28C4%29O%29C%29O%29C.%5BNa%2B%5D'
DEBUG:pubchempy:Created Compound(236

DEBUG:pubchempy:Request data: b'smiles=C1C%28C%28C%28CC1%28C%28%3DO%29O%29O%29OC%28%3DO%29C%3DCC2%3DCC%28%3DC%28C%3DC2%29O%29O%29O%29O'
DEBUG:pubchempy:Created Compound(348159)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=O%3DC%28C%3DCc1ccc%28O%29c%28O%29c1%29OC1CC%28O%29%28C%28%3DO%29O%29CC%28O%29C1O'
DEBUG:pubchempy:Created Compound(348159)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=O%3DC%28O%29C1%28O%29CC%28O%29C%28O%29C%28OC%28%3DO%29C%3DCC2%3DCC%3DC%28O%29C%28O%29%3DC2%29C1'
DEBUG:pubchempy:Created Compound(348159)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CC%28C%28O%29CCC%28C%29%28C%29O%29C1CCC2%28O%29C3%3DCC%28%3DO%29C4CC%28O%29C%28O%29CC4%28C%29C3CCC12C'
DEBUG:pubchempy:Created Compound(4233007)
DEBUG:pubchempy:Request 

DS_201124_SC_full_PRM_neg_02


DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CCCCCCCCCCCCOS%28%3DO%29%28%3DO%29O'
DEBUG:pubchempy:Created Compound(8778)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CC%28CCC%28%3DO%29O%29C1CCC2C1%28C%28CC3C2CCC4C3%28CCC%28C4%29OS%28%3DO%29%28%3DO%29O%29C%29O%29C'
DEBUG:pubchempy:Created Compound(53462088)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CC%28CCC%28%3DO%29O%29C1CCC2C1%28CCC3C2CCC4C3%28CCC%28C4%29OS%28%3DO%29%28%3DO%29O%29C%29C'
DEBUG:pubchempy:Created Compound(4388803)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CC%28CCC%28%3DO%29O%29C1CCC2C1%28CCC3C2C%28CC4C3%28CCC%28C4%29OS%28%3DO%29%28%3DO%29O%29C%29O%29C'
DEBUG:pubchempy:Created C

DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=CC%28C%29%28C%29c1ccc%28-c2oc3ccccc3c%28%3DO%29c2OCC%23N%29cc1'
DEBUG:pubchempy:Created Compound(1696095)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=OC1CC%28%3DC%28C%3DCC%28%3DCC%3DCC%28%3DCC%3DCC%3DC%28C%3DCC%3DC%28C%3DCC2%3DC%28C%29CC%28O%29CC2%28C%29C%29C%29C%29C%29C%29C%28C%29%28C%29C1%29C'
DEBUG:pubchempy:Created Compound(534748)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=O%3DC1OC2%3DC%28C%3DC1%29C%3DCC%3D3OC%28C%29%28C%29C%28OC%28%3DO%29C%28%3DCC%29C%29CC32'
DEBUG:pubchempy:Created Compound(511786)
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON
DEBUG:pubchempy:Request data: b'smiles=O%3DC1OC%3D2C%3DC3OC%28C%3DCC3%3DCC2C%3DC1%29%28C%29

In [166]:
entry = "/Users/mahnoorzulfiqar/OneDriveUNI/MAW-Diatom/SmarinoiRun1/DS_201124_SC_full_PRM_pos_03"

In [168]:
if os.path.exists(entry.split("/DS")[0] + "/hmdb_dframe_str.csv"):
    extract_smiles = pd.read_csv(entry.split("/DS")[0] + "/hmdb_dframe_str.csv", low_memory=False)


In [170]:
msp_file = glob.glob(
    entry + "/spectral_dereplication" + "/*.csv"
)

In [184]:
msp = pd.read_csv(msp_file[0])
sub_dir = (
    entry + "/spectral_dereplication/HMDB/"
)
files = glob.glob(sub_dir + "/*.csv")
files = [item for item in files if 'proc' not in item]
for mz, row in msp.iterrows():
    for fls_h in files:
        if msp["id_X"][mz] in fls_h:
            print(mz)
            hmdb_df = pd.read_csv(fls_h)
            print(hmdb_df)
            print("______")

0
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
1
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
2
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
3
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
4
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
5
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
6
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
7
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
8
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
9
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
10
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
11
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
12
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
13
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
14
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
15
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
16
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
______
17
Empty DataFrame
Colum

In [185]:
mz = 51

In [190]:
for fls_h in files:
    if msp["id_X"][mz] in fls_h:
        print(mz)
        hmdb_df = pd.read_csv(fls_h)
        #print(hmdb_df)
        print("______")
        if len(hmdb_df) > 0:
            SmilesHM = pd.merge(
                hmdb_df,
                extract_smiles,
                left_on=hmdb_df.HMDBcompoundID,
                right_on=extract_smiles.DATABASE_ID,
            )
            for i, row in hmdb_df.iterrows():
                if HMDB_Scoring(hmdb_df, i):
                    for j, row in SmilesHM.iterrows():
                        print("SmilesHM")

51
______
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM
SmilesHM


In [189]:
HMDB_Scoring(hmdb_df, i)

True

In [3]:
path = "/Users/mahnoorzulfiqar/OneDriveUNI/MAW-Diatom/SmarinoiRun1"
file = os.listdir("/Users/mahnoorzulfiqar/OneDriveUNI/MAW-Diatom/SmarinoiRun1")
import os
folders = [x for x in os.listdir("/Users/mahnoorzulfiqar/OneDriveUNI/MAW-Diatom/SmarinoiRun1") if x.startswith('DS')]
folders2 = [x for x in folders if not '.mzML' in x]
folders2

['DS_201124_SC_full_PRM_pos_03',
 'DS_201124_SC_full_PRM_pos_04',
 'DS200309_Scost_QC_70k_neg_PRM',
 'DS_201124_SC_full_PRM_pos_05',
 'DS_201124_SC_full_PRM_pos_02',
 'DS_201124_SC_full_PRM_neg_03',
 'DS_201124_SC_full_PRM_neg_04',
 'DS200309_Scost_QC_70k_pos_PRM',
 'DS_201124_SC_full_PRM_neg_05',
 'DS_201124_SC_full_PRM_neg_02',
 'DS_201124_SC_full_PRM_neg_10',
 'DS_201124_SC_full_PRM_pos_10',
 'DS_201124_SC_full_PRM_pos_07',
 'DS_201124_SC_full_PRM_pos_09',
 'DS_201124_SC_full_PRM_pos_08',
 'DS_201124_SC_full_PRM_pos_01',
 'DS_201124_SC_full_PRM_pos_06',
 'DS_201124_SC_full_PRM_neg_07',
 'DS_201124_SC_full_PRM_neg_09',
 'DS_201124_SC_full_PRM_neg_08',
 'DS_201124_SC_full_PRM_neg_01',
 'DS_201124_SC_full_PRM_neg_06']

In [4]:
SuspectListPath = "/Users/mahnoorzulfiqar/OneDriveUNI/SuspectList/SkeletonemaSuspectListV1_correctedColumns.csv"

In [5]:
SuspectList = pd.read_csv(SuspectListPath)

In [6]:
SuspectList.columns

Index(['Unnamed: 0', 'CompoundName', 'MolecularFormula', 'Species', 'SMILES',
       'InChI', 'MonoisotopicMass', 'ChEBIid', 'KEGGid', 'PubChemId',
       'source_database', 'Source', 'nonIsomeric_SMILES_byRDKit', 'iupac',
       'Synonym', 'PubChemPY', 'correct_CompoundName', 'Molecular mass',
       'subclass', 'class', 'superclass', 'Enzymes', 'InChIKey'],
      dtype='object')

In [116]:
SuspectList

Unnamed: 0.1,Unnamed: 0,CompoundName,MolecularFormula,Species,SMILES,InChI,MonoisotopicMass,ChEBIid,KEGGid,PubChemId,...,iupac,Synonym,PubChemPY,correct_CompoundName,Molecular mass,subclass,class,superclass,Enzymes,InChIKey
0,0,N-Acetyl-L-glutamic acid,C7H11NO5,S. marinoi,CC(=O)N[C@@H](CCC(=O)O)C(=O)O,InChI=1S/C7H11NO5/c1-4(9)8-5(7(12)13)2-3-6(10)...,189.063723,CHEBI:44337,C00624,70914.0,...,(2S)-2-acetamidopentanedioic acid,"['N-Acetyl-L-glutamic acid', '1188-37-0', 'ace...",checked; NEW name,,189.17,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives,Organic acids and derivatives,,RFMMMVDNIPUKGG-YFKPBYRVSA-N
1,1,Threonic acid,C4H8O5,S. marinoi,C(C(C(C(=O)O)O)O)O,"InChI=1S/C4H8O5/c5-1-2(6)3(7)4(8)9/h2-3,5-7H,1...",136.037173,CHEBI:26984,,439535.0,...,"2,3,4-trihydroxybutanoic acid",,checked; structure added,,136.10,Carbohydrates and carbohydrate conjugates,Organooxygen compounds,Organic oxygen compounds,,JPIJQSOTBSSVTP-UHFFFAOYSA-N
2,2,L-citrulline,C6H13N3O3,S. costatum,C(C[C@@H](C(=O)O)N)CNC(=O)N,InChI=1S/C6H13N3O3/c7-4(5(10)11)2-1-3-9-6(8)12...,175.095691,CHEBI:16349,C00327,9750.0,...,(2S)-2-amino-5-(carbamoylamino)pentanoic acid,"['L-citrulline', 'citrulline', '372-75-8', 'H-...",checked; NEW name,,175.19,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives,Organic acids and derivatives,,RHGKLRLOHDJJDR-BYPYZUCNSA-N
3,3,DL-HISTIDINE,C6H9N3O2,S. marinoi and S. costatum,NC(Cc1c[nH]cn1)C(O)=O,InChI=1S/C6H9N3O2/c7-5(6(10)11)1-4-2-8-3-9-4/h...,155.069477,CHEBI:27570,,773.0,...,2-amino-3-(1H-imidazol-5-yl)propanoic acid,"['DL-HISTIDINE', '4998-57-6', 'H-DL-His-OH', '...",checked; NEW name,,155.15,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives,Organic acids and derivatives,,HNDVDQJCIGZPNO-UHFFFAOYSA-N
4,4,DL-Leucine,C6H13NO2,S. marinoi and S. costatum,CC(C)CC(N)C(O)=O,"InChI=1S/C6H13NO2/c1-4(2)3-5(7)6(8)9/h4-5H,3,7...",131.094629,CHEBI:25017,,857.0,...,2-amino-4-methylpentanoic acid,"['DL-Leucine', '328-39-2', 'H-DL-Leu-OH', '2-A...",checked; NEW name,,131.17,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives,Organic acids and derivatives,,ROHFNLRQFUQHCH-UHFFFAOYSA-N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
881,881,Phosphatidylethanolamine,C9H18NO8P,S.marinoi,CC(=O)OC[C@H](COP(=O)(O)OCCN)OC(=O)C,InChI=1S/C9H18NO8P/c1-7(11)15-5-9(18-8(2)12)6-...,299.077004,,,5327011.0,...,[(2R)-2-acetyloxy-3-[2-aminoethoxy(hydroxy)pho...,"['UNII-7CMB6B4449', '7CMB6B4449', '[(2R)-2-ace...",,,299.21,Glycerophosphoethanolamines,Glycerophospholipids,Lipids and lipid-like molecules,,CFWRDBDJAOHXSH-SECBINFHSA-N
882,882,cholesterol sulphate,C27H46O4S,S.marinoi,C[C@H](CCCC(C)C)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3...,InChI=1S/C27H46O4S/c1-18(2)7-6-8-19(3)23-11-12...,466.311681,,,65076.0,...,"[(3S,8S,9S,10R,13R,14S,17R)-10,13-dimethyl-17-...","['Cholesterol sulfate', 'Cholesteryl sulfate',...",,,466.70,Cholestane steroids,Steroids and steroid derivatives,Lipids and lipid-like molecules,,BHYOQNUELFTYRT-DPAQBDIFSA-N
883,883,EHETE,C20H30O4,S.marinoi,C(CCCCCC=CC=CC=CC1=C(O1)O)CCCCCC(=O)O,InChI=1S/C20H30O4/c21-19(22)17-15-13-11-9-7-5-...,334.214409,,,139267243.0,...,"18-(3-hydroxyoxiren-2-yl)octadeca-13,15,17-tri...",['epoxy-hydroxy-eicosatetraenoic acid'],,,334.40,,,,,UKMMYLXMVSMRAX-UHFFFAOYSA-N
884,884,HHTrE,C16H26O3,S.marinoi,CCCCCCCCCC=CC=CC=C(C(=O)O)O,InChI=1S/C16H26O3/c1-2-3-4-5-6-7-8-9-10-11-12-...,266.188195,,,129650825.0,...,"2-hydroxyhexadeca-2,4,6-trienoic acid",['hydroxy hexadecatrienoic acid'],,,266.38,,,,,NERZKBYSVCSUHD-UHFFFAOYSA-N


DS_201124_SC_full_PRM_pos_03
DS_201124_SC_full_PRM_pos_04
DS200309_Scost_QC_70k_neg_PRM
DS_201124_SC_full_PRM_pos_05
DS_201124_SC_full_PRM_pos_02
DS_201124_SC_full_PRM_neg_03
DS_201124_SC_full_PRM_neg_04
DS200309_Scost_QC_70k_pos_PRM
DS_201124_SC_full_PRM_neg_05
DS_201124_SC_full_PRM_neg_02
DS_201124_SC_full_PRM_neg_10
DS_201124_SC_full_PRM_pos_10
DS_201124_SC_full_PRM_pos_07
DS_201124_SC_full_PRM_pos_09
DS_201124_SC_full_PRM_pos_08
DS_201124_SC_full_PRM_pos_01
DS_201124_SC_full_PRM_pos_06
DS_201124_SC_full_PRM_neg_07
DS_201124_SC_full_PRM_neg_09
DS_201124_SC_full_PRM_neg_08
DS_201124_SC_full_PRM_neg_01
DS_201124_SC_full_PRM_neg_06


In [8]:
#DS_201124_SC_full_PRM_pos_09
# DS200309_Scost_QC_70k_pos_PRM

In [114]:
entry = path + "/" + "DS_201124_SC_full_PRM_neg_01"

In [115]:
sub_dir = "/" + entry + "/spectral_dereplication/MassBank/"
if os.path.exists(sub_dir):
    mbank_files = glob.glob(sub_dir + "/*proc.csv")
    for file in mbank_files:

        mbankproc = pd.read_csv(file)

        if len(mbankproc) > 0:
            for m, row in mbankproc.iterrows():
                for s, row in SuspectList.iterrows():
                    if (
                        not isNaN(mbankproc["MBSMILES"][m])
                        and mbankproc["MBSMILES"][m] != " "
                    ):
                        if (
                            not isNaN(SuspectList["SMILES"][s])
                            and SuspectList["SMILES"][s] != " "
                        ):
                            try:
                                ms = Chem.MolFromSmiles(mbankproc["MBSMILES"][m], sanitize=False)
                                if ms is None:
                                    mbankproc["MBSMILES"][m] = "invalid_SMILES"
                                else:
                                    try:
                                        Chem.SanitizeMol(ms)
                                    except Exception:
                                        mbankproc["MBSMILES"][m] = "invalid_chemistry"
                                if ms is not "invalid_SMILES" or m is not "invalid_chemistry":
                                    LHms2 = [
                                        Chem.MolFromSmiles(
                                            mbankproc["MBSMILES"][m]
                                        ),
                                        Chem.MolFromSmiles(
                                            SuspectList["SMILES"][s]
                                        ),
                                    ]
                                    LHfps2 = [
                                        AllChem.GetMorganFingerprintAsBitVect(
                                            x2, 2, nBits=2048
                                        )
                                        for x2 in LHms2
                                    ]
                                    LHtn2 = DataStructs.FingerprintSimilarity(
                                        LHfps2[0], LHfps2[1]
                                    )
                                    if LHtn2 >= tanimoto:
                                        mbankproc.loc[
                                            m, "SLMsmiles"
                                        ] = SuspectList["SMILES"][s]
                                        mbankproc.loc[
                                            m, "SLMname"
                                        ] = SuspectList['CompoundName'][s]
                                        mbankproc.loc[m, "SLMtanimoto"] = LHtn2
                            except:
                                pass

        mbankproc.to_csv(file)

  if ms is not "invalid_SMILES" or m is not "invalid_chemistry":
  if ms is not "invalid_SMILES" or m is not "invalid_chemistry":


In [72]:
sub_dir = entry + "/spectral_dereplication/HMDB/"
if os.path.exists(sub_dir):
    hmdb_files = glob.glob(sub_dir + "/*proc.csv")
    for file in hmdb_files:

        hmdbproc = pd.read_csv(file)
        if len(hmdbproc) > 0:
            for h, row in hmdbproc.iterrows():
                for s, row in SuspectList.iterrows():
                    if (
                        not isNaN(hmdbproc["HMDBSMILES"][h])
                        and hmdbproc["HMDBSMILES"][h] != " "
                    ):
                        if (
                            not isNaN(SuspectList["SMILES"][s])
                            and SuspectList["SMILES"][s] != " "
                        ):
                            try:
                                m = Chem.MolFromSmiles(hmdbproc["HMDBSMILES"][h], sanitize=False)
                                if m is None:
                                    hmdbproc["HMDBSMILES"][h] = "invalid_SMILES"
                                else:
                                    try:
                                        Chem.SanitizeMol(m)
                                    except Exception:
                                        hmdbproc["HMDBSMILES"][h] = "invalid_chemistry"
                                if m is not "invalid_SMILES" or m is not "invalid_chemistry":
                                    LHms2 = [
                                        Chem.MolFromSmiles(
                                            hmdbproc["HMDBSMILES"][h]
                                        ),
                                        Chem.MolFromSmiles(
                                            SuspectList["SMILES"][s]
                                        ),
                                    ]
                                    LHfps2 = [
                                        AllChem.GetMorganFingerprintAsBitVect(
                                            x2, 2, nBits=2048
                                        )
                                        for x2 in LHms2
                                    ]
                                    LHtn2 = DataStructs.FingerprintSimilarity(
                                        LHfps2[0], LHfps2[1]
                                    )
                                    if LHtn2 >= 0.99:
                                        hmdbproc.loc[
                                            h, "SLHsmiles"
                                        ] = SuspectList["SMILES"][s]
                                        hmdbproc.loc[
                                            h, "SLHname"
                                        ] = SuspectList['CompoundName'][s]
                                        hmdbproc.loc[h, "SLHtanimoto"] = LHtn2
                            except:
                                pass

        hmdbproc.to_csv(file)

  if m is not "invalid_SMILES" or m is not "invalid_chemistry":
  if m is not "invalid_SMILES" or m is not "invalid_chemistry":


In [20]:
sub_dir = entry + "/spectral_dereplication/GNPS/"
if os.path.exists(sub_dir):
    gnps_files = glob.glob(sub_dir + "/*proc.csv")
    for file in gnps_files:
        gnpsproc = pd.read_csv(file)
        if len(gnpsproc) > 0:
            for g, row in gnpsproc.iterrows():
                for s, row in SuspectList.iterrows():
                    if (
                        not isNaN(gnpsproc["GNPSSMILES"][g])
                        and gnpsproc["GNPSSMILES"][g] != " "
                    ):
                        if (
                            not isNaN(SuspectList["SMILES"][s])
                            and SuspectList["SMILES"][s] != " "
                        ):
                            try:
                                m = Chem.MolFromSmiles(gnpsproc["GNPSSMILES"][g], sanitize=False)
                                if m is None:
                                    gnpsproc["GNPSSMILES"][g] = "invalid_SMILES"
                                else:
                                    try:
                                        Chem.SanitizeMol(m)
                                    except Exception:
                                        gnpsproc["GNPSSMILES"][g] = "invalid_chemistry"
                                if m is not "invalid_SMILES" or m is not "invalid_chemistry":
                                    LHms2 = [
                                        Chem.MolFromSmiles(
                                            gnpsproc["GNPSSMILES"][g]
                                        ),
                                        Chem.MolFromSmiles(
                                            SuspectList["SMILES"][s]
                                        ),
                                    ]
                                    LHfps2 = [
                                        AllChem.GetMorganFingerprintAsBitVect(
                                            x2, 2, nBits=2048
                                        )
                                        for x2 in LHms2
                                    ]
                                    LHtn2 = DataStructs.FingerprintSimilarity(
                                        LHfps2[0], LHfps2[1]
                                    )
                                    if LHtn2 >= 0.99:
                                        gnpsproc.loc[
                                            g, "SLGsmiles"
                                        ] = SuspectList["SMILES"][s]
                                        gnpsproc.loc[
                                            g, "SLGname"
                                        ] = SuspectList['CompoundName'][s]
                                        gnpsproc.loc[g, "SLGtanimoto"] = LHtn2
                            except:
                                pass
        gnpsproc.to_csv(file)

  if m is not "invalid_SMILES" or m is not "invalid_chemistry":
  if m is not "invalid_SMILES" or m is not "invalid_chemistry":
RDKit ERROR: [11:04:50] SMILES Parse Error: syntax error while parsing: N/ACCCCCCCC/C=C\\CCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[NH3+])OC(=O)CCCCCCC/C=C\\CCCCCCCC
[11:04:50] SMILES Parse Error: syntax error while parsing: N/ACCCCCCCC/C=C\\CCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[NH3+])OC(=O)CCCCCCC/C=C\\CCCCCCCC
RDKit ERROR: [11:04:50] SMILES Parse Error: Failed parsing SMILES 'N/ACCCCCCCC/C=C\\CCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[NH3+])OC(=O)CCCCCCC/C=C\\CCCCCCCC' for input: 'N/ACCCCCCCC/C=C\\CCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[NH3+])OC(=O)CCCCCCC/C=C\\CCCCCCCC'
RDKit ERROR: [11:04:50] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:50] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:50] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:50]

RDKit ERROR: [11:04:50] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:50] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:50] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:50] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:50] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:50] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:50] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:50] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:50] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:50] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERRO

d_SMILES'
[11:04:50] SMILES Parse Error: syntax error while parsing: invalid_SMILES
[11:04:50] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
[11:04:50] SMILES Parse Error: syntax error while parsing: invalid_SMILES
[11:04:50] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
[11:04:50] SMILES Parse Error: syntax error while parsing: invalid_SMILES
[11:04:50] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
[11:04:50] SMILES Parse Error: syntax error while parsing: invalid_SMILES
[11:04:50] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
[11:04:50] SMILES Parse Error: syntax error while parsing: invalid_SMILES
[11:04:50] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
[11:04:50] SMILES Parse Error: syntax error while parsing: invalid_SMILES
[11:04:50] SMILES Parse Error: Failed parsing SMILES 'in

RDKit ERROR: [11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERRO

MILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
[11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
[11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
[11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
[11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
[11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
[11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
[11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
[11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
[11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
[11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
[11:04:51] SMILES Parse Error: syntax error while pars

RDKit ERROR: [11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
valid_SMILES'
[11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
[11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
[11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
[11:04:51] SMILES Parse Error: Fail

RDKit ERROR: [11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
g: invalid_SMILES
[11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
[11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
[11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
[11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
[11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
[11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
[11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' 

RDKit ERROR: [11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:51] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:51] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERRO

RDKit ERROR: [11:04:52] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:52] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:52] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:52] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:52] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:52] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:52] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:52] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERROR: [11:04:52] SMILES Parse Error: Failed parsing SMILES 'invalid_SMILES' for input: 'invalid_SMILES'
RDKit ERROR: [11:04:52] SMILES Parse Error: syntax error while parsing: invalid_SMILES
RDKit ERRO

In [None]:
def SuspectListScreening(input_dir, SuspectListPath, tanimoto, Source):
    def isNaN(string):
        return string != string

    SuspectList = pd.read_csv(SuspectListPath)

    for entry in os.listdir(input_dir):

        if os.path.isdir(os.path.join(input_dir, entry)):

            if Source == "gnps" or Source == "specdb" or Source == "all":

                sub_dir = input_dir + "/" + entry + "/spectral_dereplication/GNPS/"
                if os.path.exists(sub_dir):
                    gnps_files = glob.glob(sub_dir + "/*proc.csv")
                    for file in gnps_files:
                        gnpsproc = pd.read_csv(file)
                        if len(gnpsproc) > 0:
                            for g, row in gnpsproc.iterrows():
                                for s, row in SuspectList.iterrows():
                                    if (
                                        not isNaN(gnpsproc["GNPSSMILES"][g])
                                        and gnpsproc["GNPSSMILES"][g] != " "
                                    ):
                                        if (
                                            not isNaN(SuspectList["SMILES"][s])
                                            and SuspectList["SMILES"][s] != " "
                                        ):
                                            LHms2 = [
                                                Chem.MolFromSmiles(
                                                    gnpsproc["GNPSSMILES"][g]
                                                ),
                                                Chem.MolFromSmiles(
                                                    SuspectList["SMILES"][s]
                                                ),
                                            ]
                                            LHfps2 = [
                                                AllChem.GetMorganFingerprintAsBitVect(
                                                    x2, 2, nBits=2048
                                                )
                                                for x2 in LHms2
                                            ]
                                            LHtn2 = DataStructs.FingerprintSimilarity(
                                                LHfps2[0], LHfps2[1]
                                            )
                                            if LHtn2 >= tanimoto:
                                                gnpsproc.loc[
                                                    g, "SLGsmiles"
                                                ] = SuspectList["SMILES"][s]
                                                gnpsproc.loc[
                                                    g, "SLGname"
                                                ] = SuspectList['CompoundName'][s]
                                                gnpsproc.loc[g, "SLGtanimoto"] = LHtn2
                        gnpsproc.to_csv(file)
                        return gnpsproc
            if Source == "hmdb" or Source == "specdb" or Source == "all":

                sub_dir = input_dir + "/" + entry + "/spectral_dereplication/HMDB/"
                if os.path.exists(sub_dir):
                    hmdb_files = glob.glob(sub_dir + "/*proc.csv")
                    for file in hmdb_files:

                        hmdbproc = pd.read_csv(file)
                        if len(hmdbproc) > 0:
                            for h, row in hmdbproc.iterrows():
                                for s, row in SuspectList.iterrows():
                                    if (
                                        not isNaN(hmdbproc["HMDBSMILES"][h])
                                        and hmdbproc["HMDBSMILES"][h] != " "
                                    ):
                                        if (
                                            not isNaN(SuspectList["SMILES"][s])
                                            and SuspectList["SMILES"][s] != " "
                                        ):
                                            LHms2 = [
                                                Chem.MolFromSmiles(
                                                    hmdbproc["HMDBSMILES"][h]
                                                ),
                                                Chem.MolFromSmiles(
                                                    SuspectList["SMILES"][s]
                                                ),
                                            ]
                                            LHfps2 = [
                                                AllChem.GetMorganFingerprintAsBitVect(
                                                    x2, 2, nBits=2048
                                                )
                                                for x2 in LHms2
                                            ]
                                            LHtn2 = DataStructs.FingerprintSimilarity(
                                                LHfps2[0], LHfps2[1]
                                            )
                                            if LHtn2 >= tanimoto:
                                                hmdbproc.loc[
                                                    h, "SLHsmiles"
                                                ] = SuspectList["SMILES"][s]
                                                hmdbproc.loc[
                                                    h, "SLHname"
                                                ] = SuspectList['CompoundName'][s]
                                                hmdbproc.loc[h, "SLHtanimoto"] = LHtn2

                        hmdbproc.to_csv(file)
                        return hmdbproc

            if Source == "mbank" or Source == "specdb" or Source == "all":

                sub_dir = input_dir + "/" + entry + "/spectral_dereplication/MassBank/"
                if os.path.exists(sub_dir):
                    mbank_files = glob.glob(sub_dir + "/*proc.csv")
                    for file in mbank_files:

                        mbankproc = pd.read_csv(file)

                        if len(mbankproc) > 0:
                            for m, row in mbankproc.iterrows():
                                for s, row in SuspectList.iterrows():
                                    if (
                                        not isNaN(mbankproc["MBSMILES"][m])
                                        and mbankproc["MBSMILES"][m] != " "
                                    ):
                                        if (
                                            not isNaN(SuspectList["SMILES"][s])
                                            and SuspectList["SMILES"][s] != " "
                                        ):
                                            LHms2 = [
                                                Chem.MolFromSmiles(
                                                    mbankproc["MBSMILES"][m]
                                                ),
                                                Chem.MolFromSmiles(
                                                    SuspectList["SMILES"][s]
                                                ),
                                            ]
                                            LHfps2 = [
                                                AllChem.GetMorganFingerprintAsBitVect(
                                                    x2, 2, nBits=2048
                                                )
                                                for x2 in LHms2
                                            ]
                                            LHtn2 = DataStructs.FingerprintSimilarity(
                                                LHfps2[0], LHfps2[1]
                                            )
                                            if LHtn2 >= tanimoto:
                                                mbankproc.loc[
                                                    m, "SLMsmiles"
                                                ] = SuspectList["SMILES"][s]
                                                mbankproc.loc[
                                                    m, "SLMname"
                                                ] = SuspectList['CompoundName'][s]
                                                mbankproc.loc[m, "SLMtanimoto"] = LHtn2

                        mbankproc.to_csv(file)
                        return mbankproc