In [1]:
import pandas as pd
from rdkit import Chem
import seaborn as sns
from tqdm import tqdm
import numpy as np

In [2]:
tqdm.pandas()


In [3]:
df = pd.read_csv("./PubChem_AID_589039/AID_589039_datatable_all.csv")

In [4]:
df

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Standard Type,Activity Comment
0,RESULT_TYPE,,,,,,,,STRING,STRING
1,RESULT_DESCR,,,,,,,,Standardized activity type (e.g. IC50 rather t...,Additional comments
2,1,103164548.0,5284616.0,C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C/[C@H](C[...,Unspecified,,,Potential missing data,Activity,Substrate
3,2,103164549.0,445643.0,C[C@@H]1C[C@@H]([C@@H]2[C@H](C[C@H]([C@@](O2)(...,Unspecified,,,Potential missing data,Activity,Substrate
4,3,103166970.0,2477.0,C1CCC2(C1)CC(=O)N(C(=O)C2)CCCCN3CCN(CC3)C4=NC=...,Unspecified,,,Potential missing data,Activity,Substrate
...,...,...,...,...,...,...,...,...,...,...
116,115,160677133.0,5978.0,CC[C@@]1(C[C@@H]2C[C@@](C3=C(CCN(C2)C1)C4=CC=C...,Unspecified,,,Potential missing data,Activity,Substrate
117,116,160681707.0,443869.0,CC1=C([C@@H](C(=C(N1)C)C(=O)O[C@H]2CCN(C2)CC3=...,Unspecified,,,Potential missing data,Activity,Substrate
118,117,163312394.0,5311497.0,CCC1=C[C@H]2C[C@@](C3=C(CN(C2)C1)C4=CC=CC=C4N3...,Unspecified,,,Potential missing data,Activity,Substrate
119,118,194134279.0,3793.0,CCC(C)N1C(=O)N(C=N1)C2=CC=C(C=C2)N3CCN(CC3)C4=...,Unspecified,,,Potential missing data,Activity,Substrate


In [5]:
df[0:2]

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Standard Type,Activity Comment
0,RESULT_TYPE,,,,,,,,STRING,STRING
1,RESULT_DESCR,,,,,,,,Standardized activity type (e.g. IC50 rather t...,Additional comments


In [6]:
    df = df.drop(labels=list(range(0, 2)), axis=0).reset_index(
        drop=True
    )

In [7]:
df

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Standard Type,Activity Comment
0,1,103164548.0,5284616.0,C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C/[C@H](C[...,Unspecified,,,Potential missing data,Activity,Substrate
1,2,103164549.0,445643.0,C[C@@H]1C[C@@H]([C@@H]2[C@H](C[C@H]([C@@](O2)(...,Unspecified,,,Potential missing data,Activity,Substrate
2,3,103166970.0,2477.0,C1CCC2(C1)CC(=O)N(C(=O)C2)CCCCN3CCN(CC3)C4=NC=...,Unspecified,,,Potential missing data,Activity,Substrate
3,4,103169851.0,123619.0,CC1=NC=C(C=C1)C2=C(C=C(C=N2)Cl)C3=CC=C(C=C3)S(...,Unspecified,,,Potential missing data,Activity,Substrate
4,5,103172458.0,53232.0,CC[C@H](C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([...,Unspecified,,,Potential missing data,Activity,Substrate
...,...,...,...,...,...,...,...,...,...,...
114,115,160677133.0,5978.0,CC[C@@]1(C[C@@H]2C[C@@](C3=C(CCN(C2)C1)C4=CC=C...,Unspecified,,,Potential missing data,Activity,Substrate
115,116,160681707.0,443869.0,CC1=C([C@@H](C(=C(N1)C)C(=O)O[C@H]2CCN(C2)CC3=...,Unspecified,,,Potential missing data,Activity,Substrate
116,117,163312394.0,5311497.0,CCC1=C[C@H]2C[C@@](C3=C(CN(C2)C1)C4=CC=CC=C4N3...,Unspecified,,,Potential missing data,Activity,Substrate
117,118,194134279.0,3793.0,CCC(C)N1C(=O)N(C=N1)C2=CC=C(C=C2)N3CCN(CC3)C4=...,Unspecified,,,Potential missing data,Activity,Substrate


In [8]:
df.PUBCHEM_ACTIVITY_OUTCOME.value_counts()

PUBCHEM_ACTIVITY_OUTCOME
Unspecified    119
Name: count, dtype: int64

In [9]:
df.PUBCHEM_ACTIVITY_SCORE.value_counts()

Series([], Name: count, dtype: int64)

In [10]:
df["Activity Comment"].value_counts()

Activity Comment
Substrate    119
Name: count, dtype: int64

In [11]:
# standardisation of SMILES
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.rdBase import BlockLogs
import mols2grid

In [12]:

# from Pat Walters method
def standardize_smiles(smiles):
    try:
        # follows the steps in
        # https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
        # as described **excellently** (by Greg) in
        # https://www.youtube.com/watch?v=eWTApNX8dJQ
        mol = Chem.MolFromSmiles(smiles)
    
        # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
        clean_mol = rdMolStandardize.Cleanup(mol)
    
        # if many fragments, get the "parent" (the actual mol we are interested in) 
        parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
    
        # try to neutralize molecule
        uncharger = rdMolStandardize.Uncharger()  # annoying, but necessary as no convenience method exists
        uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
    
        # note that no attempt is made at reionization at this step
        # nor at ionization at some pH (rdkit has no pKa caculator)
        # the main aim to represent all molecules from different sources
        # in a (single) standard way, for use in ML, catalogue, etc.
    
        te = rdMolStandardize.TautomerEnumerator()  # idem
        taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol)
    
        return Chem.MolToSmiles(taut_uncharged_parent_clean_mol)
    except:
        return pd.NA

In [13]:

with BlockLogs():
    df["CANONICAL_SMILES"] = df["PUBCHEM_EXT_DATASOURCE_SMILES"].progress_apply(lambda x: standardize_smiles(x))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 119/119 [00:02<00:00, 52.65it/s]


In [14]:
def smi_to_inchikey(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return Chem.MolToInchiKey(mol)
    except:
        return pd.NA

In [15]:
# compute inchikeys
with BlockLogs():
    df["INCHIKEY"] = df["CANONICAL_SMILES"].progress_apply(lambda x: smi_to_inchikey(x))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 119/119 [00:00<00:00, 3166.92it/s]


In [16]:
# not much more to do really
df["dataset"] = "aid_589039_clinically_relevant"
df["data_type"] = "assay" # going to assume assay
df["action_type"] = "substrate"

In [17]:
df.to_csv("processed/aid_589039_clinically_relevant.csv", index=False)