In [1]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit import RDLogger
#IPythonConsole.ipython_useSVG = True
import numpy as np
import pandas as pd
from tqdm import tqdm
from openbabel import pybel
RDLogger.DisableLog('rdApp.*')
from collections import defaultdict

In [2]:

def clean_and_standardize(smiles,ph=7.4,iso=False):
    try:
        # Convert SMILES to RDKit molecule
        mol = Chem.MolFromSmiles(smiles)
        
        # Skip invalid molecules
        if mol is None:
            return None,None

        # Canonicalize the SMILES
        # canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)

        # Remove salts and other fragments / Keep only the largest fragment
        fragments = Chem.GetMolFrags(mol, asMols=True)
        largest_fragment = max(fragments, default=None, key=lambda m: m.GetNumAtoms())
        if largest_fragment is None:
            return None,None
        
        u = rdMolStandardize.Uncharger()
        uncharge_mol = u.uncharge(largest_fragment)
        uncharge_smiles = Chem.MolToSmiles(uncharge_mol, isomericSmiles=iso, canonical=True)
        
        ob_mol = pybel.readstring("smi", Chem.MolToSmiles(largest_fragment, isomericSmiles=iso, canonical=True))
        
        ob_mol.OBMol.AddHydrogens(False, True, ph)

        # Convert back to SMILES
        adjusted_smiles = ob_mol.write("smi").strip()

        return adjusted_smiles, uncharge_smiles
    
    except Exception as e:
        print(f"Error processing SMILES {smiles}: {e}")
        return None,None

In [3]:
def sort_dict(x):
    return {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

def count_place(inh, parameter):
    count = {}
    for s in inh[parameter].unique():
        count[s] = sum(inh[parameter]==s)
    count = sort_dict(count)
    return count

In [4]:
df = pd.read_csv('./source/substrates_chem.csv',encoding='utf-8',sep='\t')

In [5]:
df.columns

Index(['CMPD_CHEMBLID', 'MOLREGNO', 'PARENT_CMPD_CHEMBLID', 'PARENT_MOLREGNO',
       'MOL_PREF_NAME', 'COMPOUND_KEY', 'MOLWEIGHT', 'ALOGP', 'PSA',
       'NUM_RO5_VIOLATIONS', 'CANONICAL_SMILES', 'ACTIVITY_ID',
       'STANDARD_TYPE', 'RELATION', 'STANDARD_VALUE', 'STANDARD_UNITS',
       'PCHEMBL_VALUE', 'PUBLISHED_TYPE', 'PUBLISHED_RELATION',
       'PUBLISHED_VALUE', 'PUBLISHED_UNITS', 'ACTIVITY_COMMENT',
       'DATA_VALIDITY_COMMENT', 'POTENTIAL_DUPLICATE', 'BAO_ENDPOINT',
       'UO_UNITS', 'QUDT_UNITS', 'ASSAY_ID', 'ASSAY_CHEMBLID', 'ASSAY_TYPE',
       'DESCRIPTION', 'ASSAY_SRC_ID', 'ASSAY_SRC_DESCRIPTION',
       'ASSAY_ORGANISM', 'ASSAY_STRAIN', 'ASSAY_TAX_ID', 'CURATED_BY',
       'BAO_FORMAT', 'TID', 'TARGET_CHEMBLID', 'TARGET_TYPE',
       'PROTEIN_ACCESSION', 'PREF_NAME', 'ORGANISM', 'CONFIDENCE_SCORE',
       'TARGET_MAPPING', 'APD_NAME', 'APD_CONFIDENCE', 'DOC_ID',
       'DOC_CHEMBLID', 'PUBMED_ID', 'JOURNAL', 'YEAR', 'VOLUME', 'ISSUE',
       'FIRST_PAGE', 'CELL_ID',

In [6]:
df = df[['MOL_PREF_NAME','CANONICAL_SMILES','RELATION','STANDARD_VALUE','DESCRIPTION','PUBMED_ID']].dropna(subset=['CANONICAL_SMILES','RELATION','STANDARD_VALUE','DESCRIPTION','PUBMED_ID'])

In [7]:
df['clean_smiles_pH'] = [ clean_and_standardize(s)[1] for s in df['CANONICAL_SMILES'] ]

In [8]:
source=df

In [9]:
len(df),len(df.drop_duplicates(subset=['clean_smiles_pH']))

(2520, 1236)

In [10]:
bool0 = np.array([
    'presence' not in exp and 
    'treat' not in exp and
    '5.5' not in exp and
    'inh' not in exp
    for exp in df['DESCRIPTION']
])

bool1 = np.array([
    ('Efflux' in exp or 'efflux' in exp) and 
    'Influx' not in exp 
    for exp in df['DESCRIPTION']
])

bool2 = np.array([
    'Caco2' in exp and 
    'MDCK' not in exp and
    'cell' in exp and 
    'human' in exp
    for exp in df['DESCRIPTION']
])

df = df[bool0 & bool1 & bool2]
print(len(df))

1204


In [11]:
count_place(df,'DESCRIPTION')

{'Efflux ratio of permeability from apical to basolateral over basolateral to apical side in human Caco2 cells': 138,
 'Efflux ratio of permeability in human Caco2 cells': 133,
 'Efflux ratio of permeability from basolateral to apical side over apical to basolateral side in human Caco2 cells': 94,
 'Efflux ratio of permeability from apical to basolateral side over basolateral to apical side of human Caco2 cells at 10 uM up to 120 mins by HPLC-MC analysis': 63,
 'Efflux ratio of permeability from apical to basolateral side over basolateral to apical side in human Caco2 cells': 56,
 'Efflux ratio of permeability from apical to basolateral side over basolateral to apical side in human Caco2 cells at 100 uM up to 90 mins': 46,
 'Efflux ratio of apparent permeability from basal to apical to apical to basal side of human Caco2 cells at 10 uM after 60 mins by LC/MS analysis': 40,
 'Efflux ratio of apparent permeability from basolateral to apical side over apical to basolateral side in human C

In [12]:
import copy
df_describ = copy.deepcopy(df)
df_describ['RELATION'] = [ 0 if (s=='=') else (1 if ('>' in s) else 2) for s in df_describ['RELATION'] ]
df_describ['rev'] = np.zeros(len(df_describ))

In [13]:
count_place(df_describ[df_describ['STANDARD_VALUE']<1],'STANDARD_VALUE')

{0.7: 35,
 0.8: 18,
 0.9: 18,
 0.6: 16,
 0.5: 15,
 0.1: 15,
 0.2: 15,
 0.3: 13,
 0.84: 7,
 0.48: 7,
 0.4: 7,
 0.78: 5,
 0.55: 5,
 0.42: 4,
 0.91: 4,
 0.74: 4,
 0.61: 4,
 0.36: 4,
 0.41: 4,
 0.29: 4,
 0.25: 3,
 0.47: 3,
 0.46: 3,
 0.57: 3,
 0.23: 3,
 0.26: 3,
 0.067: 2,
 0.28: 2,
 0.0: 2,
 0.092: 2,
 0.88: 2,
 0.82: 2,
 0.63: 2,
 0.89: 2,
 0.53: 2,
 0.66: 2,
 0.22: 2,
 0.49: 2,
 0.31: 2,
 0.11: 2,
 0.54: 2,
 0.37: 2,
 0.34: 2,
 0.35: 2,
 0.32: 2,
 0.97: 1,
 0.59: 1,
 0.43: 1,
 0.79: 1,
 0.94: 1,
 0.87: 1,
 0.51: 1,
 0.71: 1,
 0.95: 1,
 0.45: 1,
 0.67: 1,
 0.72: 1,
 0.33: 1,
 0.21: 1,
 0.39: 1,
 0.77: 1,
 0.73: 1}

In [14]:
val_list = []
rel_list = []
for i in range(len(df_describ)):
    data = df_describ.iloc[i]
    rel_list.append(data['RELATION'])
    val_list.append(data['STANDARD_VALUE'])
        
df_describ['RELATION']=rel_list
df_describ['STANDARD_VALUE']=val_list

In [15]:
data_list = []
tho1=2.0
tho2=2.0
for i in range(len(df_describ)):
    data = df_describ.iloc[i]
    val = data['STANDARD_VALUE']
    rel =  data['RELATION']

    if (rel==2 and val <= tho1):
        data_list.append(0)
    elif (rel==1 and val >= tho2):
        data_list.append(2)
    elif (rel==1 or rel==2):
        data_list.append(None)
    elif val<tho1:
        data_list.append(0)
    elif val>=tho1 and val<tho2:
        data_list.append(1)
    else:
        data_list.append(2)
        
df_describ['label']=data_list
print(len(set(df_describ['clean_smiles_pH'])),len(df_describ))
df_describ = df_describ.dropna(subset=['label'])


727 1204


In [16]:
count_place(df_describ,'label')

{2.0: 711, 0.0: 476}

In [17]:
# df_describ['label']=[ True if s==2 else (None if s==1 else False) for s in df_describ['label'].tolist()]
# df_describ=df_describ.dropna(subset=['label'])

In [18]:
count_place(df_describ,'label')

{2.0: 711, 0.0: 476}

In [19]:
def test_v1(data_dict,dropna=True,strict=True):
    from scipy.stats import chisquare,multinomial
    val_list = []
    for smi in data_dict.keys():
        
        temp = np.array(data_dict[smi])
        total=len(temp)
        
        results=np.zeros(3)
        for l in range(3):
            results[l]=sum(temp==l)
        
        observed=np.zeros(2)
        observed[0]=sum(temp<=0)
        observed[1]=sum(temp>=1)
        
        if strict:
            if results[results.argmax()]==total:
                val_list.append(results.argmax())
            else:
                val_list.append(None)
        else:
            if observed[observed.argmax()]==total:
                if results.argmax()==0 and results[0]==results[1]:
                        val_list.append(1)
                else:
                    val_list.append(results.argmax())
            else:
                val_list.append(None)
    if dropna:
        return [(k,v) for k,v in zip(data_dict.keys(),val_list) if v is not None]
    else:
        return [(k,v) for k,v in zip(data_dict.keys(),val_list) ]
    

In [20]:
df_describ['clean_smiles_pH'] = [ clean_and_standardize(s)[1] for s in df_describ['CANONICAL_SMILES'] ]
data_dict = defaultdict(list)
smiles_dict = defaultdict(list)
for i in range(len(df_describ)):
    data = df_describ.iloc[i]
    smiles = data['clean_smiles_pH']
    smiles_dict[smiles].append(i)
    data_dict[smiles].append(data['label'])

val_list = []
for k,v in test_v1(data_dict,dropna=False,strict=False):
    val_list.append(v)
        
smiles = list(data_dict.keys())
df_process = pd.DataFrame({'smiles':smiles,'label':val_list}).dropna()

In [21]:
len(df_process),count_place(df_process,'label')

(708, {2.0: 415, 0.0: 293})

In [22]:
df_process['label']=[ True if s==2 else (None if s==1 else False) for s in df_process['label'].tolist()]
df_process=df_process.dropna(subset=['label'])

In [23]:
len(df_process),count_place(df_process,'label')

(708, {True: 415, False: 293})

In [24]:
df_process.to_csv('./substrates_refine.csv',index=False)

In [45]:
from copy import deepcopy
def validation(df_process):
    inh_com= pd.read_csv('../data_extra/competitive.csv')
    smi_mask=list(inh_com['smiles'])
    drug_dict={}
    for i,smi in enumerate(smi_mask):
        drug_dict[smi]=inh_com.iloc[i]['drug']
    flag=[ exp in smi_mask for exp in df_process['smiles'] ]
    df=deepcopy(df_process[flag])
    df['drug']=list([ drug_dict[smi] for smi in df['smiles'] ])
    df=df[['drug','smiles','label']]
    return df

In [46]:
validation(df_process)

Unnamed: 0,drug,smiles,label
248,Tariquidar,COc1cc2c(cc1OC)CN(CCc1ccc(NC(=O)c3cc(OC)c(OC)c...,True
302,Paclitaxel,CC(=O)OC1C(=O)C2(C)C(O)CC3OCC3(OC(C)=O)C2C(OC(...,True
667,Daunorubicin,COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)CC(O)(C(C...,True
675,Nicardipine,COC(=O)C1=C(C)NC(C)=C(C(=O)OCCN(C)Cc2ccccc2)C1...,False
704,Vinblastine,CCC1(O)CC2CN(CCc3c([nH]c4ccccc34)C(C(=O)OC)(c3...,True
705,Colchicine,COc1cc2c(c(OC)c1OC)-c1ccc(OC)c(=O)cc1C(NC(C)=O...,True


In [47]:
validation(pd.read_csv('../data_classes/substrates_classes.csv'))

Unnamed: 0,drug,smiles,label
16,Rhodamine_123,COC(=O)c1ccccc1-c1c2ccc(=N)cc-2oc2cc(N)ccc12,True
85,Paclitaxel,CC(=O)OC1C(=O)C2(C)C(O)CC3OCC3(OC(C)=O)C2C(OC(...,True
115,Colchicine,COc1cc2c(c(OC)c1OC)-c1ccc(OC)c(=O)cc1C(NC(C)=O...,True
326,Doxorubicin,COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)CC(O)(C(=...,True
487,Vinblastine,CCC1(O)CC2CN(CCc3c([nH]c4ccccc34)C(C(=O)OC)(c3...,True
537,Morphine,CN1CCC23c4c5ccc(O)c4OC2C(O)C=CC3C1C5,True
563,Nicardipine,COC(=O)C1=C(C)NC(C)=C(C(=O)OCCN(C)Cc2ccccc2)C1...,False
614,Daunorubicin,COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)CC(O)(C(C...,True
749,Elacridar,COc1cc2c(cc1OC)CN(CCc1ccc(NC(=O)c3cccc4c(=O)c5...,False
962,Verapamil,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...,True
