In [1]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
#IPythonConsole.ipython_useSVG = True
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit import RDLogger
#IPythonConsole.ipython_useSVG = True
import numpy as np
import pandas as pd
from tqdm import tqdm
from openbabel import pybel
RDLogger.DisableLog('rdApp.*')
def clean_and_standardize(smiles,ph=7.4,iso=False):
    try:
        # Convert SMILES to RDKit molecule
        mol = Chem.MolFromSmiles(smiles)
        
        # Skip invalid molecules
        if mol is None:
            return None,None

        # Canonicalize the SMILES
        # canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)

        # Remove salts and other fragments / Keep only the largest fragment
        fragments = Chem.GetMolFrags(mol, asMols=True)
        largest_fragment = max(fragments, default=None, key=lambda m: m.GetNumAtoms())
        if largest_fragment is None:
            return None,None
        
        u = rdMolStandardize.Uncharger()
        uncharge_mol = u.uncharge(largest_fragment)
        uncharge_smiles = Chem.MolToSmiles(uncharge_mol, isomericSmiles=iso, canonical=True)
        
        ob_mol = pybel.readstring("smi", Chem.MolToSmiles(largest_fragment, isomericSmiles=iso, canonical=True))
        
        ob_mol.OBMol.AddHydrogens(False, True, ph)

        # Convert back to SMILES
        adjusted_smiles = ob_mol.write("smi").strip()

        return adjusted_smiles, uncharge_smiles
    
    except Exception as e:
        print(f"Error processing SMILES {smiles}: {e}")
        return None,None

In [2]:
def sort_dict(x):
    return {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

def count_place(inh, parameter):
    count = {}
    for s in inh[parameter].unique():
        count[s] = sum(inh[parameter]==s)
    count = sort_dict(count)
    return count

In [10]:
df = pd.read_csv('./source/substrates_ochem_class.csv',encoding='utf-8')
df['smiles']=df['SMILES']
df['label'] = df['PgP substrate'] == 'yes'
df = df[['smiles','label']].dropna()

df['clean_smiles_pH'] = [ clean_and_standardize(s)[1] for s in df['smiles'] ]

smiles_dict = defaultdict(list)
data_dict=defaultdict(list)

for i in range(len(df)):
    data = df.iloc[i]
    smiles = data['clean_smiles_pH']
    smiles_dict[smiles].append(i)
    data_dict[smiles].append(data['label'])
    
val_list = defaultdict(bool)
for smi in data_dict.keys():
    temp = np.array(data_dict[smi])
    if (temp.all() != temp.any()):
        print(smi,temp)
        continue
    val_list[smi] = temp.any()
    
smiles = list(val_list.keys())
labels = list(val_list.values())
df_process = pd.DataFrame({'smiles':smiles,'label':labels}).dropna()

Error processing SMILES CCP(CC)(CC)[Au+]1[S]=C2NC=NC3=C2[N]1=CN3: Failed to convert 'CC[PH](CC)(CC)[Au+]1[S]=c2[nH]cnc3[nH]cn->1c23' to format 'smi'
CC1CC2C3CCC4=CC(=O)C=CC4(C)C3(F)C(O)CC2(C)C1(O)C(=O)CO [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True False  True  True  True  True  True  True]
CCCCOc1cc(C(=O)NCCN(CC)CC)c2ccccc2n1 [ True  True  True  True  True  True  True  True  True False  True False]
CC1OC(OC2C(O)CC(OC3C(O)CC(OC4CCC5(C)C(CCC6C5CC(O)C5(C)C(C7=CC(=O)OC7)CCC65O)C4)OC3C)OC2C)CC(O)C1O [ True  True  True  True  True  True  True  True  True  True  True False
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True]
COc1ccc(C2Sc3ccccc3N(CCN(C)C)C(=O)C2OC(C)=O)cc1 [ True  True  True  True False  True  True  True  True False  True False
  True  True  True  True  True  True  True  True  True  True False]
CC12CCC(=O)C=C1CCC1C2C(O)CC2(C)C1CCC2(O)C(=O)CO [ True  True  True 

In [11]:
len(df_process),count_place(df_process,'label')

(2451, {True: 1567, False: 884})

In [5]:
df_process.to_csv('substrates_ochem_class.csv',index=False)

In [6]:
df_raw = pd.read_csv('./data_paper/substrates_origin.csv')
print(len(df_raw),count_place(df_raw,'label'))
df_ochem = pd.read_csv('./substrates_ochem_class.csv')
print(len(df_ochem),count_place(df_ochem,'label'))
df_cat_d = pd.concat([df_raw,df_ochem])

df_cat_d['smiles'] = [ clean_and_standardize(s)[1] for s in df_cat_d['smiles'] ]
df_cat_d = df_cat_d.dropna()

data_list = defaultdict(list)
for i in range(len(df_cat_d)):
    data = df_cat_d.iloc[i]
    data_list[data['smiles']].append(data['label'])
    
val_list = defaultdict(bool)
for smi in data_list.keys():
    temp = np.array(data_list[smi])
    if (temp.all() != temp.any()):
        print(smi,temp)
        continue
    val_list[smi] = temp.any()

smiles = list(val_list.keys())
labels = list(val_list.values())
df_process = pd.DataFrame({'smiles':smiles,'label':labels}).dropna()

print(len(df_process),count_place(df_process,'label'))

1171 {True: 753, False: 418}
2451 {True: 1567, False: 884}
3108
COc1ccc2nc(S(=O)Cc3ncc(C)c(OC)c3C)[nH]c2c1 [ True False]
CNCCC(Oc1ccccc1C)c1ccccc1 [ True False]
CCC(CO)Nc1nc(NCc2ccccc2)c2ncn(C(C)C)c2n1 [ True False]
CC(NC(C)(C)C)C(=O)c1cccc(Cl)c1 [ True False]
Cc1ccc(C(=CCN2CCCC2)c2ccccn2)cc1 [ True False]
3103 {True: 1820, False: 1283}


In [7]:
len(df_process),count_place(df_process,'label')

(3103, {True: 1820, False: 1283})

In [8]:
df=pd.read_csv('substrates_classes.csv')

In [9]:
len(df),count_place(df,'label')

(3105, {True: 1822, False: 1283})

In [82]:
df_process.to_csv('substrates_classes.csv',index=False)

In [9]:
df = pd.read_csv('./source/inhibitors_ochem_class.csv',encoding='utf-8')
df['smiles']=df['SMILES']
df['label'] = df['PgP inhibitor'] == 'yes'
df = df[['smiles','label']]

df['clean_smiles_pH'] = [ clean_and_standardize(s)[1] for s in df['smiles'] ]

smiles_dict = defaultdict(list)
data_dict=defaultdict(list)

for i in range(len(df)):
    data = df.iloc[i]
    smiles = data['clean_smiles_pH']
    smiles_dict[smiles].append(i)
    data_dict[smiles].append(data['label'])
    
val_list = defaultdict(bool)
for smi in data_dict.keys():
    temp = np.array(data_dict[smi])
    if (temp.all() != temp.any()):
        print(smi,temp)
        continue
    val_list[smi] = temp.any()
    
smiles = list(val_list.keys())
labels = list(val_list.values())
df_process = pd.DataFrame({'smiles':smiles,'label':labels}).dropna()

df_process.to_csv('inhibitors_ochem_class.csv',index=False)

CC12CCC(=O)C=C1CCC1C2C(O)CC2(C=O)C(C(=O)CO)CCC12 [ True  True False]
CC1CC2C3CCC4=CC(=O)C=CC4(C)C3(F)C(O)CC2(C)C1(O)C(=O)CO [ True  True False False False  True False False False False]
CC12CCC(=O)C=C1CCC1C2C(O)CC2(C)C1CCC2(O)C(=O)CO [ True  True  True False  True False  True False]
CC12CCC(=O)C=C1CCC1C2C(O)CC2(C)C(C(=O)CO)CCC12 [ True  True False False False  True False]
CC12CCC(=O)C=C1CCC1C2CCC2(C)C(O)CCC12 [ True  True  True False  True  True]
COc1ccccc1OCCNCC(O)COc1cccc2[nH]c3c(OC4OC(C(=O)O)C(O)C(O)C4O)cccc3c12 [ True False]
CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(C(C)=O)c1 [ True False  True False False]
Oc1ccccc1OCCNCC(O)COc1cccc2[nH]c3ccccc3c12 [ True False False  True]
CC(=O)Nc1ccc(OCC(O)CNC(C)C)c(C(C)=O)c1 [ True False False  True False False]
CC(CCc1ccccc1)NCC(O)c1ccc(O)c(C(N)=O)c1 [ True  True False  True False False False]
CC(C)NCC(O)COc1cccc2ccccc12 [ True  True  True  True False  True False]
CC(C)(C)NCC(O)COc1ccc(NC(=O)NC2CCCCC2)cc1 [ True False False False False  True False False

In [10]:
print(len(df_process),sum(df_process['label']),len(df_process)-sum(df_process['label']))

1748 1588 160


In [11]:
df_raw = pd.read_csv('./data_paper/inhibitors_origin.csv')
df_ochem = pd.read_csv('./inhibitors_ochem_class.csv')
df_cat_d = pd.concat([df_raw,df_ochem])

df_cat_d['smiles'] = [ clean_and_standardize(s)[1] for s in df_cat_d['smiles'] ]
df_cat_d = df_cat_d.dropna()

data_list = defaultdict(list)
for i in range(len(df_cat_d)):
    data = df_cat_d.iloc[i]
    data_list[data['smiles']].append(data['label'])
    
print(len(data_list))
    
val_list = defaultdict(bool)
for smi in data_list.keys():
    temp = np.array(data_list[smi])
    if (temp.all() != temp.any()):
        print(smi,temp)
        continue
    val_list[smi] = temp.any()

smiles = list(val_list.keys())
labels = list(val_list.values())
df_process = pd.DataFrame({'smiles':smiles,'label':labels}).dropna()

print(len(df_process),sum(df_process['label']),len(df_process)-sum(df_process['label']))

2712
COc1ccc2c(c1)C(=CCCN1CCc3cc(OC)c(OC)cc3C1)CCC2 [ True False]
CCCNC(=C1C(=O)N(C)N=C1C)c1ccccc1 [False  True]
CNCCC=C1c2ccccc2CCc2ccccc21 [ True False]
COc1cccc2c1CCCC2=CCCN1CCCCC1 [ True False]
COc1cc2c(cc1OC)CN(CCC=C1CCCCC1)CC2 [ True False]
COc1cc2c(cc1OC)CN(CCCC1CCCc3cc(OC)c(OC)cc31)CC2 [ True False]
COc1cc2c(cc1OC)CN(CCCC1CCCc3c(OC)cccc31)CC2 [ True False]
COc1ccc2c(c1)CCCC2CCCN1CCc2cc(OC)c(OC)cc2C1 [ True False]
COc1ccc2c(c1)C(CCCN1CCc3cc(OC)c(OC)cc3C1)CCC2 [ True False]
COc1cccc2c1CCCC2CCCN1CCN(C2CCCCC2)CC1 [ True False]
2702 1726 976


In [90]:
df_process.to_csv('inhibitors_classes.csv',index=False)