In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit import RDLogger
#IPythonConsole.ipython_useSVG = True
import numpy as np
import pandas as pd
from tqdm import tqdm
from openbabel import pybel
RDLogger.DisableLog('rdApp.*')
def clean_and_standardize(smiles,ph=7.4,iso=False):
    try:
        # Convert SMILES to RDKit molecule
        mol = Chem.MolFromSmiles(smiles)
        
        # Skip invalid molecules
        if mol is None:
            return None,None

        # Canonicalize the SMILES
        # canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)

        # Remove salts and other fragments / Keep only the largest fragment
        fragments = Chem.GetMolFrags(mol, asMols=True)
        largest_fragment = max(fragments, default=None, key=lambda m: m.GetNumAtoms())
        if largest_fragment is None:
            return None,None
        
        u = rdMolStandardize.Uncharger()
        uncharge_mol = u.uncharge(largest_fragment)
        uncharge_smiles = Chem.MolToSmiles(uncharge_mol, isomericSmiles=iso, canonical=True)
        
        ob_mol = pybel.readstring("smi", Chem.MolToSmiles(largest_fragment, isomericSmiles=iso, canonical=True))
        
        ob_mol.OBMol.AddHydrogens(False, True, ph)

        # Convert back to SMILES
        adjusted_smiles = ob_mol.write("smi").strip()

        return adjusted_smiles, uncharge_smiles
    
    except Exception as e:
        print(f"Error processing SMILES {smiles}: {e}")
        return None,None

In [3]:
def sort_dict(x):
    return {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

def count_place(inh, parameter):
    count = {}
    for s in inh[parameter].unique():
        count[s] = sum(inh[parameter]==s)
    count = sort_dict(count)
    return count

In [4]:
df = pd.read_csv('./source/S1.csv')

In [5]:
df = df[['Name','Smiles','Class']].dropna()

In [6]:
count_place(df,'Class')

{'Pgp_Inhibitor': 1178,
 'Pgp_nonInhibitor': 787,
 'substrate': 477,
 'non_substrate': 70}

In [7]:
df['clean_smiles_pH'] = [ clean_and_standardize(smi)[1] for smi in df['Smiles'] ]

In [8]:
inh_dict = defaultdict(list)
sub_dict = defaultdict(list)
inh_ndx=[]
sub_ndx=[]
for i in range(len(df)):
    data = df.iloc[i]
    if (data['Class'] in ['Pgp_Inhibitor','Pgp_nonInhibitor']):
        inh_dict[data['clean_smiles_pH']].append(data['Class'])
        inh_ndx.append(i)
    else:
        sub_dict[data['clean_smiles_pH']].append(data['Class'])
        sub_ndx.append(i)

In [9]:
len(inh_ndx),len(sub_ndx)

(1965, 547)

In [10]:
from collections import defaultdict
name_dict = defaultdict(set)
for i in range(len(df)):
    data = df.iloc[i]
    name = data['Name']
    smiles = data['clean_smiles_pH']
    name_dict[smiles].add(name)

In [11]:
inh = []
sub = []
for drug in inh_dict.keys():
    inh_list = inh_dict[drug]
    inh_label = (np.array(inh_list)=='Pgp_Inhibitor').all()
    if ((np.array(inh_list)=='Pgp_Inhibitor').all() != (np.array(inh_list)=='Pgp_Inhibitor').any()):
        inh.append(None)
    else:
        inh.append(inh_label)
        
for drug in sub_dict.keys():
    sub_list = sub_dict[drug]
    sub_label = (np.array(sub_list)=='substrate').all()
    if ((np.array(sub_list)=='substrate').all() != (np.array(sub_list)=='substrate').any()):
        sub.append(None)
    else:
        sub.append(sub_label)
    # print(drug,inh_label,sub_label)

In [12]:
df_inh = pd.DataFrame({
    'drug':[ list(name_dict[s])[0] for s in inh_dict.keys() ],
    'smiles':list(inh_dict.keys()),
    'label':inh,
}).dropna(subset=['smiles','label'])

In [13]:
df_sub = pd.DataFrame({
    'drug':[ list(name_dict[s])[0] for s in sub_dict.keys() ],
    'smiles':list(sub_dict.keys()),
    'label':sub,
}).dropna(subset=['smiles','label'])

In [14]:
len(df_inh),count_place(df_inh,'label')

(1918, {True: 1169, False: 749})

In [15]:
len(df_sub),count_place(df_sub,'label')

(521, {True: 451, False: 70})

In [16]:
df_inh.to_csv('inhibitors_multiclass.csv',index=False)

In [17]:
df_sub.to_csv('substrates_multiclass.csv',index=False)

In [18]:
name_dict = defaultdict(set)
df = pd.read_csv('./source/export.csv')
for i in range(len(df)):
    data = df.iloc[i]
    did = data['cmpd_id']
    smiles = data['smiles']
    name_dict[did].add(smiles)

In [19]:
df = pd.read_csv('./source/export_1.csv')

In [20]:
df = df[df['protein_id']=='mbtp000001']

In [21]:
df = df[
    np.array([ s in ['substrate','inhibitor','non-inhibitor','non-substrate'] for s in df['action_type']])
]

In [22]:
df['smiles'] = [ list(name_dict[s])[0] for s in df['cmpd_id'] ]
df=df.dropna(subset=['smiles'])
df['clean_smiles_pH'] = [ clean_and_standardize(smi)[1] for smi in df['smiles'] ]
df = df.dropna(subset=['smiles','clean_smiles_pH'])

Error processing SMILES OC(=O)C[N]12CC[N]3(CC(O)=O)CC[N]4(CC(O)=O)CC[N](CC(O)=O)(CC1)[Eu]234: Failed to convert 'O=C(O)CN12->[Eu]34<-N(CC(=O)O)(CC1)CCN->3(CC(=O)O)CCN->4(CC(=O)O)CC2' to format 'smi'
Error processing SMILES NC(=O)C[N]12CC[N]3(CC(N)=O)CC[N]4(CC(N)=O)CC[N](CC(N)=O)(CC1)[Eu]234: Failed to convert 'NC(=O)CN12->[Eu]34<-N(CC(N)=O)(CC1)CCN->3(CC(N)=O)CCN->4(CC(N)=O)CC2' to format 'smi'
Error processing SMILES OC(=C)C[N]1(CC(O)=O)CC[N]2(CC(O)=O)CC[N](CC(O)=O)(CC(O)=O)[Eu]12: Failed to convert 'C=C(O)CN1(CCN2(CCN(CC(=O)O)(CC(=O)O)->[Eu]<-2<-1)CC(=O)O)CC(=O)O' to format 'smi'
Error processing SMILES CNC(=O)C[N]1(CC(O)=C)CC[N]2(CC(O)=O)CC[N](CC(O)=O)(CC(=O)NC)[Eu]12: Failed to convert 'C=C(O)CN1(CCN2(CCN(CC(=O)O)(CC(=O)NC)->[Eu]<-2<-1)CC(=O)O)CC(=O)NC' to format 'smi'
Error processing SMILES CCCCNC(=O)C[N]1(CC(O)=C)CC[N]2(CC(O)=O)CC[N](CC(O)=O)(CC(=O)NCCCC)[Eu]12: Failed to convert 'C=C(O)CN1(CCN2(CCN(CC(=O)O)(CC(=O)NCCCC)->[Eu]<-2<-1)CC(=O)O)CC(=O)NCCCC' to format 'smi'


*** Open Babel Error  in ParseSimple
  SMILES string contains a character '<' which is invalid
*** Open Babel Error  in ParseSimple
  SMILES string contains a character '<' which is invalid
*** Open Babel Error  in ParseSimple
  SMILES string contains a character '<' which is invalid
*** Open Babel Error  in ParseSimple
  SMILES string contains a character '<' which is invalid
*** Open Babel Error  in ParseSimple
  SMILES string contains a character '<' which is invalid


In [23]:
inh_dict = defaultdict(list)
sub_dict = defaultdict(list)
key_set = set()
inh_ndx=[]
sub_ndx=[]
for i in range(len(df)):
    data = df.iloc[i]
    key_set.add(data['clean_smiles_pH'])
    if (data['action_type'] in ['inhibitor','non-inhibitor']):
        inh_ndx.append(i)
        inh_dict[data['clean_smiles_pH']].append(data['action_type'])
    else:
        sub_ndx.append(i)
        sub_dict[data['clean_smiles_pH']].append(data['action_type'])

In [24]:
len(inh_ndx),len(sub_ndx)

(875, 1452)

In [25]:
drugs = list(key_set)
drug_list = []
inh = []
sub = []
for drug in drugs:
    try:
        inh_list = inh_dict[drug]
        inh_label = (np.array(inh_list)=='inhibitor').all()
        if ((np.array(inh_list)=='inhibitor').all() != (np.array(inh_list)=='inhibitor').any()):
            inh_label = None
    except:
        inh_label = None
    
    try:
        sub_list = sub_dict[drug]
        sub_label = (np.array(sub_list)=='substrate').all()
        if ((np.array(sub_list)=='substrate').all() != (np.array(sub_list)=='substrate').any()):
            sub_label = None
    except:
        sub_label = None
    
    drug_list.append(drug)
    inh.append(inh_label)
    sub.append(sub_label)

In [26]:
df_inh = pd.DataFrame({
    'smiles':drug_list,
    'label':inh,
}).dropna()

df_sub = pd.DataFrame({
    'smiles':drug_list,
    'label':sub,
}).dropna()

In [27]:
len(df_inh),count_place(df_inh,'label')

(333, {True: 259, False: 74})

In [28]:
len(df_sub),count_place(df_sub,'label')

(899, {True: 489, False: 410})

In [29]:
df_inh.to_csv('inhibitors_metra.csv',index=False)

In [30]:
df_sub.to_csv('substrates_metra.csv',index=False)

In [31]:
df = pd.read_csv('./source/inhibitor_main.csv')

In [32]:
df['clean_smiles_pH'] = [ clean_and_standardize(smi)[1] for smi in df['SMILES'] ]

In [33]:
inh_dict = defaultdict(list)
for i in range(len(df)):
    data = df.iloc[i]
    inh_dict[data['clean_smiles_pH']].append(data['Class'])

In [34]:
drug_list = []
inh = []
for drug in inh_dict.keys():
    try:
        inh_list = inh_dict[drug]
        inh_label = (np.array(inh_list)=='inhibitor').all()
        if ((np.array(inh_list)=='inhibitor').all() != (np.array(inh_list)=='inhibitor').any()):
            inh_label = None
    except:
        inh_label = None
    
    drug_list.append(drug)
    inh.append(inh_label)
    
df_inh = pd.DataFrame({
    'smiles':drug_list,
    'label':inh,
})
df_inh=df_inh.dropna()

In [35]:
len(df_inh),count_place(df_inh,'label')

(2247, {True: 1389, False: 858})

In [36]:
df_inh.to_csv('inhibitors_main.csv',index=False)

In [37]:
df_main = pd.read_csv('inhibitors_main.csv')
df_metra = pd.read_csv('inhibitors_metra.csv')
df_multiclass = pd.read_csv('inhibitors_multiclass.csv')

In [38]:
df_main.columns,df_metra.columns,df_multiclass.columns

(Index(['smiles', 'label'], dtype='object'),
 Index(['smiles', 'label'], dtype='object'),
 Index(['drug', 'smiles', 'label'], dtype='object'))

In [39]:
df = pd.concat([df_main,df_metra,df_multiclass],axis=0)[['smiles','label']]

In [40]:
inh_dict = defaultdict(list)
for i in range(len(df)):
    data = df.iloc[i]
    inh_dict[data['smiles']].append(data['label'])

In [41]:
inh = []
for drug in inh_dict.keys():
    try:
        inh_list = inh_dict[drug]
        inh_label = np.array(inh_list).all()
        if (np.array(inh_list).all() != np.array(inh_list).any()):
            inh_label = None
    except:
        inh_label = None
    
    inh.append(inh_label)
    
df_inh = pd.DataFrame({
    'smiles':list(inh_dict.keys()),
    'label':inh,
})

In [42]:
df_inh = df_inh.dropna()

In [43]:
len(df_inh),count_place(df_inh,'label')

(2416, {True: 1538, False: 878})

In [44]:
df_inh.to_csv('inhibitors_origin.csv',index=False)

In [45]:
df_metra = pd.read_csv('substrates_metra.csv')
df_multiclass = pd.read_csv('substrates_multiclass.csv')

In [46]:
df_metra.columns,df_multiclass.columns

(Index(['smiles', 'label'], dtype='object'),
 Index(['drug', 'smiles', 'label'], dtype='object'))

In [47]:
df = pd.concat([df_metra,df_multiclass],axis=0)[['smiles','label']]
df['smiles'] = [ clean_and_standardize(s)[1] for s in df['smiles'] ]
df = df.dropna()

In [48]:
inh_dict = defaultdict(list)
for i in range(len(df)):
    data = df.iloc[i]
    inh_dict[data['smiles']].append(data['label'])

In [49]:
inh = []
for drug in inh_dict.keys():
    try:
        inh_list = inh_dict[drug]
        inh_label = np.array(inh_list).all()
        if (np.array(inh_list).all() != np.array(inh_list).any()):
            inh_label = None
    except:
        inh_label = None
    
    inh.append(inh_label)
    
df_inh = pd.DataFrame({
    'smiles':list(inh_dict.keys()),
    'label':inh,
})

In [50]:
df_inh = df_inh.dropna()

In [51]:
len(df_inh),count_place(df_inh,'label')

(1171, {True: 753, False: 418})

In [52]:
df_inh.to_csv('substrates_origin.csv',index=False)