In [1]:
from molmap import loadmap

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit import RDLogger
#IPythonConsole.ipython_useSVG = True
import numpy as np
import pandas as pd
from tqdm import tqdm
from openbabel import pybel
RDLogger.DisableLog('rdApp.*')

In [2]:
mp1 = loadmap('./test.mp')

In [3]:
def clean_and_standardize(smiles,ph=7.4,iso=False):
    try:
        # Convert SMILES to RDKit molecule
        mol = Chem.MolFromSmiles(smiles)
        
        # Skip invalid molecules
        if mol is None:
            return None,None

        # Canonicalize the SMILES
        # canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)

        # Remove salts and other fragments / Keep only the largest fragment
        fragments = Chem.GetMolFrags(mol, asMols=True)
        largest_fragment = max(fragments, default=None, key=lambda m: m.GetNumAtoms())
        if largest_fragment is None:
            return None,None
        
        u = rdMolStandardize.Uncharger()
        uncharge_mol = u.uncharge(largest_fragment)
        uncharge_smiles = Chem.MolToSmiles(uncharge_mol, isomericSmiles=iso, canonical=True)
        
        ob_mol = pybel.readstring("smi", Chem.MolToSmiles(largest_fragment, isomericSmiles=iso, canonical=True))
        
        ob_mol.OBMol.AddHydrogens(False, True, ph)

        # Convert back to SMILES
        adjusted_smiles = ob_mol.write("smi").strip()

        return adjusted_smiles, uncharge_smiles
    
    except Exception as e:
        print(f"Error processing SMILES {smiles}: {e}")
        return None,None
    
# def clean_csv(input_name,output_name):
#     inh = pd.read_csv(input_name,encoding='utf-8')
#     smiles_list = inh['smiles'].tolist()
#     clean = []
#     neu = []
#     for smi in smiles_list:
#         std_smi,neu_smi = clean_and_standardize(smi)
#         clean.append(std_smi)
#         neu.append(neu_smi)
#     inh['clean_smiles_pH'] = clean
#     inh['clean_smiles_neu'] = neu
#     inh.to_csv(output_name,index=False)

In [4]:
def clean_csv(input_name,output_name):
    inh = pd.read_csv(input_name,encoding='utf-8')
    smiles_list = inh['smiles'].tolist()
    clean = []
    neu = []
    for smi in smiles_list:
        std_smi,neu_smi = clean_and_standardize(smi)
        clean.append(std_smi)
        neu.append(neu_smi)
    inh['clean_smiles_pH'] = clean
    inh['clean_smiles_neu'] = neu
    inh.to_csv(output_name,index=False)

In [5]:
mp1 = loadmap('./test.mp')
def generate_MolDs(input_name,output_name,columns):
    df = pd.read_csv(input_name, encoding='utf-8')
    # smi = df['clean_smiles_pH']
    smi = df[columns]
    df_list = []
    for s in tqdm(smi):
        arr = mp1.extract.transform(s)
        df = pd.DataFrame(arr).T
        df.columns = mp1.extract.bitsinfo.IDs
        df_list.append(df)
    feat = pd.concat(df_list)
    
    rows_with_nan = np.array(feat.isna().any(axis=1),dtype=np.int_)
    df = pd.read_csv(input_name,encoding='utf-8')
    df[columns+'_valid'] = rows_with_nan
    
    mask0 = rows_with_nan == 0
    feat = feat[mask0]
    
    df.to_csv(input_name,index=False)
    feat.to_csv(output_name,index=False)

In [6]:
def generate_map(input_name,output_name,columns):
    df = pd.read_csv(input_name,encoding='utf-8')
    mask = df[columns+'_valid']==0
    smi = df[columns][mask].tolist()
    F = mp1.batch_transform(smi)
    np.savez(output_name,x=F)

In [7]:
def scaling_min_max(df):
    df = mp1.MinMaxScaleClip(df,
            mp1.scale_info['min'], 
            mp1.scale_info['max'])
    df = df[mp1.flist]
    return df

def scaling_csv(name):
    scaling_min_max(pd.read_csv(name)).to_csv('scaled_'+name,index=False)

In [8]:
def data_prepare(name,addition):
    origin=name+'.csv'
    clean='clean_'+name+'.csv'
    if (name == addition):
        clean_csv(origin,clean)
        df = pd.read_csv(clean).drop_duplicates(subset='clean_smiles_pH')
        df.to_csv(clean,index=False)
        
    clean_MolDs='clean_'+addition+'_MolDs.csv'
    clean_npz = 'clean_'+addition+'.npz'
    
    if (name == addition):
        generate_MolDs(clean,clean_MolDs,'clean_smiles_pH')
        scaling_csv(clean_MolDs)
        generate_map(clean,clean_npz,'clean_smiles_pH')
    else:
        generate_MolDs(clean,clean_MolDs,'clean_smiles_neu')
        scaling_csv(clean_MolDs)
        generate_map(clean,clean_npz,'clean_smiles_neu')      

In [18]:
for name in ['refine_inhibitors']:
    data_prepare(name,name)
    data_prepare(name,name+'_neu')

100%|██████████| 174/174 [00:28<00:00,  6.12it/s]
100%|##########| 174/174 [00:07<00:00, 24.21it/s]
100%|██████████| 174/174 [00:28<00:00,  6.08it/s]
100%|##########| 174/174 [00:07<00:00, 24.06it/s]


In [None]:
for name in ['refine_substrates']:
    data_prepare(name,name)
    data_prepare(name,name+'_neu')

 16%|█▌        | 21/130 [00:03<00:16,  6.42it/s]

In [10]:
df = pd.read_csv('clean_refine_inhibitors.csv')

In [13]:
df['labels'].sum(), len(df)-df['labels'].sum()

(101, 115)

In [14]:
df = pd.read_csv('clean_refine_substrates.csv')

In [15]:
df['labels'].sum(), len(df)-df['labels'].sum()

(193, 191)