In [1]:
import deepchem as dc
from rdkit import Chem
from rdkit.Chem.MolStandardize.rdMolStandardize import StandardizeSmiles
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

import numpy as np
import pandas as pd
import os

In [50]:
def neutralize_atoms(mol):
    pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
    at_matches = mol.GetSubstructMatches(pattern)
    at_matches_list = [y[0] for y in at_matches]
    if len(at_matches_list) > 0:
        for at_idx in at_matches_list:
            atom = mol.GetAtomWithIdx(at_idx)
            chg = atom.GetFormalCharge()
            hcount = atom.GetTotalNumHs()
            atom.SetFormalCharge(0)
            atom.SetNumExplicitHs(hcount - chg)
            atom.UpdatePropertyCache()
    return mol

def try_standardize(smi):
    try:
        return StandardizeSmiles(Chem.MolToSmiles(neutralize_atoms(Chem.MolFromSmiles(smi))))
    except:
        pass
    return 'fail'

def clean_up_smiles(df):
    df['smiles'] = df['smiles'].map(lambda s: try_standardize(s))
    df = df[df.smiles != 'fail']
    return df

def df_to_ds(df):
    X = np.zeros(shape=(len(df),1))
    return dc.data.DiskDataset.from_numpy(X=X, y=np.vstack(df.label.to_numpy()), ids=df.smiles)

In [51]:
path_to_pickle = 'datasets/small_dataset2_CHEMBL2114881.pkl'
out_dir_name = ''
splitter = 'random'
split_type = ''

df = pd.read_pickle(path_to_pickle)
df = clean_up_smiles(df)
ds = df_to_ds(df)

In [52]:
df

Unnamed: 0,smiles,label
0,CN(C)c1ccc(NC2=C(Cl)C(=O)c3ccccc3C2=O)cc1,3.913711
1,Oc1ccc(Cl)cc1/N=C/c1c[nH]nc1O,4.522156
2,COc1cc(CN2CCC(C(=O)N3CCN(c4ccccc4F)CC3)CC2)ccc...,3.958528
3,Oc1ccccc1CNc1ccc2c(c1)OCCO2,4.022551
4,O=C(COc1ccc(Nc2ccccc2)cc1)OCC(=O)N1CCCCC1,4.007756
...,...,...
2049,N#CC1=C(S)N(c2ccc(F)cc2)C(c2ccco2)NC1=O,4.073503
2050,COc1ccc(NCc2cc3ccccc3nc2O)cc1,4.023558
2051,COc1cc(/C=C2/C(=O)NN(c3ccc(Cl)cc3)C2=O)cc(Br)c1O,3.985774
2052,COc1ccc(Br)cc1CN(C)CC(=O)NCCc1ccccc1.O=C(O)C(=O)O,4.019815


In [4]:
if splitter == 'random':
    splitter = dc.splits.RandomSplitter()
    split_type = 'random'
elif splitter == 'fingerprint':
    splitter = dc.splits.FingerprintSplitter()
    split_type = 'finger'
elif splitter == 'scaffold':
    splitter = dc.splits.ScaffoldSplitter()
    split_type = 'scafld'

In [5]:
train_test_splits = random.k_fold_split(dataset=ds, k=5)

In [7]:
out_dir_name = f'CHEMBL2114881_{split_type}_split'
os.mkdir(path=out_dir_name)

In [30]:
for i,tt in enumerate(train_test_splits):
    train_df = tt[0].to_dataframe()
    train_df = train_df.rename({'ids': 'smiles', 'y': 'label'}, axis='columns').drop(['X', 'w'], axis='columns')
    train_df.to_pickle(f'{out_dir_name}/train{i}')
    
    test_df = tt[1].to_dataframe()
    test_df = test_df.rename({'ids': 'smiles', 'y': 'label'}, axis='columns').drop(['X', 'w'], axis='columns')
    test_df.to_pickle(f'{out_dir_name}/test{i}')

In [27]:
test = []
for i in range(5):
    test.append((pd.read_pickle(f'{out_dir_name}/train{i}'), pd.read_pickle(f'{out_dir_name}/test{i}')))