# These codes and splits based on [Wu et al's MoleculeNet](https://pubs.rsc.org/en/content/articlelanding/2018/sc/c7sc02664a#!divAbstract) work: 

 - Their codes: 
     * https://github.com/deepchem/deepchem/blob/master/examples/benchmark.py
     * http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/Hyperparameter_MoleculeNetv3.tar.gz
     
 - req:
    * deepchem: 2.4.0
    * tensorflow: 2.0.0
    * sklearn: 2.3

In [2]:
import deepchem as dc
import numpy as np
import pandas as pd
import os

# set DEEPCHEM_DATA_DIR
os.environ.setdefault('DEEPCHEM_DATA_DIR', './temp')
dc.__version__

'2.4.0-rc1.dev'

## download file from: 
 - full: http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/full_smiles_labels.csv,  
 - core: http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/core_smiles_labels.csv;  
 - refined: http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/refined_smiles_labels.csv
 
 then put this file to the `DEEPCHEM_DATA_DIR` 

In [2]:
urls = ['http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/full_smiles_labels.csv',
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/core_smiles_labels.csv',
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/refined_smiles_labels.csv']

if not os.path.exists('./temp'):
    os.makedirs('./temp')
    
for url in urls:
    dc.utils.download_url(url, dest_dir= './temp')

## 01_time split PDBbind datasets

In [3]:
flags = ['full', 'core', 'refined']
marker = ['train', 'valid', 'test']

for flag in flags:
    tasks, trian_valid_test, transformer = dc.molnet.load_pdbbind_grid(split='time', featurizer='Raw', subset = flag)

    all_data = []
    idx = 0
    indexs = []
    for i in range(len(trian_valid_test)):
        subset = trian_valid_test[i]
        dfx = pd.DataFrame(subset.ids, columns = ['smiles'])
        dfy = pd.DataFrame(subset.y, columns = tasks)
        dfa = dfx.join(dfy)
        #dfa['group'] = marker[i]
        indexs.append(list(range(idx, idx + len(dfa))))
        idx += len(dfa)
        all_data.append(dfa)
        
    df = pd.concat(all_data)
    df = df.set_index('smiles')

    dirs = os.path.join('./data_and_index', 'PDBbind-%s' % flag)
    if not os.path.exists(dirs):
        os.makedirs(dirs)
        
    df.to_csv(os.path.join(dirs, 'PDBbind-%s.csv.gz' % flag), compression='gzip')
    ind = pd.Series(indexs)
    ind.to_pickle(os.path.join(dirs, 'PDBbind-%s.timesplit.ind.pkl' % flag))

## 02_random and scaffold split datasets

In [3]:
random_seeds = [122, 123, 124] # the orignal random seeds in paper of MoleculeNet <Wu et al.> 

In [4]:
# from https://www.deepchem.io/_modules/index.html
s3_bucket = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/'

# load_function, s3_file_name, smiles_field
loading_functions = {'ESOL': [dc.molnet.load_delaney,'delaney-processed.csv', 'smiles'],
                     'FreeSolv': [dc.molnet.load_sampl, 'SAMPL.csv', 'smiles'],
                     'Lipop': [dc.molnet.load_lipo, 'Lipophilicity.csv', 'smiles'],
                     'SIDER': [dc.molnet.load_sider, 'sider.csv.gz', 'smiles'],
                     'Tox21':  [dc.molnet.load_tox21, 'tox21.csv.gz', 'smiles'], 
                     'ToxCast': [dc.molnet.load_toxcast, 'toxcast_data.csv.gz', 'smiles'], #merge problems in smiles,after merge dataset change
                     'ClinTox': [dc.molnet.load_clintox, 'clintox.csv.gz', 'smiles'],
                     'HIV': [dc.molnet.load_hiv, 'HIV.csv', 'smiles'], #merge problems in smiles,after merge dataset change
                     'BBBP': [dc.molnet.load_bbbp, 'BBBP.csv', 'smiles'], 
                     'BACE': [dc.molnet.load_bace_classification, 'bace.csv', 'smiles'],
                     'PCBA': [dc.molnet.load_pcba, 'pcba.csv.gz', 'smiles' ],
                     'MUV': [dc.molnet.load_muv, 'muv.csv.gz', 'smiles'],
                    }

scaffold_split_dataset = ['HIV', 'BBBP', 'BACE']
splitters = {'random': dc.splits.RandomSplitter(),
             'scaffold': dc.splits.ScaffoldSplitter()}

In [6]:
for dataset in loading_functions.keys():
    
    dirs = os.path.join('./data_and_index2', dataset)
    
    if not os.path.exists(dirs):
        os.makedirs(dirs)
        
    load_data, s3_file, smiles_col = loading_functions[dataset]
    
    tasks, all_dataset, transformers = load_data(featurizer='Raw', split='index', verbose = 0)
    all_dataset = dc.data.DiskDataset.merge(all_dataset)
    
    dfX = pd.DataFrame(all_dataset.ids, columns = ['smiles'])
    dfY = pd.DataFrame(all_dataset.y, columns = tasks)
    dfO = dfX.join(dfY)
    if '.gz' in s3_file:
        compression='gzip'
    else:
        compression= None
    dfAll = pd.read_csv(os.path.join(s3_bucket, s3_file), compression=compression)
    #assert len(dfO) == len(dfAll)
    assert set(tasks).issubset(dfAll.columns) 
    
    if dataset in scaffold_split_dataset:
        split = 'scaffold'
    else:
        split = 'random'
    
    for seed in random_seeds:
        np.random.seed(seed)
        splitter = splitters[split]        
        trian_valid_test = splitter.train_valid_test_split(all_dataset,
                                                           seed  = seed,
                                                             frac_train=0.8,
                                                             frac_valid=0.1,
                                                             frac_test=0.1)   

        indexs = []
        for subset in trian_valid_test:
            idx = dfAll[dfAll[smiles_col].isin(subset.ids)].index.tolist()
            indexs.append(idx)
        sall = len(indexs[0]) + len(indexs[1]) + len(indexs[2])
        print(dataset, seed, len(dfO), len(dfAll), sall, len(indexs[0]), len(indexs[1]), len(indexs[2]))

        ind = pd.Series(indexs)
        ind.to_pickle(os.path.join(dirs, '%s.%s%s.ind.pkl' % (dataset, split, seed)))   
        
    dfAll = dfAll.set_index(smiles_col)[tasks]
    dfAll.index.name = 'smiles'
    
    dfAll.to_csv(os.path.join(dirs, '%s.csv.gz' % dataset), compression='gzip')



ESOL 122 1128 1128 1128 902 113 113
ESOL 123 1128 1128 1128 902 113 113
ESOL 124 1128 1128 1128 902 113 113
FreeSolv 122 642 642 642 513 64 65
FreeSolv 123 642 642 642 513 64 65
FreeSolv 124 642 642 642 513 64 65
Lipop 122 4200 4200 4200 3360 420 420
Lipop 123 4200 4200 4200 3360 420 420
Lipop 124 4200 4200 4200 3360 420 420
SIDER 122 1427 1427 1427 1141 143 143
SIDER 123 1427 1427 1427 1141 143 143
SIDER 124 1427 1427 1427 1141 143 143
Tox21 122 7831 7831 7831 6264 783 784
Tox21 123 7831 7831 7831 6264 783 784
Tox21 124 7831 7831 7831 6264 783 784
ToxCast 122 8597 8576 8576 6860 858 858
ToxCast 123 8597 8576 8576 6860 858 858
ToxCast 124 8597 8576 8576 6860 858 858
ClinTox 122 1484 1478 1478 1182 148 148
ClinTox 123 1484 1478 1478 1182 148 148
ClinTox 124 1484 1478 1478 1182 148 148




HIV 122 41127 41127 41127 32901 4113 4113




HIV 123 41127 41127 41127 32901 4113 4113




HIV 124 41127 41127 41127 32901 4113 4113




BBBP 122 2050 2039 2039 1631 204 204




BBBP 123 2050 2039 2039 1631 204 204




BBBP 124 2050 2039 2039 1631 204 204
BACE 122 1513 1513 1513 1210 151 152
BACE 123 1513 1513 1513 1210 151 152


smiles_field is deprecated and will be removed in a future version of DeepChem.Use feature_field instead.


BACE 124 1513 1513 1513 1210 151 152




PCBA 122 437929 437929 437929 350343 43793 43793
PCBA 123 437929 437929 437929 350343 43793 43793
PCBA 124 437929 437929 437929 350343 43793 43793
MUV 122 93087 93087 93087 74469 9309 9309
MUV 123 93087 93087 93087 74469 9309 9309
MUV 124 93087 93087 93087 74469 9309 9309


# 03 deal with ChEMBL 

a benchmark dataset with 1,310 assays and 4,743,712 assay measurements of 456,331 compounds, scafflod split
---
* paper: Mayr et al: https://pubs.rsc.org/en/Content/ArticleLanding/2018/SC/c8sc00148k#!divAbstract;
* paper Yang et al: https://pubs-acs-org.libproxy1.nus.edu.sg/doi/pdf/10.1021/acs.jcim.9b00237

* dataset prep. details : http://www.rsc.org/suppdata/c8/sc/c8sc00148k/c8sc00148k1.pdf

----

 * the orignal Chembel dataset from https://github.com/swansonk14/chemprop, compressed to gzip file
 * the orignal scaffold split induces from https://github.com/swansonk14/chemprop/blob/master/splits.tar.gz, according to Yang et al's paper, they use only three splits for ChEMBL dataset, the scaffold split 10,11,12 are taken from https://github.com/swansonk14/chemprop/blob/master/splits.tar.gz
