# Notebook to perform data splitting for train/test & val
We plan to use a single train/test data set as our models will use cross validation in order to train, which should help give some idea of how well the model generalizes initially. 

In [120]:
# to do - get this to loop over all the datasets. 

In [121]:
import pandas as pd
from rdkit import Chem
from sklearn.model_selection import train_test_split

from rdkit.Chem import AllChem
import numpy as np
from tqdm import tqdm
import os
tqdm.pandas()
np.random.seed(0)

In [122]:
path='data_cleaned'
out_path='data_split_cleaned'

files=os.listdir(path)
data_map={
    'HIV.csv': {'target':'HIV_active','structure':'smiles'},
    'bace.csv':{'target':'active','structure':'mol'},
    'tox21.csv':{'target':'NR-AhR','structure':'smiles'},
    'clintox.csv':{'target':'CT_TOX','structure':'smiles'},
    'sol_del.csv':{'target':'binned_sol','structure':'smiles'},
    'deepchem_Lipophilicity.csv':{'target':'drug_like','structure':'smiles'}   
}

validation_size=0.15

In [123]:
# test file
files = os.listdir(path)

df=pd.read_csv(os.path.join(path,'bace.csv'))

In [124]:
def generate_fingerprint(smiles,radius,bits):
    try:
        mol=Chem.MolFromSmiles(smiles)
        fp=AllChem.GetMorganFingerprintAsBitVect(mol,radius,bits)
        return(np.array(fp))
    except:
        print(f'{smiles} failed in RDkit')
        return (np.nan)

In [125]:
# Generate the morgan finger prints and drop any rows that don't convert to a mol, or those that don't have a target value
radius=2
bits=1024
df['fp'] = df['mol'].apply(lambda x: generate_fingerprint(x,radius,bits))
df.dropna(subset=['fp','active'],inplace=True)

# Random split:

In [126]:
df_train, df_validate = train_test_split(df,test_size=validation_size,random_state=0)

In [127]:
print(df_train.shape)
print(df_train['active'].value_counts())
#df_train.head(2)

(1286, 598)
1    856
0    430
Name: active, dtype: int64


In [128]:
print(df_validate.shape)
print(df_validate['active'].value_counts())
#df_validate.head(2)

(227, 598)
1    156
0     71
Name: active, dtype: int64


In [129]:
dataset='bace'
df_train.to_csv(os.path.join(out_path,f'{dataset}_train.csv'))
df_validate.to_csv(os.path.join(out_path,f'{dataset}_validate.csv'))

# Clustered Split
To make the split more realistic and understand how well the model will generalize to new data, let's also look at a split based on the data clusters. The goal is to minimize overlap in terms of chemical structure between the two data sets. This will effectively make it much harder for a model that just memorizes certain aspects of the potent scaffolds for instance.<br>
This code was inspired by the chemprop scaffold based split: https://chemprop.readthedocs.io/en/latest/_modules/chemprop/data/scaffold.html

In [130]:
from sklearn.cluster import MiniBatchKMeans
clusters=int(df.shape[0]/30)
kmeans = MiniBatchKMeans(n_clusters=clusters,random_state=0,batch_size=100).fit(df['fp'].to_list())

In [131]:
df['cluster']=kmeans.labels_

In [132]:
df['cluster'].value_counts()

9     157
25     82
7      79
8      75
12     67
36     67
39     62
3      57
27     54
34     47
35     45
18     37
16     36
49     36
48     35
10     34
5      33
30     33
33     33
17     30
19     29
6      28
32     28
22     27
38     25
26     24
13     21
40     20
14     18
28     18
46     16
43     15
42     15
23     15
2      14
20     12
15     11
37     10
4      10
45      9
31      8
44      8
1       8
47      7
0       4
29      4
21      4
24      2
41      2
11      2
Name: cluster, dtype: int64

In [133]:
# add these clusters to two groups, train and val:
val_size=validation_size*len(df)
df_cluster_train=pd.DataFrame()
df_cluster_val=pd.DataFrame()
for group, dataframe in df.groupby('cluster'):
    if dataframe.shape[0] > val_size / 2:
        df_cluster_train=pd.concat([df_cluster_train,dataframe])
    elif len(df_cluster_val)+len(dataframe) <= val_size:
        df_cluster_val=pd.concat([df_cluster_val,dataframe])
    else:
        df_cluster_train=pd.concat([df_cluster_train,dataframe])
        

In [134]:
print(df_cluster_train.shape)
print(df_cluster_train['active'].value_counts())

(1287, 599)
1    845
0    442
Name: active, dtype: int64


In [135]:
print(df_cluster_val.shape)
print(df_cluster_val['active'].value_counts())

(226, 599)
1    167
0     59
Name: active, dtype: int64


In [136]:
# class balance is a little different between the two, however this should outweigh issues arising from
# the scaffolds being shared between both. 

In [137]:
845/(845+442)

0.6565656565656566

In [138]:
167/(167+59)

0.7389380530973452

In [139]:
dataset='bace'
df_cluster_train.to_csv(os.path.join(out_path_cluster,f'{dataset}_cluster_train.csv'))
df_cluster_val.to_csv(os.path.join(out_path_cluster,f'{dataset}_cluster_validate.csv'))

# Apply this splitting to all of the datasets:

In [141]:
# setup loop to split datasets

for file in files:
    if file[-4:]=='.csv':
        print(file)
        dataset_name=file.replace('.csv','')
        df=pd.read_csv(os.path.join(path,file))
        # extract the dataset features:
        target=data_map[file]['target']
        smiles=data_map[file]['structure']

        # add the features:
        radius=2
        bits=1024
        df['fp'] = df[smiles].apply(lambda x: generate_fingerprint(x,radius,bits))
        df.dropna(subset=['fp',target],inplace=True)
        
        #perform a random split:
        df_train, df_validate = train_test_split(df,test_size=validation_size,random_state=0)
        df_train.to_csv(os.path.join(out_path,f'{dataset_name}-random-train.csv'))
        df_validate.to_csv(os.path.join(out_path,f'{dataset_name}-random-validate.csv'))
        
        # perform the clusterd split:
        clusters=int(df.shape[0]/30) # generate a rough number of clusters
        kmeans = MiniBatchKMeans(n_clusters=clusters,random_state=0,batch_size=100).fit(df['fp'].to_list())
        df['cluster']=kmeans.labels_
        # add these clusters to two groups, train and val:
        val_size=validation_size*len(df)
        df_cluster_train=pd.DataFrame()
        df_cluster_val=pd.DataFrame()
        for group, dataframe in df.groupby('cluster'):
            if dataframe.shape[0] > val_size / 2:
                df_cluster_train=pd.concat([df_cluster_train,dataframe])
            elif len(df_cluster_val)+len(dataframe) <= val_size:
                df_cluster_val=pd.concat([df_cluster_val,dataframe])
            else:
                df_cluster_train=pd.concat([df_cluster_train,dataframe])
        # write the clustered sets to a file:
        df_cluster_train.to_csv(os.path.join(out_path,f'{dataset_name}-cluster-train.csv'))
        df_cluster_val.to_csv(os.path.join(out_path,f'{dataset_name}-cluster-validate.csv'))

deepchem_Lipophilicity.csv
sol_del.csv
HIV.csv




clintox.csv
[NH4][Pt]([NH4])(Cl)Cl failed in RDkit


[15:04:40] Explicit valence for atom # 0 N, 5, is greater than permitted
[15:04:40] Can't kekulize mol.  Unkekulized atoms: 9


c1ccc(cc1)n2c(=O)c(c(=O)n2c3ccccc3)CCS(=O)c4ccccc4 failed in RDkit


[15:04:41] Explicit valence for atom # 10 N, 4, is greater than permitted
[15:04:41] Explicit valence for atom # 10 N, 4, is greater than permitted


Cc1cc2c(cc1C)N3C=N2[Co+]456(N7=C8[C@H](C(C7=CC9=N4C(=C(C1=N5[C@@]([C@@H]2N6C(=C8C)[C@@]([C@H]2CC(=O)N)(CCC(=O)NC[C@H](OP(=O)(O[C@@H]2[C@H](O[C@H]3[C@@H]2O)CO)[O-])C)C)([C@@]([C@@H]1CCC(=O)N)(C)CC(=O)N)C)C)[C@@]([C@@H]9CCC(=O)N)(C)CC(=O)N)(C)C)CCC(=O)N)O failed in RDkit
Cc1cc2c(cc1C)N3C=N2[Co]456(N7=C8[C@H](C(C7=CC9=N4C(=C(C1=N5[C@@]([C@@H]2N6C(=C8C)[C@@]([C@H]2CC(=O)N)(CCC(=O)NC[C@H](OP(=O)(O[C@@H]2[C@H](O[C@H]3[C@@H]2O)CO)O)C)C)([C@@]([C@@H]1CCC(=O)N)(C)CC(=O)N)C)C)[C@@]([C@@H]9CCC(=O)N)(C)CC(=O)N)(C)C)CCC(=O)N)C#N failed in RDkit


[15:04:41] Can't kekulize mol.  Unkekulized atoms: 4
[15:04:41] Can't kekulize mol.  Unkekulized atoms: 4


CCCCc1c(=O)n(n(c1=O)c2ccc(cc2)O)c3ccccc3 failed in RDkit
CCCCc1c(=O)n(n(c1=O)c2ccccc2)c3ccccc3 failed in RDkit
bace.csv
tox21.csv


