# Notebook to perform data splitting for train/test & val
We plan to use a single train/test data set as our models will use cross validation in order to train, which should help give some idea of how well the model generalizes initially. 

In [19]:
import pandas as pd
from rdkit import Chem
from sklearn.model_selection import train_test_split

from rdkit.Chem import AllChem
import numpy as np
from tqdm import tqdm
import os
tqdm.pandas()
np.random.seed(0)

In [60]:
path='data_cleaned'
out_path='data_split_rand'
out_path_cluster='data_split_cluster'
files=os.listdir(path)
data_map={
    'HIV.csv': {'target':'HIV_active','structure':'smiles'},
    'bace.csv':{'target':'active','structure':'mol'},
    'tox21.csv':{'target':'NR-AhR','structure':'smiles'},
    'clintox.csv':{'target':'CT_TOX','structure':'smiles'},
    'sol_del.csv':{'target':'binned_sol','structure':'smiles'},
    'deepchem_Lipophilicity.csv':{'target':'drug_like','structure':'smiles'}   
}

validation_size=0.15

In [30]:
# test file 
df=pd.read_csv(os.path.join(path,'bace.csv'))

In [23]:
def generate_fingerprint(smiles,radius,bits):
    try:
        mol=Chem.MolFromSmiles(smiles)
        fp=AllChem.GetMorganFingerprintAsBitVect(mol,radius,bits)
        return(np.array(fp))
    except:
        print(f'{smiles} failed in RDkit')
        return (np.nan)

In [32]:
# Generate the morgan finger prints and drop any rows that don't convert to a mol, or those that don't have a target value
radius=2
bits=1024
df['fp'] = df['mol'].apply(lambda x: generate_fingerprint(x,radius,bits))
df.dropna(subset=['fp','active'],inplace=True)

# Random split:

In [49]:
df_train, df_validate = train_test_split(df,test_size=validation_size,random_state=0)

In [53]:
print(df_train.shape)
print(df_train['active'].value_counts())
#df_train.head(2)

(1286, 598)
1    856
0    430
Name: active, dtype: int64


In [52]:
print(df_validate.shape)
print(df_validate['active'].value_counts())
#df_validate.head(2)

(227, 598)
1    156
0     71
Name: active, dtype: int64


In [62]:
dataset='bace'
df_train.to_csv(os.path.join(out_path,f'{dataset}_train.csv'))
df_validate.to_csv(os.path.join(out_path,f'{dataset}_validate.csv'))

# Clustered Split
To make the split more realistic and understand how well the model will generalize to new data, let's also look at a split based on the data clusters. The goal is to minimize overlap in terms of chemical structure between the two data sets. This will effectively make it much harder for a model that just memorizes certain aspects of the potent scaffolds for instance.

In [71]:
from sklearn.cluster import MiniBatchKMeans
clusters=int(df.shape[0]/30)
kmeans = MiniBatchKMeans(n_clusters=clusters,random_state=0,batch_size=100).fit(df['fp'].to_list())

In [73]:
df['cluster']=kmeans.labels_

In [74]:
df['cluster'].value_counts()

9     157
25     82
7      79
8      75
12     67
36     67
39     62
3      57
27     54
34     47
35     45
18     37
16     36
49     36
48     35
10     34
5      33
30     33
33     33
17     30
19     29
6      28
32     28
22     27
38     25
26     24
13     21
40     20
14     18
28     18
46     16
43     15
42     15
23     15
2      14
20     12
15     11
37     10
4      10
45      9
31      8
44      8
1       8
47      7
0       4
29      4
21      4
24      2
41      2
11      2
Name: cluster, dtype: int64

In [81]:
# add these clusters to two groups, train and val:
val_size=validation_size*len(df)
df_cluster_train=pd.DataFrame()
df_cluster_val=pd.DataFrame()
for group, dataframe in df.groupby('cluster'):
    if dataframe.shape[0] > val_size / 2:
        df_cluster_train=pd.concat([df_cluster_train,dataframe])
    elif len(df_cluster_val)+len(dataframe) <= val_size:
        df_cluster_val=pd.concat([df_cluster_val,dataframe])
    else:
        df_cluster_train=pd.concat([df_cluster_train,dataframe])
        

In [90]:
print(df_cluster_train.shape)
print(df_cluster_train['active'].value_counts())

(1287, 599)
1    845
0    442
Name: active, dtype: int64


In [91]:
print(df_cluster_val.shape)
print(df_cluster_val['active'].value_counts())

(226, 599)
1    167
0     59
Name: active, dtype: int64


In [None]:
# class balance is a little off - tbd how much this matters:

In [92]:
845/(845+442)

0.6565656565656566

In [93]:
167/(167+59)

0.7389380530973452

In [95]:
dataset='bace'
df_cluster_train.to_csv(os.path.join(out_path_cluster,f'{dataset}_cluster_train.csv'))
df_cluster_val.to_csv(os.path.join(out_path_cluster,f'{dataset}_cluster_validate.csv'))