In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from rdkit import Chem
from tqdm.auto import tqdm
import warnings
from helper import scaffold_split

Enable Pandas progress apply

In [13]:
tqdm.pandas()

Read the data 

In [14]:
df = pd.read_csv("MTL_2_input_BSEP_herg_BBB_PDK_HIV.csv")

Add an RDKit molecule to the dataframe.  Not that 9 molecules can't be parsed.

In [15]:
df['mol'] = df.SMILES.progress_apply(Chem.MolFromSmiles)

  0%|          | 0/8128 [00:00<?, ?it/s]

[21:49:15] Explicit valence for atom # 0 C, 5, is greater than permitted
[21:49:15] Explicit valence for atom # 10 N, 4, is greater than permitted
[21:49:15] Explicit valence for atom # 2 N, 4, is greater than permitted
[21:49:15] Explicit valence for atom # 2 N, 4, is greater than permitted
[21:49:15] Explicit valence for atom # 2 N, 4, is greater than permitted
[21:49:15] Explicit valence for atom # 14 N, 4, is greater than permitted
[21:49:15] Explicit valence for atom # 2 N, 4, is greater than permitted
[21:49:15] Explicit valence for atom # 2 N, 4, is greater than permitted
[21:49:15] Explicit valence for atom # 2 N, 4, is greater than permitted


Delete rows where the SMILES couldn't be parsed

In [16]:
df_ok = df.dropna(subset='mol').copy()

Make sure we don't have duplicate molecules

In [17]:
df_ok['cansmi'] = df_ok.mol.apply(Chem.MolToSmiles)

In [18]:
df_ok.cansmi.value_counts()

cansmi
CCOC(=O)N1CCC(=C2c3ccc(Cl)cc3CCc3cccnc32)CC1                        1
NC(=O)Nc1ccc2c(c1)/C(=C(\c1cccc(N)c1)c1ccc[nH]1)C(=O)N2             1
NC(=O)Nc1ccc2c(c1)/C(=C/c1ccc[nH]1)C(=O)N2                          1
NC(=O)Nc1ccc2c(c1)/C(=C/c1cc(-c3cccnc3)c[nH]1)C(=O)N2               1
NC(=O)Nc1ccc2c(c1)/C(=C/c1cc(-c3ccccc3)c[nH]1)C(=O)N2               1
                                                                   ..
CN1C(=O)CN=C(C2=CCCCC2)c2cc([N+](=O)[O-])ccc21                      1
CN1C(=O)CN2CCc3ccccc3C2c2cc(Cl)ccc21                                1
CN1C(=O)CN(C(N)=O)C(c2ccccc2)c2cc(Cl)ccc21                          1
CC1=CC(=O)N2CC(=O)N(C)c3ccc(Cl)cc3C2(c2ccccc2)O1                    1
OCc1ccc(COC(COCc2ccccc2)C(O)C(O)C(COCc2ccccc2)OCc2ccc(CO)cc2)cc1    1
Name: count, Length: 8119, dtype: int64

Drop the molecule column

In [19]:
df_ok = df_ok.drop(["mol","cansmi"],axis=1)

Double check the sizes of the dataframes

In [20]:
len(df),len(df_ok),len(df)-len(df_ok)

(8128, 8119, 9)

Create two dataframes, one with the molecules that have BSEP data and one with the molecules that don't. For single task, we'll only use the molecules with BSEP data.  For multitask, we'll use all the data. 

In [21]:
df_bsep = df_ok.dropna(subset="BSEP").copy()
df_rest = df_ok[df_ok.BSEP.isna()].copy()

In [22]:
df_bsep

Unnamed: 0,Name,SMILES,BSEP,HERG,BBB,PDK1,HIVPRO
0,50775606,CCOC(N1CCC(=C2c(nccc3)c3CCc(cc(Cl)cc4)c24)CC1)=O,1.0,1.0,1.0,,
1,50753667,CSc1cc(N(CCC2N(C)CCCC2)c(cccc3)c3S4)c4cc1,1.0,1.0,1.0,,
2,50753625,OC1(c2ccc(Cl)cc2)CCN(CCCC(c3ccc(F)cc3)=O)CC1,1.0,1.0,1.0,,
3,50739573,CC(N1CCN(c2ccc(OCC3OC(c4c(Cl)cc(Cl)cc4)(Cn5cnc...,1.0,1.0,0.0,,
4,50754140,C(C1NCCCC1)C(C2CCCCC2)C3CCCCC3,1.0,1.0,,,
...,...,...,...,...,...,...,...
920,50772245,OCCN1[C@H](CO)[C@@H](O)[C@H](O)[C@@H](O)C1,0.0,,,,
921,50776041,ONC(CCCCCCC(Nc1ccccc1)=O)=O,0.0,,,,
922,50775841,OS(Cc1c(cccc2)c2on1)(=O)=N,0.0,,,,
923,50772666,OS(CCS)(=O)=O,0.0,,,,


Write the splits to disk. Supress warnings because we're writing floats and NAs in a column. 

In [24]:
warnings.simplefilter(action = "ignore", category = RuntimeWarning)
dataset = {}

if not os.path.exists("data"):
        os.mkdir("data")
base_name = "data/BSEP"
for i in tqdm(range(0,10)):
    dir_name = f"{base_name}{i:03d}"
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
    
    # random split
    rnd_train, rnd_val_test = train_test_split(df_bsep,test_size=0.2,stratify=df_bsep.BSEP,random_state=i)
    rnd_val, rnd_test = train_test_split(rnd_val_test, test_size=0.5,stratify-random_state=i)
    dataset["random"] = [rnd_train, rnd_val, rnd_test]
    # scaffold split
    scaf_train, scaf_val_test = scaffold_split(df_bsep.SMILES,sizes=(0.8,0.2),seed=i)
    df_scaf_train = df_bsep.iloc[scaf_train]
    df_scaf_val_test = df_bsep.iloc[scaf_val_test]
    scaf_val, scaf_test = scaffold_split(df_scaf_val_test.SMILES,sizes=(0.5,0.5),seed=i)
    df_scaf_val = df_scaf_val_test.iloc[scaf_val]
    df_scaf_test = df_scaf_val_test.iloc[scaf_test]    
    dataset["scaffold"] = [df_scaf_train, df_scaf_val, df_scaf_test]
    
    for prefix in ["random", "scaffold"]:
        train, val, test = dataset[prefix]
    
        st_cols = ["SMILES","Name","BSEP"]
        train[st_cols].to_csv(f"{dir_name}/{prefix}_train_{i:03d}.csv",index=False)
        val[st_cols].to_csv(f"{dir_name}/{prefix}_val_{i:03d}.csv",index=False)
        test[st_cols].to_csv(f"{dir_name}/{prefix}_test_{i:03d}.csv",index=False)
    
        mt_cols = ["SMILES","Name","BSEP","HERG","BBB","PDK1","HIVPRO"]
        rest_train, rest_val = train_test_split(df_rest,test_size=0.2,random_state=i)
        train = pd.concat([train, rest_train]).sample(frac=1.0)
        train[mt_cols].to_csv(f"{dir_name}/{prefix}_mt_train_{i:03d}.csv",index=False)
        val = pd.concat([val, rest_val]).sample(frac=1.0)
        val[mt_cols].to_csv(f"{dir_name}/{prefix}_mt_val_{i:03d}.csv",index=False)
        test[mt_cols].to_csv(f"{dir_name}/{prefix}_mt_test_{i:03d}.csv",index=False)

  0%|          | 0/10 [00:00<?, ?it/s]

TypeError: Singleton array array('BSEP', dtype='<U4') cannot be considered a valid collection.