In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from rdkit import Chem
from tqdm.auto import tqdm
import warnings
from helper import scaffold_split
import useful_rdkit_utils as uru
from sklearn.decomposition import PCA
import numpy as np
from sklearn.manifold import TSNE
import seaborn as sns
import mols2grid

Enable Pandas progress apply

In [2]:
tqdm.pandas()

Read the data 

In [4]:
df = pd.read_csv("MTL_2_input_BSEP_herg_BBB_PDK_HIV.csv")

Add an RDKit molecule to the dataframe.  Not that 9 molecules can't be parsed.

In [5]:
df['mol'] = df.SMILES.progress_apply(Chem.MolFromSmiles)

  0%|          | 0/8128 [00:00<?, ?it/s]

[20:35:48] Explicit valence for atom # 0 C, 5, is greater than permitted
[20:35:48] Explicit valence for atom # 10 N, 4, is greater than permitted
[20:35:48] Explicit valence for atom # 2 N, 4, is greater than permitted
[20:35:48] Explicit valence for atom # 2 N, 4, is greater than permitted
[20:35:48] Explicit valence for atom # 2 N, 4, is greater than permitted
[20:35:48] Explicit valence for atom # 14 N, 4, is greater than permitted
[20:35:48] Explicit valence for atom # 2 N, 4, is greater than permitted
[20:35:48] Explicit valence for atom # 2 N, 4, is greater than permitted
[20:35:48] Explicit valence for atom # 2 N, 4, is greater than permitted


Delete rows where the SMILES couldn't be parsed

In [6]:
df_ok = df.dropna(subset='mol').copy()

In [7]:
df_ok['cansmi'] = df_ok.mol.apply(Chem.MolToSmiles)

In [8]:
df_ok.cansmi.value_counts()

CCOC(=O)N1CCC(=C2c3ccc(Cl)cc3CCc3cccnc32)CC1                        1
NC(=O)Nc1ccc2c(c1)/C(=C(\c1cccc(N)c1)c1ccc[nH]1)C(=O)N2             1
NC(=O)Nc1ccc2c(c1)/C(=C/c1ccc[nH]1)C(=O)N2                          1
NC(=O)Nc1ccc2c(c1)/C(=C/c1cc(-c3cccnc3)c[nH]1)C(=O)N2               1
NC(=O)Nc1ccc2c(c1)/C(=C/c1cc(-c3ccccc3)c[nH]1)C(=O)N2               1
                                                                   ..
CN1C(=O)CN=C(C2=CCCCC2)c2cc([N+](=O)[O-])ccc21                      1
CN1C(=O)CN2CCc3ccccc3C2c2cc(Cl)ccc21                                1
CN1C(=O)CN(C(N)=O)C(c2ccccc2)c2cc(Cl)ccc21                          1
CC1=CC(=O)N2CC(=O)N(C)c3ccc(Cl)cc3C2(c2ccccc2)O1                    1
OCc1ccc(COC(COCc2ccccc2)C(O)C(O)C(COCc2ccccc2)OCc2ccc(CO)cc2)cc1    1
Name: cansmi, Length: 8119, dtype: int64

Drop the molecule column

In [9]:
df_ok = df_ok.drop("mol",axis=1)

Double check the sizes of the dataframes

In [10]:
len(df),len(df_ok),len(df)-len(df_ok)

(8128, 8119, 9)

Create two dataframes, one with the molecules that have BSEP data and one with the molecules that don't. For single task, we'll only use the molecules with BSEP data.  For multitask, we'll use all the data. 

In [11]:
df_bsep = df_ok.dropna(subset="BSEP").copy()
df_rest = df_ok[df_ok.BSEP.isna()].copy()

In [12]:
df_bsep

Unnamed: 0,Name,SMILES,BSEP,HERG,BBB,PDK1,HIVPRO,cansmi
0,50775606,CCOC(N1CCC(=C2c(nccc3)c3CCc(cc(Cl)cc4)c24)CC1)=O,1.0,1.0,1.0,,,CCOC(=O)N1CCC(=C2c3ccc(Cl)cc3CCc3cccnc32)CC1
1,50753667,CSc1cc(N(CCC2N(C)CCCC2)c(cccc3)c3S4)c4cc1,1.0,1.0,1.0,,,CSc1ccc2c(c1)N(CCC1CCCCN1C)c1ccccc1S2
2,50753625,OC1(c2ccc(Cl)cc2)CCN(CCCC(c3ccc(F)cc3)=O)CC1,1.0,1.0,1.0,,,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1
3,50739573,CC(N1CCN(c2ccc(OCC3OC(c4c(Cl)cc(Cl)cc4)(Cn5cnc...,1.0,1.0,0.0,,,CC(=O)N1CCN(c2ccc(OCC3COC(Cn4ccnc4)(c4ccc(Cl)c...
4,50754140,C(C1NCCCC1)C(C2CCCCC2)C3CCCCC3,1.0,1.0,,,,C1CCC(C(CC2CCCCN2)C2CCCCC2)CC1
...,...,...,...,...,...,...,...,...
920,50772245,OCCN1[C@H](CO)[C@@H](O)[C@H](O)[C@@H](O)C1,0.0,,,,,OCCN1C[C@H](O)[C@@H](O)[C@H](O)[C@H]1CO
921,50776041,ONC(CCCCCCC(Nc1ccccc1)=O)=O,0.0,,,,,O=C(CCCCCCC(=O)Nc1ccccc1)NO
922,50775841,OS(Cc1c(cccc2)c2on1)(=O)=N,0.0,,,,,N=S(=O)(O)Cc1noc2ccccc12
923,50772666,OS(CCS)(=O)=O,0.0,,,,,O=S(=O)(O)CCS


Write the splits to disk. Supress warnings because we're writing floats and NAs in a column. 

In [17]:
warnings.simplefilter(action = "ignore", category = RuntimeWarning)
dataset = {}
if not os.path.exists(dir_name):
    os.mkdir("data")
for i in tqdm(range(0,10)):
    dir_name = f"data/BSEP{i:03d}"
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
    
    # random split
    rnd_train, rnd_val_test = train_test_split(df_bsep,test_size=0.2,random_state=i)
    rnd_val, rnd_test = train_test_split(rnd_val_test, test_size=0.5, random_state=i)
    dataset["random"] = [rnd_train, rnd_val, rnd_test]
    # scaffold split
    scaf_train, scaf_val_test = scaffold_split(df_bsep.SMILES,sizes=(0.8,0.2),seed=i)
    df_scaf_train = df_bsep.iloc[scaf_train]
    df_scaf_val_test = df_bsep.iloc[scaf_val_test]
    scaf_val, scaf_test = scaffold_split(df_scaf_val_test.SMILES,sizes=(0.5,0.5),seed=i)
    df_scaf_val = df_scaf_val_test.iloc[scaf_val]
    df_scaf_test = df_scaf_val_test.iloc[scaf_test]    
    dataset["scaffold"] = [df_scaf_train, df_scaf_val, df_scaf_test]
    
    for prefix in ["random", "scaffold"]:
        train, val, test = dataset[prefix]
    
        st_cols = ["SMILES","Name","BSEP"]
        train[st_cols].to_csv(f"{dir_name}/{prefix}_train_{i:03d}.csv",index=False)
        val[st_cols].to_csv(f"{dir_name}/{prefix}_val_{i:03d}.csv",index=False)
        test[st_cols].to_csv(f"{dir_name}/{prefix}_test_{i:03d}.csv",index=False)
    
    
        rest_train, rest_val = train_test_split(df_rest,test_size=0.2,random_state=i)
        pd.concat([train, rest_train]).sample(frac=1.0).to_csv(f"{dir_name}/{prefix}_mt_train_{i:03d}.csv",index=False)
        pd.concat([val, rest_val]).sample(frac=1.0).to_csv(f"{dir_name}/{prefix}_mt_val_{i:03d}.csv",index=False)
        test.to_csv(f"{dir_name}/{prefix}_mt_test_{i:03d}.csv",index=False)

  0%|          | 0/10 [00:00<?, ?it/s]