Generate a consitent set of training and test sets for comparisons.  Sets are generated using both random and scaffold splits.

In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
from glob import glob
import pandas as pd
from sklearn.model_selection import ShuffleSplit
from helper import scaffold_split
from rdkit import Chem
from tqdm.notebook import tqdm

Add columns to a dataframe corresponding to cross validation folds.  Generate two columns for each fold, one with a random split and one with a scaffold split

In [7]:
def process_dataframe(df,splits):
    idx_list = range(0,len(df))
    rs = ShuffleSplit(n_splits=splits, test_size=0.25, train_size=None)
    for idx,(train, test) in enumerate(rs.split(idx_list)):
        col_list = ["train"] * len(df)
        for t in test:
            col_list[t] = "test"
        col_name = f"RND_{idx:02d}"
        df[col_name] = col_list
    mol_list = [Chem.MolFromSmiles(x) for x in df.SMILES]
    for idx in range(0,splits):
        train, test = scaffold_split(mol_list,sizes=(0.75,0.25),seed=idx)
        col_list = ["train"] * len(df)
        for t in test:
            col_list[t] = "test"
        col_name = f"SCAF_{idx:02d}"
        df[col_name] = col_list
    return df

Loop over datasets in SMILES file and add to a dataframe that will be used to define training and test sets.

In [None]:
cv_df = pd.DataFrame()
num_splits = 10
for filename in tqdm(sorted(glob("*.smi"))):
    df = pd.read_csv(filename,sep=" ",names=['SMILES','Name','pIC50'])
    df['Dataset'] = filename.replace(".smi","")
    cv_df = cv_df.append(process_dataframe(df, num_splits))

HBox(children=(FloatProgress(value=0.0, max=24.0), HTML(value='')))

In [None]:
cv_df

A qucik sanity check. 

In [None]:
for s in range(0,num_splits):
    rnd_col = f"RND_{s:02}"
    scaf_col = f"SCAF_{s:02}"
    print(f"{rnd_col:8s}",cv_df[rnd_col].value_counts().values.tolist())
    print(f"{scaf_col:8s}",cv_df[scaf_col].value_counts().values.tolist())

Write the dataframe to disk. 

In [None]:
cv_df.to_csv("cv_splits.csv",index=False)