In [2]:
from glob import glob
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
import useful_rdkit_utils as uru
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score, matthews_corrcoef

In [3]:
def read_input_data(dirname, prefix):
    idx = int(dirname.replace("BSEP",""))
    # read the datafiles
    train = pd.read_csv(f"{dirname}/{prefix}_train_{idx:03d}.csv")
    val = pd.read_csv(f"{dirname}/{prefix}_val_{idx:03d}.csv")
    test = pd.read_csv(f"{dirname}/{prefix}_test_{idx:03d}.csv")
    test = pd.concat([val,test])
    train = train.dropna(subset=["BSEP"])
    test = test.dropna(subset=["BSEP"])
    train['fp'] = train.SMILES.apply(uru.smi2numpy_fp)
    test['fp'] = test.SMILES.apply(uru.smi2numpy_fp)
    return train, test

In [4]:
def build_model(train, test):
    lgbm = LGBMClassifier(verbose=-1)
    lgbm.fit(np.stack(train.fp),train.BSEP)
    prob = lgbm.predict_proba(np.stack(test.fp))
    pred = lgbm.predict(np.stack(test.fp))
    roc_auc = roc_auc_score(test.BSEP, prob[:,1])
    precision, recall, thresholds = precision_recall_curve(test.BSEP, prob[:,1])
    pr_auc = auc(recall, precision)
    mcc = matthews_corrcoef(test.BSEP, pred)
    return roc_auc, pr_auc, mcc

In [19]:
result_list = []
for dirname in sorted(glob("BSEP0*")):
    idx = int(dirname.replace("BSEP",""))
    for prefix in ["random","scaffold"]:
        train, test = read_input_data(dirname,prefix)
        roc_auc, pr_auc, mcc = build_model(train, test)
        print(f"{prefix} {dirname} {roc_auc:.2f} {pr_auc:.2f} {mcc: .2f}")
        result_list.append([prefix, dirname,"ST",roc_auc, pr_auc, mcc])

random BSEP000 0.86 0.64  0.51
scaffold BSEP000 0.90 0.66  0.52
random BSEP001 0.91 0.72  0.57
scaffold BSEP001 0.81 0.52  0.40
random BSEP002 0.84 0.66  0.54
scaffold BSEP002 0.75 0.56  0.32
random BSEP003 0.83 0.73  0.57
scaffold BSEP003 0.81 0.63  0.56
random BSEP004 0.90 0.67  0.50
scaffold BSEP004 0.87 0.63  0.55
random BSEP005 0.87 0.64  0.55
scaffold BSEP005 0.77 0.51  0.47
random BSEP006 0.87 0.60  0.55
scaffold BSEP006 0.84 0.65  0.46
random BSEP007 0.89 0.64  0.45
scaffold BSEP007 0.83 0.63  0.31
random BSEP008 0.89 0.65  0.47
scaffold BSEP008 0.85 0.65  0.44
random BSEP009 0.89 0.72  0.60
scaffold BSEP009 0.83 0.64  0.43


In [20]:
result_df = pd.DataFrame(result_list,columns=["split","dataset","task","roc_auc","pr_auc","mcc"])

In [21]:
result_df

Unnamed: 0,split,dataset,task,roc_auc,pr_auc,mcc
0,random,BSEP000,ST,0.85575,0.636497,0.509922
1,scaffold,BSEP000,ST,0.89881,0.663441,0.523241
2,random,BSEP001,ST,0.905172,0.720933,0.568089
3,scaffold,BSEP001,ST,0.811005,0.524939,0.400543
4,random,BSEP002,ST,0.842458,0.657107,0.537111
5,scaffold,BSEP002,ST,0.749427,0.555806,0.322722
6,random,BSEP003,ST,0.826308,0.726316,0.56703
7,scaffold,BSEP003,ST,0.805821,0.633459,0.559241
8,random,BSEP004,ST,0.901501,0.66725,0.497179
9,scaffold,BSEP004,ST,0.871777,0.631353,0.550296


In [22]:
result_df.to_csv("lgbm_result.csv",index=False)