In [1]:
from glob import glob
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
import useful_rdkit_utils as uru
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score, matthews_corrcoef

Read the training and test data

In [2]:
def read_input_data(dirname, prefix):
    idx = int(dirname.replace("data/BSEP",""))
    # read the datafiles
    train = pd.read_csv(f"{dirname}/{prefix}_train_{idx:03d}.csv")
    val = pd.read_csv(f"{dirname}/{prefix}_val_{idx:03d}.csv")
    test = pd.read_csv(f"{dirname}/{prefix}_test_{idx:03d}.csv")
    train = pd.concat([train, val])
    train = train.dropna(subset=["BSEP"])
    test = test.dropna(subset=["BSEP"])
    print(len(train),len(val),len(test))
    train['fp'] = train.SMILES.apply(uru.smi2numpy_fp)
    test['fp'] = test.SMILES.apply(uru.smi2numpy_fp)
    return train, test

Build the LightGBM model

In [3]:
def build_model(train, test):
    lgbm = LGBMClassifier(verbose=-1)
    lgbm.fit(np.stack(train.fp),train.BSEP)
    prob = lgbm.predict_proba(np.stack(test.fp))
    pred = lgbm.predict(np.stack(test.fp))
    roc_auc = roc_auc_score(test.BSEP, prob[:,1])
    precision, recall, thresholds = precision_recall_curve(test.BSEP, prob[:,1])
    pr_auc = auc(recall, precision)
    mcc = matthews_corrcoef(test.BSEP, pred)
    return roc_auc, pr_auc, mcc, prob, pred

Build models and store the data

In [4]:
df_list = []
result_list = []
for dirname in sorted(glob("data/BSEP0*")):
    idx = int(dirname.replace("data/BSEP",""))
    for prefix in ["random","scaffold"]:
        train, test = read_input_data(dirname,prefix)
        roc_auc, pr_auc, mcc, prob, pred = build_model(train, test)
        test.BSEP = test.BSEP.astype(int)
        test['method'] = 'lightGBM'
        test['BSEP_prob'] = prob[:,1]
        test['BSEP_pred'] = pred.astype(int)
        test['cv_cycle'] = idx
        test['split'] = prefix
        df_list.append(test)
        print(f"{prefix} {dirname} {roc_auc:.2f} {pr_auc:.2f} {mcc: .2f}")
        result_list.append([prefix, dirname,"ST",roc_auc, pr_auc, mcc])

832 92 93
random data/BSEP000 0.85 0.68  0.55
832 92 93
scaffold data/BSEP000 0.90 0.56  0.48
832 92 93
random data/BSEP001 0.84 0.62  0.55
832 92 93
scaffold data/BSEP001 0.77 0.56  0.39
832 92 93
random data/BSEP002 0.86 0.76  0.60
832 92 93
scaffold data/BSEP002 0.69 0.43  0.22
832 92 93
random data/BSEP003 0.88 0.54  0.45
832 92 93
scaffold data/BSEP003 0.76 0.61  0.57
832 92 93
random data/BSEP004 0.90 0.62  0.41
832 92 93
scaffold data/BSEP004 0.95 0.86  0.68
832 92 93
random data/BSEP005 0.96 0.89  0.84
832 92 93
scaffold data/BSEP005 0.78 0.51  0.56
832 92 93
random data/BSEP006 0.84 0.67  0.45
832 92 93
scaffold data/BSEP006 0.88 0.68  0.49
832 92 93
random data/BSEP007 0.85 0.65  0.54
832 92 93
scaffold data/BSEP007 0.88 0.73  0.53
832 92 93
random data/BSEP008 0.81 0.59  0.45
832 92 93
scaffold data/BSEP008 0.80 0.50  0.34
832 92 93
random data/BSEP009 0.87 0.66  0.48
832 92 93
scaffold data/BSEP009 0.78 0.58  0.42


Format the results into a dataframe

In [14]:
cols = ['cv_cycle','split','method','SMILES','Name','BSEP','BSEP_prob','BSEP_pred']
test[cols]

Unnamed: 0,cv_cycle,split,method,SMILES,Name,BSEP,BSEP_prob,BSEP_pred
0,9,scaffold,lightGBM,OC(C(=O)C(O)=C(O)C1=O)=C1O,50754325,0,0.000015,0
1,9,scaffold,lightGBM,CC[C@@H]1[C@](O)(C)[C@@H](O)[C@@H](C)C(=O)[C@H...,50754320,0,0.020274,0
2,9,scaffold,lightGBM,CC[C@@H]1[C@](O)(C)[C@H](O)[C@@H](C)C(=O)[C@H]...,50774876,1,0.020274,0
3,9,scaffold,lightGBM,CC[C@@H]1[C@](O)(C)[C@H](O)[C@@H](C)C(=O)[C@H]...,50375468,1,0.057594,0
4,9,scaffold,lightGBM,CC[C@@H]1[C@](O)(C)[C@H](O)[C@@H](C)C(=O)[C@H]...,50772771,0,0.032712,0
...,...,...,...,...,...,...,...,...
88,9,scaffold,lightGBM,CC([C@@H]1[C@@](C)(CC[C@H]([C@@](C)(CC[C@H](O)...,50772847,0,0.183872,0
89,9,scaffold,lightGBM,CC([C@@H]1[C@@](C)(CC[C@H]([C@@](C)(CCC(O)C2)C...,50739371,0,0.183872,0
90,9,scaffold,lightGBM,Nc1oc(ccc(Cl)c2)c2n1,50754122,0,0.004766,0
91,9,scaffold,lightGBM,COc1ccc(CN(c2ncccc2)CCN(C)C)cc1,50753682,0,0.008502,0


Write the individual predictions to disk

In [19]:
pd.concat(df_list)[cols].to_csv("lightgbm_classifciation_results.csv",index=False)

Write the summary statistics to disk

In [20]:
result_df = pd.DataFrame(result_list,columns=["split","dataset","task","roc_auc","pr_auc","mcc"])

In [None]:
result_df

In [18]:
result_df.to_csv("lgbm_result.csv",index=False)