In [1]:
! git clone https://github.com/jimmyjbling/SGC-DEL-ML-WDR91.git
! git clone https://github.com/NikSchap-2107/Target2035_Aircheck_Utils.git
! pip install rdkit-pypi
! pip install rdkit

fatal: destination path 'SGC-DEL-ML-WDR91' already exists and is not an empty directory.
fatal: destination path 'Target2035_Aircheck_Utils' already exists and is not an empty directory.


In [2]:
import sys
sys.path.append("/content/Target2035_Aircheck_Utils/ReadingParquetFiles")
sys.path.append("/content/SGC-DEL-ML-WDR91/scripts")
from read_parquet_utils import read_parquet_file, process_column_to_array
from sklearn.model_selection import train_test_split
import numpy as np
from rdkit.SimDivFilters import rdSimDivPickers
from rdkit import DataStructs
from tqdm import tqdm
from collections import defaultdict
from functools import partial
import pandas as pd
from copy import deepcopy
from time import time
from typing import Dict, Union, Optional
import pickle
import numpy.typing as npt
from lightgbm import LGBMClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, balanced_accuracy_score, \
    average_precision_score
from sklearn.model_selection import StratifiedGroupKFold, StratifiedShuffleSplit

data_file = "/content/WDR91.parquet"
df = read_parquet_file(data_file, columns = ['ECFP6', 'LABEL']) #bECFP6 for initial model ; bECFP6-ECFP6-atompairs for advanced model ; available: ECFP4, ECFP6, FCFP4, FCFP6, TOPTOR, MACCS, RDK, AVALON
tqdm = partial(tqdm, position=0, leave=True)

In [3]:
df.head()

Unnamed: 0,ECFP6,LABEL
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
3,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
4,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",0


In [4]:
df.value_counts('LABEL')

Unnamed: 0_level_0,count
LABEL,Unnamed: 1_level_1
0,346817
1,28778


In [5]:
fps = process_column_to_array(df, 'ECFP6')
labels = np.array(df['LABEL'].tolist())
print(fps)
print(labels)
print(fps.max())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]]
[0 0 0 ... 0 0 0]
24


In [6]:
def numpy_2_fp(array):
    fp = DataStructs.cDataStructs.UIntSparseIntVect(len(array))
    for ix, value in enumerate(array):
        fp[ix] = int(value)
    return fp

In [7]:
fps_l = [numpy_2_fp(fp) for fp in tqdm(fps, total=len(fps))]

100%|██████████| 375595/375595 [09:25<00:00, 663.65it/s]


In [8]:
def distij(i, j, fps=fps_l):
  return 1 - DataStructs.DiceSimilarity(fps[i], fps[j])

def assignPointsToClusters(picks, fps):
    clusters = defaultdict(list)
    for i, idx in enumerate(picks):
        clusters[i].append(idx)
    sims = np.zeros((len(picks), len(fps)))
    for i in tqdm(range(len(picks))):
        pick = picks[i]
        sims[i, :] = DataStructs.BulkTanimotoSimilarity(fps[pick], fps)
        sims[i, i] = 0
    best = np.argmax(sims, axis=0)
    for i, idx in enumerate(best):
        if i not in picks:
            clusters[idx].append(i)
    return clusters

In [10]:
lp = rdSimDivPickers.LeaderPicker()
thresh = 0.65  # <- minimum distance between cluster centroids
picks = lp.LazyPick(distij, len(fps), thresh)
clusters = assignPointsToClusters(picks, fps_l)
cluster_ids = np.zeros(len(fps))
for key, val in clusters.items():
    cluster_ids[val] = key

100%|██████████| 337/337 [08:36<00:00,  1.53s/it]


In [11]:
np.save("/content/cluster_ids.npy", cluster_ids)

In [None]:
class Model:
    def __init__(self):
        self._models = [[]]
        self._train_preds = []
        self._bayes = None
        self._fit = False
        self._ensemble = 0
        self._fp_func = []

    def fit(
            self,
            train_data: Dict[str, Union[npt.NDArray, str]],
            binary_labels: Union[npt.NDArray, str],
            clusters: Optional[Union[npt.NDArray, str]] = None,
            ensemble: int = 1
    ):
        """
        Fit the model
        :param train_data:
            should be a a dictionary where the key is the fingerprint type
            (see the `FPS_FUNCS` dict for names) and the val the path to a pickle
            or the loaded numpy array of the fingerprints

            Will make a separate model for each fingerprint type and mate. So if you set ensemble to 5
            use 4 different FPs, you will have 5 x 4 = 20 models
        :param binary_labels:
            the path to a pickle or the loaded numpy array of the binary labels
        :param clusters:
            the path to a pickle or the loaded numpy array of the cluster IDs
            not used if ensemble is <= 1
        :param ensemble:
            number of ensembles mates to use. Default is 1 (no ensemble)
        :return:
        """
        # load in pickles in needed
        for key, val in train_data.items():
            if isinstance(val, str):
                train_data[key] = pickle.load(open(val, "rb"))

        if isinstance(binary_labels, str):
            y = pickle.load(open(binary_labels, "rb"))
        else:
            y = binary_labels

        # save the fingerprints used for later
        self._fp_func = list(train_data.keys())

        if ensemble > 1:
            mates = []

            # load in cluster data
            if isinstance(clusters, str):
                clusters = pickle.load(open(clusters, "rb"))

            s = StratifiedGroupKFold(n_splits=ensemble, shuffle=True)
            for i, (train_idx, test_idx) in tqdm(enumerate(s.split(train_data[self._fp_func[0]], y, clusters)), desc="Doing Folds"):
                y_train = y[train_idx]
                models = []
                for _, x_train in train_data.items():
                    clf = LGBMClassifier(n_estimators=150, n_jobs=-1)
                    clf.fit(x_train, y_train)
                    models.append(deepcopy(clf))
                mates.append(models)
            self._models = deepcopy(mates)
        else:
            for _, x_train in train_data.items():
                clf = LGBMClassifier(n_estimators=150, n_jobs=-1)
                clf.fit(x_train, y)
                self._models[0].append(deepcopy(clf))

        self._fit = True
        self._ensemble = ensemble

    def screen(self, filepath, outpath: Optional[str] = None):
        """
        Screening a file of SMILES and returns predictions
        pred value will the be probability the model thinks something is active
        conf value is the confidence the model has in its predicted probability

        assumes you are using a smi file, so its tab delimited and first column is SMILES second is Name/ID

        :param filepath: path the .smi file to screen
        :param outpath: name of output file
        """

        if outpath is None:
            outpath = os.path.abspath(filepath).split('.')[0] + ".PREDS"

        with open(outpath, "w") as outfile:
            outfile.write("ID\tSMILES\tPRED\tCONF\n")

        with open(filepath, "r") as f:
            names = []
            smiles = []

            for i, line in tqdm(enumerate(f)):
                splits = line.split("\t")
                smiles.append(splits[0].strip())
                if len(splits) > 1:
                    names.append(splits[1].strip())
                else:
                    names.append(i)

                if ((i+1) % 100) == 0:
                    preds, confs = self.screen_smiles(smiles)
                    with open(outpath, "a") as f2:
                        for n, s, p, c in zip(names, smiles, preds, confs):
                            f2.write(f"{n}\t{s}\t{round(float(p), 4)}\t{round(float(c), 4)}\n")
                        names = []
                        smiles = []

            # catch the last batch
            if len(smiles) != 0:
                preds, confs = self.screen_smiles(smiles)
                with open(outpath, "a") as f2:
                    for n, s, p, c in zip(names, smiles, preds, confs):
                        f2.write(f"{n}\t{s}\t{round(float(p), 4)}\t{round(float(c), 4)}\n")

    def screen_smiles(self, smis: list[str]):
        """
        Screens a list of smiles and returns predictions and confidences
        :param smis:
        :return:
        """
        fps = []
        for _fp in self._fp_func:
            fps.append(list(FPS_FUNCS[_fp](smis)))
        test_preds = []
        for i_model in range(self._ensemble):
            for clf, fp in zip(self._models[i_model], fps):
                test_preds.append(clf.predict_proba(fp)[:, 1])
        test_preds = np.array(test_preds).T
        preds = test_preds.mean(axis=1)
        confs = test_preds.std(axis=1)
        return preds, confs

    def cv(
            self,
            train_data: Dict[str, Union[npt.NDArray, str]],
            binary_labels: Union[npt.NDArray, str],
            clusters: Union[npt.NDArray, str],
            ensemble: int = 1,
    ):
        """
        Fit the model
        :param train_data:
            should be a a dictionary where the key is the fingerprint type
            (see the `FPS_FUNCS` dict for names) and the val the path to a pickle
            or the loaded numpy array of the fingerprints

            Will make a separate model for each fingerprint type and mate. So if you set ensemble to 5
            use 4 different FPs, you will have 5 x 4 = 20 models
        :param binary_labels:
            the path to a pickle or the loaded numpy array of the binary labels
        :param clusters:
            the path to a pickle or the loaded numpy array of the cluster IDs
            not used if ensemble is <= 1
        :param ensemble:
            number of ensembles mates to use. Default is 1 (no ensemble)
        :return:
        """
        # load in pickles in needed
        for key, val in train_data.items():
            if isinstance(val, str):
                train_data[key] = pickle.load(open(val, "rb"))

        if isinstance(binary_labels, str):
            y = np.array(pickle.load(open(binary_labels, "rb")))
        else:
            y = np.array(binary_labels)

        # load in cluster data
        if isinstance(clusters, str):
            clusters = pickle.load(open(clusters, "rb"))

        overall_res_ensemble = {
            "fit_time": [],
            "pred_time": [],
            "precision": [],
            "recall": [],
            "balanced_accuracy": [],
            "AUC_PR": [],
            "AUC_ROC": [],
            "PlatePPV": [],
            "DivPlatePPV": []
        }

        s = StratifiedShuffleSplit(test_size=0.2)

        for i, (train_idx, test_idx) in tqdm(enumerate(s.split(list(train_data.values())[0], y, clusters)), desc="Doing Folds"):
            y_train = y[train_idx]
            y_test = y[test_idx]

            train_clusters = clusters[train_idx]

            mates = []
            all_train_preds = []

            t0 = time()
            for _, x_train_ in train_data.items():
                x_train = x_train_[train_idx]
                if ensemble > 1:
                    # this is the ensemble builder
                    # should have done this so I could have reused the fit func but too late lol
                    s2 = StratifiedGroupKFold(n_splits=ensemble, shuffle=True)
                    models = []
                    train_preds = []

                    for ii, (train_idx2, test_idx2) in tqdm(enumerate(s2.split(x_train, y_train, train_clusters)), desc="Doing ensemble"):
                        clf = LGBMClassifier(n_estimators=150, n_jobs=-1)
                        x_train2 = x_train[train_idx2]
                        y_train2 = y_train[train_idx2]
                        clf.fit(x_train2, y_train2)
                        models.append(deepcopy(clf))
                        train_preds.append(clf.predict_proba(x_train)[:, 1])
                    mates.append(models)
                    all_train_preds.append(train_preds)

                else:
                    clf = LGBMClassifier(n_estimators=150, n_jobs=-1)
                    clf.fit(x_train, y_train)
                    mates.append([deepcopy(clf)])
                    all_train_preds.append([clf.predict_proba(x_train)[:, 1]])
            fit_time = time() - t0

            t0 = time()
            test_preds = []
            for clf_group, (_, x_test) in zip(mates, train_data.items()):
                x_test = x_test[test_idx]
                for clf in clf_group:
                    clf.predict_proba(x_test)
                    test_preds.append(clf.predict_proba(x_test)[:, 1])
            test_preds = np.array(test_preds).T
            pred_time = time() - t0

            preds = test_preds.mean(axis=1)
            discrete_preds = (preds > 0.3).astype(int)

            ppv = precision_score(y_test, discrete_preds)
            recall = recall_score(y_test, discrete_preds)
            auc_roc = roc_auc_score(y_test, preds)
            ba = balanced_accuracy_score(y_test, discrete_preds)
            auc_pr = average_precision_score(y_test, preds)
            p_ppv = plate_ppv(y_test, preds, top_n=128)
            dp_ppv = diverse_plate_ppv(y_test, preds, clusters=clusters[test_idx].tolist())

            overall_res_ensemble["fit_time"].append(fit_time)
            overall_res_ensemble["pred_time"].append(pred_time)
            overall_res_ensemble["precision"].append(ppv)
            overall_res_ensemble["recall"].append(recall)
            overall_res_ensemble["balanced_accuracy"].append(ba)
            overall_res_ensemble["AUC_ROC"].append(auc_roc)
            overall_res_ensemble["AUC_PR"].append(auc_pr)
            overall_res_ensemble["PlatePPV"].append(p_ppv)
            overall_res_ensemble["DivPlatePPV"].append(dp_ppv)

            print("ensemble", overall_res_ensemble)
        return pd.DataFrame(overall_res_ensemble)