In [1]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings('ignore', category=UndefinedMetricWarning)
warnings.filterwarnings('ignore', category=UserWarning)

In [2]:
import numpy as np
import pandas as pd
from rdkit import Chem
from skfp.fingerprints import (
    PubChemFingerprint, LaggnerFingerprint, AvalonFingerprint,
    AtomPairFingerprint, ECFPFingerprint, MACCSFingerprint
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from skmultilearn.adapt import MLkNN
#from sklearn.model_selection import KFold
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    precision_score, recall_score
)
from tqdm import tqdm

import deepchem as dc
from rdkit import Chem


No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
No normalization for NumAmideBonds. Feature removed!
No normalization for NumAtomStereoCenters. Feature removed!
No normalization for NumBridgeheadAtoms. Feature removed!
No normalization for NumHeterocycles. Feature removed!
No normalization for NumSpiroAtoms. Feature removed!
No normalization for NumUnspecifiedAtomStereoCenters. Feature removed!
No normalization for Phi. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading some PyTorch models, missing a dependency. No module named 'transformers'
No module named 'transformers'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'transformers'
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'
Skipped 

In [3]:
smiles_df = pd.read_csv('smiles.csv', header=None)
smiles_list = smiles_df.iloc[:, 0].tolist()
Y_lltpt = pd.read_csv('/home/maciej/studia/praktyki_ibb/lltpt_binary.csv').to_numpy()
Y_hlt = pd.read_csv('/home/maciej/studia/praktyki_ibb/hlt_binary.csv').to_numpy()
Y_hlgt = pd.read_csv('/home/maciej/studia/praktyki_ibb/hlgt_binary.csv').to_numpy()


In [4]:
smiles_list_clean = []
idx_clean = []
mols_clean = []
for i, smi in enumerate(smiles_list):
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        smiles_list_clean.append(smi)
        mols_clean.append(mol)
        idx_clean.append(i)
Y_lltpt_clean = Y_lltpt[idx_clean, :]
Y_hlt_clean = Y_hlt[idx_clean, :]
Y_hlgt_clean = Y_hlgt[idx_clean, :]

In [5]:
countEnabled = True

In [6]:
fingerprints = {
    'PubChem': PubChemFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
    'Laggner': LaggnerFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
    'Avalon': AvalonFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
    'AtomPair': AtomPairFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
    'ECFP': ECFPFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
    'MACCS': MACCSFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
}
fp_dims = {
    'PubChem': 881,
    'Laggner': 307,
    'Avalon': 512,
    'AtomPair': 2048,
    'ECFP': 2048,
    'MACCS': 166,
}

In [7]:
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'KNN': KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    'MLkNN': MLkNN(k=10, s=1.0)
}
levels = [
    ('LLT/PT', Y_lltpt_clean),
    ('HLT', Y_hlt_clean),
    ('HLGT', Y_hlgt_clean)
]

In [8]:
def get_proba_multilabel(model, X, Y):
    if hasattr(model, "predict_proba"):
        Y_pred = model.predict_proba(X)
        if isinstance(Y_pred, list) or isinstance(Y_pred, tuple):
            proba = np.vstack([p[:, 1] if p.shape[1] == 2 else np.zeros(p.shape[0]) for p in Y_pred]).T
        else:
            proba = Y_pred.toarray() if hasattr(Y_pred, "toarray") else Y_pred
        return proba
    else:
        return model.predict_proba(X)


In [9]:
def run_w_scaffold(model, X_train, Y_train, X_test, Y_test, multilabel=False):
    model.fit(X_train, Y_train)
    if multilabel:
        Y_pred_proba = get_proba_multilabel(model, X_test, Y_test)
    else:
        Y_pred = model.predict_proba(X_test)
        proba_list = []
        for i, p in enumerate(Y_pred):
            if p.shape[1] == 2:
                proba_list.append(p[:,1])
            else:
                present_class = model.classes_[i][0]
                if present_class == 1:
                    proba_list.append(p[:, 0])
                else:
                    proba_list.append(np.zeros_like(p[:, 0]))
        Y_pred_proba = np.vstack(proba_list).T

    
    auc_micro = roc_auc_score(Y_test.ravel(), Y_pred_proba.ravel()) * 100
    aupr_micro = average_precision_score(Y_test.ravel(), Y_pred_proba.ravel()) * 100
    try:
        auc_macro = roc_auc_score(Y_test, Y_pred_proba, average='macro') * 100
    except Exception:
        auc_macro = float('nan')
    try:
        aupr_macro = average_precision_score(Y_test, Y_pred_proba, average='macro') * 100
    except Exception:
        aupr_macro = float('nan')
    Y_pred_bin = (Y_pred_proba >= 0.5).astype(int)
    precision_micro = precision_score(Y_test, Y_pred_bin, average='micro', zero_division=0) * 100
    recall_micro = recall_score(Y_test, Y_pred_bin, average='micro', zero_division=0) * 100

    return {
        'auc_micro': auc_micro,
        'aupr_micro': aupr_micro,
        'auc_macro': auc_macro,
        'aupr_macro': aupr_macro,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro
    }

In [10]:
results = []
X_cache = {}

for fp_name, fp in tqdm(fingerprints.items(), desc="Fingerprints"):
    print(f"=== Fingerprint: {fp_name} ===")
    if fp_name not in X_cache:
        X = fp.fit_transform(smiles_list_clean)
        if hasattr(X, "toarray"):
            X = X.toarray()
        X_cache[fp_name] = X
    else:
        X = X_cache[fp_name]

    
    dataset = dc.data.NumpyDataset(X, y=None, ids=smiles_list_clean)
    splitter = dc.splits.ScaffoldSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, frac_train=0.8, frac_valid=0.0, frac_test=0.2
    )
    train_idx = [np.where(dataset.ids == id)[0][0] for id in train_dataset.ids]
    test_idx = [np.where(dataset.ids == id)[0][0] for id in test_dataset.ids]


    for level_name, Y in levels:
        X_train, X_test = X[train_idx], X[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]
        for model_name, model in models.items():
            multilabel = (model_name == 'MLkNN')
            print(f" - {model_name} on {level_name}")
            res = run_w_scaffold(model, X_train, Y_train, X_test, Y_test, multilabel=multilabel)
            results.append({
                'fingerprint': fp_name,
                'model': model_name,
                'level': level_name,
                'auc_micro': res['auc_micro'],
                'aupr_micro': res['aupr_micro'],
                'auc_macro': res['auc_macro'],
                'aupr_macro': res['aupr_macro'],
                'precision_micro': res['precision_micro'],
                'recall_micro': res['recall_micro'],
                'dim': fp_dims[fp_name]
            })


Fingerprints:   0%|          | 0/6 [00:00<?, ?it/s]

=== Fingerprint: PubChem ===
 - RandomForest on LLT/PT
 - KNN on LLT/PT
 - MLkNN on LLT/PT
 - RandomForest on HLT
 - KNN on HLT
 - MLkNN on HLT
 - RandomForest on HLGT
 - KNN on HLGT
 - MLkNN on HLGT


Fingerprints:  17%|█▋        | 1/6 [01:10<05:54, 70.91s/it]

=== Fingerprint: Laggner ===
 - RandomForest on LLT/PT
 - KNN on LLT/PT
 - MLkNN on LLT/PT
 - RandomForest on HLT
 - KNN on HLT
 - MLkNN on HLT
 - RandomForest on HLGT
 - KNN on HLGT
 - MLkNN on HLGT


Fingerprints:  33%|███▎      | 2/6 [02:09<04:14, 63.63s/it]

=== Fingerprint: Avalon ===
 - RandomForest on LLT/PT
 - KNN on LLT/PT
 - MLkNN on LLT/PT
 - RandomForest on HLT
 - KNN on HLT
 - MLkNN on HLT
 - RandomForest on HLGT
 - KNN on HLGT
 - MLkNN on HLGT


Fingerprints:  50%|█████     | 3/6 [03:31<03:36, 72.23s/it]

=== Fingerprint: AtomPair ===
 - RandomForest on LLT/PT
 - KNN on LLT/PT
 - MLkNN on LLT/PT
 - RandomForest on HLT
 - KNN on HLT
 - MLkNN on HLT
 - RandomForest on HLGT
 - KNN on HLGT
 - MLkNN on HLGT


Fingerprints:  67%|██████▋   | 4/6 [04:37<02:19, 69.73s/it]

=== Fingerprint: ECFP ===
 - RandomForest on LLT/PT
 - KNN on LLT/PT
 - MLkNN on LLT/PT
 - RandomForest on HLT
 - KNN on HLT
 - MLkNN on HLT
 - RandomForest on HLGT
 - KNN on HLGT
 - MLkNN on HLGT


Fingerprints:  83%|████████▎ | 5/6 [05:40<01:07, 67.27s/it]

=== Fingerprint: MACCS ===
 - RandomForest on LLT/PT
 - KNN on LLT/PT
 - MLkNN on LLT/PT
 - RandomForest on HLT
 - KNN on HLT
 - MLkNN on HLT
 - RandomForest on HLGT
 - KNN on HLGT
 - MLkNN on HLGT


Fingerprints: 100%|██████████| 6/6 [06:36<00:00, 66.02s/it]


In [18]:
filename = "results_all_scaffold.txt"
filepath = f"results/{filename}"

with open(filepath, "w") as f:
    for r in results:
        f.write(
            f"{r['fingerprint']}/{r['model']}/{r['level']}/"
            f"AUC_micro:{r['auc_micro']:.2f} "
            f"AUPR_micro:{r['aupr_micro']:.2f} "
            f"AUC_macro:{r['auc_macro']:.2f} "
            f"AUPR_macro:{r['aupr_macro']:.2f} "
            f"Precision_micro:{r['precision_micro']:.2f} "
            f"Recall_micro:{r['recall_micro']:.2f} "
            f"Dim:{r['dim']}\n"
        )
print(f"saved as {filename}")


saved as results_all_scaffold.txt


In [19]:
from make_table import parse_metrics, generate_text_table, generate_latex_table


with open(filepath, 'r') as f:
    file_content = f.read()

metrics_df = parse_metrics(file_content)

metrics_df


Unnamed: 0,dataset,model,level,extra,AUC_micro,AUPR_micro,AUC_macro,AUPR_macro,Precision_micro,Recall_micro,Dim
0,PubChem,RandomForest,LLT,PT,81.59,40.18,,17.22,61.97,20.11,881
1,PubChem,KNN,LLT,PT,73.86,27.08,,12.51,46.75,23.57,881
2,PubChem,MLkNN,LLT,PT,79.71,34.21,,9.91,0.0,0.0,881
3,PubChem,RandomForest,HLT,,83.25,48.75,,22.37,61.75,28.21,881
4,PubChem,KNN,HLT,,76.24,35.47,,17.59,49.5,31.77,881
5,PubChem,MLkNN,HLT,,80.26,42.34,,14.41,0.0,0.0,881
6,PubChem,RandomForest,HLGT,,84.39,60.69,,28.45,65.73,42.75,881
7,PubChem,KNN,HLGT,,78.84,48.18,,24.04,56.53,45.1,881
8,PubChem,MLkNN,HLGT,,81.03,54.72,,20.92,0.0,0.0,881
9,Laggner,RandomForest,LLT,PT,82.43,41.79,,18.09,65.62,19.51,307


In [20]:

print(generate_latex_table(metrics_df))


\begin{tabular}{llllrrlrrrr}
\toprule
dataset & model & level & extra & AUC_micro & AUPR_micro & AUC_macro & AUPR_macro & Precision_micro & Recall_micro & Dim \\
\midrule
PubChem & RandomForest & LLT & PT & 81.59 & 40.18 & NaN & 17.22 & 61.97 & 20.11 & 881 \\
PubChem & KNN & LLT & PT & 73.86 & 27.08 & NaN & 12.51 & 46.75 & 23.57 & 881 \\
PubChem & MLkNN & LLT & PT & 79.71 & 34.21 & NaN & 9.91 & 0.00 & 0.00 & 881 \\
PubChem & RandomForest & HLT & NaN & 83.25 & 48.75 & NaN & 22.37 & 61.75 & 28.21 & 881 \\
PubChem & KNN & HLT & NaN & 76.24 & 35.47 & NaN & 17.59 & 49.50 & 31.77 & 881 \\
PubChem & MLkNN & HLT & NaN & 80.26 & 42.34 & NaN & 14.41 & 0.00 & 0.00 & 881 \\
PubChem & RandomForest & HLGT & NaN & 84.39 & 60.69 & NaN & 28.45 & 65.73 & 42.75 & 881 \\
PubChem & KNN & HLGT & NaN & 78.84 & 48.18 & NaN & 24.04 & 56.53 & 45.10 & 881 \\
PubChem & MLkNN & HLGT & NaN & 81.03 & 54.72 & NaN & 20.92 & 0.00 & 0.00 & 881 \\
Laggner & RandomForest & LLT & PT & 82.43 & 41.79 & NaN & 18.09 & 65.62 & 