In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem
from skfp.fingerprints import (
    PubChemFingerprint, LaggnerFingerprint, AvalonFingerprint,
    AtomPairFingerprint, ECFPFingerprint, MACCSFingerprint
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from skmultilearn.adapt import MLkNN
from sklearn.model_selection import KFold
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    precision_score, recall_score
)
from tqdm import tqdm

In [2]:
smiles_df = pd.read_csv('smiles.csv', header=None)
smiles_list = smiles_df.iloc[:, 0].tolist()
Y_lltpt = pd.read_csv('/home/maciej/studia/praktyki_ibb/lltpt_binary.csv').to_numpy()
Y_hlt = pd.read_csv('/home/maciej/studia/praktyki_ibb/hlt_binary.csv').to_numpy()
Y_hlgt = pd.read_csv('/home/maciej/studia/praktyki_ibb/hlgt_binary.csv').to_numpy()


In [3]:
smiles_list_clean = []
idx_clean = []
for i, smi in enumerate(smiles_list):
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        smiles_list_clean.append(smi)
        idx_clean.append(i)
Y_lltpt_clean = Y_lltpt[idx_clean, :]
Y_hlt_clean = Y_hlt[idx_clean, :]
Y_hlgt_clean = Y_hlgt[idx_clean, :]

In [4]:
countEnabled = False

In [6]:
fingerprints = {
    'PubChem': PubChemFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
    'Laggner': LaggnerFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
    'Avalon': AvalonFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
    'AtomPair': AtomPairFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
    'ECFP': ECFPFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
    'MACCS': MACCSFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
}
fp_dims = {
    'PubChem': 881,
    'Laggner': 307,
    'Avalon': 512,
    'AtomPair': 2048,
    'ECFP': 2048,
    'MACCS': 166,
}

In [7]:
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'KNN': KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    'MLkNN': MLkNN(k=10, s=1.0)
}
levels = [
    ('LLT/PT', Y_lltpt_clean),
    ('HLT', Y_hlt_clean),
    ('HLGT', Y_hlgt_clean)
]

In [None]:
def get_proba_multilabel(model, X, Y):
    """"
    compute probability estimates for multilabel classifiers

    supports both list-based and sparse outputs from scikit-multilearn models

    returns p
        probability estimates, shape (n_samples, n_labels)
    """
    
    if hasattr(model, "predict_proba"):
        Y_pred = model.predict_proba(X)
        if isinstance(Y_pred, list) or isinstance(Y_pred, tuple):
            proba = np.vstack([p[:, 1] if p.shape[1] == 2 else np.zeros(p.shape[0]) for p in Y_pred]).T
        else:
            proba = Y_pred.toarray() if hasattr(Y_pred, "toarray") else Y_pred
        return proba
    else:
        return model.predict_proba(X)


In [9]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings('ignore')

In [None]:
def run_(model, X, Y, multilabel=False):
    """
    preform n_splits cross-validation and average metrics

    trains and eval a multilabel model, 

    returns
        dictionary of metrics
    """
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    auc_micro, aupr_micro, auc_macro, aupr_macro = [], [], [], []
    prec_micro, rec_micro = [], []
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]
        if multilabel:
            model.fit(X_train, Y_train)
            Y_pred_proba = get_proba_multilabel(model, X_test, Y_test)
        else:
            model.fit(X_train, Y_train)
            Y_pred = model.predict_proba(X_test)
            proba_list = []
            for i, p in enumerate(Y_pred):
                if p.shape[1] == 2:
                    proba_list.append(p[:, 1])
                else:
                    present_class = model.classes_[i][0]
                    if present_class == 1:
                        proba_list.append(p[:, 0])
                    else:
                        proba_list.append(np.zeros_like(p[:, 0]))
            Y_pred_proba = np.vstack(proba_list).T

        
        auc_micro.append(roc_auc_score(Y_test.ravel(), Y_pred_proba.ravel()))
        aupr_micro.append(average_precision_score(Y_test.ravel(), Y_pred_proba.ravel()))
        
        try:
            auc_macro.append(roc_auc_score(Y_test, Y_pred_proba, average='macro'))
        except Exception:
            auc_macro.append(np.nan)
        try:
            aupr_macro.append(average_precision_score(Y_test, Y_pred_proba, average='macro'))
        except Exception:
            aupr_macro.append(np.nan)
        
        Y_pred_bin = (Y_pred_proba >= 0.5).astype(int)
        prec_micro.append(precision_score(Y_test, Y_pred_bin, average='micro', zero_division=0))
        rec_micro.append(recall_score(Y_test, Y_pred_bin, average='micro', zero_division=0))
    return {
        'auc_micro': np.mean(auc_micro) * 100,
        'aupr_micro': np.mean(aupr_micro) * 100,
        'auc_macro': np.nanmean(auc_macro) * 100,
        'aupr_macro': np.nanmean(aupr_macro) * 100,
        'precision_micro': np.mean(prec_micro) * 100,
        'recall_micro': np.mean(rec_micro) * 100
    }


In [11]:
results = []
for fp_name, fp in fingerprints.items():
    print(f"=== Fingerprint: {fp_name} ===")
    X = fp.fit_transform(smiles_list_clean)
    if hasattr(X, "toarray"):
        X = X.toarray()
    for level_name, Y in levels:
        for model_name, model in models.items():
            multilabel = (model_name == 'MLkNN')
            print(f"  - {model_name} on {level_name}")
            res = run_(model, X, Y, multilabel=multilabel)
            results.append({
                'fingerprint': fp_name,
                'model': model_name,
                'level': level_name,
                'auc_micro': res['auc_micro'],
                'aupr_micro': res['aupr_micro'],
                'auc_macro': res['auc_macro'],
                'aupr_macro': res['aupr_macro'],
                'precision_micro': res['precision_micro'],
                'recall_micro': res['recall_micro'],
                'dim': fp_dims[fp_name]
            })

=== Fingerprint: PubChem ===
  - RandomForest on LLT/PT
  - KNN on LLT/PT
  - MLkNN on LLT/PT
  - RandomForest on HLT
  - KNN on HLT
  - MLkNN on HLT
  - RandomForest on HLGT
  - KNN on HLGT
  - MLkNN on HLGT
=== Fingerprint: Laggner ===
  - RandomForest on LLT/PT
  - KNN on LLT/PT
  - MLkNN on LLT/PT
  - RandomForest on HLT
  - KNN on HLT
  - MLkNN on HLT
  - RandomForest on HLGT
  - KNN on HLGT
  - MLkNN on HLGT
=== Fingerprint: Avalon ===
  - RandomForest on LLT/PT
  - KNN on LLT/PT
  - MLkNN on LLT/PT
  - RandomForest on HLT
  - KNN on HLT
  - MLkNN on HLT
  - RandomForest on HLGT
  - KNN on HLGT
  - MLkNN on HLGT
=== Fingerprint: AtomPair ===
  - RandomForest on LLT/PT
  - KNN on LLT/PT
  - MLkNN on LLT/PT
  - RandomForest on HLT
  - KNN on HLT
  - MLkNN on HLT
  - RandomForest on HLGT
  - KNN on HLGT
  - MLkNN on HLGT
=== Fingerprint: ECFP ===
  - RandomForest on LLT/PT
  - KNN on LLT/PT
  - MLkNN on LLT/PT
  - RandomForest on HLT
  - KNN on HLT
  - MLkNN on HLT
  - RandomForest 

In [None]:
filename = "results_all_bin.txt"
filepath = f"results/{filename}"

with open(filepath, "w") as f:
    for r in results:
        f.write(
            f"{r['fingerprint']}/{r['model']}/{r['level']}/"
            f"AUC_micro:{r['auc_micro']:.2f} "
            f"AUPR_micro:{r['aupr_micro']:.2f} "
            f"AUC_macro:{r['auc_macro']:.2f} "
            f"AUPR_macro:{r['aupr_macro']:.2f} "
            f"Precision_micro:{r['precision_micro']:.2f} "
            f"Recall_micro:{r['recall_micro']:.2f} "
            f"Dim:{r['dim']}\n"
        )
print(f"saved as {filename}")


saved as results_all.txt


In [5]:
from make_table import parse_metrics, generate_text_table, generate_latex_table


with open('results/results_all_quant_fps.txt', 'r') as f:
    file_content = f.read()

metrics_df = parse_metrics(file_content)

metrics_df


Unnamed: 0,dataset,model,level,AUC_micro,AUPR_micro,AUC_macro,AUPR_macro,Precision_micro,Recall_micro,Dim
0,PubChem,RandomForest,LLT,83.56,40.94,,23.5,57.02,26.1,881
1,PubChem,KNN,LLT,76.03,28.62,,15.2,49.79,25.26,881
2,PubChem,MLkNN,LLT,80.54,32.66,,9.95,0.0,0.0,881
3,PubChem,RandomForest,HLT,84.66,49.48,67.75,28.98,59.52,33.94,881
4,PubChem,KNN,HLT,78.18,36.82,61.79,20.29,52.04,33.03,881
5,PubChem,MLkNN,HLT,80.65,40.44,58.12,14.23,0.0,0.0,881
6,PubChem,RandomForest,HLGT,85.48,60.94,66.48,35.33,64.8,46.37,881
7,PubChem,KNN,HLGT,80.25,48.93,61.24,26.72,58.4,44.94,881
8,PubChem,MLkNN,HLGT,80.83,52.46,56.45,20.61,0.0,0.0,881
9,Laggner,RandomForest,LLT,83.8,40.91,,23.08,59.13,24.01,307


In [7]:

print(generate_latex_table(metrics_df))


\begin{tabular}{lllrrrrrrr}
\toprule
dataset & model & level & AUC_micro & AUPR_micro & AUC_macro & AUPR_macro & Precision_micro & Recall_micro & Dim \\
\midrule
PubChem & RandomForest & LLT & 83.56 & 40.94 & NaN & 23.50 & 57.02 & 26.10 & 881 \\
PubChem & KNN & LLT & 76.03 & 28.62 & NaN & 15.20 & 49.79 & 25.26 & 881 \\
PubChem & MLkNN & LLT & 80.54 & 32.66 & NaN & 9.95 & 0.00 & 0.00 & 881 \\
PubChem & RandomForest & HLT & 84.66 & 49.48 & 67.75 & 28.98 & 59.52 & 33.94 & 881 \\
PubChem & KNN & HLT & 78.18 & 36.82 & 61.79 & 20.29 & 52.04 & 33.03 & 881 \\
PubChem & MLkNN & HLT & 80.65 & 40.44 & 58.12 & 14.23 & 0.00 & 0.00 & 881 \\
PubChem & RandomForest & HLGT & 85.48 & 60.94 & 66.48 & 35.33 & 64.80 & 46.37 & 881 \\
PubChem & KNN & HLGT & 80.25 & 48.93 & 61.24 & 26.72 & 58.40 & 44.94 & 881 \\
PubChem & MLkNN & HLGT & 80.83 & 52.46 & 56.45 & 20.61 & 0.00 & 0.00 & 881 \\
Laggner & RandomForest & LLT & 83.80 & 40.91 & NaN & 23.08 & 59.13 & 24.01 & 307 \\
Laggner & KNN & LLT & 76.21 & 29.21 & 