In [1]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings('ignore', category=UndefinedMetricWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

In [2]:
import numpy as np
import pandas as pd
from rdkit import Chem
from skfp.fingerprints import (
    PubChemFingerprint, LaggnerFingerprint, AvalonFingerprint,
    AtomPairFingerprint, ECFPFingerprint, MACCSFingerprint
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from skmultilearn.adapt import MLkNN
#from sklearn.model_selection import KFold
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    precision_score, recall_score
)
from tqdm import tqdm

import deepchem as dc
from rdkit import Chem

import umap
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering


No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
No normalization for NumAmideBonds. Feature removed!
No normalization for NumAtomStereoCenters. Feature removed!
No normalization for NumBridgeheadAtoms. Feature removed!
No normalization for NumHeterocycles. Feature removed!
No normalization for NumSpiroAtoms. Feature removed!
No normalization for NumUnspecifiedAtomStereoCenters. Feature removed!
No normalization for Phi. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading some PyTorch models, missing a dependency. No module named 'transformers'
No module named 'transformers'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'transformers'
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'
Skipped 

In [3]:
smiles_df = pd.read_csv('smiles.csv', header=None)
smiles_list = smiles_df.iloc[:, 0].tolist()
Y_lltpt = pd.read_csv('/home/maciej/studia/praktyki_ibb/lltpt_binary.csv').to_numpy()
Y_hlt = pd.read_csv('/home/maciej/studia/praktyki_ibb/hlt_binary.csv').to_numpy()
Y_hlgt = pd.read_csv('/home/maciej/studia/praktyki_ibb/hlgt_binary.csv').to_numpy()


In [4]:
smiles_list_clean = []
idx_clean = []
mols_clean = []
for i, smi in enumerate(smiles_list):
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        smiles_list_clean.append(smi)
        mols_clean.append(mol)
        idx_clean.append(i)
Y_lltpt_clean = Y_lltpt[idx_clean, :]
Y_hlt_clean = Y_hlt[idx_clean, :]
Y_hlgt_clean = Y_hlgt[idx_clean, :]

In [5]:
countEnabled = True

In [6]:
fingerprints = {
    'PubChem': PubChemFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
    'Laggner': LaggnerFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
    'Avalon': AvalonFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
    'AtomPair': AtomPairFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
    'ECFP': ECFPFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
    'MACCS': MACCSFingerprint(n_jobs=-1, sparse=False, count=countEnabled),
}
fp_dims = {
    'PubChem': 881,
    'Laggner': 307,
    'Avalon': 512,
    'AtomPair': 2048,
    'ECFP': 2048,
    'MACCS': 166,
}

In [7]:
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'KNN': KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    'MLkNN': MLkNN(k=10, s=1.0)
}
levels = [
    ('LLT/PT', Y_lltpt_clean),
    ('HLT', Y_hlt_clean),
    ('HLGT', Y_hlgt_clean)
]

In [None]:
def get_proba_multilabel(model, X, Y):
    """
    compute probability estimates for multilabel classifiers

    returns
        probability predictions, shape (n_samples, n_labels)
    """
    if hasattr(model, "predict_proba"):
        Y_pred = model.predict_proba(X)
        if isinstance(Y_pred, list) or isinstance(Y_pred, tuple):
            proba = np.vstack([p[:, 1] if p.shape[1] == 2 else np.zeros(p.shape[0]) for p in Y_pred]).T
        else:
            proba = Y_pred.toarray() if hasattr(Y_pred, "toarray") else Y_pred
        return proba
    else:
        return model.predict_proba(X)


In [9]:
def run_w_scaffold(model, X_train, Y_train, X_test, Y_test, multilabel=False):
    model.fit(X_train, Y_train)
    if multilabel:
        Y_pred_proba = get_proba_multilabel(model, X_test, Y_test)
    else:
        Y_pred = model.predict_proba(X_test)
        proba_list = []
        for i, p in enumerate(Y_pred):
            if p.shape[1] == 2:
                proba_list.append(p[:,1])
            else:
                present_class = model.classes_[i][0]
                if present_class == 1:
                    proba_list.append(p[:, 0])
                else:
                    proba_list.append(np.zeros_like(p[:, 0]))
        Y_pred_proba = np.vstack(proba_list).T

    
    auc_micro = roc_auc_score(Y_test.ravel(), Y_pred_proba.ravel()) * 100
    aupr_micro = average_precision_score(Y_test.ravel(), Y_pred_proba.ravel()) * 100
    try:
        auc_macro = roc_auc_score(Y_test, Y_pred_proba, average='macro') * 100
    except Exception:
        auc_macro = float('nan')
    try:
        aupr_macro = average_precision_score(Y_test, Y_pred_proba, average='macro') * 100
    except Exception:
        aupr_macro = float('nan')
    Y_pred_bin = (Y_pred_proba >= 0.5).astype(int)
    precision_micro = precision_score(Y_test, Y_pred_bin, average='micro', zero_division=0) * 100
    recall_micro = recall_score(Y_test, Y_pred_bin, average='micro', zero_division=0) * 100

    return {
        'auc_micro': auc_micro,
        'aupr_micro': aupr_micro,
        'auc_macro': auc_macro,
        'aupr_macro': aupr_macro,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro
    }

In [None]:

clusters_to_validate_dict = {
    'PubChem':  [1],            
    'Laggner':  [2, 2],                 
    'Avalon':   [1, 2],            
    'AtomPair': [1, 2],                 
    'ECFP':     [1, 2],             
    'MACCS':    [2, 5]           
}


param_lookup = {
    'PubChem':  {'n_components': 10, 'n_clusters': 5},
    'Laggner':  {'n_components': 18, 'n_clusters': 3},
    'Avalon':   {'n_components': 5, 'n_clusters': 3},
    'AtomPair': {'n_components': 5, 'n_clusters': 3},
    'ECFP':     {'n_components': 13, 'n_clusters': 3},
    'MACCS':    {'n_components': 20, 'n_clusters': 9},
}

results = []
X_cache = {}

for fp_name, fp in tqdm(fingerprints.items(), desc="Fingerprints"):
    print(f"=== Fingerprint: {fp_name} ===")
    if fp_name not in X_cache:
        X = fp.fit_transform(smiles_list_clean)
        if hasattr(X, "toarray"):
            X = X.toarray()
        X_cache[fp_name] = X
    else:
        X = X_cache[fp_name]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)


    n_components = param_lookup[fp_name]['n_components']
    n_clusters   = param_lookup[fp_name]['n_clusters']
    umap_reducer = umap.UMAP(n_components=n_components, metric='jaccard', random_state=42, n_jobs=5)
    X_umap = umap_reducer.fit_transform(X_scaled)

    clustering = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clustering.fit_predict(X_umap)


    clusters_to_validate = clusters_to_validate_dict[fp_name]
    test_idx = [i for i, lbl in enumerate(labels) if lbl in clusters_to_validate]
    train_idx = [i for i, lbl in enumerate(labels) if lbl not in clusters_to_validate]

    for level_name, Y in levels:
        X_train, X_test = X[train_idx], X[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]
        for model_name, model in models.items():
            multilabel = (model_name == 'MLkNN')
            res = run_w_scaffold(model, X_train, Y_train, X_test, Y_test, multilabel=multilabel)
            results.append({
                'fingerprint': fp_name,
                'model': model_name,
                'level': level_name,
                'auc_micro': res['auc_micro'],
                'aupr_micro': res['aupr_micro'],
                'auc_macro': res['auc_macro'],
                'aupr_macro': res['aupr_macro'],
                'precision_micro': res['precision_micro'],
                'recall_micro': res['recall_micro'],
                'dim': fp_dims[fp_name],
                'n_test': len(test_idx),
                'n_train': len(train_idx),
                'val_clusters': clusters_to_validate
            })


Fingerprints:   0%|          | 0/6 [00:00<?, ?it/s]

=== Fingerprint: PubChem ===


Fingerprints:  17%|█▋        | 1/6 [01:08<05:41, 68.26s/it]

=== Fingerprint: Laggner ===


Fingerprints:  33%|███▎      | 2/6 [02:09<04:15, 63.98s/it]

=== Fingerprint: Avalon ===


Fingerprints:  50%|█████     | 3/6 [03:31<03:37, 72.33s/it]

=== Fingerprint: AtomPair ===


Fingerprints:  67%|██████▋   | 4/6 [04:55<02:33, 76.88s/it]

=== Fingerprint: ECFP ===


Fingerprints:  83%|████████▎ | 5/6 [06:03<01:13, 73.63s/it]

=== Fingerprint: MACCS ===


Fingerprints: 100%|██████████| 6/6 [06:54<00:00, 69.15s/it]


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import umap
import numpy as np
import pandas as pd
from tqdm import tqdm


X_cache = {}


n_components_range = range(2, 21)   
n_clusters_range = range(3, 10)     

clustering_results = []

for fp_name, fp in tqdm(fingerprints.items(), desc='fingerprints'):
    if fp_name not in X_cache:
        X = fp.fit_transform(smiles_list_clean)
        if hasattr(X, "toarray"):
            X = X.toarray()
        X_cache[fp_name] = X
    else:
        X = X_cache[fp_name]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    best_silhouette = -1
    best_n_components = None
    best_n_clusters = None
    best_labels = None

    for n_components in n_components_range:
        umap_reducer = umap.UMAP(n_components=n_components, metric='jaccard', random_state=42, n_jobs=5)
        X_umap = umap_reducer.fit_transform(X_scaled)
        for n_clusters in n_clusters_range:
            clustering = AgglomerativeClustering(n_clusters=n_clusters)
            labels = clustering.fit_predict(X_umap)
            silhouette = silhouette_score(X_umap, labels)
            if silhouette > best_silhouette:
                best_silhouette = silhouette
                best_n_components = n_components
                best_n_clusters = n_clusters
                best_labels = labels

    counts = np.bincount(best_labels)
    clustering_results.append({
        'fingerprint': fp_name,
        'silhouette': best_silhouette,
        'cluster_sizes': counts.tolist(),
        'n_clusters': best_n_clusters,
        'n_components': best_n_components
    })

clustering_df = pd.DataFrame(clustering_results)


fingerprints: 100%|██████████| 6/6 [08:59<00:00, 89.94s/it]


In [23]:
clustering_df['size_sum'] = clustering_df['cluster_sizes'].apply(lambda x: sum(x))

In [25]:
clustering_df['validate_set_size'] = clustering_df['size_sum']*0.2

In [26]:
clustering_df

Unnamed: 0,fingerprint,silhouette,cluster_sizes,n_clusters,n_components,size_sum,validate_set_size
0,PubChem,0.511185,"[877, 335, 114, 27, 33]",5,10,1386,277.2
1,Laggner,0.507569,"[1308, 10, 68]",3,18,1386,277.2
2,Avalon,0.791479,"[1195, 153, 38]",3,5,1386,277.2
3,AtomPair,0.566422,"[1212, 28, 146]",3,5,1386,277.2
4,ECFP,0.459035,"[1280, 45, 61]",3,13,1386,277.2
5,MACCS,0.568609,"[869, 53, 136, 39, 66, 147, 58, 8, 10]",9,20,1386,277.2


In [29]:
filename = "results_all_umap_best_params.txt"
filepath = f"results/{filename}"

with open(filepath, "w") as f:
    for r in results:
        f.write(
            f"{r['fingerprint']}/{r['model']}/{r['level']}/"
            f"AUC_micro:{r['auc_micro']:.2f} "
            f"AUPR_micro:{r['aupr_micro']:.2f} "
            f"AUC_macro:{r['auc_macro']:.2f} "
            f"AUPR_macro:{r['aupr_macro']:.2f} "
            f"Precision_micro:{r['precision_micro']:.2f} "
            f"Recall_micro:{r['recall_micro']:.2f} "
            f"Dim:{r['dim']}\n"
        )
print(f"saved as {filename}")


saved as results_all_umap_best_params.txt


In [30]:
from make_table import parse_metrics, generate_text_table, generate_latex_table


with open(filepath, 'r') as f:
    file_content = f.read()

metrics_df = parse_metrics(file_content)

metrics_df


Unnamed: 0,dataset,model,level,AUC_micro,AUPR_micro,AUC_macro,AUPR_macro,Precision_micro,Recall_micro,Dim
0,PubChem,RandomForest,LLT,79.73,30.06,,9.12,52.03,15.16,881
1,PubChem,KNN,LLT,72.72,20.54,,7.64,37.3,23.03,881
2,PubChem,MLkNN,LLT,78.49,27.12,,6.82,0.0,0.0,881
3,PubChem,RandomForest,HLT,81.01,37.05,,13.26,52.2,19.72,881
4,PubChem,KNN,HLT,75.24,27.18,,11.5,39.09,30.83,881
5,PubChem,MLkNN,HLT,79.19,33.87,,10.54,0.0,0.0,881
6,PubChem,RandomForest,HLGT,82.31,49.4,,19.5,57.35,32.7,881
7,PubChem,KNN,HLGT,77.56,38.35,,17.1,45.61,43.4,881
8,PubChem,MLkNN,HLGT,79.94,45.48,,16.33,0.0,0.0,881
9,Laggner,RandomForest,LLT,83.59,38.16,,13.31,51.05,26.69,307


In [31]:

print(generate_latex_table(metrics_df))


\begin{tabular}{lllrrlrrrr}
\toprule
dataset & model & level & AUC_micro & AUPR_micro & AUC_macro & AUPR_macro & Precision_micro & Recall_micro & Dim \\
\midrule
PubChem & RandomForest & LLT & 79.73 & 30.06 & NaN & 9.12 & 52.03 & 15.16 & 881 \\
PubChem & KNN & LLT & 72.72 & 20.54 & NaN & 7.64 & 37.30 & 23.03 & 881 \\
PubChem & MLkNN & LLT & 78.49 & 27.12 & NaN & 6.82 & 0.00 & 0.00 & 881 \\
PubChem & RandomForest & HLT & 81.01 & 37.05 & NaN & 13.26 & 52.20 & 19.72 & 881 \\
PubChem & KNN & HLT & 75.24 & 27.18 & NaN & 11.50 & 39.09 & 30.83 & 881 \\
PubChem & MLkNN & HLT & 79.19 & 33.87 & NaN & 10.54 & 0.00 & 0.00 & 881 \\
PubChem & RandomForest & HLGT & 82.31 & 49.40 & NaN & 19.50 & 57.35 & 32.70 & 881 \\
PubChem & KNN & HLGT & 77.56 & 38.35 & NaN & 17.10 & 45.61 & 43.40 & 881 \\
PubChem & MLkNN & HLGT & 79.94 & 45.48 & NaN & 16.33 & 0.00 & 0.00 & 881 \\
Laggner & RandomForest & LLT & 83.59 & 38.16 & NaN & 13.31 & 51.05 & 26.69 & 307 \\
Laggner & KNN & LLT & 74.73 & 27.31 & NaN & 9.01 & 5