In [1]:
# ===========================
# Core Python & Utilities
# ===========================
import numpy as np
import pandas as pd
import GEOparse

# ===========================
# Scikit-learn preprocessing & model selection
# ===========================
from sklearn.model_selection import train_test_split, GroupKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline

# ===========================
# Scikit-learn models
# ===========================
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA

# ===========================
# Metrics
# ===========================
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    matthews_corrcoef,
    make_scorer,
    euclidean_distances
)


In [2]:
gse = GEOparse.get_GEO("GSE68951", destdir="../data")
print(gse)

05-Feb-2026 19:58:10 DEBUG utils - Directory ../data already exists. Skipping.
05-Feb-2026 19:58:10 INFO GEOparse - File already exist: using local version.
05-Feb-2026 19:58:10 INFO GEOparse - Parsing ../data/GSE68951_family.soft.gz: 
05-Feb-2026 19:58:10 DEBUG GEOparse - DATABASE: GeoMiame
05-Feb-2026 19:58:10 DEBUG GEOparse - SERIES: GSE68951
05-Feb-2026 19:58:10 DEBUG GEOparse - PLATFORM: GPL16770
05-Feb-2026 19:58:10 DEBUG GEOparse - SAMPLE: GSM1688368
05-Feb-2026 19:58:10 DEBUG GEOparse - SAMPLE: GSM1688369
05-Feb-2026 19:58:10 DEBUG GEOparse - SAMPLE: GSM1688370
05-Feb-2026 19:58:10 DEBUG GEOparse - SAMPLE: GSM1688371
05-Feb-2026 19:58:10 DEBUG GEOparse - SAMPLE: GSM1688372
05-Feb-2026 19:58:10 DEBUG GEOparse - SAMPLE: GSM1688373
05-Feb-2026 19:58:10 DEBUG GEOparse - SAMPLE: GSM1688374
05-Feb-2026 19:58:10 DEBUG GEOparse - SAMPLE: GSM1688375
05-Feb-2026 19:58:10 DEBUG GEOparse - SAMPLE: GSM1688376
05-Feb-2026 19:58:10 DEBUG GEOparse - SAMPLE: GSM1688377
05-Feb-2026 19:58:10 DEBU

<SERIES: GSE68951 - 215 SAMPLES, 1 d(s)>


In [3]:
sample = gse.gsms["GSM1688406"]
sample.table.tail()


Unnamed: 0,ID_REF,VALUE
1200,hsa-miR-100,1.936057
1201,hsa-miR-101,4.114362
1202,hsa-miR-1289,1.735477
1203,hsa-miR-1288,2.192123
1204,hsa-miR-105,1.285812


In [4]:
def extract_full_info(gse):

    data = [] 
    for gsm_id, sample in gse.gsms.items():

        meta = sample.metadata["characteristics_ch1"]

        patient_id, timepoint, disease = None, None, None
        
        for item in meta:
            if item.startswith("patient id"):
                patient_id = item.split(":")[1].strip()
            if item.startswith("timepoint"):
                timepoint = int(item.split(":")[1].strip())
            if item.startswith("disease"):
                disease = item.split(":")[1].strip()
        if patient_id is None or timepoint is None:
            continue

        mirnas = sample.table["ID_REF"].values
        expr   = sample.table["VALUE"].values.astype(float)

        data.append({
            "gsm": gsm_id,
            "patient": patient_id,
            "timepoint": timepoint,
            "mirna_names": mirnas,
            "expression": expr
        })

    return pd.DataFrame(data)

In [5]:
df_all = extract_full_info(gse)
df_all.head()



Unnamed: 0,gsm,patient,timepoint,mirna_names,expression
0,GSM1688368,A,1,"[hsa-miR-507, hsa-miR-548d-5p, hsa-miR-1976, h...","[1.854979099, 2.184182338, 2.532296403, 1.5180..."
1,GSM1688369,A,2,"[hsa-miR-507, hsa-miR-548d-5p, hsa-miR-1976, h...","[1.469763084, 2.320244044, 2.111889095, 1.6172..."
2,GSM1688370,A,3,"[hsa-miR-507, hsa-miR-548d-5p, hsa-miR-1976, h...","[1.44937518, 2.510284729, 2.083508284, 1.23588..."
3,GSM1688371,A,4,"[hsa-miR-507, hsa-miR-548d-5p, hsa-miR-1976, h...","[1.777522456, 2.724678628, 3.012721615, 1.3907..."
4,GSM1688372,A,5,"[hsa-miR-507, hsa-miR-548d-5p, hsa-miR-1976, h...","[1.538218176, 2.030509973, 2.470374907, 1.5635..."


In [6]:
# Feature matrix
X = np.vstack(df_all["expression"].values)

# Labels = patient identity
y = df_all["patient"].astype("category").cat.codes.values

# Groups = patient (CV kontrolü için)
groups = df_all["patient"].values


In [7]:
df_all.groupby("patient").size()


patient
A              8
B              8
C              8
D              8
E              8
F              8
G              8
H              7
I              8
J              8
K              8
L              8
M              8
N              8
O              8
P              8
Q              8
R              8
S              7
T              8
U              8
V              8
W              8
X              7
Y              8
Z              6
ZZ_control    12
dtype: int64

In [8]:
groups

array(['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'E', 'E', 'E', 'E', 'E', 'E', 'E',
       'E', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G',
       'G', 'G', 'G', 'G', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'I', 'I',
       'I', 'I', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'J', 'J', 'J',
       'J', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'L', 'L', 'L', 'L',
       'L', 'L', 'L', 'L', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'S', 'S', 'S', 'S', 'S', 'S', 'S', 'T', 'T', 'T', 'T', 'T', 'T',
       'T', 'T', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'V', 'V', 'V',
       'V', 'V', 'V', 'V', 'V', 'W', 'W', 'W', 'W', 'W', 'W', 'W

In [9]:
scaler_classifier = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=5000)
)



In [10]:
def evaluate_model_logreg(X, y, classifier=None):  # NAIVE METHOD 
    if classifier is None:
        classifier = LogisticRegression(max_iter=5000)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=45, stratify=y
    )

    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    probs = classifier.predict_proba(X_test)  # Proba for top-5

    print(classification_report(y_test, y_pred, zero_division=0))

    # Top-1/5 hesapla
    rank1 = 0
    rank5 = 0
    total = len(y_test)
    for i in range(total):
        true_label = y_test[i]
        top_idx = np.argsort(-probs[i])[:5]
        top_labels = classifier.classes_[top_idx]
        if true_label == y_pred[i]:  # Top-1
            rank1 += 1
        if true_label in top_labels:  # Top-5
            rank5 += 1

    print(f"Naive LogReg Rank-1: {rank1 / total:.3f} ({rank1}/{total})")
    print(f"Naive LogReg Rank-5: {rank5 / total:.3f} ({rank5}/{total})")

    return classifier

In [11]:
cfg = evaluate_model_logreg(X, y, scaler_classifier)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         2
           2       0.25      0.50      0.33         2
           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         2
           5       1.00      0.50      0.67         2
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00         1
          12       1.00      1.00      1.00         1
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         2
          16       0.00      0.00      0.00         2
          17       0.00    

In [12]:
groups

array(['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'E', 'E', 'E', 'E', 'E', 'E', 'E',
       'E', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G',
       'G', 'G', 'G', 'G', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'I', 'I',
       'I', 'I', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'J', 'J', 'J',
       'J', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'L', 'L', 'L', 'L',
       'L', 'L', 'L', 'L', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'S', 'S', 'S', 'S', 'S', 'S', 'S', 'T', 'T', 'T', 'T', 'T', 'T',
       'T', 'T', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'V', 'V', 'V',
       'V', 'V', 'V', 'V', 'V', 'W', 'W', 'W', 'W', 'W', 'W', 'W

In [13]:
def evaluate_model_logreg_groupKFold(X, y, groups): #Actually does not suit our problem at hand at all cause KFold is an open-set classification

    gkf = GroupKFold(n_splits=5)
    accuracies = []
    f1s = []
    fold = 1

    clf = make_pipeline(
        StandardScaler(),
        LogisticRegression(max_iter=5000)
    )

    for train_idx, test_idx in gkf.split(X, y, groups):

        print(f"\n====== Fold {fold} ======")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)

        print("Accuracy:", acc)
        print("F1 macro:", f1)
        print(classification_report(y_test, y_pred, zero_division=0))

        accuracies.append(acc)
        f1s.append(f1)
        fold += 1

    print("\n=== FINAL LOGREG ===")
    print("Mean Accuracy:", np.mean(accuracies))
    print("Mean F1 Macro:", np.mean(f1s))


In [14]:
evaluate_model_logreg_groupKFold(X, y, groups)
(X, y, groups)


Accuracy: 0.0
F1 macro: 0.0
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       0.0
           2       0.00      0.00      0.00       0.0
           4       0.00      0.00      0.00       0.0
           7       0.00      0.00      0.00       7.0
           9       0.00      0.00      0.00       8.0
          11       0.00      0.00      0.00       0.0
          12       0.00      0.00      0.00       0.0
          13       0.00      0.00      0.00       0.0
          15       0.00      0.00      0.00       8.0
          16       0.00      0.00      0.00       0.0
          17       0.00      0.00      0.00       0.0
          18       0.00      0.00      0.00       0.0
          21       0.00      0.00      0.00       8.0
          22       0.00      0.00      0.00       0.0
          24       0.00      0.00      0.00       0.0
          25       0.00      0.00      0.00       0.

(array([[1.8549791 , 2.18418234, 2.5322964 , ..., 1.41000193, 3.72745529,
         1.37007761],
        [1.46976308, 2.32024404, 2.11188909, ..., 1.4279172 , 2.45035886,
         1.35776645],
        [1.44937518, 2.51028473, 2.08350828, ..., 1.12366167, 2.571076  ,
         1.06683767],
        ...,
        [1.55135439, 2.09630509, 1.84061179, ..., 1.46840623, 4.03715339,
         0.933782  ],
        [1.58819243, 2.44854081, 2.43935147, ..., 1.26001547, 3.43739097,
         1.32423004],
        [1.96891928, 2.30610466, 2.02487855, ..., 1.21281883, 3.98844345,
         0.86618795]]),
 array([ 0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  2,
         2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,
         4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,
         6,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,
         8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10,
        10, 10, 11, 11, 11, 1

In [15]:

def run_basic_svm_baseline(X, y, test_size=0.2, random_state=42):
    """
    Naive SVM baseline (NOT attack-consistent).
    Purpose: sanity check that SVM can learn from the data.
    """

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=test_size,
        stratify=y,
        random_state=random_state
    )

    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=0.95)),
        ("svm", SVC(
            kernel="rbf",
            C=1.0,
            gamma="scale"
        ))
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print("Naive SVM baseline (random split)")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred, zero_division=0))

    return pipeline



In [16]:
run_basic_svm_baseline(X,y)

Naive SVM baseline (random split)
Accuracy: 0.18604651162790697
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         2
           2       1.00      0.50      0.67         2
           3       0.33      0.50      0.40         2
           4       0.00      0.00      0.00         2
           5       0.50      0.50      0.50         2
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         2
           9       0.33      1.00      0.50         1
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         2
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         2
          16     

In [17]:
def build_svm_pca_pipeline():
    return Pipeline([
        ("scaler", StandardScaler()),
        ("pca", PCA()),
        ("svm", SVC(kernel="rbf"))
    ])


In [18]:
def tune_hyperparameters(pipeline, X, y, groups):

    param_grid = {
        "pca__n_components": [5, 10, 15, 20],
        "svm__C": [0.1, 1, 10],
        "svm__gamma": ["scale", 0.01, 0.001]
    }

    mcc_scorer = make_scorer(matthews_corrcoef)
    gkf = GroupKFold(n_splits=5)

    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring=mcc_scorer,
        cv=gkf.split(X, y, groups),
        n_jobs=-1,
        verbose=0
    )

    grid.fit(X, y)

    print("Best Params:", grid.best_params_)
    print("Best MCC:", grid.best_score_)

    return grid.best_estimator_


In [19]:
def evaluate_model(model, X, y, groups):

    gkf = GroupKFold(n_splits=5)
    accuracies = []
    f1s = []
    fold = 1

    for train_idx, test_idx in gkf.split(X, y, groups):

        print(f"\n====== Fold {fold} ======")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)

        print("Accuracy:", acc)
        print("F1:", f1)
        print(classification_report(y_test, y_pred, zero_division=0))

        accuracies.append(acc)
        f1s.append(f1)
        fold += 1

    print("\nFINAL RESULTS")
    print("Mean Accuracy:", np.mean(accuracies))
    print("Mean F1 Macro:", np.mean(f1s))


In [20]:
def run_svm_pca_experiment(X, y, groups):

    print("Building model")
    pipeline = build_svm_pca_pipeline()

    print("Tuning hyperparameters")
    best_model = tune_hyperparameters(pipeline, X, y, groups)

    print("Evaluating final model")
    evaluate_model(best_model, X, y, groups)

    return best_model


In [21]:
best_model = run_svm_pca_experiment(X, y, groups)


Building model
Tuning hyperparameters
Best Params: {'pca__n_components': 5, 'svm__C': 0.1, 'svm__gamma': 'scale'}
Best MCC: 0.0
Evaluating final model

Accuracy: 0.0
F1: 0.0
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       0.0
           2       0.00      0.00      0.00       0.0
           4       0.00      0.00      0.00       0.0
           7       0.00      0.00      0.00       7.0
           8       0.00      0.00      0.00       0.0
           9       0.00      0.00      0.00       8.0
          10       0.00      0.00      0.00       0.0
          12       0.00      0.00      0.00       0.0
          13       0.00      0.00      0.00       0.0
          14       0.00      0.00      0.00       0.0
          15       0.00      0.00      0.00       8.0
          17       0.00      0.00      0.00       0.0
          19       0.00      0.00      0.00       0.0
          21   

In [22]:
g = df_all.groupby("patient")
first = next(iter(g))
type(first), len(first)

(tuple, 2)

In [23]:
def reidentification_cv(df): #patient-aware LOOCV
    for patient, df_p in df.groupby("patient"):
        for test_idx in df_p.index:
            train_idx = df.index.difference([test_idx]) #Assume that one timepoint for each patient is an unknown profile (test data).
            yield train_idx.to_numpy(), np.array([test_idx]) 


In [24]:

clf = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.9)),  # %90 variance
    ("logreg", LogisticRegression(max_iter=5000))
])

y_true_all = []
y_pred_all = []
rank1_logreg = 0
rank5_logreg = 0
total_tests = 0

for fold, (train_idx, test_idx) in enumerate(reidentification_cv(df_all), 1):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    probs = clf.predict_proba(X_test)[0]  # Proba for top-5

    y_true_all.extend(y_test)
    y_pred_all.extend(y_pred)

    true_label = y_test[0]
    top_idx = np.argsort(-probs)[:5]  # Top-5 indices (descending)
    top_labels = clf.classes_[top_idx]

    if true_label == y_pred[0]:  # Top-1 (direct predict)
        rank1_logreg += 1
    if true_label in top_labels:  # Top-5
        rank5_logreg += 1
    total_tests += 1

report = classification_report(
    y_true_all,
    y_pred_all,
    output_dict=True,
    zero_division=0
)

report_df = pd.DataFrame(report).transpose()
print(report_df)
print(f"LogReg Rank-1: {rank1_logreg / total_tests:.3f} ({rank1_logreg}/{total_tests})")
print(f"LogReg Rank-5: {rank5_logreg / total_tests:.3f} ({rank5_logreg}/{total_tests})")

              precision    recall  f1-score     support
0              0.000000  0.000000  0.000000    8.000000
1              0.200000  0.375000  0.260870    8.000000
2              0.400000  0.750000  0.521739    8.000000
3              0.750000  0.375000  0.500000    8.000000
4              0.400000  0.250000  0.307692    8.000000
5              0.666667  0.750000  0.705882    8.000000
6              0.428571  0.375000  0.400000    8.000000
7              0.600000  0.428571  0.500000    7.000000
8              1.000000  0.375000  0.545455    8.000000
9              0.714286  0.625000  0.666667    8.000000
10             0.500000  0.125000  0.200000    8.000000
11             0.300000  0.375000  0.333333    8.000000
12             1.000000  0.375000  0.545455    8.000000
13             0.400000  0.500000  0.444444    8.000000
14             1.000000  0.750000  0.857143    8.000000
15             0.428571  0.375000  0.400000    8.000000
16             0.666667  0.500000  0.571429    8

In [None]:

clf_svm = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.9)),
    ("svm", SVC(
        kernel="rbf",
        C=10,
        gamma="scale",
        class_weight="balanced",
        probability=True  # Proba için ekledim
    ))
])

y_true_all_svm = []
y_pred_all_svm = []
rank1_svm = 0
rank5_svm = 0
total_tests = 0

for fold, (train_idx, test_idx) in enumerate(reidentification_cv(df_all), 1):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    clf_svm.fit(X_train, y_train)
    y_pred = clf_svm.predict(X_test)
    probs = clf_svm.predict_proba(X_test)[0]  # Proba ekledim

    y_true_all_svm.extend(y_test)
    y_pred_all_svm.extend(y_pred)

    true_label = y_test[0]
    top_idx = np.argsort(-probs)[:5]
    top_labels = clf_svm.classes_[top_idx]

    if true_label == y_pred[0]:
        rank1_svm += 1
    if true_label in top_labels:
        rank5_svm += 1
    total_tests += 1

report_svm = classification_report(
    y_true_all_svm,
    y_pred_all_svm,
    output_dict=True,
    zero_division=0
)

report_svm_df = pd.DataFrame(report_svm).transpose()
print(report_svm_df)
print(f"SVM Rank-1: {rank1_svm / total_tests:.3f} ({rank1_svm}/{total_tests})")
print(f"SVM Rank-5: {rank5_svm / total_tests:.3f} ({rank5_svm}/{total_tests})")

In [None]:

clf_nb = Pipeline([
    ("scaler", StandardScaler()),
    ("gnb", GaussianNB())
])

y_true_all_nb = []
y_pred_all_nb = []
rank1_nb = 0
rank5_nb = 0
total_tests = 0

for fold, (train_idx, test_idx) in enumerate(reidentification_cv(df_all), 1):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    clf_nb.fit(X_train, y_train)
    y_pred = clf_nb.predict(X_test)
    probs = clf_nb.predict_proba(X_test)[0]

    y_true_all_nb.extend(y_test)
    y_pred_all_nb.extend(y_pred)

    true_label = y_test[0]
    top_idx = np.argsort(-probs)[:5]
    top_labels = clf_nb.classes_[top_idx]

    if true_label == y_pred[0]:
        rank1_nb += 1
    if true_label in top_labels:
        rank5_nb += 1
    total_tests += 1

report_nb = classification_report(
    y_true_all_nb,
    y_pred_all_nb,
    output_dict=True,
    zero_division=0
)

report_nb_df = pd.DataFrame(report_nb).transpose()
print(report_nb_df)
print(f"Naive Bayes Rank-1: {rank1_nb / total_tests:.3f} ({rank1_nb}/{total_tests})")
print(f"Naive Bayes Rank-5: {rank5_nb / total_tests:.3f} ({rank5_nb}/{total_tests})")

In [None]:
def naive_bayes_reidentification_cv(X, y, cv_iterator, top_k=5):
    y_true = []
    y_pred = []
    y_topk = []

    for train_idx, test_idx in cv_iterator:
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = Pipeline([
            ("scaler", StandardScaler()),
            ("gnb", GaussianNB())
        ])

        model.fit(X_train, y_train)

        # Top-1
        y_hat = model.predict(X_test)[0]
        y_true.append(y_test[0])
        y_pred.append(y_hat)

        # Top-k (re-identification relevant)
        proba = model.predict_proba(X_test)[0]
        classes = model.named_steps["gnb"].classes_
        topk = classes[np.argsort(proba)[::-1][:top_k]]
        y_topk.append(y_test[0] in topk)

    acc = accuracy_score(y_true, y_pred)
    topk_acc = np.mean(y_topk)

    return acc, topk_acc


In [None]:
acc, top5 = naive_bayes_reidentification_cv(
    X,
    y,
    reidentification_cv(df_all),
    top_k=5
)

print(f"Naive Bayes Accuracy: {acc:.3f}")
print(f"Naive Bayes Top-5 Accuracy: {top5:.3f}")


In [None]:
comparison_df = pd.DataFrame({
    "Model": ["LogReg", "SVM", "Bayes", "Siamese"],
    "Precision (macro)": [
        report_df.loc["macro avg", "precision"],
        report_svm_df.loc["macro avg", "precision"],
        report_nb_df.loc["macro avg", "precision"],
        np.nan  # Gonna add manually
    ],
    "Recall (macro)": [
        report_df.loc["macro avg", "recall"],
        report_svm_df.loc["macro avg", "recall"],
        report_nb_df.loc["macro avg", "recall"],
        np.nan
    ],
    "F1 (macro)": [
        report_df.loc["macro avg", "f1-score"],
        report_svm_df.loc["macro avg", "f1-score"],
        report_nb_df.loc["macro avg", "f1-score"],
        np.nan
    ],
    "Rank-1": [
        rank1_logreg / total_tests,
        rank1_svm / total_tests,
        rank1_nb / total_tests,
        0.484  # Siamese res
    ],
    "Rank-5": [
        rank5_logreg / total_tests,
        rank5_svm / total_tests,
        rank5_nb / total_tests,
        0.600  # Siamese res
    ]
})

print(comparison_df)