In [2]:
import GEOparse

In [3]:
gse = GEOparse.get_GEO("GSE68951", destdir="../data")
print(gse)

15-Dec-2025 17:07:23 DEBUG utils - Directory ../data already exists. Skipping.
15-Dec-2025 17:07:23 INFO GEOparse - File already exist: using local version.
15-Dec-2025 17:07:23 INFO GEOparse - Parsing ../data/GSE68951_family.soft.gz: 
15-Dec-2025 17:07:24 DEBUG GEOparse - DATABASE: GeoMiame
15-Dec-2025 17:07:24 DEBUG GEOparse - SERIES: GSE68951
15-Dec-2025 17:07:24 DEBUG GEOparse - PLATFORM: GPL16770
15-Dec-2025 17:07:24 DEBUG GEOparse - SAMPLE: GSM1688368
15-Dec-2025 17:07:24 DEBUG GEOparse - SAMPLE: GSM1688369
15-Dec-2025 17:07:24 DEBUG GEOparse - SAMPLE: GSM1688370
15-Dec-2025 17:07:24 DEBUG GEOparse - SAMPLE: GSM1688371
15-Dec-2025 17:07:24 DEBUG GEOparse - SAMPLE: GSM1688372
15-Dec-2025 17:07:24 DEBUG GEOparse - SAMPLE: GSM1688373
15-Dec-2025 17:07:24 DEBUG GEOparse - SAMPLE: GSM1688374
15-Dec-2025 17:07:24 DEBUG GEOparse - SAMPLE: GSM1688375
15-Dec-2025 17:07:24 DEBUG GEOparse - SAMPLE: GSM1688376
15-Dec-2025 17:07:24 DEBUG GEOparse - SAMPLE: GSM1688377
15-Dec-2025 17:07:24 DEBU

<SERIES: GSE68951 - 215 SAMPLES, 1 d(s)>


In [4]:
sample = gse.gsms["GSM1688406"]
sample.table.tail()


Unnamed: 0,ID_REF,VALUE
1200,hsa-miR-100,1.936057
1201,hsa-miR-101,4.114362
1202,hsa-miR-1289,1.735477
1203,hsa-miR-1288,2.192123
1204,hsa-miR-105,1.285812


In [5]:
import numpy as np
import pandas as pd

In [6]:
def extract_full_info(gse):

    data = [] 
    for gsm_id, sample in gse.gsms.items():

        meta = sample.metadata["characteristics_ch1"]

        patient_id, timepoint, disease = None, None, None
        
        for item in meta:
            if item.startswith("patient id"):
                patient_id = item.split(":")[1].strip()
            if item.startswith("timepoint"):
                timepoint = int(item.split(":")[1].strip())
            if item.startswith("disease"):
                disease = item.split(":")[1].strip()
        if patient_id is None or timepoint is None:
            continue

        mirnas = sample.table["ID_REF"].values
        expr   = sample.table["VALUE"].values.astype(float)

        data.append({
            "gsm": gsm_id,
            "patient": patient_id,
            "timepoint": timepoint,
            "mirna_names": mirnas,
            "expression": expr
        })

    return pd.DataFrame(data)

In [7]:
df_all = extract_full_info(gse)
df_all.head()

Unnamed: 0,gsm,patient,timepoint,mirna_names,expression
0,GSM1688368,A,1,"[hsa-miR-507, hsa-miR-548d-5p, hsa-miR-1976, h...","[1.854979099, 2.184182338, 2.532296403, 1.5180..."
1,GSM1688369,A,2,"[hsa-miR-507, hsa-miR-548d-5p, hsa-miR-1976, h...","[1.469763084, 2.320244044, 2.111889095, 1.6172..."
2,GSM1688370,A,3,"[hsa-miR-507, hsa-miR-548d-5p, hsa-miR-1976, h...","[1.44937518, 2.510284729, 2.083508284, 1.23588..."
3,GSM1688371,A,4,"[hsa-miR-507, hsa-miR-548d-5p, hsa-miR-1976, h...","[1.777522456, 2.724678628, 3.012721615, 1.3907..."
4,GSM1688372,A,5,"[hsa-miR-507, hsa-miR-548d-5p, hsa-miR-1976, h...","[1.538218176, 2.030509973, 2.470374907, 1.5635..."


In [10]:
df_all.groupby("patient").size()


patient
A              8
B              8
C              8
D              8
E              8
F              8
G              8
H              7
I              8
J              8
K              8
L              8
M              8
N              8
O              8
P              8
Q              8
R              8
S              7
T              8
U              8
V              8
W              8
X              7
Y              8
Z              6
ZZ_control    12
dtype: int64

In [11]:
scaler_classifier = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=5000)
)

cfg = evaluate_model_logreg(X, y, scaler_classifier)


              precision    recall  f1-score   support

           A       0.00      0.00      0.00         2
           B       0.00      0.00      0.00         2
           C       0.67      1.00      0.80         2
           D       0.00      0.00      0.00         3
           E       0.00      0.00      0.00         3
           F       0.33      0.50      0.40         2
           G       0.00      0.00      0.00         3
           H       0.00      0.00      0.00         2
           I       0.00      0.00      0.00         3
           J       0.33      0.50      0.40         2
           K       0.00      0.00      0.00         2
           L       0.00      0.00      0.00         3
           M       0.00      0.00      0.00         2
           N       0.00      0.00      0.00         2
           O       0.50      0.50      0.50         2
           P       0.00      0.00      0.00         3
           Q       0.50      0.50      0.50         2
           R       0.00    

In [10]:
def evaluate_model_logreg(X, y, classifier=None):

    if classifier is None:
        classifier = LogisticRegression(max_iter=5000)

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.3,
        random_state=42,
        stratify=y
    )

    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    print(classification_report(y_test, y_pred, zero_division=0))

    return classifier


In [12]:
def evaluate_model_logreg_groupKFold(X, y, groups):

    gkf = GroupKFold(n_splits=5)
    accuracies = []
    f1s = []
    fold = 1

    clf = make_pipeline(
        StandardScaler(),
        LogisticRegression(max_iter=5000)
    )

    for train_idx, test_idx in gkf.split(X, y, groups):

        print(f"\n====== Fold {fold} ======")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)

        print("Accuracy:", acc)
        print("F1 macro:", f1)
        print(classification_report(y_test, y_pred, zero_division=0))

        accuracies.append(acc)
        f1s.append(f1)
        fold += 1

    print("\n=== FINAL LOGREG ===")
    print("Mean Accuracy:", np.mean(accuracies))
    print("Mean F1 Macro:", np.mean(f1s))


In [13]:
evaluate_model_logreg_groupKFold(X, y, groups)
(X, y, groups)


Accuracy: 0.0
F1 macro: 0.0
              precision    recall  f1-score   support

           A       0.00      0.00      0.00       0.0
           B       0.00      0.00      0.00       0.0
           C       0.00      0.00      0.00       0.0
           E       0.00      0.00      0.00       0.0
           H       0.00      0.00      0.00       7.0
           J       0.00      0.00      0.00       8.0
           L       0.00      0.00      0.00       0.0
           M       0.00      0.00      0.00       0.0
           N       0.00      0.00      0.00       0.0
           P       0.00      0.00      0.00       8.0
           Q       0.00      0.00      0.00       0.0
           R       0.00      0.00      0.00       0.0
           S       0.00      0.00      0.00       0.0
           V       0.00      0.00      0.00       8.0
           W       0.00      0.00      0.00       0.0
           Y       0.00      0.00      0.00       0.0
           Z       0.00      0.00      0.00       0.

(array([[1.8549791 , 2.18418234, 2.5322964 , ..., 1.41000193, 3.72745529,
         1.37007761],
        [1.46976308, 2.32024404, 2.11188909, ..., 1.4279172 , 2.45035886,
         1.35776645],
        [1.44937518, 2.51028473, 2.08350828, ..., 1.12366167, 2.571076  ,
         1.06683767],
        ...,
        [1.55135439, 2.09630509, 1.84061179, ..., 1.46840623, 4.03715339,
         0.933782  ],
        [1.58819243, 2.44854081, 2.43935147, ..., 1.26001547, 3.43739097,
         1.32423004],
        [1.96891928, 2.30610466, 2.02487855, ..., 1.21281883, 3.98844345,
         0.86618795]]),
 array(['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B',
        'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'D', 'D',
        'D', 'D', 'D', 'D', 'D', 'D', 'E', 'E', 'E', 'E', 'E', 'E', 'E',
        'E', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G',
        'G', 'G', 'G', 'G', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'I', 'I',
        'I', 'I', 'I', 'I', 'I', 'I', 'J', '

In [14]:
def build_svm_pca_pipeline():
    return Pipeline([
        ("scaler", StandardScaler()),
        ("pca", PCA()),
        ("svm", SVC(kernel="rbf"))
    ])


In [15]:
def tune_hyperparameters(pipeline, X, y, groups):

    param_grid = {
        "pca__n_components": [5, 10, 15, 20],
        "svm__C": [0.1, 1, 10],
        "svm__gamma": ["scale", 0.01, 0.001]
    }

    mcc_scorer = make_scorer(matthews_corrcoef)
    gkf = GroupKFold(n_splits=5)

    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring=mcc_scorer,
        cv=gkf.split(X, y, groups),
        n_jobs=-1,
        verbose=2
    )

    grid.fit(X, y)

    print("Best Params:", grid.best_params_)
    print("Best MCC:", grid.best_score_)

    return grid.best_estimator_


In [16]:
def evaluate_model(model, X, y, groups):

    gkf = GroupKFold(n_splits=5)
    accuracies = []
    f1s = []
    fold = 1

    for train_idx, test_idx in gkf.split(X, y, groups):

        print(f"\n====== Fold {fold} ======")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)

        print("Accuracy:", acc)
        print("F1:", f1)
        print(classification_report(y_test, y_pred, zero_division=0))

        accuracies.append(acc)
        f1s.append(f1)
        fold += 1

    print("\nFINAL RESULTS")
    print("Mean Accuracy:", np.mean(accuracies))
    print("Mean F1 Macro:", np.mean(f1s))


In [17]:
def run_svm_pca_experiment(X, y, groups):

    print("Building model")
    pipeline = build_svm_pca_pipeline()

    print("Tuning hyperparameters")
    best_model = tune_hyperparameters(pipeline, X, y, groups)

    print("Evaluating final model")
    evaluate_model(best_model, X, y, groups)

    return best_model


In [18]:
best_model = run_svm_pca_experiment(X, y, groups)


Building model
Tuning hyperparameters
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Params: {'pca__n_components': 5, 'svm__C': 0.1, 'svm__gamma': 'scale'}
Best MCC: 0.0
Evaluating final model

Accuracy: 0.0
F1: 0.0
              precision    recall  f1-score   support

           A       0.00      0.00      0.00       0.0
           B       0.00      0.00      0.00       0.0
           C       0.00      0.00      0.00       0.0
           E       0.00      0.00      0.00       0.0
           H       0.00      0.00      0.00       7.0
           I       0.00      0.00      0.00       0.0
           J       0.00      0.00      0.00       8.0
           K       0.00      0.00      0.00       0.0
           M       0.00      0.00      0.00       0.0
           N       0.00      0.00      0.00       0.0
           O       0.00      0.00      0.00       0.0
           P       0.00      0.00      0.00       8.0
           R       0.00      0.00      0.00       0.0
       

In [8]:
import numpy as np
import pandas as pd

import GEOparse

from sklearn.model_selection import train_test_split, GroupKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    matthews_corrcoef,
    make_scorer
)


In [19]:
def reidentification_cv(df):

    for patient, df_p in df.groupby("patient"):
        for test_idx in df_p.index:
            train_idx = df.index.difference([test_idx])
            yield train_idx.to_numpy(), np.array([test_idx])


In [22]:
from sklearn.metrics import classification_report

clf = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.9)),  # %90 variance
    ("logreg", LogisticRegression(max_iter=5000))
])


y_true_all = []
y_pred_all = []

for fold, (train_idx, test_idx) in enumerate(reidentification_cv(df_all), 1):

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    y_true_all.extend(y_test)
    y_pred_all.extend(y_pred)

report = classification_report(
    y_true_all,
    y_pred_all,
    output_dict=True,
    zero_division=0
)

report_df = pd.DataFrame(report).transpose()
report_df


Unnamed: 0,precision,recall,f1-score,support
A,0.0,0.0,0.0,8.0
B,0.214286,0.375,0.272727,8.0
C,0.375,0.75,0.5,8.0
D,0.75,0.375,0.5,8.0
E,0.4,0.25,0.307692,8.0
F,0.6,0.75,0.666667,8.0
G,0.5,0.375,0.428571,8.0
H,0.6,0.428571,0.5,7.0
I,0.75,0.375,0.5,8.0
J,0.714286,0.625,0.666667,8.0


In [23]:
from sklearn.metrics import classification_report

clf_svm = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.9)),
    ("svm", SVC(
        kernel="rbf",
        C=10,
        gamma="scale",
        class_weight="balanced"
    ))
])

y_true_all_svm = []
y_pred_all_svm = []

for fold, (train_idx, test_idx) in enumerate(reidentification_cv(df_all), 1):

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    clf_svm.fit(X_train, y_train)
    y_pred = clf_svm.predict(X_test)

    y_true_all_svm.extend(y_test)
    y_pred_all_svm.extend(y_pred)

report_svm = classification_report(
    y_true_all_svm,
    y_pred_all_svm,
    output_dict=True,
    zero_division=0
)

report_svm_df = pd.DataFrame(report_svm).transpose()
report_svm_df


Unnamed: 0,precision,recall,f1-score,support
A,0.05,0.125,0.071429,8.0
B,0.235294,0.5,0.32,8.0
C,0.384615,0.625,0.47619,8.0
D,0.6,0.375,0.461538,8.0
E,1.0,0.125,0.222222,8.0
F,0.666667,0.75,0.705882,8.0
G,0.0,0.0,0.0,8.0
H,0.4,0.285714,0.333333,7.0
I,0.0,0.0,0.0,8.0
J,0.428571,0.375,0.4,8.0


In [24]:
comparison_df = pd.DataFrame({
    "LogReg_precision": report_df.loc["macro avg", "precision"],
    "LogReg_recall": report_df.loc["macro avg", "recall"],
    "LogReg_f1": report_df.loc["macro avg", "f1-score"],
    "SVM_precision": report_svm_df.loc["macro avg", "precision"],
    "SVM_recall": report_svm_df.loc["macro avg", "recall"],
    "SVM_f1": report_svm_df.loc["macro avg", "f1-score"],
}, index=["macro avg"])

comparison_df


Unnamed: 0,LogReg_precision,LogReg_recall,LogReg_f1,SVM_precision,SVM_recall,SVM_f1
macro avg,0.587437,0.486111,0.511068,0.468396,0.405864,0.400535
