In [106]:
import GEOparse

In [107]:
gse = GEOparse.get_GEO("GSE68951", destdir=".")
print(gse)

04-Dec-2025 13:08:35 DEBUG utils - Directory . already exists. Skipping.
04-Dec-2025 13:08:35 INFO GEOparse - File already exist: using local version.
04-Dec-2025 13:08:35 INFO GEOparse - Parsing ./GSE68951_family.soft.gz: 
04-Dec-2025 13:08:35 DEBUG GEOparse - DATABASE: GeoMiame
04-Dec-2025 13:08:35 DEBUG GEOparse - SERIES: GSE68951
04-Dec-2025 13:08:35 DEBUG GEOparse - PLATFORM: GPL16770
04-Dec-2025 13:08:35 DEBUG GEOparse - SAMPLE: GSM1688368
04-Dec-2025 13:08:35 DEBUG GEOparse - SAMPLE: GSM1688369
04-Dec-2025 13:08:35 DEBUG GEOparse - SAMPLE: GSM1688370
04-Dec-2025 13:08:35 DEBUG GEOparse - SAMPLE: GSM1688371
04-Dec-2025 13:08:35 DEBUG GEOparse - SAMPLE: GSM1688372
04-Dec-2025 13:08:35 DEBUG GEOparse - SAMPLE: GSM1688373
04-Dec-2025 13:08:35 DEBUG GEOparse - SAMPLE: GSM1688374
04-Dec-2025 13:08:35 DEBUG GEOparse - SAMPLE: GSM1688375
04-Dec-2025 13:08:35 DEBUG GEOparse - SAMPLE: GSM1688376
04-Dec-2025 13:08:35 DEBUG GEOparse - SAMPLE: GSM1688377
04-Dec-2025 13:08:35 DEBUG GEOparse -

<SERIES: GSE68951 - 215 SAMPLES, 1 d(s)>


In [108]:
sample = gse.gsms["GSM1688406"]
sample.table.head()


Unnamed: 0,ID_REF,VALUE
0,hsa-miR-507,1.6973
1,hsa-miR-548d-5p,2.528559
2,hsa-miR-1976,2.317002
3,hsa-miR-429,0.702423
4,hsa-miR-1973,1.225079


In [109]:
import numpy as np
import pandas as pd

In [110]:
def extract_full_info(gse):

    data = [] 
    for gsm_id, sample in gse.gsms.items():

        meta = sample.metadata["characteristics_ch1"]

        patient_id, timepoint, disease = None, None, None
        
        for item in meta:
            if item.startswith("patient id"):
                patient_id = item.split(":")[1].strip()
            if item.startswith("timepoint"):
                timepoint = int(item.split(":")[1].strip())
            if item.startswith("disease"):
                disease = item.split(":")[1].strip()
        if patient_id is None or timepoint is None:
            continue

        mirnas = sample.table["ID_REF"].values
        expr   = sample.table["VALUE"].values.astype(float)

        data.append({
            "gsm": gsm_id,
            "patient": patient_id,
            "timepoint": timepoint,
            "mirna_names": mirnas,
            "expression": expr
        })

    return pd.DataFrame(data)

In [148]:
df_all = extract_full_info(gse)
df_all.head()

Unnamed: 0,gsm,patient,timepoint,mirna_names,expression
0,GSM1688368,A,1,"[hsa-miR-507, hsa-miR-548d-5p, hsa-miR-1976, h...","[1.854979099, 2.184182338, 2.532296403, 1.5180..."
1,GSM1688369,A,2,"[hsa-miR-507, hsa-miR-548d-5p, hsa-miR-1976, h...","[1.469763084, 2.320244044, 2.111889095, 1.6172..."
2,GSM1688370,A,3,"[hsa-miR-507, hsa-miR-548d-5p, hsa-miR-1976, h...","[1.44937518, 2.510284729, 2.083508284, 1.23588..."
3,GSM1688371,A,4,"[hsa-miR-507, hsa-miR-548d-5p, hsa-miR-1976, h...","[1.777522456, 2.724678628, 3.012721615, 1.3907..."
4,GSM1688372,A,5,"[hsa-miR-507, hsa-miR-548d-5p, hsa-miR-1976, h...","[1.538218176, 2.030509973, 2.470374907, 1.5635..."


In [29]:
df_all.groupby("patient").size()


patient
A              8
B              8
C              8
D              8
E              8
F              8
G              8
H              7
I              8
J              8
K              8
L              8
M              8
N              8
O              8
P              8
Q              8
R              8
S              7
T              8
U              8
V              8
W              8
X              7
Y              8
Z              6
ZZ_control    12
dtype: int64

In [32]:
df_all.groupby("patient")["timepoint"].apply(list)

patient
A                            [1, 2, 3, 4, 5, 6, 7, 8]
B                            [1, 2, 3, 4, 5, 6, 7, 8]
C                            [1, 2, 3, 4, 5, 6, 7, 8]
D                            [1, 2, 3, 4, 5, 6, 7, 8]
E                            [1, 2, 3, 4, 5, 6, 7, 8]
F                            [1, 2, 3, 4, 5, 6, 7, 8]
G                            [1, 2, 3, 4, 5, 6, 7, 8]
H                               [1, 2, 3, 4, 5, 6, 7]
I                            [1, 2, 3, 4, 5, 6, 7, 8]
J                            [1, 2, 3, 4, 5, 6, 7, 8]
K                            [1, 2, 3, 4, 5, 6, 7, 8]
L                            [1, 2, 3, 4, 5, 6, 7, 8]
M                            [1, 2, 3, 4, 5, 6, 7, 8]
N                            [1, 2, 3, 4, 5, 6, 7, 8]
O                            [1, 2, 3, 4, 5, 6, 7, 8]
P                            [1, 2, 3, 4, 5, 6, 7, 8]
Q                            [1, 2, 3, 4, 5, 6, 7, 8]
R                            [1, 2, 3, 4, 5, 6, 7, 8]
S                   

In [112]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

scaler_classifier = make_pipeline(StandardScaler(), LogisticRegression(max_iter = 5000))
y = df_all["patient"].values
X = np.vstack(df_all["expression"].values)


In [113]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

def evaluate_model_logreg(X, y, classifier=None):

    if classifier is None:
        classifier = LogisticRegression(max_iter=5000)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)

    print(classification_report(y_test, y_pred, zero_division=0))

    return classifier


In [114]:
cfg = evaluate_model_logreg(X, y, scaler_classifier)
cfg

              precision    recall  f1-score   support

           A       0.00      0.00      0.00         2
           B       0.00      0.00      0.00         2
           C       0.67      1.00      0.80         2
           D       0.00      0.00      0.00         3
           E       0.00      0.00      0.00         3
           F       0.33      0.50      0.40         2
           G       0.00      0.00      0.00         3
           H       0.00      0.00      0.00         2
           I       0.00      0.00      0.00         3
           J       0.33      0.50      0.40         2
           K       0.00      0.00      0.00         2
           L       0.00      0.00      0.00         3
           M       0.00      0.00      0.00         2
           N       0.00      0.00      0.00         2
           O       0.50      0.50      0.50         2
           P       0.00      0.00      0.00         3
           Q       0.50      0.50      0.50         2
           R       0.00    

0,1,2
,steps,"[('standardscaler', ...), ('logisticregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,5000


In [128]:
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, matthews_corrcoef, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
def evaluate_model_logreg_groupKFold(X, y, groups):
    gkf = GroupKFold(n_splits=5)
    accuracies = []
    f1s = []
    fold = 1

    clf = make_pipeline(StandardScaler(),LogisticRegression(max_iter=5000))
    

    for train_idx, test_idx in gkf.split(X, y, groups):
        print(f"\n====== Fold {fold} ======")
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="macro")

        print("Accuracy:", acc)
        print("F1 macro:", f1)
        print(classification_report(y_test, y_pred, zero_division=0))

        accuracies.append(acc)
        f1s.append(f1)
        fold += 1

    print("\n=== FINAL LOGREG ===")
    print("Mean Accuracy:", np.mean(accuracies))
    print("Mean F1 Macro:", np.mean(f1s))

In [125]:
evaluate_model_logreg_groupKFold(X, y, groups)


Accuracy: 0.0
F1 macro: 0.0
              precision    recall  f1-score   support

           A       0.00      0.00      0.00       0.0
           B       0.00      0.00      0.00       0.0
           C       0.00      0.00      0.00       0.0
           E       0.00      0.00      0.00       0.0
           H       0.00      0.00      0.00       7.0
           J       0.00      0.00      0.00       8.0
           L       0.00      0.00      0.00       0.0
           M       0.00      0.00      0.00       0.0
           N       0.00      0.00      0.00       0.0
           P       0.00      0.00      0.00       8.0
           Q       0.00      0.00      0.00       0.0
           R       0.00      0.00      0.00       0.0
           S       0.00      0.00      0.00       0.0
           V       0.00      0.00      0.00       8.0
           W       0.00      0.00      0.00       0.0
           Y       0.00      0.00      0.00       0.0
           Z       0.00      0.00      0.00       0.

In [139]:
def build_svm_pca_pipeline():
    return Pipeline([
        ("scaler", StandardScaler()),
        ("pca", PCA()),
        ("svm", SVC(kernel="rbf"))
    ])

In [144]:
def tune_hyperparameters(pipeline, X, y, groups):
    param_grid = {
        "pca__n_components": [5, 10, 15, 20],
        "svm__C": [0.1, 1, 10],
        "svm__gamma": ["scale", 0.01, 0.001]
    }

    mcc_scorer = make_scorer(matthews_corrcoef)
    gkf = GroupKFold(n_splits=5)

    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring=mcc_scorer,
        cv=gkf.split(X, y, groups),
        n_jobs=-1,
        verbose=2
    )

    grid.fit(X, y)

    print("Best Params:", grid.best_params_)
    print("Best MCC:", grid.best_score_)

    return grid.best_estimator_


In [145]:
def evaluate_model(model, X, y, groups):
    gkf = GroupKFold(n_splits=5)
    accuracies = []
    f1s = []
    fold = 1

    for train_idx, test_idx in gkf.split(X, y, groups):
        print(f"\n====== Fold {fold} ======")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="macro")

        print("Accuracy:", acc)
        print("F1:", f1)
        print(classification_report(y_test, y_pred, zero_division=0))

        accuracies.append(acc)
        f1s.append(f1)
        fold += 1

    print("\nFINAL RESULTS")
    print("Mean Accuracy:", np.mean(accuracies))
    print("Mean F1 Macro:", np.mean(f1s))


In [146]:
def run_svm_pca_experiment(X, y, groups):
    print("Building model")
    pipeline = build_svm_pca_pipeline()

    print("Tuning hyperparameters")
    best_model = tune_hyperparameters(pipeline, X, y, groups)

    print("Evaluating final model")
    evaluate_model(best_model, X, y, groups)

    return best_model


In [147]:
best_model = run_svm_pca_experiment(X, y, groups)


üîß Building model...
üîç Tuning hyperparameters...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Params: {'pca__n_components': 5, 'svm__C': 0.1, 'svm__gamma': 'scale'}
Best MCC: 0.0

üìä Evaluating final model...

Accuracy: 0.0
F1: 0.0
              precision    recall  f1-score   support

           A       0.00      0.00      0.00       0.0
           B       0.00      0.00      0.00       0.0
           C       0.00      0.00      0.00       0.0
           E       0.00      0.00      0.00       0.0
           H       0.00      0.00      0.00       7.0
           I       0.00      0.00      0.00       0.0
           J       0.00      0.00      0.00       8.0
           K       0.00      0.00      0.00       0.0
           M       0.00      0.00      0.00       0.0
           N       0.00      0.00      0.00       0.0
           O       0.00      0.00      0.00       0.0
           P       0.00      0.00      0.00       8.0
           R       0.00      0.00   