### ***Step 3 Predicion for full feature dataset***

in this step, the dataset need to be split into testing and training dataset!

In [2]:
#load data again
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import roc_auc_score, balanced_accuracy_score, f1_score, average_precision_score

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / "data"

df_gex = pd.read_csv(DATA_DIR / "mammacarcinoma_gex.csv")
df_pat = pd.read_csv(DATA_DIR / "mammacarcinoma_pat.csv")

df_full = df_pat.merge(df_gex, on="patient_id", how="inner")

gene_cols = [c for c in df_gex.columns if c != "patient_id"]

# Full-feature models (no PCA)
pre = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

models = {
    "LogReg": Pipeline([
        ("pre", pre),
        ("model", LogisticRegression(
            max_iter=5000, class_weight="balanced", random_state=RANDOM_STATE
        ))
    ]),
    "RF": Pipeline([
        ("pre", pre),
        ("model", RandomForestClassifier(
            n_estimators=500, class_weight="balanced_subsample",
            random_state=RANDOM_STATE, n_jobs=-1
        ))
    ]),
    "kNN": Pipeline([
        ("pre", pre),
        ("model", KNeighborsClassifier(n_neighbors=11, weights="distance"))
    ]),
}


In [3]:
targets = ["er", "node", "relapse"]
all_rows = []

for target in targets:
    if target not in df_full.columns:
        print(f" Target '{target}' fehlt in df_full -> Ã¼bersprungen.")
        continue

    df_t = df_full.dropna(subset=[target]).copy()
    df_t[target] = df_t[target].astype(int)

    X = df_t[gene_cols]
    y = df_t[target]

    # stratified split per target
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.20,
        random_state=RANDOM_STATE,
        stratify=y
    )

    print(f"\nTarget={target} | N={len(y)} | PosRate={y.mean():.3f} | Train={len(y_train)} Test={len(y_test)}")

    for model_name, est in models.items():
        est.fit(X_train, y_train)

        # scores for AUC/AP
        if hasattr(est, "predict_proba"):
            y_score = est.predict_proba(X_test)[:, 1]
        elif hasattr(est, "decision_function"):
            y_score = est.decision_function(X_test)
        else:
            y_score = est.predict(X_test)

        y_pred = est.predict(X_test)

        all_rows.append({
            "target": target,
            "model": model_name,
            "roc_auc": roc_auc_score(y_test, y_score),
            "balanced_acc": balanced_accuracy_score(y_test, y_pred),
            "f1": f1_score(y_test, y_pred),
            "avg_precision": average_precision_score(y_test, y_score),
        })

df_metrics_full = pd.DataFrame(all_rows).sort_values(["target", "model"]).reset_index(drop=True)
df_metrics_full



Target=er | N=308 | PosRate=0.851 | Train=246 Test=62

Target=node | N=247 | PosRate=0.223 | Train=197 Test=50

Target=relapse | N=218 | PosRate=0.413 | Train=174 Test=44


Unnamed: 0,target,model,roc_auc,balanced_acc,f1,avg_precision
0,er,LogReg,0.951782,0.833333,0.972477,0.991041
1,er,RF,0.966457,0.666667,0.946429,0.993941
2,er,kNN,0.951782,0.611111,0.938053,0.991186
3,node,LogReg,0.762238,0.597902,0.352941,0.513052
4,node,RF,0.801865,0.5,0.0,0.526094
5,node,kNN,0.724942,0.5,0.0,0.493294
6,relapse,LogReg,0.690171,0.655983,0.518519,0.697632
7,relapse,RF,0.614316,0.508547,0.1,0.569952
8,relapse,kNN,0.559829,0.544872,0.26087,0.554315


In [4]:
#get insight into name structure of genes
print("Number of gene columns:", len(gene_cols))
print("First 10 genes:", gene_cols[:10])
print("Last 10 genes:", gene_cols[-10:])


Number of gene columns: 6384
First 10 genes: ['DDR1', 'RFC2', 'HSPA6', 'PAX8', 'GUCA1A', 'THRA', 'PTPN21', 'EPHB3', 'ESRRA', 'CYP2A6']
Last 10 genes: ['CASP8AP2', 'POLM', 'KLK5', 'SH3BP4', 'SPO11', 'TRDN', 'DKFZP564C196', 'YDD19', 'TAZ', 'GMEB2']


In [5]:
est = models["LogReg"]
est.fit(X_train, y_train)

coefs = est.named_steps["model"].coef_
print("coef_ shape:", coefs.shape)  # erwartet: (1, n_features)
print("n_features from coef_:", coefs.shape[1])


coef_ shape: (1, 6384)
n_features from coef_: 6384
