In [1]:
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.stats import ttest_rel
from sklearn.svm import SVC
import xgboost as xgb
import numpy as np
import pandas as pd

In [10]:
# Step 1: Load and transpose
# If your data is in an Excel file, you can use:
# df = pd.read_excel('your_file.xlsx')
df = pd.read_csv(r"C:\Users\Paolo\OneDrive\Desktop\Thesis\Mycos_old_data (1)\Mycos_old_data\20250613_Cuc_Mycos_RGB\ready_for_testing_2025.xlsx")
df_T = df.T  # Now rows = samples, columns = features
df_T_clean = df_T.dropna(subset=[df_T.columns[-1]])

In [11]:
# Assume the last column contains labels (adjust if needed)
X = df_T_clean.iloc[:, :-1].values  # All features
y = df_T_clean.iloc[:, -1].values   # Class labels (assumes last column is labels)

y_encoded = y
label_names = np.array([0, 1])

In [12]:
# 🔹 Define Stratified CV
n_samples = len(y_encoded)
n_splits = 5 if n_samples >= 10 else n_samples
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [13]:
def styled_confusion_matrix(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=range(len(labels)))
    cm_df = pd.DataFrame(cm, index=[f"{label}_true" for label in labels],
                             columns=[f"{label}_pred" for label in labels])
    cm_percent = cm.astype('float') / cm.sum(axis=1, keepdims=True)
    cm_formatted = cm_df.copy()

    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            count = cm[i, j]
            percent = cm_percent[i, j]
            cm_formatted.iloc[i, j] = f"{count} ({percent:.0%})"

    cm_formatted['Total'] = cm_df.sum(axis=1)
    totals_row = cm_df.sum(axis=0)
    totals_row['Total'] = cm_df.values.sum()
    cm_formatted.loc['Total'] = totals_row.astype(str)

    return cm_formatted

In [14]:
# LDA
lda_pipeline = make_pipeline(
    StandardScaler(),
    LDA(solver='lsqr', shrinkage='auto')
)

# XGBoost + GridSearchCV
xgb_pipeline = make_pipeline(
    StandardScaler(),
    xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=0)
)
xgb_params = {
    'xgbclassifier__n_estimators': [50, 100],
    'xgbclassifier__max_depth': [3, 4],
    'xgbclassifier__learning_rate': [0.05, 0.1]
}
xgb_grid = GridSearchCV(xgb_pipeline, xgb_params, cv=cv, scoring='accuracy', n_jobs=-1)

# Random Forest
rf_pipeline = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0)
)

# SVM + GridSearchCV
svm_pipeline = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf', random_state=0)
)
svm_params = {
    'svc__C': [0.1, 1.0, 10],
    'svc__gamma': ['scale', 0.01, 0.001]
}
svm_grid = GridSearchCV(svm_pipeline, svm_params, cv=cv, scoring='accuracy', n_jobs=-1)

In [15]:
models = {
    "LDA": lda_pipeline,
    "XGBoost": xgb_grid,
    "RandomForest": rf_pipeline,
    "SVM": svm_grid
}

In [16]:
# Encode labels: if not already encoded
label_names = sorted(set(y))  # or provide manually
n_labels = len(label_names)

print("📊 Confusion Matrices (Stratified 5-Fold CV):\n")

for name, model in models.items():
    print(f"\n🔹 {name}")
    y_pred = cross_val_predict(model, X, y_encoded, cv=cv)
    cm = styled_confusion_matrix(y_encoded, y_pred, label_names)
    print(cm)

📊 Confusion Matrices (Stratified 5-Fold CV):


🔹 LDA


  cm_formatted.iloc[i, j] = f"{count} ({percent:.0%})"
  cm_formatted.iloc[i, j] = f"{count} ({percent:.0%})"


          0.0_pred   1.0_pred Total
0.0_true  95 (99%)     1 (1%)    96
1.0_true    2 (1%)  142 (99%)   144
Total           97        143   240

🔹 XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  cm_formatted.iloc[i, j] = f"{count} ({percent:.0%})"
  cm_formatted.iloc[i, j] = f"{count} ({percent:.0%})"


          0.0_pred   1.0_pred Total
0.0_true  93 (97%)     3 (3%)    96
1.0_true    2 (1%)  142 (99%)   144
Total           95        145   240

🔹 RandomForest


  cm_formatted.iloc[i, j] = f"{count} ({percent:.0%})"
  cm_formatted.iloc[i, j] = f"{count} ({percent:.0%})"


          0.0_pred   1.0_pred Total
0.0_true  93 (97%)     3 (3%)    96
1.0_true    2 (1%)  142 (99%)   144
Total           95        145   240

🔹 SVM
          0.0_pred   1.0_pred Total
0.0_true  94 (98%)     2 (2%)    96
1.0_true    3 (2%)  141 (98%)   144
Total           97        143   240


  cm_formatted.iloc[i, j] = f"{count} ({percent:.0%})"
  cm_formatted.iloc[i, j] = f"{count} ({percent:.0%})"
