In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.utils.class_weight import compute_class_weight
from sklearn.base import clone
import xgboost as xgb

In [None]:
def prepare_data(df):
    df_clean = df.iloc[1:].copy()
    df_clean.columns = df.iloc[0]  # First row as column names

    labels = df_clean.iloc[-1]
    features = df_clean.iloc[:-1]

    labeled_idx = labels.dropna().index
    X = features[labeled_idx].astype(float).T
    y = labels[labeled_idx].astype(int)

    return X, y

In [None]:
def compute_dataset_weights(datasets):
    dataset_sizes = []
    for df in datasets:
        _, y = prepare_data(df)
        dataset_sizes.append(len(y))

    total_size = sum(dataset_sizes)
    weights = [total_size / size for size in dataset_sizes]  # inverse proportional
    return weights

In [None]:
def load_dataset_with_row_labels(filepath):
    if filepath.endswith('.csv'):
        df = pd.read_csv(filepath, header=None)
    else:
        df = pd.read_excel(filepath, header=None)
    return df

In [None]:
def get_class_weight_dict(y):
    classes = np.unique(y)
    class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
    return dict(zip(classes, class_weights))

In [None]:
lda_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lda', LDA(solver='lsqr', shrinkage='auto'))
])

xgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=0))
])
xgb_params = {
    'xgb__n_estimators': [50, 100],
    'xgb__max_depth': [3, 4],
    'xgb__learning_rate': [0.05, 0.1]
}

rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0))
])

svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(kernel='rbf', random_state=0))
])
svm_params = {
    'svc__C': [0.1, 1.0, 10],
    'svc__gamma': ['scale', 0.01, 0.001]
}

In [None]:
def plot_confusion_matrix_with_totals(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_df = pd.DataFrame(cm, index=[f"{l}_true" for l in labels],
                         columns=[f"{l}_pred" for l in labels])
    
    cm_df["Total"] = cm_df.sum(axis=1)
    total_row = cm_df.sum(axis=0)
    total_row.name = "Total"
    cm_df = pd.concat([cm_df, pd.DataFrame([total_row])])

    annot = cm_df.astype(str)
    for i in range(len(labels)):
        row_sum = cm_df.iloc[i, :-1].sum()
        for j in range(len(labels)):
            count = cm_df.iloc[i, j]
            pct = (count / row_sum * 100) if row_sum > 0 else 0
            annot.iloc[i, j] = f"{count} ({pct:.0f}%)"
    
    fig, ax = plt.subplots(figsize=(6, 4))
    ax.axis("off")
    table = ax.table(cellText=annot.values,
                     rowLabels=annot.index,
                     colLabels=annot.columns,
                     loc="center",
                     cellLoc="center")
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2, 1.2)
    plt.title("Confusion Matrix with Totals", pad=20)
    plt.show()

In [None]:
ds2014 = load_dataset_with_row_labels(r"C:\Users\Paolo\OneDrive\Desktop\Thesis\Mycos_old_data (1)\width_extraction\result\merged_14_20_with_avarage.xlsx")
ds2005 = load_dataset_with_row_labels(r"C:\Users\Paolo\OneDrive\Desktop\Thesis\Mycos_old_data (1)\width_extraction\result\merged_widths_200_Ok_SAM_2005.csv")
ds2025 = load_dataset_with_row_labels(r"C:\Users\Paolo\OneDrive\Desktop\Thesis\Mycos_old_data (1)\Mycos_old_data\20250613_Cuc_Mycos_RGB\ready_for_testing_2025.xlsx")

datasets = [ds2014, ds2005, ds2025]
names = ['ds2014', 'ds2005', 'ds2025']

In [None]:
def run_all_combinations(datasets, names):
    combinations = [
        ([0, 1], 2),
        ([0, 2], 1),
        ([1, 2], 0)
    ]

    for train_idxs, test_idx in combinations:
        train_name = f"{names[train_idxs[0]]} + {names[train_idxs[1]]}"
        test_name = names[test_idx]

        # Prepare train data
        X_train_list, y_train_list = [], []
        for idx in train_idxs:
            X, y = prepare_data(datasets[idx])
            X_train_list.append(X)
            y_train_list.append(y)

        X_train = pd.concat(X_train_list).reset_index(drop=True)
        y_train = pd.concat(y_train_list).reset_index(drop=True)

        # Prepare test data
        X_test, y_test = prepare_data(datasets[test_idx])

        # Dynamic CV
        n_splits = 5 if len(y_train) >= 10 else len(y_train)
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

        # Class weights
        class_weight_dict = get_class_weight_dict(y_train)

        # Models
        model_defs = {
            "LDA": lda_pipeline,
            "XGBoost": GridSearchCV(xgb_pipeline, xgb_params, cv=cv, scoring='accuracy', n_jobs=-1),
            "RandomForest": rf_pipeline,
            "SVM": GridSearchCV(svm_pipeline, svm_params, cv=cv, scoring='accuracy', n_jobs=-1)
        }

        for model_name, model in model_defs.items():
            print(f"\n▶️ Model: {model_name} | Train: {train_name} | Test: {test_name}")

            model = clone(model)

            # Apply class_weight to supported models
            if model_name == "RandomForest":
                model.set_params(rf__class_weight=class_weight_dict)
            elif model_name == "SVM":
                model.estimator.set_params(svc__class_weight=class_weight_dict)

            # Handle XGBoost imbalance
            if model_name == "XGBoost":
                n_pos = sum(y_train == 1)
                n_neg = sum(y_train == 0)
                if n_pos > 0:
                    model.estimator.set_params(xgb__scale_pos_weight=n_neg / n_pos)

            # CV fit
            cross_val_score(model, X_train, y_train, cv=cv)

            # Final train
            model.fit(X_train, y_train)

            # Predictions
            predictions = model.predict(X_test)
            acc = accuracy_score(y_test, predictions)
            print(f"Test Accuracy: {acc:.4f}")

            plot_confusion_matrix_with_totals(y_test, predictions, labels=model.classes_)


In [None]:
run_all_combinations(datasets, names)