In [1]:
import numpy as np
from sklearn.model_selection import KFold, cross_validate
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [2]:
def train_and_evaluate_lda(dataframe, k=5):
    # --- Data Preparation ---
    df = dataframe.iloc[1:].copy()  # Skip the first row
    X = df.iloc[:, 2:]
    y = df.iloc[:, 1]

    # --- Pipeline: StandardScaler + LDA ---
    # LDA often benefits from data with comparable scale, especially if features vary widely
    lda_pipeline = make_pipeline(
        StandardScaler(),
        LinearDiscriminantAnalysis()
    )

    # --- Cross Validation Setup ---
    cv = KFold(n_splits=k, shuffle=True, random_state=42)
    scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

    # --- Perform Cross Validation ---
    cv_results = cross_validate(lda_pipeline, X, y, cv=cv, scoring=scoring)

    # --- Calculate Average Metrics ---
    avg_accuracy  = np.mean(cv_results['test_accuracy'])
    avg_precision = np.mean(cv_results['test_precision_macro'])
    avg_recall    = np.mean(cv_results['test_recall_macro'])
    avg_f1        = np.mean(cv_results['test_f1_macro'])

    # --- Print or Return Results ---
    metrics = {
        'accuracy':  avg_accuracy,
        'precision': avg_precision,
        'recall':    avg_recall,
        'f1_score':  avg_f1
    }

    print(f"LDA CV Metrics (averaged over {k} folds):")
    for metric_name, value in metrics.items():
        print(f"{metric_name.capitalize()}: {value*100:.2f}")

    return metrics