In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_validate
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [11]:
def train_and_evaluate(dataframe, k=5):
    # --- Data Preparation ---
    df = dataframe.iloc[1:].copy()  # Skip the first row
    X = df.iloc[:, 2:]
    y = df.iloc[:, 1]

    # --- Pipeline: StandardScaler + LDA ---
    # LDA often benefits from data with comparable scale, especially if features vary widely
    lda_pipeline = make_pipeline(
        StandardScaler(),
        LinearDiscriminantAnalysis()
    )

    # --- Cross Validation Setup ---
    cv = KFold(n_splits=k, shuffle=True, random_state=42)
    scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

    # --- Perform Cross Validation ---
    cv_results = cross_validate(lda_pipeline, X, y, cv=cv, scoring=scoring)

    # --- Calculate Average Metrics ---
    avg_accuracy  = np.mean(cv_results['test_accuracy'])
    avg_precision = np.mean(cv_results['test_precision_macro'])
    avg_recall    = np.mean(cv_results['test_recall_macro'])
    avg_f1        = np.mean(cv_results['test_f1_macro'])

    # --- Print or Return Results ---
    metrics = {
        'accuracy':  avg_accuracy,
        'precision': avg_precision,
        'recall':    avg_recall,
        'f1_score':  avg_f1
    }

    print(f"LDA CV Metrics (averaged over {k} folds):")
    for metric_name, value in metrics.items():
        print(f"{metric_name.capitalize()}: {value*100:.2f}")

    return metrics

In [12]:
# Combined dataset
df = pd.read_csv('Datasets/combined_dataset.csv')
train_and_evaluate(df)

LDA CV Metrics (averaged over 5 folds):
Accuracy: 90.90
Precision: 84.49
Recall: 86.76
F1_score: 85.51


{'accuracy': np.float64(0.909018390496174),
 'precision': np.float64(0.8448515163188596),
 'recall': np.float64(0.8676326572938964),
 'f1_score': np.float64(0.8551237847111638)}

In [13]:
# Motor only dataset
df = pd.read_csv('Datasets/motor_only.csv')
train_and_evaluate(df)

LDA CV Metrics (averaged over 5 folds):
Accuracy: 90.20
Precision: 83.97
Recall: 86.13
F1_score: 84.90


{'accuracy': np.float64(0.9020280767659699),
 'precision': np.float64(0.8396749753124993),
 'recall': np.float64(0.861260267324643),
 'f1_score': np.float64(0.8490301871417258)}

In [14]:
# Non-motor only dataset
df = pd.read_csv('Datasets/non_motor_only.csv')
train_and_evaluate(df)

LDA CV Metrics (averaged over 5 folds):
Accuracy: 78.96
Precision: 68.42
Recall: 65.54
F1_score: 66.79


{'accuracy': np.float64(0.7895714370502189),
 'precision': np.float64(0.684155325517747),
 'recall': np.float64(0.6553544102096054),
 'f1_score': np.float64(0.6679492992519267)}

In [15]:
# Objective only dataset
df = pd.read_csv('Datasets/objective_only.csv')
train_and_evaluate(df)

LDA CV Metrics (averaged over 5 folds):
Accuracy: 89.37
Precision: 83.18
Recall: 85.70
F1_score: 84.26


{'accuracy': np.float64(0.8937382637332713),
 'precision': np.float64(0.831837941600638),
 'recall': np.float64(0.8570270895183343),
 'f1_score': np.float64(0.842568039411405)}

In [16]:
# Self Report only dataset
df = pd.read_csv('Datasets/self_report_only.csv')
train_and_evaluate(df)

LDA CV Metrics (averaged over 5 folds):
Accuracy: 84.52
Precision: 73.98
Recall: 73.56
F1_score: 73.67


{'accuracy': np.float64(0.8452002316206011),
 'precision': np.float64(0.7398370148022086),
 'recall': np.float64(0.7356360249853078),
 'f1_score': np.float64(0.7367441010558747)}