In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_validate
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [14]:
def train_and_evaluate(dataframe, k=5):
    # --- Data Preparation ---
    df = dataframe.iloc[1:].copy()  # Skip the first row
    X = df.iloc[:, 2:]
    y = df.iloc[:, 1]

    # --- Pipeline: StandardScaler + XGBClassifier ---
    # Although XGBoost can handle varying scales decently, scaling can still help in many cases
    xgb_pipeline = make_pipeline(
        StandardScaler(),
        XGBClassifier(
            n_estimators=100,
            eval_metric='logloss', 
            random_state=42,
            n_jobs=-1
        )
    )


    # --- Cross Validation Setup ---
    cv = KFold(n_splits=k, shuffle=True, random_state=42)
    scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

    # --- Perform Cross Validation ---
    cv_results = cross_validate(xgb_pipeline, X, y, cv=cv, scoring=scoring)

    # --- Calculate Average Metrics ---
    avg_accuracy  = np.mean(cv_results['test_accuracy'])
    avg_precision = np.mean(cv_results['test_precision_macro'])
    avg_recall    = np.mean(cv_results['test_recall_macro'])
    avg_f1        = np.mean(cv_results['test_f1_macro'])

    # --- Print or Return Results ---
    metrics = {
        'accuracy':  avg_accuracy,
        'precision': avg_precision,
        'recall':    avg_recall,
        'f1_score':  avg_f1
    }

    print(f"XGBoost CV Metrics (averaged over {k} folds):")
    for metric_name, value in metrics.items():
        print(f"{metric_name.capitalize()}: {value*100:.2f}")

    return metrics


In [15]:
# Combined dataset
df = pd.read_csv('Datasets/combined_dataset.csv')
train_and_evaluate(df)

XGBoost CV Metrics (averaged over 5 folds):
Accuracy: 93.80
Precision: 91.06
Recall: 87.04
F1_score: 88.84


{'accuracy': np.float64(0.9379806913007112),
 'precision': np.float64(0.9106131270998343),
 'recall': np.float64(0.8703597238167895),
 'f1_score': np.float64(0.8883756878001584)}

In [16]:
# Motor only dataset
df = pd.read_csv('Datasets/motor_only.csv')
train_and_evaluate(df)

XGBoost CV Metrics (averaged over 5 folds):
Accuracy: 93.27
Precision: 89.99
Recall: 87.11
F1_score: 88.45


{'accuracy': np.float64(0.9326882303916753),
 'precision': np.float64(0.8998662609179254),
 'recall': np.float64(0.8710617502848631),
 'f1_score': np.float64(0.8844517463965621)}

In [17]:
# Non-motor only dataset
df = pd.read_csv('Datasets/non_motor_only.csv')
train_and_evaluate(df)

XGBoost CV Metrics (averaged over 5 folds):
Accuracy: 86.09
Precision: 79.93
Recall: 73.41
F1_score: 76.02


{'accuracy': np.float64(0.8608799089078669),
 'precision': np.float64(0.7993200759233035),
 'recall': np.float64(0.7340822992819322),
 'f1_score': np.float64(0.7601504458397902)}

In [18]:
# Objective only dataset
df = pd.read_csv('Datasets/objective_only.csv')
train_and_evaluate(df)

XGBoost CV Metrics (averaged over 5 folds):
Accuracy: 92.79
Precision: 90.17
Recall: 86.02
F1_score: 87.88


{'accuracy': np.float64(0.9278938236002638),
 'precision': np.float64(0.9017277475665001),
 'recall': np.float64(0.8601601450405143),
 'f1_score': np.float64(0.8788179282894099)}

In [19]:
# Self Report only dataset
df = pd.read_csv('Datasets/self_report_only.csv')
train_and_evaluate(df)

XGBoost CV Metrics (averaged over 5 folds):
Accuracy: 88.93
Precision: 82.12
Recall: 76.85
F1_score: 79.12


{'accuracy': np.float64(0.8893431082148007),
 'precision': np.float64(0.8212001120079883),
 'recall': np.float64(0.7685201152014285),
 'f1_score': np.float64(0.791194423317308)}

In [None]:
"