In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_validate
from sklearn.ensemble import RandomForestClassifier

In [8]:
def train_and_evaluate(dataframe, k=5, n_estimators=100):
    # --- Data Preparation ---
    df = dataframe.iloc[1:].copy()  # Skip the first row
    X = df.iloc[:, 2:]
    y = df.iloc[:, 1]
    
    # --- Model ---
    rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=42, n_jobs=-1)

    # --- Cross Validation Setup ---
    cv = KFold(n_splits=k, shuffle=True, random_state=42)
    scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
    
    # --- Perform Cross Validation ---
    cv_results = cross_validate(rf_model, X, y, cv=cv, scoring=scoring)
    
    # --- Calculate Average Metrics ---
    avg_accuracy = np.mean(cv_results['test_accuracy'])
    avg_precision = np.mean(cv_results['test_precision_macro'])
    avg_recall = np.mean(cv_results['test_recall_macro'])
    avg_f1 = np.mean(cv_results['test_f1_macro'])
    
    # --- Print or Return Results ---
    metrics = {
        'accuracy': avg_accuracy,
        'precision': avg_precision,
        'recall': avg_recall,
        'f1_score': avg_f1
    }

    print(f"Random Forest CV Metrics (averaged over {k} folds):")
    for metric_name, value in metrics.items():
        print(f"{metric_name.capitalize()}: {value*100:.2f}")

    return metrics

In [9]:
# Combined dataset
df = pd.read_csv('Datasets/combined_dataset.csv')
train_and_evaluate(df)

Random Forest CV Metrics (averaged over 5 folds):
Accuracy: 92.97
Precision: 92.89
Recall: 81.52
F1_score: 85.76


{'accuracy': np.float64(0.9296915266460948),
 'precision': np.float64(0.9288821313357885),
 'recall': np.float64(0.8151532059234359),
 'f1_score': np.float64(0.8576059733227224)}

In [10]:
# Motor only dataset
df = pd.read_csv('Datasets/motor_only.csv')
train_and_evaluate(df)

Random Forest CV Metrics (averaged over 5 folds):
Accuracy: 92.96
Precision: 91.36
Recall: 84.52
F1_score: 87.41


{'accuracy': np.float64(0.9295917761719059),
 'precision': np.float64(0.9135658570271292),
 'recall': np.float64(0.8452193639484378),
 'f1_score': np.float64(0.8741159124707094)}

In [11]:
# Non-motor only dataset
df = pd.read_csv('Datasets/non_motor_only.csv')
train_and_evaluate(df)

Random Forest CV Metrics (averaged over 5 folds):
Accuracy: 84.87
Precision: 79.38
Recall: 69.76
F1_score: 73.30


{'accuracy': np.float64(0.8486961366142595),
 'precision': np.float64(0.7938040917611728),
 'recall': np.float64(0.6976359159302039),
 'f1_score': np.float64(0.7330464521549442)}

In [12]:
# Objective only dataset
df = pd.read_csv('Datasets/objective_only.csv')
train_and_evaluate(df)

Random Forest CV Metrics (averaged over 5 folds):
Accuracy: 92.45
Precision: 91.49
Recall: 83.34
F1_score: 86.70


{'accuracy': np.float64(0.9244979184569798),
 'precision': np.float64(0.9148741021122901),
 'recall': np.float64(0.8334280305134705),
 'f1_score': np.float64(0.8670407521237037)}

In [13]:
# Self Report only dataset
df = pd.read_csv('Datasets/self_report_only.csv')
train_and_evaluate(df)

Random Forest CV Metrics (averaged over 5 folds):
Accuracy: 87.96
Precision: 82.89
Recall: 73.29
F1_score: 76.97


{'accuracy': np.float64(0.8795566889426102),
 'precision': np.float64(0.8288936596961891),
 'recall': np.float64(0.7328678285720065),
 'f1_score': np.float64(0.7696508669738381)}