In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

In [44]:
#With oversampling
# Load the dataset
from collections import Counter

def oversampledRF(csv_file):
    data = pd.read_csv(csv_file)

    # Handle missing values
    data.fillna(data.mean(), inplace=True)

    # Separate features and labels
    X = data.iloc[:, 1:].values  # Features
    y = data.iloc[:, 0].values   # Labels

    # Standardize the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)


    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    # Check label distribution after SMOTE
    print("Label distribution after SMOTE:")
    print(Counter(y_train_resampled))


    # Initialize and train the Random Forest Classifier
    rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
    rf_model.fit(X_train_resampled, y_train_resampled)

    # Make predictions on the test set
    y_pred = rf_model.predict(X_test)

    # Evaluate the model
    print("Confusion Matrix (oversampled):")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification Report (oversampled):")
    print(classification_report(y_test, y_pred, target_names=["Class 0", "Class 1"]))
    
def originalRF(csv_file):
    data = pd.read_csv(csv_file)

    # Handle missing values
    data.fillna(data.mean(), inplace=True)

    # Separate features and labels
    X = data.iloc[:, 1:].values  # Features
    y = data.iloc[:, 0].values   # Labels

    # Standardize the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)


    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # smote = SMOTE(random_state=42)
    # X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    # # Check label distribution after SMOTE
    # print("Label distribution after SMOTE:")
    print(Counter(y_train))


    # Initialize and train the Random Forest Classifier
    rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
    rf_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = rf_model.predict(X_test)

    # Evaluate the model
    print("Confusion Matrix (original):")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification Report (original):")
    print(classification_report(y_test, y_pred, target_names=["Class 0", "Class 1"]))

csv_file = "./data/merged_data/no_personality/filtered_merged_5seconds_middle.csv"  
oversampledRF(csv_file)
originalRF(csv_file)

Label distribution after SMOTE:
Counter({1: 1265, 0: 1265})
Confusion Matrix (oversampled):
[[300  19]
 [ 27  67]]

Classification Report (oversampled):
              precision    recall  f1-score   support

     Class 0       0.92      0.94      0.93       319
     Class 1       0.78      0.71      0.74        94

    accuracy                           0.89       413
   macro avg       0.85      0.83      0.84       413
weighted avg       0.89      0.89      0.89       413

Counter({0: 1265, 1: 383})
Confusion Matrix (original):
[[311   8]
 [ 35  59]]

Classification Report (original):
              precision    recall  f1-score   support

     Class 0       0.90      0.97      0.94       319
     Class 1       0.88      0.63      0.73        94

    accuracy                           0.90       413
   macro avg       0.89      0.80      0.83       413
weighted avg       0.89      0.90      0.89       413



In [10]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, matthews_corrcoef, f1_score
)
from imblearn.over_sampling import SMOTE
import os

def extract_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()
    return {
        'TP': tp,
        'TN': tn,
        'FP': fp,
        'FN': fn,
        'Accuracy': accuracy_score(y_true, y_pred),
        'MCC': matthews_corrcoef(y_true, y_pred),
        'F1_macro': f1_score(y_true, y_pred, average='macro'),
        'F1_weighted': f1_score(y_true, y_pred, average='weighted')
    }

def oversampledRF(csv_file, timecards, n_runs=30):
    data = pd.read_csv(csv_file)
    data.fillna(data.mean(), inplace=True)
    X = data.iloc[:, 1:].values
    y = data.iloc[:, 0].values
    X = StandardScaler().fit_transform(X)

    results = []

    for run in range(n_runs):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

        try:
            X_train_resampled, y_train_resampled = SMOTE(random_state=42).fit_resample(X_train, y_train)
        except ValueError as e:
            print(f"Run {run}: SMOTE failed with: {e}")
            continue

        rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
        rf_model.fit(X_train_resampled, y_train_resampled)
        y_pred = rf_model.predict(X_test)

        metrics = extract_metrics(y_test, y_pred)
        metrics['Run'] = run + 1
        results.append(metrics)

    df = pd.DataFrame(results)
    parent = os.path.basename(os.path.dirname(csv_file))
    # filename = os.path.splitext(os.path.basename(csv_file))[0]
    # tag = f"{parent}_{filename}"
    df.to_csv(f"results/{timecards}/rf/{parent}_oversampled_rf_metrics.csv", index=False)
    print(f"\n[Oversampled RF] Average accuracy over {n_runs} runs: {df['Accuracy'].mean():.4f}")

def originalRF(csv_file, timecards, n_runs=30):
    data = pd.read_csv(csv_file)
    data.fillna(data.mean(), inplace=True)
    X = data.iloc[:, 1:].values
    y = data.iloc[:, 0].values
    X = StandardScaler().fit_transform(X)

    results = []

    for run in range(n_runs):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

        rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
        rf_model.fit(X_train, y_train)
        y_pred = rf_model.predict(X_test)

        metrics = extract_metrics(y_test, y_pred)
        metrics['Run'] = run + 1
        results.append(metrics)

    df = pd.DataFrame(results)
    parent = os.path.basename(os.path.dirname(csv_file))
    # filename = os.path.splitext(os.path.basename(csv_file))[0]
    # tag = f"{parent}_{filename}"
    df.to_csv(f"results/{timecards}/rf/{parent}_original_rf_metrics.csv", index=False)
    print(f"\n[Original RF] Average accuracy over {n_runs} runs: {df['Accuracy'].mean():.4f}")
    
def runexperiments(file_extension, timecards):
    os.makedirs("results", exist_ok=True)

    all_data = "../data/merged_data/all/" + file_extension
    no_sensor = "../data/merged_data/no_sensor/" + file_extension
    no_fixation = "../data/merged_data/no_fixation/" + file_extension
    no_saccade = "../data/merged_data/no_saccade/" + file_extension
    no_velocity = "../data/merged_data/no_velocity/" + file_extension
    no_distance = "../data/merged_data/no_distance/" + file_extension
    no_personality = "../data/merged_data/no_personality/" + file_extension

    data_list = [ all_data]
    # data_list = [all_data, no_sensor, no_fixation, no_saccade, no_velocity, no_distance, no_personality]
    for data in data_list:
        oversampledRF(data,timecards)
        originalRF(data,timecards)

runexperiments("filtered_merged_5seconds.csv", "5_seconds")



[Oversampled RF] Average accuracy over 30 runs: 0.8937

[Original RF] Average accuracy over 30 runs: 0.8944
