In [1]:
import csv
import os

import math
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.metrics import roc_curve, auc

## Load Data

In [2]:
# load y data
participant_info_df = pd.read_csv("../Model_Input/participant_info.csv")
participant_info_df.sort_values('id', inplace=True) #make sure, its alphabetically sorted

participants = {}

participants = participant_info_df.set_index('id')['ASC'].to_dict()

# rename participant ids to their corresponding dataframe name
for k in list(participants.keys()):
    new_key = k + "_concat.csv"
    participants[new_key] = participants.pop(k)

In [3]:
# Load X Data
method_frames = {}

for (root, dirs, file) in os.walk("../Model_Input/whole_video"):
    for f in file:
        if ".csv" in f:
            path = root + "/" + f
            df = pd.read_csv(path)  
            # get rid of unknown participant
            df = df.drop(df.loc[df['id'] == 'pre-91-020_part_1_concat.csv'].index)
            
            method_frames[f] = df

print(method_frames.keys())

dict_keys(['libreface_stats_complete.csv', 'megraph_stats_complete.csv', 'openface_stats_complete.csv'])


### LOOCV

In [4]:
def loocv(data_features, labels):
    
    # create loocv procedure
    cv = LeaveOneOut()
    # create model instance
    bst = XGBClassifier()  #(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
    # evaluate model
    base_scores = cross_val_score(bst, data_features, labels, scoring='accuracy', cv=cv, n_jobs=-1) # 'precision', 'recall'
    
    return base_scores

### Inner LOOCV

In [30]:
def inner_loocv(X, y):
    classifier = XGBClassifier()
    # Setup cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    stats = []
    models = []
    
    # Perform cross-validation
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the model
        classifier.fit(X_train, y_train)
        models.append(classifier)

        # Make predictions
        predictions = classifier.predict(X_test)

        # Calculate accuracy
        # TODO check that
        report = classification_report(y_test, predictions)
        stats.append(report)
    
    return stats

In [24]:
def get_data(method_name):
    df = method_frames[method_name]
    
    # make sure it's sorted by participant in alph. order
    df = df.sort_values('id')
    
    cleaned_data = df.drop(columns=["id", "Unnamed: 0"])
    return cleaned_data

In [25]:
# Test if participant labels are correct with data:
data = method_frames['openface_stats_complete.csv']
zipped = zip(list(data["id"]), list(participants.keys()))
for l1,l2 in zipped:
    if not l1 in l2:
        print(f"{l1} != {l2}")
print("If no line was printed, the data X and label y match")

If no line was printed, the data X and label y match


## Replicate Openface stats as in paper

In [26]:
X_of = get_data('openface_stats_complete.csv')
labels = np.array(list(participants.values()))

In [27]:
#X_of.columns

In [28]:
# report performance
of_loocv_scores = loocv(X_of, labels)
print('Accuracy for all data: %.3f (%.3f)' % (np.mean(of_loocv_scores), np.std(of_loocv_scores)))

Accuracy for all data: 0.640 (0.480)


In [31]:
stats = inner_loocv(X_of, labels)

for s in stats:
    print(s)

              precision    recall  f1-score   support

           0       0.64      0.56      0.60        16
           1       0.63      0.71      0.67        17

    accuracy                           0.64        33
   macro avg       0.64      0.63      0.63        33
weighted avg       0.64      0.64      0.63        33

              precision    recall  f1-score   support

           0       0.47      0.50      0.48        16
           1       0.50      0.47      0.48        17

    accuracy                           0.48        33
   macro avg       0.49      0.49      0.48        33
weighted avg       0.49      0.48      0.48        33

              precision    recall  f1-score   support

           0       0.59      0.62      0.61        16
           1       0.62      0.59      0.61        17

    accuracy                           0.61        33
   macro avg       0.61      0.61      0.61        33
weighted avg       0.61      0.61      0.61        33

              preci

## LibreFace

In [32]:
X_lf = get_data('libreface_stats_complete.csv')
labels = np.array(list(participants.values()))

In [33]:
#X_lf.columns

In [34]:
# report performance
lf_loocv_scores = loocv(X_lf, labels)
print('Accuracy for all data: %.3f (%.3f)' % (np.mean(lf_loocv_scores), np.std(lf_loocv_scores)))

Accuracy for all data: 0.537 (0.499)


In [39]:
inner_loocv(X_lf, labels)

SyntaxError: can't use starred expression here (2654832627.py, line 1)

## ME-Graph

In [36]:
X_me = get_data('megraph_stats_complete.csv')

In [37]:
me_loocv_scores = loocv(X_me, labels)
print(np.mean(me_loocv_scores))

0.6951219512195121


In [38]:
inner_loocv(X_me, labels)

['              precision    recall  f1-score   support\n\n           0       0.69      0.69      0.69        16\n           1       0.71      0.71      0.71        17\n\n    accuracy                           0.70        33\n   macro avg       0.70      0.70      0.70        33\nweighted avg       0.70      0.70      0.70        33\n',
 '              precision    recall  f1-score   support\n\n           0       0.56      0.56      0.56        16\n           1       0.59      0.59      0.59        17\n\n    accuracy                           0.58        33\n   macro avg       0.58      0.58      0.58        33\nweighted avg       0.58      0.58      0.58        33\n',
 '              precision    recall  f1-score   support\n\n           0       0.69      0.69      0.69        16\n           1       0.71      0.71      0.71        17\n\n    accuracy                           0.70        33\n   macro avg       0.70      0.70      0.70        33\nweighted avg       0.70      0.70      0.