# Gaze-based mind wandering detection model training

### Data Location
Data was prepared for 5 iterations of 3-fold cross-validation. Data should located in the './Data' folder. The filename systemically follows the naming pattern (*training/testing*)\_(*iteration index*)\_(*fold index*).

In each fold, nine randomly selected participants were left out so that the data from 18 participants were used for training and 
the remaining data for testing. We used a leave-9-participants-out cross-validation because some of the participants had  unbalanced class distribution 

## Notebook setup

In [1]:
# Basic import
import numpy as np
import pandas as pd
import xgboost as xgb

from scipy.io import arff
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support, f1_score
from sklearn import preprocessing

In [3]:
class ResultSummary:
    sumAOC = 0
    sumF1 = 0
    sumFP = 0
    sumPrecision = 0
    sumRecall = 0

## 1. Model Training and Testing

### 1.1 XGBoost Training and Testing

In [5]:
def RunXGBoost(condition, iteration, fold, X_train, y_train, X_test, y_test, xgBoostSummary):
    le = preprocessing.LabelEncoder()
    le.fit(y_train)
    y_train = le.transform(y_train)

    le.fit(y_test)
    y_test = le.transform(y_test)

    dtrain = xgb.DMatrix(data=X_train[X_train.columns], label=y_train)
    dtest = xgb.DMatrix(data=X_test[X_test.columns])

    params = {
        'learning_rate':0.1,
        'max_depth':10,
        'objective': 'multi:softmax', 
        'num_class': 2
    }

    # Init classifier
    bst = xgb.train(params, dtrain)

    pred = bst.predict(dtest)

    # Calculate Results
    TN, FP, FN, TP = confusion_matrix(y_test,pred).ravel()

    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Fall out or false positive rate
    FPR = FP/(FP+TN)

    # Overall accuracy
    ACC = (TP+TN)/(TP+FP+FN+TN)
    countCorrect = (TP+TN)

    AOC = roc_auc_score(y_test,pred)
    Precision_MW, Recall_MW, F1_MW,support = precision_recall_fscore_support(y_test, pred, average='binary')
    F1 = f1_score(y_test, pred, average='weighted')

    xgBoostSummary.sumAOC += AOC
    xgBoostSummary.sumF1 += F1
    xgBoostSummary.sumFP += FPR
    xgBoostSummary.sumPrecision += Precision_MW
    xgBoostSummary.sumRecall += Recall_MW
    
    output = "Condition: " + condition + " Iteration: " + str(iteration) + " fold: "+ str(fold) + " AOC: "+str(round(AOC,4))+" F1: " +str(round(F1,4))+" FP: "+str(round(FPR,4))+" Pression: "+str(round(Precision_MW,4))+" Recall: "+str(round(Recall_MW, 4))
    print(output)

    return

In [7]:
#XGBoost
conditions = ['With_Vergence','Without_Vergence']
for condition in conditions:
    xgBoostSummary = ResultSummary()
    
    for i in range(5):
        for j in range(3):
            testdata = arff.loadarff('./Data/'+condition+'/testing_'+str(i)+'_'+str(j)+'.arff')
            test_df = pd.DataFrame(testdata[0])
            X_test = test_df[test_df.columns.difference(['Mean_Centroid_Dist_of_Gazes','SD_Centroid_Dist_of_Gazes','GT'])]
            y_test = test_df['GT'].str.decode('utf-8') 
    
            traindata = arff.loadarff('./Data/'+condition+'/training_'+str(i)+'_'+str(j)+'.arff')
            train_df = pd.DataFrame(traindata[0])
            X_train = train_df[train_df.columns.difference(['Mean_Centroid_Dist_of_Gazes','SD_Centroid_Dist_of_Gazes','GT'])]
            y_train = train_df['GT'].str.decode('utf-8')
    
            RunXGBoost(condition, i, j, X_train, y_train, X_test, y_test, xgBoostSummary)
            
    output = "\nSummary for condition: " + condition + " AOC: "+str(round(xgBoostSummary.sumAOC/15,4))+" F1: " +str(round(xgBoostSummary.sumF1/15,4))+" FP: "+str(round(xgBoostSummary.sumFP/15,4))+" Pression: "+str(round(xgBoostSummary.sumPrecision/15,4))+" Recall: "+str(round(xgBoostSummary.sumRecall/15, 4))
    print(output)
    print("\n----------------------------\n")