In [1]:
# Scikit-learn models:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import graphviz

# Base packages:
from IPython.display import SVG
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
# Load data:
data = pd.read_csv("../data/Final_Results_Pupil_v2.csv", sep=";")

# Print first 5 rows:
data

Unnamed: 0,Participant,World,TrialName,Task,Duration,Milliseconds,Errors,Baseline,APCPS,MPD,...,MPDC_A,SD_A,PeakDilation_A,Latencytopeak_A,Changeposition_A,Attempts_A,ChangepositionAttemps_A,Errorschangeposition_A,ErrorsAttemps_A,Totalerrors_A
0,1,2,2_5,25,00:00:45,44785.0,0.0,307896.0,",0409",32049.0,...,29259,15489,60057.0,65.0,2.0,0.0,2.0,3.0,1.0,3.0
1,3,2,2_4,24,00:01:53,112464.0,1.0,181298.0,",0524",190801.0,...,23878,13852,49936.0,33.0,0.0,2.0,2.0,0.0,2.0,2.0
2,6,2,2_4,24,00:00:40,39458.0,0.0,270984.0,",0869",294543.0,...,22689,",8128",40133.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,2,2_5,25,00:00:46,45055.0,0.0,293381.0,",0288",301824.0,...,13159,",8638",30021.0,38.0,1.0,0.0,1.0,1.0,0.0,1.0
4,7,2,2_4,24,00:02:28,147727.0,7.0,255965.0,",0839",27745.0,...,43299,10139,69429.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0
5,7,2,2_5,25,00:00:23,22825.0,0.0,275585.0,",0029",27639.0,...,32252,10105,5764.0,151.0,5.0,9.0,14.0,10.0,14.0,19.0
6,8,2,2_4,24,00:01:47,106656.0,4.0,191137.0,",0789",206213.0,...,30102,11218,58991.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0
7,9,2,2_4,24,00:00:36,35397.0,0.0,235228.0,",0344",243316.0,...,365,",9193",53706.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
8,9,2,2_5,25,00:00:32,31082.0,0.0,205479.0,",1786",242184.0,...,16062,",7782",35325.0,36.0,2.0,2.0,4.0,4.0,4.0,6.0
9,10,2,2_4,24,4.8596064814814812E-4,41987.0,1.0,183643.0,",197",219812.0,...,25888,11392,50655.0,80.0,6.0,0.0,6.0,6.0,0.0,6.0


In [3]:
# Create testing indices.
# Subjects: 3, 12, 20
test_idx = [1, 13, 14, 22, 23, 41, 42, 49]
print("Testing indices:", test_idx)

Testing indices: [1, 13, 14, 22, 23, 41, 42, 49]


In [4]:
def clean_data(data):
    """Utlity function to parse data."""
    for col in range(data.shape[1]):
        for row in range(data.shape[0]):
            try:
                data[row, col] = float(data[row, col].replace(',', '.'))
            except:
                pass

# Model v1

Data used as features:
- Totalerrors
- Milliseconds 
- MPDC
- PeakDilation

In [5]:
# Create features and labels (data):
features = data[['Totalerrors', 'Milliseconds', 'MPDC', 'PeakDilation']].values
labels = data[['World']].values
clean_data(features)

# Create features and labels (names):
feature_names = np.array(['Totalerrors', 'Milliseconds', 'MPDC', 'PeakDilation'])
label_names = np.array(['World 1', 'World 2'])

# Create training data:
train_data = np.delete(features, test_idx, axis=0)
train_lbls = np.delete(labels, test_idx)

# Create testing data:
test_data = features[test_idx]
test_lbls = labels[test_idx]

In [6]:
# Build classifier:
clf = RandomForestClassifier(n_estimators=10, max_depth=None, 
                             min_samples_split=2, random_state=0)
clf.fit(train_data, train_lbls)

# Predict data:
prediction = clf.predict(test_data)
print("Predictions: \t", prediction)

# Compare with labels:
print("Testing labels: ", test_lbls.flatten())
acc_sc = accuracy_score(test_lbls, prediction)
print("Accuracy score: {}%".format(acc_sc*100))

Predictions: 	 [2 2 2 4 2 4 2 4]
Testing labels:  [2 2 2 2 2 4 4 4]
Accuracy score: 75.0%


# Model v2

Data used as features:
- Milliseconds 
- Errors
- Changeposition
- Attempts
- ChangepositionAttemps
- Errorschangeposition
- ErrorsAttemps
- Totalerrors

In [7]:
# Create features and labels (data):
features2 = data[['Milliseconds', 'Errors', 'Changeposition', 'Attempts', 
                  'ChangepositionAttemps', 'Errorschangeposition', 
                  'ErrorsAttemps', 'Totalerrors']].values
labels2 = data[['World']].values
clean_data(features2)

# Create features and labels (names):
feature_names2 = np.array(['Milliseconds', 'Errors', 'Changeposition', 
                           'Attempts', 'ChangepositionAttemps', 
                           'Errorschangeposition',  'ErrorsAttemps', 
                           'Totalerrors'])
label_names2 = np.array(['World 1', 'World 2'])

# Create training data:
train_data2 = np.delete(features2, test_idx, axis=0)
train_lbls2 = np.delete(labels2, test_idx)

# Create testing data:
test_data2 = features2[test_idx]
test_lbls2 = labels2[test_idx]

In [8]:
# Build classifier:
clf2 = RandomForestClassifier(n_estimators=10, max_depth=None, 
                              min_samples_split=2, random_state=0)
clf2.fit(train_data2, train_lbls2)

# Predict data:
prediction2 = clf2.predict(test_data2)
print("Predictions: \t", prediction2)

# Compare with labels:
print("Testing labels: ", test_lbls2.flatten())
acc_sc2 = accuracy_score(test_lbls2, prediction2)
print("Accuracy score: {}%".format(acc_sc2*100))

Predictions: 	 [4 2 2 4 2 4 4 2]
Testing labels:  [2 2 2 2 2 4 4 4]
Accuracy score: 62.5%


# Model v3

Data used as features:
- Milliseconds
- Totalerrors

In [9]:
# Create features and labels (data):
features3 = data[['Milliseconds', 'Totalerrors']].values
labels3 = data[['World']].values
clean_data(features3)

# Create features and labels (names):
feature_names3 = np.array(['Milliseconds', 'Totalerrors'])
label_names3 = np.array(['World 1', 'World 2'])

# Create training data:
train_data3 = np.delete(features3, test_idx, axis=0)
train_lbls3 = np.delete(labels3, test_idx)

# Create testing data:
test_data3 = features3[test_idx]
test_lbls3 = labels3[test_idx]

In [10]:
# Build classifier:
clf3 = RandomForestClassifier(n_estimators=10, max_depth=None, 
                              min_samples_split=2, random_state=0)
clf3.fit(train_data3, train_lbls3)

# Predict data:
prediction3 = clf3.predict(test_data3)
print("Predictions: \t", prediction3)

# Compare with labels:
print("Testing labels: ", test_lbls3.flatten())
acc_sc3 = accuracy_score(test_lbls3, prediction3)
print("Accuracy score: {}%".format(acc_sc3*100))

Predictions: 	 [4 2 2 4 2 4 4 4]
Testing labels:  [2 2 2 2 2 4 4 4]
Accuracy score: 75.0%


# Model v4 (Cognitive Load)

Data used as features:
- MPDC
- PeakDilation

In [11]:
# Create features and labels (data):
features4 = data[['MPDC', 'PeakDilation']].values
labels4 = data[['World']].values
clean_data(features4)

# Create features and labels (names):
feature_names4 = np.array(['MPDC', 'PeakDilation'])
label_names4 = np.array(['World 1', 'World 2'])

# Create training data:
train_data4 = np.delete(features4, test_idx, axis=0)
train_lbls4 = np.delete(labels4, test_idx)

# Create testing data:
test_data4 = features4[test_idx]
test_lbls4 = labels4[test_idx]

In [12]:
# Build classifier:
clf4 = RandomForestClassifier(n_estimators=10, max_depth=None, 
                              min_samples_split=2, random_state=0)
clf4.fit(train_data4, train_lbls4)

# Predict data:
prediction4 = clf4.predict(test_data4)
print("Predictions: \t", prediction4)

# Compare with labels:
print("Testing labels: ", test_lbls4.flatten())
acc_sc4 = accuracy_score(test_lbls4, prediction4)
print("Accuracy score: {}%".format(acc_sc4*100))

Predictions: 	 [2 4 4 4 2 4 4 4]
Testing labels:  [2 2 2 2 2 4 4 4]
Accuracy score: 62.5%


# Model v5

Data used as features:
- PeakDilation
- Milliseconds
- Totalerrors

In [13]:
# Create features and labels (data):
features5 = data[['PeakDilation', 'Milliseconds', 'Totalerrors']].values
labels5 = data[['World']].values
clean_data(features5)

# Create features and labels (names):
feature_names5 = np.array(['PeakDilation', 'Milliseconds', 'Totalerrors'])
label_names5 = np.array(['World 1', 'World 2'])

# Create training data:
train_data5 = np.delete(features5, test_idx, axis=0)
train_lbls5 = np.delete(labels5, test_idx)

# Create testing data:
test_data5 = features5[test_idx]
test_lbls5 = labels5[test_idx]

In [14]:
# Build classifier:
clf5 = RandomForestClassifier(n_estimators=10, max_depth=None, 
                              min_samples_split=2, random_state=0)
clf5.fit(train_data5, train_lbls5)

# Predict data:
prediction5 = clf5.predict(test_data5)
print("Predictions: \t", prediction5)

# Compare with labels:
print("Testing labels: ", test_lbls5.flatten())
acc_sc5 = accuracy_score(test_lbls5, prediction5)
print("Accuracy score: {}%".format(acc_sc5*100))

Predictions: 	 [2 2 2 2 2 4 2 4]
Testing labels:  [2 2 2 2 2 4 4 4]
Accuracy score: 87.5%
