In [1]:
import datetime
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm 
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
from tqdm import tqdm 
    
from IPython.display import HTML
def View(df):
    css = """<style>
    table { border-collapse: collapse; border: 3px solid #eee; }
    table tr th:first-child { background-color: #eeeeee; color: #333; font-weight: bold }
    table thead th { background-color: #eee; color: #000; }
    tr, th, td { border: 1px solid #ccc; border-width: 1px 0 0 1px; border-collapse: collapse;
    padding: 3px; font-family: monospace; font-size: 10px }</style>
    """
    s  = '<script type="text/Javascript">'
    s += 'var win = window.open("", "Title", "toolbar=no, location=no, directories=no, status=no, menubar=no, scrollbars=yes, resizable=yes, width=780, height=200, top="+(screen.height-400)+", left="+(screen.width-840));'
    s += 'win.document.body.innerHTML = \'' + (df.to_html() + css).replace("\n",'\\') + '\';'
    s += '</script>'
    return(HTML(s+css))    

os.chdir(r'C:\Users\User\GitHub\WebET_Analysis')
print("Current Working directory " , os.getcwd())

Current Working directory  C:\Users\User\GitHub\WebET_Analysis


# Read Data

In [2]:
data_et = pd.read_csv(r'C:/Users/User/GitHub/WebET_Analysis/data_jupyter/cleaned/data_et.csv')
data_trial = pd.read_csv(r'C:/Users/User/GitHub/WebET_Analysis/data_jupyter/cleaned/data_trial.csv')
data_subject = pd.read_csv(r'C:/Users/User/GitHub/WebET_Analysis/data_jupyter/cleaned/data_subject.csv')

print(pd.DataFrame([
    [
        len(data_et), 
        len(data_et['run_id'].unique())
    ], 
    [
        len(data_trial),
        len(data_trial['run_id'].unique())
    ], 
    [
        len(data_subject), 
        len(data_subject['run_id'].unique())
    ]
], 
   columns=['length', 'run'],
   index=['data_et', 'data_trial', 'data_subject'])
)

               length  run
data_et       2464333  209
data_trial     111695  209
data_subject      209  209


# Create datasets

## data_trial

In [3]:
data_trial = data_trial.loc[
    data_trial['trial_type']=='eyetracking-choice', 
    [
        'run_id', 'chinFirst', 
        'task_nr', 
        'trial_index', 'trial_type', 'withinTaskIndex', 
        'choiceTask_amountLeftFirst', 
        'option_topLeft', 'option_bottomLeft',
        'option_topRight', 'option_bottomRight',
        'key_press', 'trial_duration_exact',
        'window_width', 'window_height',
        'fps'
    ]
]

## data_et

In [4]:
def add_var_to_data_et(data_et, source_data, varName):
    if varName in data_et.columns: data_et=data_et.drop(columns=varName)
    data_et = data_et.merge(
        source_data.loc[:, ['run_id', 'trial_index', varName]], 
        on=['run_id', 'trial_index'], how='left')
    return data_et

data_et = add_var_to_data_et(data_et, data_trial, 'trial_type')
data_et = add_var_to_data_et(data_et, data_trial, 'withinTaskIndex')

data_et = data_et \
    .loc[data_et['trial_type']=='eyetracking-choice', :] \
    .drop(columns=['trial_type'])
data_et = data_et.loc[
    :, 
    ['run_id', 'withinTaskIndex', 'x', 'y', 't_task']
]

print(data_et.columns)

Index(['run_id', 'withinTaskIndex', 'x', 'y', 't_task'], dtype='object')


# Screening

## Reaction time

## Reaction time / Trials too long

In [5]:
data_trial.loc[data_trial['trial_duration_exact']>10000, :]
print(len(data_trial.loc[
    data_trial['trial_duration_exact']>10000, 
    'run_id'
].unique()))

23


In [6]:
print(
    'Average reaction time raw: ' +
    str(data_trial['trial_duration_exact'].mean()) +
    '\n SD=' +
    str(data_trial['trial_duration_exact'].std()) 
)

print(
    'Average reaction time below 10 seconds: ' +
    str(data_trial.loc[
        data_trial['trial_duration_exact']<10000, 
        'trial_duration_exact'].mean()) +
    '\n SD=' +
    str(data_trial.loc[
        data_trial['trial_duration_exact']<10000, 
        'trial_duration_exact'].std()) 
)

Average reaction time raw: 2100.1107057416266
 SD=1273.1232237581721
Average reaction time below 10 seconds: 2069.610278244183
 SD=1103.5719166073122


## Not enough fps

In [7]:
grouped = data_et \
    .groupby(['run_id', 'withinTaskIndex'])['x'].count() \
    .reset_index() \
    .rename(columns={'x': 'x_count'})

if 'x_count' in data_trial.columns:
    data_trial = data_trial.drop(columns=['x_count'])
data_trial = data_trial.merge(
    grouped, on=['run_id', 'withinTaskIndex'], how='left')

data_trial['fps_choice']= \
    1000 * data_trial['x_count'] / \
    data_trial['trial_duration_exact']
data_trial['fps_choice'].describe()

count    16714.000000
mean        17.294854
std          7.180669
min          1.112347
25%         12.186003
50%         16.840742
75%         22.076505
max         39.645366
Name: fps_choice, dtype: float64

In [8]:
print(data_trial.loc[
    data_trial['fps_choice']<3, 
    'run_id'].unique())

grouped = data_trial \
    .loc[
        data_trial['fps_choice']<1, 
        ['run_id', 'trial_index', 'fps_choice']
    ] \
    .groupby(['run_id'])['trial_index'].count() \
    .reset_index() \
    .rename(columns={'trial_index': 'n_lowFPS'})
print(grouped)
subjects_lowFPS = grouped.loc[
    grouped['n_lowFPS'] > 10, 
    'run_id'
]
subjects_lowFPS

[163 256 395   4 458]
Empty DataFrame
Columns: [run_id, n_lowFPS]
Index: []


Series([], Name: run_id, dtype: int64)

## Additional
Run 144 was found to barely have any variation in gaze transitions

In [9]:
run_additional_flaws = np.array([144])
run_additional_flaws

array([144])

## Summary

In [10]:
excludedSubjects = list(
    set(subjects_lowFPS) |
    set(run_additional_flaws)
)

output = pd.DataFrame(
   {'name': [
               'subjects_lowFPS', 
               'assitional_flaws',
               'total',
   ],
    'length': [
                len(subjects_lowFPS),
                len(run_additional_flaws),
                len(excludedSubjects)
           ]}
)

output

Unnamed: 0,name,length
0,subjects_lowFPS,0
1,assitional_flaws,1
2,total,1


# Cleaning

## data_subject

In [11]:
data_subject = data_subject.loc[
    ~data_subject['run_id'].isin(excludedSubjects), :
]

## data_trial 

In [12]:
def cleanTrialData(data):
    print('Raw: ' + str(len(data)))
    data = data.loc[
        ~(data['run_id'].isin(excludedSubjects)) &
        (data['trial_duration_exact']<10000), 
        :]
    print('Cleaned: ' + str(len(data)))
    return data

data_trial = cleanTrialData(data_trial)

Raw: 16720
Cleaned: 16596


## data_et

In [13]:
def cleanETData(data):
    print('Raw: ' + str(len(data)))
    data = data.loc[
        (data['x'] > 0) & (data['x'] < 1) &
        (data['y'] > 0) & (data['y'] < 1) &
        ~(data['run_id'].isin(excludedSubjects)) &
        (data['t_task']<10000), 
        :]
    print('Cleaned: ' + str(len(data)))
    return data

data_et = cleanETData(data_et)

Raw: 603489
Cleaned: 579618


# Export data

In [14]:
if not os.path.exists('./data_jupyter/choice_task'):
    os.mkdir('./data_jupyter/choice_task')

data_et.to_csv(
    "data_jupyter/choice_task/data_et.csv", 
    index=False, header=True)
data_trial.to_csv(
    "data_jupyter/choice_task/data_trial.csv", 
    index=False, header=True)
data_subject.to_csv(
    "data_jupyter/choice_task/data_subject.csv", 
    index=False, header=True)

# Check dataset

In [15]:
summary = pd.DataFrame(
   {'dataset': 
        [ 
           'data_et',
           'data_trial',
           'data_subject'
        ],
    'runs': 
        [
            len(data_et['run_id'].unique()),
            len(data_trial['run_id'].unique()),
            len(data_subject['run_id'].unique()),
        ]
   }
) 
summary

Unnamed: 0,dataset,runs
0,data_et,208
1,data_trial,208
2,data_subject,208


# Feedback

In [16]:
print('Success! Script ran through')

Success! Script ran through
