In [1]:
# import sys
# !conda install --yes --prefix {sys.prefix} pingouin

In [12]:
import numpy as np
import math
import os
import pandas as pd
import pingouin as pg
import re
import seaborn as sns
import json
import statsmodels.api as sm 
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import sys

if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
    
from IPython.display import HTML
def View(df):
    css = """<style>
    table { border-collapse: collapse; border: 3px solid #eee; }
    table tr th:first-child { background-color: #eeeeee; color: #333; font-weight: bold }
    table thead th { background-color: #eee; color: #000; }
    tr, th, td { border: 1px solid #ccc; border-width: 1px 0 0 1px; border-collapse: collapse;
    padding: 3px; font-family: monospace; font-size: 10px }</style>
    """
    s  = '<script type="text/Javascript">'
    s += 'var win = window.open("", "Title", "toolbar=no, location=no, directories=no, status=no, menubar=no, scrollbars=yes, resizable=yes, width=780, height=200, top="+(screen.height-400)+", left="+(screen.width-840));'
    s += 'win.document.body.innerHTML = \'' + (df.to_html() + css).replace("\n",'\\') + '\';'
    s += '</script>'
    return(HTML(s+css))    

os.chdir(r'C:\Users\User\GitHub\WebET_Analysis')
print("Current Working directory " , os.getcwd())

Current Working directory  C:\Users\User\GitHub\WebET_Analysis


In [3]:
# from IPython.display import HTML

# HTML('''<script>
# code_show=true; 
# function code_toggle() {
#  if (code_show){
#  $('div.input').hide();
#  } else {
#  $('div.input').show();
#  }
#  code_show = !code_show
# } 
# $( document ).ready(code_toggle);
# </script>
# <form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

# Raw Data

## Read CSV from String

In [14]:
def cleanhtml(raw_html):
    # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
    cleanr = re.compile('<.*?>')
    cleanText = re.sub(cleanr, '', raw_html)
    return cleanText

def cleanETText(text):
    textWithinBrackets = re.findall(re.compile('\[.*?\]'), text)
    output = text
    for i in range(0,len(textWithinBrackets)):
        old = textWithinBrackets[i]
        new = re.sub(",", "$", old)
        output = output.replace(old, new)
    return output

def cleanSurveyText(text):
    output = text
    textWithinBrackets = re.findall(re.compile('\{.*?\}'), text)
    for i in range(0,len(textWithinBrackets)):
        old = textWithinBrackets[i]
        new = old.replace(',', '§')
        output = output.replace(old, new)
    return output

def compileData(path):
    subject_files = os.listdir(path)
    all_subjects = []
    for i in range(0, len(subject_files)):
        csv_thisSubject = open(path + "/" + subject_files[i]).read()
        csv_thisSubject = cleanhtml(csv_thisSubject)
        csv_thisSubject = cleanETText(csv_thisSubject)
        csv_thisSubject = cleanSurveyText(csv_thisSubject)
        all_subjects.append(pd.read_csv(StringIO(csv_thisSubject)))
    output = pd.concat(all_subjects).reset_index(drop=True)
    return output

data_yang = compileData("data_yang2020WG")
# Custom modifications
data_yang.loc[0:515, 'run_id'] = 0
data_yang = data_yang.loc[~data_yang['run_id'].isin([4, 18]), :]
data_yang['run_id'] = data_yang['run_id'] * 1000
data_yang = data_yang.rename(columns={'eyeData': 'et_data'})
data_cognition = compileData("data_cognition")

data_raw = data_yang.append(data_cognition)
data_raw['run_id'].unique()

array([0, 1000, 11000, 13000, 14000, 15000, 16000, 17000, 19000, 2000,
       20000, 21000, 3000, 7000, 8000, 9000, 22000, 24000, 2],
      dtype=object)

### Exclude empty studies

In [15]:
data_raw = data_raw.loc[data_raw['trial_index']>0, :]

### survey data

In [16]:
def cleanOptionalNote(text):
    optionalNoteText = re.findall(re.compile('optionalNote":.*?\}'), text)
    if len(optionalNoteText) < 1:
        output = text
    else:
        old = optionalNoteText[0]
        new = old.replace('§', ' ')
        output = text.replace(old, new)
    return output


def surveyStringToFrame(subject, string):
    string = cleanOptionalNote(string)
    string = re.sub("""{""", '', string)
    string = re.sub("""}""", '', string)
    string = re.sub('"', '', string)
    string = re.sub('§', '$', string)
    output = pd.read_csv(StringIO(string),
                         sep=":",
                         lineterminator="$",
                         header=None,
                         index_col=0
                         ).transpose()
    return output


def surveyData_thisSubject(data):
    df_thisSubject = data.loc[
        (pd.notna(data["responses"])) &
        (data["responses"] != '"'), :].reset_index()
    subject = df_thisSubject['run_id'].unique()[0]
    output = []
    for i in range(0, len(df_thisSubject)):
        output.append(
            surveyStringToFrame(subject,
                                df_thisSubject.loc[i, 'responses'])
        )

    output = pd.concat(output, axis=1)
    output['run_id'] = subject
    return output

def addSurveyData(data):
    surveyData_allSubjects = pd.DataFrame(columns=[
            'prolificID', 'age', 'gender', 'ethnic', 'sight', 
            'glasses', 'degree', 'eyeshadow', 'masquara', 'eyeliner', 
            'browliner', 'vertPosition', 'triedChin', 'keptHead', 
            'optionalNote', 'run_id'])

    for subject in data['run_id'].unique():
        surveyData_allSubjects = \
            surveyData_allSubjects.append(
                surveyData_thisSubject(
                    data.loc[data['run_id']==subject, ['run_id', 'responses']]
                )
            )
    # print(surveyData_allSubjects)
    
    data = data.merge(surveyData_allSubjects, on='run_id', how='left')
    data = data.drop(columns='responses')
    return(data)
    
data_raw = addSurveyData(data_raw)
print(data_raw['run_id'].unique())
print(data_raw.columns)

[0 1000 11000 14000 15000 16000 17000 19000 2000 20000 21000 3000 7000
 8000 9000 22000 24000 2]
Index(['run_id', 'condition', 'rt', 'stimulus', 'button_pressed',
       'window_width', 'window_height', 'trial_type', 'trial_index',
       'time_elapsed', 'internal_node_id', 'subject', 'chinFirst',
       'choiceTask_amountLeftFirst', 'webcam_label', 'webcam_Id', 'webcam_fps',
       'webcam_aspectRatio', 'webcam_height', 'webcam_width', 'key_press',
       'success', 'x_pos', 'y_pos', 'chin', 'task_nr', 'et_data',
       'trial_duration', 'option_topLeft', 'option_bottomLeft',
       'option_topRight', 'option_bottomRight', 'recorded_at', 'ip',
       'user_agent', 'device', 'browser', 'browser_version', 'platform',
       'platform_version', 'Unnamed: 2', 'fbclid', 'bonusAmount', 'bonusDelay',
       'chosenAmount', 'chosenDelay', 'prolificID', 'age', 'gender', 'ethnic',
       'sight', 'glasses', 'degree', 'eyeshadow', 'masquara', 'eyeliner',
       'browliner', 'vertPosition', 'trie

In [17]:
def convertToNumeric(data, columns):
    data[columns] = data[columns].apply(pd.to_numeric, errors='coerce')
    return data


data_raw = convertToNumeric(data_raw, ['age'])
data_raw['degree'] = data_raw['degree'].replace(
        [
            'College / Undergraduate / Bachelor',
            'High School',
            'Graduate / PhD / Master',
            'Middle School'
        ],
    [
            'college',
            'highSchool',
            'grad',
            'middle'
        ]
)
data_raw['run_id'].unique()

array([0, 1000, 11000, 14000, 15000, 16000, 17000, 19000, 2000, 20000,
       21000, 3000, 7000, 8000, 9000, 22000, 24000, 2], dtype=object)

## Clean raw data

### Failed Webgazer Setup

In [18]:
subjects_failedSetups = []
for subject in data_raw['run_id'].unique():
    maxTrialIndex = data_raw.loc[data_raw['run_id']==subject, ['trial_index']].max()
    if int(maxTrialIndex) < 15:
        subjects_failedSetups.append(subject)

failedSetups = data_raw.loc[
    data_raw['run_id'].isin(subjects_failedSetups), 
    [
        'run_id', 'prolificID', 'chinFirst', 'choiceTask_amountLeftFirst',
        'webcam_label', 'webcam_fps', 'webcam_aspectRatio',    
        'user_agent', 'device', 'browser', 'browser_version', 
        'platform', 'platform_version', 
        # 'stimulus', 'trial_type', 'trial_index', 'trial_duration', 
        # 'et_data'
    ]
].drop_duplicates()
    
data_raw = data_raw.loc[~data_raw['run_id'].isin(subjects_failedSetups), :]

print(data_raw['run_id'].unique())
failedSetups

[0 11000 21000 3000 8000 2]


Unnamed: 0,run_id,prolificID,chinFirst,choiceTask_amountLeftFirst,webcam_label,webcam_fps,webcam_aspectRatio,user_agent,device,browser,browser_version,platform,platform_version
514,1000,0,0.0,1.0,Logitech HD Webcam C310 (046d:081b),30.0,0.001042,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,WebKit,Chrome,87.0.4280.141,Windows,10
1040,14000,LP2202,0.0,0.0,Integrierte iSight-Kamera,30.00003,1.0,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6...,Macintosh,Safari,11.1.2,OS X,10_13_6
1051,15000,LP22022,1.0,1.0,Integrierte iSight-Kamera,30.00003,1.0,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6...,Macintosh,Safari,11.1.2,OS X,10_13_6
1057,16000,L345,0.0,0.0,Integrierte iSight-Kamera,30.00003,1.0,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6...,Macintosh,Safari,11.1.2,OS X,10_13_6
1063,17000,L345,1.0,1.0,Integrierte iSight-Kamera (05ac:8507),30.000031,0.002083,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6...,Macintosh,Chrome,87.0.4280.141,OS X,10_13_6
1065,19000,Liv2020,0.0,1.0,Integrierte iSight-Kamera (05ac:8507),30.000031,0.002083,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6...,Macintosh,Chrome,87.0.4280.141,OS X,10_13_6
1075,2000,0,1.0,1.0,Logitech HD Webcam C310 (046d:081b),30.0,0.001042,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,WebKit,Chrome,87.0.4280.141,Windows,10
1085,20000,2020Liv,1.0,0.0,Integrierte iSight-Kamera (05ac:8507),30.000031,0.002083,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6...,Macintosh,Chrome,87.0.4280.141,OS X,10_13_6
2126,7000,Ka61Kl61,0.0,0.0,HD User Facing (04f2:b64f),30.0,0.001389,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,WebKit,Chrome,87.0.4280.141,Windows,10
2653,9000,ka1501,1.0,1.0,HD User Facing (04f2:b64f),30.0,0.001389,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,WebKit,Chrome,87.0.4280.141,Windows,10


### No EyeTracking Data

In [19]:
subjects_noet_data = []
for subject in data_raw['run_id'].unique():
    if len(data_raw.loc[data_raw['run_id']==subject, 'et_data'].unique()) < 4:
        subjects_noet_data.append(subject)

noet_data = data_raw.loc[
    data_raw['run_id'].isin(subjects_failedSetups), 
    [
        'run_id', 'prolificID', 'chinFirst', 'choiceTask_amountLeftFirst',
        'webcam_label', 'webcam_fps', 'webcam_aspectRatio',    
        'user_agent', 'device', 'browser', 'browser_version', 
        'platform', 'platform_version', 
        # 'stimulus', 'trial_type', 'trial_index', 'trial_duration', 
        # 'et_data'
    ]
].drop_duplicates()
    
data_raw = data_raw.loc[~data_raw['run_id'].isin(subjects_noet_data), :]

noet_data

Unnamed: 0,run_id,prolificID,chinFirst,choiceTask_amountLeftFirst,webcam_label,webcam_fps,webcam_aspectRatio,user_agent,device,browser,browser_version,platform,platform_version


In [20]:
data_raw.run_id.unique()

array([0, 11000, 21000, 3000, 8000, 2], dtype=object)

### Empty ET trials

In [21]:
print('Values that probably represent empty values')
for cell in data_raw["et_data"].unique():
    if len(cell) < 50:
        print(cell)

data_raw["et_data"] = data_raw["et_data"].apply(str)

if np.invert('emptyETData' in globals()):
    emptyETData = data_raw.loc[data_raw['et_data'].str.contains("""\[]}"""), :] \
        .groupby(['run_id', 'chinFirst', 'task_nr', 'chin', 'trial_type']) \
            ['et_data'].count()

data_raw = data_raw.loc[~data_raw['et_data'].str.contains("""\[]}"""), :]

emptyETData

Values that probably represent empty values
"
{"history":[]}


run_id  chinFirst  task_nr  chin  trial_type            
3000    0.0        2        1     eyetracking-fix-object     1
                   3        1     eyetracking-fix-object    14
21000   0.0        2        1     eyetracking-fix-object     1
                   3        1     eyetracking-choice         2
                                  eyetracking-fix-object     1
Name: et_data, dtype: int64

### Duplicate Prolific ID's

In [22]:
data_raw.loc[data_raw['run_id']==2, 'prolificID'] = 'Tim2'


duplicates = data_raw.loc[:, ['prolificID', 'trial_index']].duplicated()
duplicateSubjects = data_raw.loc[duplicates, 'run_id'].unique()

if len(duplicateSubjects) > 0:
    print('! Attention: Duplicate subjects: Check out the following: \n')
    print(duplicateSubjects)
else:
    print('Success: No duplicate subjects found')

Success: No duplicate subjects found


## Add other variables

In [23]:
data_raw = convertToNumeric(data_raw, [
            'run_id', 'subject', 'chinFirst', 'chin', 'task_nr', 'trial_index', # Int
            'key_press', 
            'x_pos', 'y_pos', 'time_elapsed', 'trial_duration',
            'rt',
            'window_width', 'window_height', 
        ]
   )

In [24]:
def addWindowSize(data):
    output = data
    if "window_width_max" in data.columns:
        print('window width_max already added')
    else:
        grouped = data.groupby(["run_id", "subject"])["window_width", "window_height"].max().reset_index()
        grouped.columns = ["run_id", "subject", "window_width_max", "window_height_max"]
        grouped['window_diagonal_max'] = np.sqrt(grouped['window_width_max']**2 + grouped['window_height_max']**2)
        output = data.merge(grouped, 
                            on=['run_id', "subject"],
                            how='left')
    return output

data_raw = addWindowSize(data_raw)
data_raw['window_diagonal'] = np.sqrt(data_raw['window_width']**2 + 
                                      data_raw['window_height']**2)

  


## Take a look

In [25]:
data_raw.loc[:, ['prolificID', 'run_id']].drop_duplicates()

Unnamed: 0,prolificID,run_id
0,Tim,0
514,ka1501,11000
1028,Livaila22,21000
1540,0,3000
2040,Ka61Kl61,8000
2555,Tim2,2


# data_et

In [36]:
def reformatYang(text):
    text = text.replace('$', ',')
    text = text.replace("relative-x", "x")
    text = text.replace("relative-y", 'y')
    text = text.replace('elapse-time', 't')
    text = (text[11:len(text)-1])
    return(text)

for subject in [0, 11000, 21000, 3000, 8000]:
    for i in data.loc[(data['run_id']==subject) &
                      (data['et_data']!='"'), :].index:
        data.loc[i, 'et_data'] = reformatYang(data.loc[i, 'et_data'])

In [39]:
def textToDataframe(text):
    text = text.replace('$', ',')
    dataframe = pd.read_json(text, orient='records')
    return(dataframe)


def extractEyetrackingData(data):
    data_eyetracking = pd.DataFrame(columns=['x', 'y', 't'])
    data["et_data"] = data['et_data'].apply(str)

    for i in data.loc[data['et_data'] != '"', :].index:

        df = textToDataframe(data.loc[i, 'et_data'])        

        df["t_task"] = (df.loc[:, "t"] - df.loc[0, "t"])
        columnsToAdd = data.columns.drop('et_data')
        for col in columnsToAdd:
            df[col] = data.loc[i, col]

        data_eyetracking = data_eyetracking.append(
            pd.DataFrame(data=df), ignore_index=True)

    return(data_eyetracking)


data_et = extractEyetrackingData(data_raw)

data_et.groupby(['run_id', 'chinFirst', 'task_nr',
                 'chin', 'trial_type'])['x'].count()

run_id   chinFirst  task_nr  chin  trial_type             
0.0      0.0        0.0      0.0   eyetracking-calibration    3442
                    1.0      0.0   eyetracking-fix-object     1118
                             1.0   eyetracking-calibration    2452
                    2.0      1.0   eyetracking-fix-object     1039
                    3.0      1.0   eyetracking-choice         3639
                                   eyetracking-fix-object     2160
2.0      1.0        0.0      1.0   eyetracking-calibration      33
                    1.0      1.0   eyetracking-fix-object       96
                    2.0      0.0   eyetracking-calibration      22
                             1.0   eyetracking-choice           88
                                   eyetracking-fix-object       46
                    3.0      0.0   eyetracking-fix-object       95
3000.0   0.0        0.0      0.0   eyetracking-calibration    1855
                    1.0      0.0   eyetracking-fix-object      979
   

In [40]:
data_et = convertToNumeric(data_et,
                        ['x', 'y', 't', 't_task', # Float 
                         'x_pos', 'y_pos', 
                         'chinFirst', 'chin', 'key_press'])

In [41]:
def convertToFactor(data, columns):
    stacked = data[columns].stack()
    data[columns] = pd.Series(stacked.factorize()[0], index=stacked.index).unstack()
    return data

In [42]:
def addXCount(data):
    if 'count' in data.columns:
        print('Count already added!')
    else: 
        grouped = pd.DataFrame(
                data.groupby(["run_id", "trial_index"])["x"].count()
            ) \
            .reset_index() \
            .rename(columns={'x': 'count'})
        data = data.merge(grouped, 
                          on=["run_id", "trial_index"], 
                          how='left')
    return(data)

data_et = addXCount(data_et)

In [43]:
def addMeans(data):
    if ('x_mean' in data.columns):
        print('X_mean already added!')
        data_output = data
    elif ('y_mean' in data.columns):
        print('Y_mean already added!')
        data_output = data
    else: 
        grouped = data.loc[:, ['run_id', 'subject', 'trial_index', 'x', 'y']] \
                             .groupby(['run_id', 'subject', 'trial_index']).mean() \
                             .rename(columns={'x': 'x_mean', 
                                              'y': 'y_mean'})
        data_output = data.merge(grouped,
                                 on=['run_id', 'subject', 'trial_index'],
                                 how='left')
    return data_output

data_et = addMeans(data_et)

In [44]:
def euclideanDistance(x, x_target, y, y_target):
    x_diff = x - x_target
    y_diff = y - y_target
    euclideanDistance = np.sqrt(x_diff**2 + y_diff**2)
    return(euclideanDistance)

x_location_pixel = data_et['window_width_max'] * data_et['x_pos']
y_location_pixel = data_et['window_height_max'] * data_et['y_pos']
data_et["offset"] = euclideanDistance(data_et["x"], x_location_pixel, 
                                      data_et["y"], y_location_pixel)

In [45]:
def addPrecision(data):
    if 'precision' in data.columns: 
        print('precision already added!')
    else:
        data['deviationFromAVG'] = euclideanDistance(
                data['x'], data['x_mean'], data['y'], data['y_mean']
            )
        data['deviationFromAVG_square'] = np.power(data['deviationFromAVG'], 2)
        grouped = data.groupby(['run_id', 'trial_index']).mean() \
            .reset_index()
        grouped['precision'] = np.sqrt(grouped['deviationFromAVG_square'])
        data = data.merge(
                grouped.loc[:, ['run_id', 'trial_index', 'precision']], 
                on=['run_id', 'trial_index'],
                how='left'
            )
                                
    return data

data_et = addPrecision(data_et)

In [46]:
def withinTaskIndex(data): 
    allTrialIndices = []
    for subject in data["run_id"].unique():
        df_subj = data.loc[data['run_id']==subject, :]
        
        for trial_type in df_subj['trial_type'].unique():
            df_trial = df_subj.loc[df_subj['trial_type']==trial_type, :]
                
            for task_nr in df_trial["task_nr"].unique():
                df_thisTask = df_trial.loc[df_trial['task_nr']==task_nr, 
                         [
                             'run_id',
                             'trial_index'
                         ]
                    ] \
                    .drop_duplicates() \
                    .reset_index(drop=True)
        
                df_thisTask['withinTaskIndex'] = df_thisTask.index + 1
                allTrialIndices.append(df_thisTask)
    allTrialIndices = pd.concat(allTrialIndices).reset_index(drop=True)
    return allTrialIndices

def addWithinTaskIndex(data):
    if 'withinTaskIndex' in data.columns: 
        print('withinTaskIndex already added')
    else:
        newIndices = withinTaskIndex(data_et) \
            .reset_index(drop=True)
        data = data.merge(newIndices, 
                          on = ['run_id', 'trial_index'], 
                          how = 'left')
    return data

data_et = addWithinTaskIndex(data_et)

In [47]:
def multiply(x):
    return x*10

def positionIndex(data):
    allPositionIndices = []
    for subject in data["run_id"].unique():
        df_subj = data.loc[
            (
                (data['run_id']==subject) &
                (data['trial_type'].isin(
                        [
                            'eyetracking-calibration', 
                            'eyetracking-fix-object'
                        ]
                    )
                )
            ), :]
        
        for trial_type in df_subj['trial_type'].unique():
            df_trial = df_subj.loc[df_subj['trial_type']==trial_type, :]
                
            for task_nr in df_trial["task_nr"].unique():
                df_thisTask = df_trial.loc[
                        df_trial['task_nr']==task_nr, 
                        ['run_id', 'trial_index', 'x_pos', 'y_pos']
                    ] \
                    .drop_duplicates() \
                    .reset_index(drop=True)

                df_thisTask['positionIndex'] = df_thisTask.loc[:, ['x_pos', 'y_pos']] \
                    .apply(multiply) \
                    .astype(int) \
                    .astype(str) \
                    .apply(''.join, 1)
                df_thisTask['positionIndex'] = df_thisTask['positionIndex'].astype(int)
                df_thisTask['positionIndex'] = df_thisTask.loc[:, 'positionIndex'] \
                    .replace(
                            np.sort(df_thisTask['positionIndex'].unique()), 
                            range(0, len(df_thisTask['positionIndex'].unique()))
                        )        
                allPositionIndices.append(df_thisTask)
                
    allPositionIndices = pd.concat(allPositionIndices).reset_index(drop=True)
    return allPositionIndices

def addPositionIndex(data):
    if 'positionIndex' in data.columns: 
        print('positionIndex already added')
    else:
        newIndices = positionIndex(data_et) \
            .reset_index(drop=True)
        data = data.merge(newIndices, 
                          on = ['run_id', 'trial_index', 'x_pos', 'y_pos'], 
                          how = 'left')
    return data

data_et = addPositionIndex(data_et)

## data_et_calibration

In [48]:
data_et_calibration = data_et.loc[data_et["trial_type"]=="eyetracking-calibration", :]
data_et_calibration

Unnamed: 0,x,y,t,t_task,run_id,condition,rt,stimulus,button_pressed,window_width,...,window_diagonal,count,x_mean,y_mean,offset,deviationFromAVG,deviationFromAVG_square,precision,withinTaskIndex,positionIndex
0,0.508594,0.493056,92.105,0.000,0.0,1.0,,"""","""",1280.0,...,1468.604780,141,0.760882,0.489805,1084.794866,0.252309,0.063660,0.142425,1,11.0
1,0.541406,0.402778,139.420,47.315,0.0,1.0,,"""","""",1280.0,...,1468.604780,141,0.760882,0.489805,1084.793831,0.236100,0.055743,0.142425,1,11.0
2,0.542187,0.383333,164.025,71.920,0.0,1.0,,"""","""",1280.0,...,1468.604780,141,0.760882,0.489805,1084.799540,0.243236,0.059164,0.142425,1,11.0
3,0.533594,0.393056,187.480,95.375,0.0,1.0,,"""","""",1280.0,...,1468.604780,141,0.760882,0.489805,1084.804424,0.247023,0.061020,0.142425,1,11.0
4,0.536719,0.426389,211.360,119.255,0.0,1.0,,"""","""",1280.0,...,1468.604780,141,0.760882,0.489805,1084.790427,0.232961,0.054271,0.142425,1,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54047,0.564639,0.536437,156607.000,647.000,2.0,1.0,,"""","""",1280.0,...,1509.436981,9,0.565622,0.544454,753.955383,0.008077,0.000065,0.021140,3,1.0
54048,0.557609,0.536542,156791.000,831.000,2.0,1.0,,"""","""",1280.0,...,1509.436981,9,0.565622,0.544454,753.961289,0.011261,0.000127,0.021140,3,1.0
54049,0.551026,0.535883,156945.000,985.000,2.0,1.0,,"""","""",1280.0,...,1509.436981,9,0.565622,0.544454,753.967221,0.016926,0.000286,0.021140,3,1.0
54050,0.545294,0.534213,157113.000,1153.000,2.0,1.0,,"""","""",1280.0,...,1509.436981,9,0.565622,0.544454,753.972968,0.022762,0.000518,0.021140,3,1.0


## data_et_fixation

In [49]:
data_et_fixation = data_et.loc[(
           (data_et["trial_type"]=="eyetracking-fix-object") &
           (pd.notna(data_et['withinTaskIndex'])) 
        ), :
    ]
data_et_fixation

Unnamed: 0,x,y,t,t_task,run_id,condition,rt,stimulus,button_pressed,window_width,...,window_diagonal,count,x_mean,y_mean,offset,deviationFromAVG,deviationFromAVG_square,precision,withinTaskIndex,positionIndex
3442,0.533594,-0.143056,85.010,0.000,0.0,1.0,,"""","""",1280.0,...,1468.604780,29,0.579526,0.485824,733.907559,6.305545e-01,3.975989e-01,2.188666e-01,1,4.0
3443,0.554688,-0.081944,141.240,56.230,0.0,1.0,,"""","""",1280.0,...,1468.604780,29,0.579526,0.485824,733.859192,5.683112e-01,3.229777e-01,2.188666e-01,1,4.0
3444,0.694531,0.243056,193.530,108.520,0.0,1.0,,"""","""",1280.0,...,1468.604780,29,0.579526,0.485824,733.577904,2.686310e-01,7.216264e-02,2.188666e-01,1,4.0
3445,0.888281,0.380556,247.230,162.220,0.0,1.0,,"""","""",1280.0,...,1468.604780,29,0.579526,0.485824,733.341622,3.262074e-01,1.064113e-01,2.188666e-01,1,4.0
3446,0.923437,0.715278,299.055,214.045,0.0,1.0,,"""","""",1280.0,...,1468.604780,29,0.579526,0.485824,733.146891,4.134300e-01,1.709244e-01,2.188666e-01,1,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54142,0.488281,0.485000,179212.000,1712.000,2.0,1.0,,"""","""",1280.0,...,1509.436981,23,0.488281,0.485000,1206.878483,3.667008e-07,1.344695e-13,3.878471e-07,6,2.0
54143,0.488281,0.485000,179321.000,1821.000,2.0,1.0,,"""","""",1280.0,...,1509.436981,23,0.488281,0.485000,1206.878483,3.726170e-07,1.388435e-13,3.878471e-07,6,2.0
54144,0.488281,0.485000,179413.000,1913.000,2.0,1.0,,"""","""",1280.0,...,1509.436981,23,0.488281,0.485000,1206.878483,3.752934e-07,1.408451e-13,3.878471e-07,6,2.0
54145,0.488281,0.485000,179504.000,2004.000,2.0,1.0,,"""","""",1280.0,...,1509.436981,23,0.488281,0.485000,1206.878483,3.754704e-07,1.409780e-13,3.878471e-07,6,2.0


## data_et_choice

In [50]:
data_et_choice = data_et.loc[(data_et["trial_type"]=="eyetracking-choice"), :]

In [51]:
def lookDirections(data):
    data["look_left"] = (data["x"] < 0.5).astype(int)
    data["look_top"] = (data["y"] < 0.5).astype(int)
    return data

data_et_choice = lookDirections(data_et_choice)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [52]:
# Cleaning
data_et_choice.loc[
    : , 
        [
          'option_topLeft',
          'option_bottomLeft', 
          'option_topRight', 
          'option_bottomRight'
        ] 
    ] = data_et_choice.loc[
            : , 
            [
              'option_topLeft',
              'option_bottomLeft', 
              'option_topRight', 
              'option_bottomRight'
            ] 
        ] \
    .replace(['Today', 'Tomorrow', '7 days', '15 days', '30 days', '90 days', '180 days'], 
             [0, 1, 7, 15, 30, 90, 180]) \
    .replace({'\$':''}, regex = True) \
    .replace('50 cent', 0.5) \
    .astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [53]:
def addChoiceVariables(data):
    data.loc[
                (
                    (data['choiceTask_amountLeftFirst']==1) &
                    (data['withinTaskIndex'] <41)
                ), 'amountLeft'] = 1
    data.loc[
                (
                    (data['choiceTask_amountLeftFirst']==0) &
                    (data['withinTaskIndex'] >40)
                ), 'amountLeft'] = 0

    data.loc[data['amountLeft'] == 1, 'aSS'] = \
        data.loc[data['amountLeft'] == 1,["option_topLeft", "option_bottomLeft"]].values.min(1)
    data.loc[data['amountLeft'] == 0, 'aSS'] = \
        data.loc[data['amountLeft'] == 0,["option_topRight", "option_bottomRight"]].values.min(1)

    data.loc[data['amountLeft'] == 1, 'aLL'] = \
        data.loc[data['amountLeft'] == 1,["option_topLeft", "option_bottomLeft"]].values.max(1)
    data.loc[data['amountLeft'] == 0, 'aLL'] = \
        data.loc[data['amountLeft'] == 0,["option_topRight", "option_bottomRight"]].values.max(1)

    data.loc[:, "tSS"] = 0 

    data.loc[data['amountLeft'] == 1, 'tLL'] = \
        data.loc[data['amountLeft'] == 1,["option_topRight", "option_bottomRight"]].values.max(1)
    data.loc[data['amountLeft'] == 0, 'tLL'] = \
        data.loc[data['amountLeft'] == 0,["option_topLeft", "option_bottomLeft"]].values.max(1)

    data.loc[(data["key_press"]==38), "choseTop"] = 1
    data.loc[(data["key_press"]==40), "choseTop"] = 0

    data.loc[data['amountLeft'] == 1, 'tLL'] = \
        data.loc[data['amountLeft'] == 1,["option_topRight", "option_bottomRight"]].values.max(1)

    data['LL_top'] = (data["option_topLeft"] > data["option_bottomLeft"]).astype(int)

    data.loc[
        (
             (data["choseTop"]==1) & 
             (data["LL_top"] == 1)
        ), "choseLL"] = 1
    data.loc[
        (
             (data["choseTop"]==1) & 
             (data["LL_top"] == 0)
        ), "choseLL"] = 0

    return(data)

data_et_choice = addChoiceVariables(data_et_choice)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [54]:
def cleanETChoice(data):
    output = data.loc[(data_et["x"]>-1) & 
                      (data_et["y"]>-1) & 
                      (data_et["x"]<data_et['window_width_max']) & 
                      (data_et["y"]<data_et['window_height_max']), :]
    return output
# data_et_choice = cleanETChoice(data_et_choice)

In [55]:
def assign_aoi(data, subject, aoi):
    aoiCenters = pd.DataFrame([
                [(0.05+0.9*0.2), 0.25],
                [(0.05+0.9*0.8), 0.25],
                [(0.05+0.9*0.2), 0.75],
                [(0.05+0.9*0.8), 0.75]
            ], 
            columns = ['x', 'y'], 
            index=['TL', 'TR', 'BL', 'BR']
        )

    data.loc[(
            (data['run_id'] == subject) & 
            (data.loc[:, 'x'] > (aoiCenters.loc[aoi, 'x'] - 0.175)) & \
            (data.loc[:, 'x'] < (aoiCenters.loc[aoi, 'x'] + 0.175)) & \
            (data.loc[:, 'y'] > (aoiCenters.loc[aoi, 'y'] - 0.175)) & \
            (data.loc[:, 'y'] < (aoiCenters.loc[aoi, 'y'] + 0.175))
        ), 'aoi'] = aoi   
    
    return data

def addAOIs(data):
    for subject in data['run_id'].unique(): 
        for aoi in ['TL', 'TR', 'BL', 'BR']:
            data = assign_aoi(data, subject, aoi)
        
    return data

def createAOIColumns(data):
    # If amounts are on the left side
    # If the gaze point is in the top option
    data.loc[((data['amountLeft']==1) & (data['LL_top']==1) & (data['aoi']=='TL')), 
             'aoi_amount_LL'] = 1
    data.loc[((data['amountLeft']==1) & (data['LL_top']==1) & (data['aoi']=='TR')), 
             'aoi_delay_LL'] = 1
    data.loc[((data['amountLeft']==1) & (data['LL_top']==1) & (data['aoi']=='BL')), 
             'aoi_amount_SS'] = 1
    data.loc[((data['amountLeft']==1) & (data['LL_top']==1) & (data['aoi']=='BR')), 
             'aoi_delay_SS'] = 1
    
    data.loc[((data['amountLeft']==1) & (data['LL_top']==0) & (data['aoi']=='TL')), 
             'aoi_amount_SS'] = 1
    data.loc[((data['amountLeft']==1) & (data['LL_top']==0) & (data['aoi']=='TR')), 
             'aoi_delay_SS'] = 1
    data.loc[((data['amountLeft']==1) & (data['LL_top']==0) & (data['aoi']=='BL')), 
             'aoi_amount_LL'] = 1
    data.loc[((data['amountLeft']==1) & (data['LL_top']==0) & (data['aoi']=='BR')), 
             'aoi_delay_LL'] = 1
    
    # If amounts are on the right side
    # If the gaze point is in the top option
    data.loc[((data['amountLeft']==0) & (data['LL_top']==1) & (data['aoi']=='TL')), 
             'aoi_delay_LL'] = 1
    data.loc[((data['amountLeft']==0) & (data['LL_top']==1) & (data['aoi']=='TR')), 
             'aoi_amount_LL'] = 1
    data.loc[((data['amountLeft']==0) & (data['LL_top']==1) & (data['aoi']=='BL')), 
             'aoi_delay_SS'] = 1
    data.loc[((data['amountLeft']==0) & (data['LL_top']==1) & (data['aoi']=='BR')),
             'aoi_amount_SS'] = 1

    data.loc[((data['amountLeft']==0) & (data['LL_top']==0) & (data['aoi']=='TL')), 
             'aoi_delay_SS'] = 1
    data.loc[((data['amountLeft']==0) & (data['LL_top']==0) & (data['aoi']=='TR')), 
             'aoi_amount_SS'] = 1
    data.loc[((data['amountLeft']==0) & (data['LL_top']==0) & (data['aoi']=='BL')), 
             'aoi_delay_LL'] = 1
    data.loc[((data['amountLeft']==0) & (data['LL_top']==0) & (data['aoi']=='BR')), 
             'aoi_amount_LL'] = 1
    return data

data_et_choice = addAOIs(data_et_choice)
data_et_choice = createAOIColumns(data_et_choice)

In [56]:
def add_transition_type(data):
    data = data.loc[(pd.notna(data['aoi'])), :]
    data['newAOIIndex'] = 0
    data.loc[(data['aoi_amount_LL']==1), 'newAOIIndex'] = 1
    data.loc[(data['aoi_delay_LL']==1), 'newAOIIndex'] = 2
    data.loc[(data['aoi_amount_SS']==1), 'newAOIIndex'] = 4
    data.loc[(data['aoi_delay_SS']==1), 'newAOIIndex'] = 8
    data.sort_values(by=['run_id', 'withinTaskIndex'])
    # Add a 0 due to the way np.diff works
    data['transition_type'] = np.append(np.diff(data['newAOIIndex']), [0])
    data['transition_type'] = abs(data['transition_type']) 
    return(data)

def cleanTransitions(data):
    indices = []
    for subject in data['run_id'].unique():
        df_subj = data.loc[data['run_id']==subject, :]
        for trial in df_subj['withinTaskIndex'].unique():
            df_thisTrial = data.loc[(
                                    (data['run_id'] == subject) &
                                    (data['withinTaskIndex'] == trial)
                                ), 'transition_type']
            indices.append(df_thisTrial.index.max())
    # last gaze point of each trial
    data.loc[indices, 'transition_type'] = 0 
    return(data)

data_et_choice = add_transition_type(data_et_choice)

data_et_choice = cleanTransitions(data_et_choice)
# View(data_et_choice.tail(20))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


# data_trial
[Back to Navigation](#Navigation)

In [57]:
data_raw.columns

Index(['run_id', 'condition', 'rt', 'stimulus', 'button_pressed',
       'window_width', 'window_height', 'trial_type', 'trial_index',
       'time_elapsed', 'internal_node_id', 'subject', 'chinFirst',
       'choiceTask_amountLeftFirst', 'webcam_label', 'webcam_Id', 'webcam_fps',
       'webcam_aspectRatio', 'webcam_height', 'webcam_width', 'key_press',
       'success', 'x_pos', 'y_pos', 'chin', 'task_nr', 'et_data',
       'trial_duration', 'option_topLeft', 'option_bottomLeft',
       'option_topRight', 'option_bottomRight', 'recorded_at', 'ip',
       'user_agent', 'device', 'browser', 'browser_version', 'platform',
       'platform_version', 'Unnamed: 2', 'fbclid', 'bonusAmount', 'bonusDelay',
       'chosenAmount', 'chosenDelay', 'prolificID', 'age', 'gender', 'ethnic',
       'sight', 'glasses', 'degree', 'eyeshadow', 'masquara', 'eyeliner',
       'browliner', 'vertPosition', 'triedChin', 'keptHead', 'optionalNote',
       'window_width_max', 'window_height_max', 'window_diago

In [59]:
data_trial = data_raw.loc[
    :, 
    [
        'run_id', 'subject', 'chinFirst', 'choiceTask_amountLeftFirst', 
        'rt', 'stimulus',
        'window_width', 'window_height', 'trial_type', 'trial_index',
        'time_elapsed', 
        'webcam_label', 'webcam_Id', 'webcam_fps',
        'webcam_aspectRatio', 'webcam_height', 'webcam_width', 'key_press',
        'success', 'x_pos', 'y_pos', 'chin', 'task_nr',
        'trial_duration', 'option_topLeft', 'option_bottomLeft',
        'option_topRight', 'option_bottomRight', 'recorded_at',
        'user_agent', 'device', 'browser', 'browser_version', 'platform',
        'platform_version', 'bonusAmount', 'bonusDelay',
        'prolificID', 'age', 'gender', 'ethnic', 'sight', 'glasses', 'degree',
        'eyeshadow', 'masquara', 'eyeliner', 'browliner', 'vertPosition',
        'triedChin', 'keptHead', 'optionalNote', 'window_width_max',
        'window_height_max', 'window_diagonal_max', 'window_diagonal'
    ]
]

In [60]:
def checkTimeDeviation(data, column1, column2, maxTimeDiffAllowed):
    diff = data[column1] - data['trial_duration_exact']
    longtrials_runID = data.loc[diff[diff > maxTimeDiffAllowed].index, 'run_id']
    longtrials_previousrunID = pd.DataFrame(data.loc[diff[diff > maxTimeDiffAllowed].index-1, 'run_id']) \
        .rename(columns={'run_id':'previous_run_id'})
    longtrials_previousrunID.index = longtrials_runID.index
    compare_runIDs = pd.concat([longtrials_runID, longtrials_previousrunID], axis=1)

    if sum(compare_runIDs['run_id'] == compare_runIDs['previous_run_id']) > 0: 
        print(column1 + ' and ' + column2 + ' show a deviation of ' +
              '>' + str(maxTimeDiffAllowed) + 
              ' ms. Please check on the following indices: \n')
        print(compare_runIDs.loc[(compare_runIDs['run_id'] == compare_runIDs['previous_run_id']), :].index)

    else:
        print('Success! ' + column1 + ' and ' + column2 + ' do not deviate by ' +
              '>' + str(maxTimeDiffAllowed) + 'ms.')
        
def exactTrialDuration(data):
    output = data
    output["t_startTrial"] = pd.concat([pd.Series([0]), output["time_elapsed"]], ignore_index=True)
    output["trial_duration_exact"] = output.loc[:, ("time_elapsed")] - output.loc[:, ("t_startTrial")]
    output.drop(len(output)-1)
    
    checkTimeDeviation(data, 'rt', 'trial_duration_exact', 50)
    checkTimeDeviation(data, 'trial_duration', 'trial_duration_exact', 50)
    
    return output

data_trial = exactTrialDuration(data_trial)

Success! rt and trial_duration_exact do not deviate by >50ms.
Success! trial_duration and trial_duration_exact do not deviate by >50ms.


In [61]:
def mergeByTrialIndex(data, largeData, varName):
    if (varName in data_trial.columns):
        print(varName + ' already added!')
    else:
        grouped = largeData.loc[:, 
                               [
                                   'run_id', 'trial_index', varName
                               ]
                           ] \
                    .drop_duplicates()
        data = data.merge(grouped, on=['run_id', 'trial_index'], how='left') 
    return data

for column in ['x_mean', 'y_mean', 'count', 'offset', 'precision', 'withinTaskIndex']:
    data_trial = mergeByTrialIndex(data_trial, data_et, column)

In [62]:
data_trial["fps"] = 1000 * data_trial["count"] / data_trial["trial_duration_exact"]
data_trial

Unnamed: 0,run_id,subject,chinFirst,choiceTask_amountLeftFirst,rt,stimulus,window_width,window_height,trial_type,trial_index,...,window_diagonal,t_startTrial,trial_duration_exact,x_mean,y_mean,count,offset,precision,withinTaskIndex,fps
0,0,698.0,0.0,1.0,2694.890,"""",,,survey-html-form,1.0,...,,0.0,12576.0,,,,,,,
1,0,698.0,0.0,1.0,1521.740,"Please, use only one monitor for this study.If...",,,html-button-response,2.0,...,,12576.0,1524.0,,,,,,,
2,0,698.0,0.0,1.0,759.695,Please turn your mobile phone into Airplane Mo...,,,html-button-response,3.0,...,,14100.0,762.0,,,,,,,
3,0,698.0,0.0,1.0,,"""",,,html-keyboard-response,4.0,...,,14862.0,126.0,,,,,,,
4,0,698.0,0.0,1.0,,"""",,,html-keyboard-response,5.0,...,,14988.0,109.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43251,2,637.0,1.0,1.0,1416.000,"""",,,survey-html-form,108.0,...,,220638.0,1417.0,,,,,,,
43252,2,637.0,1.0,1.0,4485.000,"""",,,survey-html-form,109.0,...,,222055.0,4491.0,,,,,,,
43253,2,637.0,1.0,1.0,14185.000,"""",,,survey-html-form,110.0,...,,226546.0,14185.0,,,,,,,
43254,2,637.0,1.0,1.0,25005.000,"Bonus PaymentFor your Bonus Payment, we random...",,,html-keyboard-response,111.0,...,,240731.0,25011.0,,,,,,,


## data_trial_fixation

In [63]:
data_trial_fixation = data_trial.loc[
         (data_trial['trial_type'] == 'eyetracking-fix-object') & 
         (pd.notna(data_trial['x_pos'])) &
         (pd.notna(data_trial['y_pos'])) &
         np.invert(data_trial['task_nr'] == 3), 
    ] \
    .reset_index(drop=True) 
# View(data_trial_fixation.drop(columns=['et_data']).tail(20)) 

## data_trial_choice

In [64]:
data_trial_choice = data_trial.loc[
    data_trial["trial_type"] == "eyetracking-choice", 
    [
        'run_id', 'chinFirst',
        'trial_duration_exact',
        'window_width', 'window_height', 
        'window_width_max', 'window_height_max',
        'window_diagonal_max', 'window_diagonal', 
        
        'trial_index',  'withinTaskIndex',
        'task_nr',
        'time_elapsed',

        'choiceTask_amountLeftFirst', 
        'key_press', 
        'option_topLeft', 'option_bottomLeft',
        'option_topRight', 'option_bottomRight', 
        'x_mean', 'y_mean', 
        'count', 'fps'        
    ]
]

In [65]:
for column in [
                  'choseTop',
                  'LL_top',
                  'choseLL',
                  'aLL',
                  'aSS',
                  'tLL',
                  'tSS',
                  'amountLeft'
              ]:
    data_trial_choice = mergeByTrialIndex(data_trial_choice, data_et_choice, column)

In [66]:
def addOptionIndex(data):
    if "optionIndex" in data.columns:
        print("Option Index already added!")
        data_output = data
    else:
        grouped = data_et_choice.groupby(['run_id', 'trial_index']) \
            ['aoi', 'aoi_amount_SS', 'aoi_amount_LL', 'aoi_delay_SS', 'aoi_delay_LL'].count() \
            .reset_index() \
            .rename(columns={"aoi": "count"})
        grouped['gazePoints_immediate'] = (grouped['aoi_amount_SS'] + grouped['aoi_delay_SS'])
        grouped['gazePoints_delay'] = (grouped['aoi_amount_LL'] + grouped['aoi_delay_LL'])
        grouped['optionIndex'] = (grouped['gazePoints_immediate'] - grouped['gazePoints_delay']) / \
                                 (grouped['gazePoints_immediate'] + grouped['gazePoints_delay'])
        data_output = data.merge(grouped[['run_id', 'trial_index', 'optionIndex']], 
                                          on=['run_id', 'trial_index'])
    return(data_output)

data_trial_choice = addOptionIndex(data_trial_choice)

  import sys


In [67]:
def addAttributeIndex(data):
    if "attributeIndex" in data.columns:
        print('Attribute Index already added!')
        data_output = data
    else:
        grouped = data_et_choice.groupby(['run_id', 'trial_index']) \
            ['aoi', 'aoi_amount_SS', 'aoi_amount_LL', 'aoi_delay_SS', 'aoi_delay_LL'].count() \
            .reset_index() \
            .rename(columns={"aoi": "count"})
        grouped['gazePoints_amount'] = (grouped['aoi_amount_LL'] + grouped['aoi_amount_SS'])
        grouped['gazePoints_time'] = (grouped['aoi_delay_LL'] + grouped['aoi_delay_SS'])
        grouped['attributeIndex'] = (grouped['gazePoints_amount'] - grouped['gazePoints_time']) / \
                                 (grouped['gazePoints_amount'] + grouped['gazePoints_time'])

        data_output = data.merge(grouped[['run_id', 'trial_index', 'attributeIndex']], 
                                          on=['run_id', 'trial_index'])
    return(data_output)

data_trial_choice = addAttributeIndex(data_trial_choice)

  import sys


In [68]:
def addTransition_type(data_trial, data_et):
    transition_count = pd.pivot_table(data_et.loc[:, ['run_id', 'trial_index', 'transition_type']], 
                        index = ['run_id', 'trial_index'],
                        columns = ['transition_type'], 
                        aggfunc = len,
                        fill_value = 0
                        ).reset_index()
    transition_count.columns = ['run_id', 'trial_index', 
                    "trans_type_0",
                    "trans_type_1",
                    "trans_type_2",
                    "trans_type_3",
                    "trans_type_4",
                    "trans_type_6",
                    "trans_type_7"]

    if "trans_type_0" in data_trial:
        print("Transitions already added!")
        data_trial = data_trial
    else:
        data_trial = data_trial.merge(transition_count, on=['run_id', 'trial_index']) 
    return(data_trial)

data_trial_choice = addTransition_type(data_trial_choice, data_et_choice)

In [69]:
def addPayneIndex(data):
    if "payneIndex" in data.columns:
        print("PayneIndex already added!")
    else: 
        # Option-wise: amount_LL-delay_LL = 1; amount_SS - delay_SS = 4
        # Attribute-wise: amount_LL-amount_SS = 3; delay_LL - delay_SS = 6
        # Cross: amount_LL-delay_SS = 7; delay_LL - amount_SS = 2
        optionWise_transition = data.loc[:, 'trans_type_1'] + data.loc[:, 'trans_type_4']
        attributeWise_transition = data.loc[:, 'trans_type_3'] + data.loc[:, 'trans_type_6']  
        data['payneIndex'] = (optionWise_transition - attributeWise_transition) / \
            (optionWise_transition + attributeWise_transition)      
    return(data)

data_trial_choice = addPayneIndex(data_trial_choice)

In [70]:
def k(aLL, aSS, tLL):
    k = ((aLL / aSS) - 1) / tLL
    return k

data_trial_choice['k'] = k(data_trial_choice['aLL'], data_trial_choice['aSS'], data_trial_choice['tLL']) 

In [71]:
def cleanChoiceData(data):
    output = data.loc[data['trial_duration_exact'] < 10000, :]
    return output

data_trial_choice = cleanChoiceData(data_trial_choice)
# View(data_trial_choice.tail(20)) 

# data_subject

In [72]:
data_subject = data_raw.loc[: , 
          [
               'run_id', 'subject', 'chinFirst',
               'choiceTask_amountLeftFirst', 'webcam_label', 'webcam_Id', 'webcam_fps',
               'webcam_aspectRatio', 'webcam_height', 'webcam_width', 
               'user_agent', 'device', 'browser',
               'browser_version', 'platform', 'platform_version', 
               'bonusAmount', 'bonusDelay', 'prolificID', 'age',
               'gender', 'ethnic', 'sight', 'glasses', 'degree', 'eyeshadow',
               'masquara', 'eyeliner', 'browliner', 'vertPosition', 'triedChin',
               'keptHead', 'optionalNote', 'window_width_max', 'window_height_max',
               'window_diagonal_max'
          ]
     ].drop_duplicates()

In [73]:
def merge_group_means_by_subject(data, sourceData, varName):
    if np.invert(varName in sourceData.columns): 
        print(varName + ' not in source Data!')
    else:
        if varName in data.columns:
            print(varName + ' already added!')
        else:
            grouped = sourceData.groupby(['run_id']).mean() \
                .reset_index() \
                .loc[:, ['run_id', varName]]        
            data = data.merge(grouped.loc[:, ['run_id', varName]], on=['run_id'], how='left')
    return data

In [74]:
for column in [
    'trial_duration_exact', 
    't_startTrial', 
    'window_width', 
    'window_height',
    'fps'
               ]:
    data_subject = merge_group_means_by_subject(data_subject, data_trial, column)    

In [75]:
for column in [
    'choseLL'
               ]:
    data_subject = merge_group_means_by_subject(data_subject, data_trial_choice, column)    

In [76]:
for column in [

                   'offset',
                   'precision', 
               ]:
    data_subject = merge_group_means_by_subject(data_subject, data_et, column)    

In [77]:
df = data_et_fixation.rename(columns={
                                                'x_mean': 'x_mean_fixTask',
                                                'y_mean': 'y_mean_fixTask'
                                            })  
for column in ['x_mean_fixTask', 'y_mean_fixTask']:
    data_subject = merge_group_means_by_subject(data_subject, df, column)

In [78]:
columns = [
            'chinFirst',
            'eyeshadow', 
            'masquara',
            'eyeliner',
            'browliner',
            'triedChin', 
            'keptHead',
        ]

data_subject[columns] = data_subject[columns].replace({'no': 0, 'yes': 1}) 

In [79]:
# View(data_subject)

# Add data_subject variables to data_trial variables

In [80]:
def merge_uniques_by_subject(data, sourceData, varName):
    if np.invert(varName in sourceData.columns): 
        print(varName + ' not in source Data!')
    else:
        if varName in data.columns:
            print(varName + ' already added!')
        else:
            grouped = sourceData.loc[:, 
                [
                    'run_id', varName
                ]
            ] \
            .reset_index(drop=True) \
            .drop_duplicates()     
            data = data.merge(grouped.loc[:, ['run_id', varName]], on=['run_id'], how='left')
    return data

for column in ['age', 'gender', 'ethnic', 'degree']:
    data_trial_choice = merge_uniques_by_subject(data_trial_choice, data_subject, column)
data_trial_choice

Unnamed: 0,run_id,chinFirst,trial_duration_exact,window_width,window_height,window_width_max,window_height_max,window_diagonal_max,window_diagonal,trial_index,...,trans_type_3,trans_type_4,trans_type_6,trans_type_7,payneIndex,k,age,gender,ethnic,degree
0,0,0.0,1736.0,1280.0,720.0,1280.0,720.0,1468.604780,1468.604780,266.0,...,1,1,1,0,0.000000,0.001587,1995.0,male,caucasian,college
1,0,0.0,1083.0,1280.0,720.0,1280.0,720.0,1468.604780,1468.604780,269.0,...,1,0,0,0,0.000000,0.026667,1995.0,male,caucasian,college
2,0,0.0,2113.0,1280.0,720.0,1280.0,720.0,1468.604780,1468.604780,272.0,...,0,0,0,0,1.000000,0.125000,1995.0,male,caucasian,college
3,0,0.0,3694.0,1280.0,720.0,1280.0,720.0,1468.604780,1468.604780,275.0,...,3,1,1,0,-0.333333,0.035714,1995.0,male,caucasian,college
4,0,0.0,3035.0,1280.0,720.0,1280.0,720.0,1468.604780,1468.604780,278.0,...,2,1,3,0,-0.666667,0.111111,1995.0,male,caucasian,college
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,8000,1.0,1744.0,1536.0,864.0,1536.0,864.0,1762.325736,1762.325736,368.0,...,0,0,0,0,,,1961.0,male,caucasian,college
379,8000,1.0,1335.0,1536.0,864.0,1536.0,864.0,1762.325736,1762.325736,371.0,...,0,0,0,0,,,1961.0,male,caucasian,college
380,8000,1.0,1501.0,1536.0,864.0,1536.0,864.0,1762.325736,1762.325736,374.0,...,0,0,0,0,,,1961.0,male,caucasian,college
381,8000,1.0,1797.0,1536.0,864.0,1536.0,864.0,1762.325736,1762.325736,377.0,...,0,0,0,0,,,1961.0,male,caucasian,college


# Export data

In [81]:
if not os.path.exists('./data_jupyter'):
    os.mkdir('./data_jupyter')

data_et.to_csv("data_jupyter/data_et.csv", index=False, header=True)
data_et_fixation.to_csv("data_jupyter/data_et_fixation.csv", index=False, header=True)
data_et_choice.to_csv("data_jupyter/data_et_choice.csv", index=False, header=True)

data_trial.to_csv("data_jupyter/data_trial.csv", index=False, header=True)
data_trial_fixation.to_csv("data_jupyter/data_trial_fixation.csv", index=False, header=True)
data_trial_choice.to_csv("data_jupyter/data_trial_choice.csv", index=False, header=True)

data_subject.to_csv("data_jupyter/data_subject.csv", index=False, header=True)

MatLab input

In [82]:
if not os.path.exists('./amasino_dataPrep/data_source'):
    os.mkdir('./amasino_dataPrep/data_source')

data_et_choice['fixationCounter'] = 1
data_et_choice.loc[:, 
                       [
                           'run_id', 
                           'withinTaskIndex', 
                           'x', 
                           'y', 
                           't_task', 
                           'window_width', 
                           'window_height',
                       ]
                  ] \
   .to_csv("amasino_dataPrep/data_source/schneegansEtAl_ET.csv", index=False, header=False)

In [83]:
data_trial_choice.loc[:, 
                          [
                              'run_id', 
                              'aSS', 
                              'aLL', 
                              'tSS', 
                              'tLL', 
                              'choseLL', 
                              'trial_duration_exact', 
                              'LL_top',
                              'choseTop'
                          ]
                     ] \
    .to_csv("amasino_dataPrep/data_source/schneegansEtAl_behavior.csv", index=False, header=False)

In [84]:
data_trial_choice.loc[:, 
                       [
                           'run_id', 
                           'withinTaskIndex', 
                           'optionIndex', 
                           'attributeIndex', 
                           'payneIndex'
                       ]
                  ] \
    .fillna(0) \
    .to_csv("amasino_dataPrep/intermediateCSVs/ET_indices.csv", index=False, header=False)

In [85]:
data_subject.loc[:, ['run_id', 'choseLL']] \
    .to_csv("amasino_dataPrep/intermediateCSVs/percLeft.csv", index=False, header=False)