In [1]:
# import sys
# !conda install --yes --prefix {sys.prefix} pingouin

In [2]:
import numpy as np
import math
import matplotlib.pyplot as plt
import os
import pandas as pd
import pingouin as pg
import re
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm 
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import sys

if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
    
from IPython.display import HTML
def View(df):
    css = """<style>
    table { border-collapse: collapse; border: 3px solid #eee; }
    table tr th:first-child { background-color: #eeeeee; color: #333; font-weight: bold }
    table thead th { background-color: #eee; color: #000; }
    tr, th, td { border: 1px solid #ccc; border-width: 1px 0 0 1px; border-collapse: collapse;
    padding: 3px; font-family: monospace; font-size: 10px }</style>
    """
    s  = '<script type="text/Javascript">'
    s += 'var win = window.open("", "Title", "toolbar=no, location=no, directories=no, status=no, menubar=no, scrollbars=yes, resizable=yes, width=780, height=200, top="+(screen.height-400)+", left="+(screen.width-840));'
    s += 'win.document.body.innerHTML = \'' + (df.to_html() + css).replace("\n",'\\') + '\';'
    s += '</script>'
    return(HTML(s+css))    
    
print("Current Working directory " , os.getcwd())

Current Working directory  C:\Users\User\Desktop\old_analysis


# data_raw

In [3]:
def compileData(path):
    subject_files = os.listdir(path)
    all_subjects = []
    for i in range(0, len(subject_files)):
        rawString = open(path + "/" + subject_files[i]).read()
        string_noHTML = cleanhtml(rawString)
        string_cleanSurvey = cleanSurveyForm(string_noHTML)
        all_subjects.append(pd.read_csv(StringIO(string_cleanSurvey)))
    output = pd.concat(all_subjects).reset_index(drop=True)
    return output

In [4]:
def cleanhtml(raw_html):
    # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
    cleanr = re.compile('<.*?>')
    cleanText = re.sub(cleanr, '', raw_html)
    return cleanText

def cleanSurveyForm(string):
    SurveySection = re.findall(re.compile('{.*?}'), string)
    cleanText = string
    for i in range(0,len(SurveySection)):
        cleanSection = re.sub(",", ";", SurveySection[i])
        cleanText = re.sub(SurveySection[i], cleanSection, cleanText)
    return cleanText

In [5]:
def convertToNumeric(data, columns):
    data[columns] = data[columns].apply(pd.to_numeric, errors='coerce')
    return data

In [6]:
def addWindowSize(data):
    output = data
    if "window_width_max" in data.columns:
        print('window width_max already added')
    else:
        grouped = data.groupby(["run_id", "subject"])["window_width", "window_height"].max().reset_index()
        grouped.columns = ["run_id", "subject", "window_width_max", "window_height_max"]
        grouped['window_diagonal_max'] = np.sqrt(grouped['window_width_max']**2 + grouped['window_height_max']**2)
        output = data.merge(grouped, 
                            on=['run_id', "subject"],
                            how='left')
    return output

## init data_raw

In [7]:
data_raw = compileData("data")
View(data_raw.head(20))
data_raw = convertToNumeric(data_raw, 
                            [
                                'run_id', 'subject', # Subject info
                                'trial_index', 'chin', # Trial input
                                'key_press',
                                'option_topLeft', 'option_bottomLeft', 'option_topRight', 'option_bottomRight', # Eyetracking output
                                'task_nr', 'x_pos', 'y_pos', # Trial input
                                'rt', 'time_elapsed', 'trial_duration',
                                'window_width', 'window_height', 
#                                 'webcam_fps', 'webcam_aspectRatio', 'webcam_height', 'webcam_width', # Webcam
#                                 't_StartEyeTrackingData', 
                                'fps', 'devAvg', 'devPercAvg', # Eyetracking output
                                ]
                           )
data_raw = addWindowSize(data_raw)
data_raw['window_diagonal'] = np.sqrt(data_raw['window_width']**2 + data_raw['window_height']**2)

  


# data_et

In [8]:
def extractEyetrackingData(data):
    data_eyetracking = pd.DataFrame(columns=['type', 'index', 'x', 'y', 'px', 'py', 't', 'click']) 
    data["et_data"] = data["et_data"].apply(str)
    for i in range(0, len(data)):
        if ((data.et_data[i] != '"') & (data.et_data[i] != "nan")):
            df = pd.read_csv(StringIO(data.et_data[i]), sep="$") 
            df_noClick = df[(df["click"]==0)].reset_index()
            df["run_id"] = data.run_id[i] # Subject identifier
            df["subject"] = data.subject[i]
            df["chinFirst"] = data.chinFirst[i]
            df["task_nr"] = data.task_nr[i] # Task characteristics
            df['trial_index'] = data.trial_index[i]
            df["chin"] = data.chin[i]
            df["x_pos"] = data.x_pos[i]
            df["y_pos"] = data.y_pos[i]
            df['option_topLeft'] = data.option_topLeft[i]
            df['option_topRight'] = data.option_topRight[i]
            df['option_bottomLeft'] = data.option_bottomLeft[i]
            df['option_bottomRight'] = data.option_bottomRight[i]
            df["tTask"] = (df["t"] - df["t"][0]) # ET-coordinate time variable 
            df["key_press"] = data.key_press[i] # ET-Trial summary data
            df["fps"] = data.fps[i]
            df["devPercAvg"] = data.devPercAvg[i]
            df['rt'] = data.rt[i]
            df['window_width'] = data.window_width[i]
            df['window_height'] = data.window_height[i]
            df['window_diagonal'] = data.window_diagonal_max[i]
            df['window_diagonal_max'] = data.window_diagonal_max[i]
            
            data_eyetracking = data_eyetracking.append(pd.DataFrame(data = df), ignore_index=True)  
            
    data_eyetracking = data_eyetracking.rename(columns={"type": "trial_type", "tTask": "t_task", "t": "t_wg"})

    return(data_eyetracking)

In [9]:
def convertToFactor(data, columns):
    stacked = data[columns].stack()
    data[columns] = pd.Series(stacked.factorize()[0], index=stacked.index).unstack()
    return data

In [10]:
def mergeByTrial(data, sourceData, varName):
    output = data
    if varName in data.columns:
        print(varName + ' already added!')
    else:
        grouped = sourceData.groupby(['run_id', 'subject', 'trial_index']).mean() \
            .reset_index() \
            .loc[:, 
                    [
                        'run_id', 'subject', 'trial_index', varName
                    ]
                ]        
        output = output.merge(grouped, 
                            on=['run_id', 'subject', 'trial_index'], 
                            how='left')
    return output

In [11]:
def addXCount(data):
    if 'count' in data_et.columns:
        print('Count already added!')
        data_output = data
    else: 
        grouped = pd.DataFrame(data.groupby(["run_id", "subject", "trial_index"])["x"].count()) \
            .reset_index() \
            .rename(columns={'x': 'count'})
#         idx = pd.IndexSlice
#         descr = grouped[["run_id", "subject", "index"]]
#         count = grouped.loc[:, ["x"]].loc[:,idx[:,'count']]
#         time = grouped.loc[:, ["t_task"]].loc[:,idx[:,'max']]
#         grouped = pd.concat([descr.reset_index(drop=True), 
#                              count.reset_index(drop=True), 
#                              time.reset_index(drop=True)], axis=1)

        data_output = data.merge(grouped.loc[:, ["run_id", "subject", "trial_index", 'count']], 
                                 on=["run_id", "subject", "trial_index"], 
                                 how='left')
    return(data_output)

In [12]:
def addMeans(data):
    if ('x_mean' in data.columns):
        print('X_mean already added!')
        data_output = data
    elif ('y_mean' in data.columns):
        print('Y_mean already added!')
        data_output = data
    else: 
        grouped = data.loc[:, ['run_id', 'subject', 'trial_index', 'x', 'y']] \
                             .groupby(['run_id', 'subject', 'trial_index']).mean() \
                             .rename(columns={'x': 'x_mean', 
                                              'y': 'y_mean'})
        data_output = data.merge(grouped,
                                 on=['run_id', 'subject', 'trial_index'],
                                 how='left')
    return data_output

In [13]:
def euclideanDistance(x, x_target, y, y_target):
    x_diff = x - x_target
    y_diff = y - y_target
    euclideanDistance = np.sqrt(x_diff**2 + y_diff**2)
    return(euclideanDistance)

## init data_et
[Back to Navigation](#Navigation)

In [14]:
data_et = extractEyetrackingData(data_raw)

data_et = convertToNumeric(data_et,
                        ['x', 'y', 'px', 'py', 't_wg', 't_task', # Float 
                         'x_pos', 'y_pos', 'fps', 'devPercAvg', 
                         'run_id', 'subject', 'chinFirst', 'index', 
                         'task_nr','click', 'option_topLeft', 'option_topRight', # Int
                         'option_bottomLeft', 'option_bottomRight', 'chin', 'key_press'])
data_et = mergeByTrial(data_et, data_raw, 'window_width_max')
data_et = mergeByTrial(data_et, data_raw, 'window_height_max')
data_et = addXCount(data_et)
data_et = addMeans(data_et)

data_et["offset"] = euclideanDistance(data_et["x"], 
                                      data_et["px"], 
                                      data_et["y"], 
                                      data_et["py"])

data_et["offset_perc"] = 100 * data_et["offset"] / data_et['window_height_max'] 
data_et

Unnamed: 0,trial_type,index,x,y,px,py,t_wg,click,run_id,subject,...,window_height,window_diagonal,window_diagonal_max,window_width_max,window_height_max,count,x_mean,y_mean,offset,offset_perc
0,eyetracking-calibration,21,375,618,256.0,576.0,68310.775,1,16.0,473.0,...,720.0,1468.604780,1468.604780,1280.0,720.0,47,266.234043,590.446809,126.194295,17.526985
1,eyetracking-calibration,21,228,546,256.0,576.0,68464.775,1,16.0,473.0,...,720.0,1468.604780,1468.604780,1280.0,720.0,47,266.234043,590.446809,41.036569,5.699523
2,eyetracking-calibration,21,320,595,256.0,576.0,68601.775,1,16.0,473.0,...,720.0,1468.604780,1468.604780,1280.0,720.0,47,266.234043,590.446809,66.760767,9.272329
3,eyetracking-calibration,21,271,613,256.0,576.0,68791.775,1,16.0,473.0,...,720.0,1468.604780,1468.604780,1280.0,720.0,47,266.234043,590.446809,39.924930,5.545129
4,eyetracking-calibration,21,265,598,256.0,576.0,68893.775,1,16.0,473.0,...,720.0,1468.604780,1468.604780,1280.0,720.0,47,266.234043,590.446809,23.769729,3.301351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58805,eyetracking-choice,495,572,393,,,1429213.000,0,20.0,168.0,...,1152.0,2349.767648,2349.767648,2048.0,1152.0,42,861.785714,371.095238,,
58806,eyetracking-choice,495,504,312,,,1429334.000,0,20.0,168.0,...,1152.0,2349.767648,2349.767648,2048.0,1152.0,42,861.785714,371.095238,,
58807,eyetracking-choice,495,480,256,,,1429375.000,0,20.0,168.0,...,1152.0,2349.767648,2349.767648,2048.0,1152.0,42,861.785714,371.095238,,
58808,eyetracking-choice,495,757,807,,,1429460.000,0,20.0,168.0,...,1152.0,2349.767648,2349.767648,2048.0,1152.0,42,861.785714,371.095238,,


## data_et_calibration
[Back to Navigation](#Navigation)

In [15]:
data_et_calibration = data_et.loc[(
                                (data_et["x"]>-1) & 
                                (data_et["y"]>-1) & 
                                (data_et["x"]<data_et['window_width_max']) & 
                                (data_et["y"]<data_et['window_height_max']) & 
                                (data_et["trial_type"]=="eyetracking-calibration")
                                                    ), 
                                [
                                    'run_id', 'subject', 'chinFirst', 'trial_index', 'index',
                                    'task_nr', 'chin', 
                                    'x_pos', 'y_pos', 'px', 'py', 'x', 'y', 
                                    'fps',
                                    'window_width_max', 'window_height_max', 'offset', 'offset_perc'
                                ]
]

## data_et_fixTask
[Back to Navigation](#Navigation)

## Filter

In [16]:
data_et_fixation = data_et.loc[(
                                (data_et["x"]>-1) & 
                                (data_et["y"]>-1) & 
                                (data_et["x"]<data_et['window_width_max']) & 
                                (data_et["y"]<data_et['window_height_max']) & 
                                (data_et["trial_type"]=="eyetracking-fix-object") &
                                (pd.notna(data_et['x_pos'])) &
                                (pd.notna(data_et['y_pos'])) &
                                ~(
                                    ((data_et["chinFirst"]==1) & (data_et["task_nr"]==2)) |
                                    ((data_et["chinFirst"]==0) & (data_et["task_nr"]==3))
                                 ) 
                                                    ), 
                                [
                                    'run_id', 'subject', 'chinFirst', 'trial_index', 'index',
                                    'task_nr', 'chin', 
                                    'x_pos', 'y_pos', 'px', 'py', 'x', 'y', 
                                    'fps',
                                    'window_width_max', 'window_height_max', 'offset', 'offset_perc'
                                ]
]

## data_et_choice

In [17]:
def filterChoiceData(data):
    output = data.loc[(data["trial_type"]=="eyetracking-choice"), 
    ['run_id', 'subject', 'chinFirst', 'index', 'trial_index', 'task_nr',
       'option_topLeft', 'option_topRight', 'option_bottomLeft', 'option_bottomRight', 
       'x', 'y', "window_width", "window_height", "window_width_max", "window_height_max",
       't_wg', 't_task', 'key_press']]
    return output

In [18]:
def lookDirections(data):
    data["look_left"] = (data["x"] < data["window_width"]*0.5).astype(int)
    data["look_top"] = (data["y"] < data["window_height"]*0.5).astype(int)
    return data

In [19]:
def addChoiceVariables(data):
    data['ssOption_top'] = 1
    data['ssOption_top'] = \
        (data['option_topLeft'] < data['option_bottomLeft']) \
        .astype(int)
    print('ssOption_top: ' + str(data['ssOption_top'].unique()))

    data['chose_top'] = 0
    data['chose_top'] = (data['key_press'] == 90).astype(int)
    print('chose_top: ' + str(data['chose_top'].unique()))

    data['chose_ssOption'] = 0
    data['chose_ssOption'] = (data['chose_top'] == 
                                       data['ssOption_top']).astype(int)
    print('chose_ssOption: ' + str(data['chose_ssOption'].unique()))
    return data

In [20]:
def addChoiceIndex(data, newIndexName): 
    if newIndexName in data.columns:
        print(newIndexName + ' already added!')
        data_output = data
    else:
        trial_indices = pd.DataFrame(
                data.loc[:, ["run_id", "subject", "trial_index"]].drop_duplicates()
            ).reset_index()

        newIndices = []
        for subject in trial_indices["run_id"].unique():
            trial_indices_thisSubject = trial_indices.loc[
                (trial_indices["run_id"]==subject), 
                ["run_id", "subject", "trial_index"]] \
                .reset_index()
            trial_indices_thisSubject[newIndexName] = trial_indices_thisSubject.index + 1
            trial_indices_thisSubject = trial_indices_thisSubject[["run_id", "subject", "trial_index", newIndexName]]
            newIndices.append(trial_indices_thisSubject)
        trial_indices = pd.concat(newIndices).reset_index()
        data_output = data.merge(trial_indices[["run_id", "subject", "trial_index", newIndexName]], 
                                                  on=["run_id", "subject", "trial_index"], how="left")
        data_output[newIndexName] = data_output[newIndexName].astype(int)
    return data_output

In [21]:
# w = screen_width, h = screen_height
def aois_centers(w, h):
    return pd.DataFrame([
            [((0.05+0.9*0.2) * w), (0.25 * h), w, h],
            [((0.05+0.9*0.8) * w), (0.25 * h), w, h],
            [((0.05+0.9*0.2) * w), (0.75 * h), w, h],
            [((0.05+0.9*0.8) * w), (0.75 * h), w, h]
        ], columns = ['x', 'y', 'width', 'height'], index=['TL', 'TR', 'BL', 'BR'])

def assign_aoi(data, subject, aoi_set, aoi):
        data.loc[(
                        (data['run_id'] == subject) & 
                        (data.loc[:, 'x'] > (aoi_set.loc[aoi, 'x'] - aoi_set.loc[aoi, 'width'] * 0.175)) & \
                        (data.loc[:, 'x'] < (aoi_set.loc[aoi, 'x'] + aoi_set.loc[aoi, 'width'] * 0.175)) & \
                        (data.loc[:, 'y'] > (aoi_set.loc[aoi, 'y'] - aoi_set.loc[aoi, 'height'] * 0.175)) & \
                        (data.loc[:, 'y'] < (aoi_set.loc[aoi, 'y'] + aoi_set.loc[aoi, 'height'] * 0.175))
                    ), 'aoi'] = aoi        
        return data

In [22]:
def addAOIs(data):

    for subject in data['run_id'].unique(): 
        w = data.loc[(data['run_id']==subject), 'window_width_max'].unique().max()
        h = data.loc[(data['run_id']==subject), 'window_height_max'].unique().max()
        aoi_set = aois_centers(w, h)

        for aoi in aoi_set.index:
            data = assign_aoi(data, subject, aoi_set, aoi)
        
    return data

In [23]:
def createAOIColumns(data):
# If the gaze point is in the top option
    data_output = data
    data_output.loc[(
                        (data_output['aoi']=='TL') &
                        data_output['ssOption_top']
                       ), 'aoi_amount_SS'] = 1
    data_output.loc[(
                        (data_output['aoi']=='TL') &
                        ~data_output['ssOption_top']
                       ), 'aoi_amount_LL'] = 1
    data_output.loc[(
                        (data_output['aoi']=='TR') &
                        data_output['ssOption_top']
                       ), 'aoi_delay_SS'] = 1
    data_output.loc[(
                        (data_output['aoi']=='TR') &
                        ~data_output['ssOption_top']
                       ), 'aoi_delay_LL'] = 1

    # If the gaze point is in the bottom option
    data_output.loc[(
                        (data_output['aoi']=='BL') &
                        data_output['ssOption_top']
                       ), 'aoi_amount_LL'] = 1
    data_output.loc[(
                        (data_output['aoi']=='BL') &
                        ~data_output['ssOption_top']
                       ), 'aoi_amount_SS'] = 1
    data_output.loc[(
                        (data_output['aoi']=='BR') &
                        data_output['ssOption_top']
                       ), 'aoi_delay_LL'] = 1
    data_output.loc[(
                        (data_output['aoi']=='BR') &
                        ~data_output['ssOption_top']
                       ), 'aoi_delay_SS'] = 1
    return data_output

In [24]:
def add_transition_type(data):
    data = data[(pd.notna(data['aoi']))]
    data['newAOIIndex'] = 0
    data.loc[(data['aoi_amount_LL']==1), 'newAOIIndex'] = 1
    data.loc[(data['aoi_delay_LL']==1), 'newAOIIndex'] = 2
    data.loc[(data['aoi_amount_SS']==1), 'newAOIIndex'] = 4
    data.loc[(data['aoi_delay_SS']==1), 'newAOIIndex'] = 8
    data.sort_values(by=['run_id', 'choiceNr'])
    # Add a 0 due to the way np.diff works
    data['transition_type'] = np.append(np.diff(data['newAOIIndex']), [0])
    data['transition_type'] = abs(data['transition_type']) 
    return(data)

def cleanTransitions(data):
    indices = []
    for subject in data['run_id'].unique():
        for trial in data['choiceNr'].unique():
            df_thisTrial = data.loc[(
                                    (data['run_id'] == subject) &
                                    (data['choiceNr'] == trial)
                                ), 'transition_type']
            indices.append(df_thisTrial.index.max())
    # last gaze point of each trial
    data.loc[indices, 'transition_type'] = 0 
    return(data)

## init data_et_choice
[Back to Navigation](#Navigation)

In [25]:
data_et_choice = filterChoiceData(data_et)
data_et_choice = lookDirections(data_et_choice)
data_et_choice = addChoiceVariables(data_et_choice)
data_et_choice = addChoiceIndex(data_et_choice, 'choiceNr')
data_et_choice = addAOIs(data_et_choice)
data_et_choice = createAOIColumns(data_et_choice)
data_et_choice = add_transition_type(data_et_choice)
data_et_choice = cleanTransitions(data_et_choice)

ssOption_top: [0 1]
chose_top: [1 0]
chose_ssOption: [0 1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from 

## data_trial
[Back to Navigation](#Navigation)

In [26]:
def selectTrialData(data):
    output = data.loc[:, 
                          [
                              'run_id', 'subject', 'chinFirst',  # subject aspects
                              'chin', 'stimulus', 'x_pos', 'y_pos', # subject input
                              'task_nr', 'trial_type', 'trial_index', # trial characteristics
                              'time_elapsed', 'trial_duration', 'recorded_at', # trial time 
                              'rt', 'responses', 'key_press', # subject behavior measures
                              'devAvg', 'devPercAvg', # eyetracking measures
                              'option_topLeft', 'option_bottomLeft',
                              'option_topRight', 'option_bottomRight', 
                              'window_width', 'window_height', 'window_diagonal',
                              'window_width_max', 'window_height_max', 'window_diagonal_max' # other measures
                          ] 
                     ]
    return output

In [27]:
def checkDeviation(data, column1, column2, maxTimeDiffAllowed):
    diff = data[column1] - data['trial_duration_exact']
    longtrials_runID = data.loc[diff[diff > maxTimeDiffAllowed].index, 'run_id']
    longtrials_previousrunID = pd.DataFrame(data.loc[diff[diff > maxTimeDiffAllowed].index-1, 'run_id']) \
        .rename(columns={'run_id':'previous_run_id'})
    longtrials_previousrunID.index = longtrials_runID.index
    compare_runIDs = pd.concat([longtrials_runID, longtrials_previousrunID], axis=1)

    if sum(compare_runIDs['run_id'] == compare_runIDs['previous_run_id']) > 0: 
        print(column1 + ' and ' + column2 + ' show a deviation of ' +
              '>' + str(maxTimeDiffAllowed) + 
              ' ms. Please check on the following indices: \n')
        print(compare_runIDs.loc[(compare_runIDs['run_id'] == compare_runIDs['previous_run_id']), :].index)

    else:
        print(column1 + ' and ' + column2 + ' do not deviate by ' +
              '>' + str(maxTimeDiffAllowed) + 'ms.')

In [28]:
def exactTrialDuration(data):
    output = data
    output["t_startTrial"] = pd.concat([pd.Series([0]), output["time_elapsed"]], ignore_index=True)
    output["trial_duration_exact"] = output.loc[:, ("time_elapsed")] - output.loc[:, ("t_startTrial")]
    output.drop(len(output)-1)
    
    checkDeviation(data, 'rt', 'trial_duration_exact', 50)
    checkDeviation(data, 'trial_duration', 'trial_duration_exact', 50)
    
    return output

In [29]:
def addXYMean(data, data_et):
    if ('x_mean' in data_trial.columns) & ('y_mean' in data_trial.columns):
        print('x_mean and y_mean already in columns')
        output = data
    else:
        grouped = data_et.loc[:, 
                                 [
                                     'run_id', 'subject', 'trial_index', 
                                     'x', 'y', 'count'
                                 ]
                             ] \
                     .groupby(["run_id", "subject", 'trial_index']).mean() \
                     .rename(columns={'x': 'x_mean',
                                      'y': 'y_mean'}) \
   
        output = data.merge(grouped,
                            on=['run_id', 'subject', 'trial_index'], 
                            how='left') 
    return output 

In [30]:
def addFPS(data):
    if 'fps' in data.columns:
        print('fps already added!')
    else: 
#         grouped = pd.DataFrame(
#                             data.groupby(["run_id", "subject", "index"])["x", 'trial_duration_exact'].describe()
#                            ).reset_index()
#         idx = pd.IndexSlice
#         descr = grouped[["run_id", "subject", "index"]]
#         count = grouped.loc[:, ["x"]].loc[:,idx[:,'count']]
#         time = grouped.loc[:, ["t_task"]].loc[:,idx[:,'max']]
#         grouped = pd.concat([descr.reset_index(drop=True), count.reset_index(drop=True), time.reset_index(drop=True)], axis=1)
        data["fps"] = 1000 * data["count"] / data["trial_duration_exact"]
    return(data)                               

## init data_trial

In [31]:
data_trial = selectTrialData(data_raw)
data_trial = convertToNumeric(data_trial,
                              ['run_id', 'subject', 'chinFirst', 'chin', 'task_nr', 'trial_index', # Int
                               'key_press', 
                               'option_topLeft', 'option_bottomLeft', 'option_topRight', # Float
                               'x_pos', 'y_pos', 'time_elapsed', 'trial_duration',
                               'rt', 'devAvg', 'devPercAvg',  
                               'window_width', 'window_width_max', 'window_height', 'window_height_max'])
data_trial = exactTrialDuration(data_trial)
data_trial = addXYMean(data_trial, data_et)
data_trial = addFPS(data_trial)  
data_trial

rt and trial_duration_exact do not deviate by >50ms.
trial_duration and trial_duration_exact do not deviate by >50ms.


Unnamed: 0,run_id,subject,chinFirst,chin,stimulus,x_pos,y_pos,task_nr,trial_type,trial_index,...,window_diagonal,window_width_max,window_height_max,window_diagonal_max,t_startTrial,trial_duration_exact,x_mean,y_mean,count,fps
0,16,473,1,,"Welcome, dear participant! Thank you for your ...",,,,html-keyboard-response,0,...,1409.851411,1280.0,720.0,1468.604780,0,9406,,,,
1,16,473,1,,"""",,,,survey-html-form,1,...,,1280.0,720.0,1468.604780,9406,15688,,,,
2,16,473,1,,"""",,,,survey-html-form,2,...,,1280.0,720.0,1468.604780,25094,24656,,,,
3,16,473,1,,"""",,,,survey-html-form,3,...,,1280.0,720.0,1468.604780,49750,10752,,,,
4,16,473,1,,Please turn your mobile phone into Airplane Mo...,,,,html-button-response,4,...,,1280.0,720.0,1468.604780,60502,1522,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2007,20,168,0,,"""",,,,html-keyboard-response,498,...,,2048.0,1152.0,2349.767648,1798986,119,,,,
2008,20,168,0,,"""",,,,html-keyboard-response,499,...,,2048.0,1152.0,2349.767648,1799105,118,,,,
2009,20,168,0,,"""",,,,fullscreen,500,...,,2048.0,1152.0,2349.767648,1799223,1009,,,,
2010,20,168,0,,"""",,,,survey-html-form,501,...,,2048.0,1152.0,2349.767648,1800232,9782,,,,


## data_trial_fixation

In [32]:
def selectfixTaskData(data):
    output = data.loc[
                      (data_trial['trial_type'] == 'eyetracking-fix-object') & 
                      (pd.notna(data_trial['x_pos'])) &
                      (pd.notna(data_trial['y_pos'])) &
                      np.invert(data_trial['task_nr'] == 3), 
                      [
                          'run_id', 'subject', 'chinFirst', # Subject specific
                          'chin', 'x_pos', 'y_pos', 'task_nr', 
                          'trial_index', 'time_elapsed',
                          'window_width', 'window_height', 'window_width_max', 'window_height_max',  
                          'window_diagonal', 'window_diagonal_max',
                          't_startTrial', 'trial_duration_exact', 'x_mean', 'y_mean', 'fps'
                      ], 
                     ] \
        .reset_index(drop=True) 
    return output

In [33]:
def addFixationNr(data): 
    if 'fixation_nr' in data.columns:
        print('fixation_nr' + ' already added!')
        data_output = data
    else:
        trial_indices = pd.DataFrame(
                data.loc[:, ["run_id", "subject", "trial_index", 'chin']].drop_duplicates()
            ).reset_index()
        newIndices = []
        for subject in trial_indices["run_id"].unique():
            for chin in trial_indices['chin'].unique():
                theseIndices = trial_indices.loc[
                    (trial_indices["run_id"]==subject) &
                    (trial_indices["chin"]==chin), 
                    ["run_id", "subject", "trial_index", "chin"]] \
                    .reset_index()
                theseIndices['fixation_nr'] = theseIndices.index + 1
                theseIndices = theseIndices[["run_id", "subject", "trial_index", "chin", 'fixation_nr']]
                newIndices.append(theseIndices)
        trial_indices = pd.concat(newIndices).reset_index()
        data_output = data.merge(trial_indices[["run_id", "subject", "trial_index", 'chin', 'fixation_nr']], 
                                                  on=["run_id", "subject", "trial_index", 'chin'], how="left")
    return data_output


In [34]:
def multiply(x):
    return x*10

def addPositionIndex(data):
    output = data
        
    if 'positionIndex' in data.columns: 
        print('positionIndex already there!')
    else: 
        output['positionIndex'] = data.loc[:, ['x_pos', 'y_pos']] \
            .apply(multiply) \
            .astype(int) \
            .astype(str) \
            .apply(''.join, 1)
        output['positionIndex'] = output.replace(
            np.sort(output['positionIndex'].unique()), 
            range(0, len(output['positionIndex'].unique())))
        output['positionIndex'] = output['positionIndex'].astype(int)
    return output
    

### init data_trial_fixation
[Back to Navigation](#Navigation)

In [35]:
data_trial_fixation = selectfixTaskData(data_trial)
data_trial_fixation = addFixationNr(data_trial_fixation.loc[(data_trial_fixation['trial_duration_exact'] > 4500), :])
data_trial_fixation = addPositionIndex(data_trial_fixation)

### Data quality outcomes

In [36]:
def getVariableFromETData(data, data_et, varName):
    if varName in data.columns:
        print(varName + ' already added!')
        output = data
    else: 
        grouped = data_et.groupby(['run_id', 'subject', 'trial_index']).mean() \
            .reset_index() \
            .loc[:, 
                    [
                        'run_id', 'subject', 'trial_index', 
                        varName
                    ]
                ]
        output = data.merge(grouped, 
                            on=['run_id', 'subject', 'trial_index'], 
                            how='left')
    return output

In [37]:
def addPrecision(data_trial, data_et):
    if 'precision' in data_trial.columns: 
        print('Precision already added!')
        data_output = data_trial
    else:
        data_et['deviationFromAVG'] = euclideanDistance(data_et['x'],
                                                             data_et['x_mean'],
                                                             data_et['y'],
                                                             data_et['y_mean']
                                                       )
        data_et['deviationFromAVG_square'] = np.power(data_et['deviationFromAVG'], 2)

        
        grouped = data_et.groupby(['run_id', 'subject', 'trial_index']).mean() \
                         .reset_index()         
        grouped['precision'] = np.sqrt(grouped['deviationFromAVG_square'])
        
        data_output = data_trial.merge(grouped.loc[:, 
                                                      [
                                                          'run_id', 'subject', 'trial_index', 
                                                          'precision'
                                                      ]
                                                  ],
                                       on=['run_id', 'subject', 'trial_index'],
                                       how='left'
                                      )
                                
    return data_output

In [38]:
data_trial_fixation = getVariableFromETData(data_trial_fixation, data_et_fixTask, 'offset')
data_trial_fixation = getVariableFromETData(data_trial_fixation, data_et_fixTask, 'offset_perc')
data_trial_fixation = addPrecision(data_trial_fixation, data_et)
data_trial_fixation['precision_perc'] = 100 * data_trial_fixation['precision'] / data_trial_fixation['window_height']
data_trial_fixation.head(10)

NameError: name 'data_et_fixTask' is not defined

## data_trial_choice
[Back to Navigation](#Navigation)

In [None]:
def selectTrialChoiceData(data):
    output = data.loc[
        data_trial["trial_type"] == "eyetracking-choice", 
        ['run_id', 'subject', 'chinFirst', # Subject info  
         'chin', 'trial_index', # Trial condition
         'option_topLeft', 'option_bottomLeft', 'option_topRight', 'option_bottomRight', 
         'window_width', 'window_height', 'window_width_max', 'window_height_max', 
         'window_diagonal', 'window_diagonal_max',
         'time_elapsed', 'trial_duration_exact', 'recorded_at', 'rt', # Trial response
         'key_press', 'fps']]
    return output

In [None]:
def addChoiceVariables(data):
    data.loc[:, 'LL_top'] = (data["option_topLeft"] > data["option_bottomLeft"]).astype(int)
    data.loc[:, "aLL"] = data[["option_topLeft", "option_bottomLeft"]].values.max(1)
    data.loc[:, "aSS"] = data[["option_topLeft", "option_bottomLeft"]].values.min(1)
    data.loc[:, "tSS"] = 0 
    data.loc[:, "tLL"] = data[["option_topRight", "option_bottomRight"]].values.max(1)
    data.loc[(data["key_press"]==90), "choseTop"] = 1
    data.loc[(data["key_press"]==66), "choseTop"] = 0
    data.loc[: , "choseLL"] = 0
    data.loc[(
                             (data["choseTop"]==1) & 
                             (data["aLL"] == data["option_topLeft"])
                          ), "choseLL"] = 1
    return(data)

In [None]:
def addOptionIndex(data):
    if "optionIndex" in data.columns:
        print("Option Index already added!")
        data_output = data
    else:
        grouped = data_et_choice.groupby(['run_id', 'subject', 'choiceNr', 'trial_index']) \
            ['aoi', 'aoi_amount_SS', 'aoi_amount_LL', 'aoi_delay_SS', 'aoi_delay_LL'].count() \
            .reset_index() \
            .rename(columns={"aoi": "count"})
        grouped['gazePoints_immediate'] = (grouped['aoi_amount_SS'] + grouped['aoi_delay_SS'])
        grouped['gazePoints_delay'] = (grouped['aoi_amount_LL'] + grouped['aoi_delay_LL'])
        grouped['optionIndex'] = (grouped['gazePoints_immediate'] - grouped['gazePoints_delay']) / \
                                 (grouped['gazePoints_immediate'] + grouped['gazePoints_delay'])
        data_output = data.merge(grouped[['run_id', 'subject', 'choiceNr', 'trial_index', 'optionIndex']], 
                                          on=['run_id', 'subject', 'choiceNr', 'trial_index'])
    return(data_output)

def addAttributeIndex(data):
    if "attributeIndex" in data.columns:
        print('Attribute Index already added!')
        data_output = data
    else:
        grouped = data_et_choice.groupby(['run_id', 'subject', 'choiceNr', 'trial_index']) \
            ['aoi', 'aoi_amount_SS', 'aoi_amount_LL', 'aoi_delay_SS', 'aoi_delay_LL'].count() \
            .reset_index() \
            .rename(columns={"aoi": "count"})
        grouped['gazePoints_amount'] = (grouped['aoi_amount_LL'] + grouped['aoi_amount_SS'])
        grouped['gazePoints_time'] = (grouped['aoi_delay_LL'] + grouped['aoi_delay_SS'])
        grouped['attributeIndex'] = (grouped['gazePoints_amount'] - grouped['gazePoints_time']) / \
                                 (grouped['gazePoints_amount'] + grouped['gazePoints_time'])

        data_output = data.merge(grouped[['run_id', 'subject', 'choiceNr', 'trial_index', 'attributeIndex']], 
                                          on=['run_id', 'subject', 'choiceNr', 'trial_index'])
    return(data_output)

In [None]:
def addTransition_type(data_trial, data_et):
    transition_count = pd.pivot_table(data_et.loc[:, ['run_id', 'subject', 'choiceNr', 'transition_type']], 
                        index = ['run_id', 'subject', 'choiceNr'],
                        columns = ['transition_type'], 
                        aggfunc = len,
                        fill_value = 0
                        ).reset_index()
    transition_count.columns = ['run_id', 'subject', 'choiceNr', 
                    "trans_type_0",
                    "trans_type_1",
                    "trans_type_2",
                    "trans_type_3",
                    "trans_type_4",
                    "trans_type_6",
                    "trans_type_7"]

    if "trans_type_0" in data_trial:
        print("Transitions already added!")
        data_trial = data_trial
    else:
        data_trial = data_trial.merge(transition_count, on=['run_id', 'subject', 'choiceNr']) 
    return(data_trial)

In [None]:
def addPayneIndex(data):
    if "payneIndex" in data.columns:
        print("PayneIndex already added!")
    else: 
        # Option-wise: amount_LL-delay_LL = 1; amount_SS - delay_SS = 4
        # Attribute-wise: amount_LL-amount_SS = 3; delay_LL - delay_SS = 6
        # Cross: amount_LL-delay_SS = 7; delay_LL - amount_SS = 2
        optionWise_transition = data.loc[:, 'trans_type_1'] + data.loc[:, 'trans_type_4']
        attributeWise_transition = data.loc[:, 'trans_type_3'] + data.loc[:, 'trans_type_6']  
        data['payneIndex'] = (optionWise_transition - attributeWise_transition) / \
            (optionWise_transition + attributeWise_transition)      
    return(data)

In [None]:
def k(aLL, aSS, tLL):
    k = ((aLL / aSS) - 1) / tLL
    return k


###  init data_trial_choice 
[Back to Navigation](#Navigation)

In [None]:
data_trial_choice = selectTrialChoiceData(data_trial)
data_trial_choice = addChoiceVariables(data_trial_choice)
data_trial_choice = getVariableFromETData(data_trial_choice, data_et_choice, 'choiceNr')
data_trial_choice = addOptionIndex(data_trial_choice)
data_trial_choice = addAttributeIndex(data_trial_choice)
data_trial_choice = addTransition_type(data_trial_choice, data_et_choice)
data_trial_choice = addPayneIndex(data_trial_choice)
data_trial_choice['k'] = k(data_trial_choice['aLL'], data_trial_choice['aSS'], data_trial_choice['tLL']) 
    
data_trial_choice

In [None]:
len(data_trial)

# data_subject

In [None]:
def selectSubjectData(data):
    output = data.loc[: , 
                          [
                              'run_id', 'subject', 'chinFirst', 
#                               'webcam_label', 'webcam_Id', 'webcam_fps', 'webcam_aspectRatio', # Webcam info
#                               'webcam_height', 'webcam_width', 
#                               't_StartEyeTrackingData', # Eyetracking
                              'user_agent', 'device', 'browser', 'browser_version', 'platform', # Device info
                              'platform_version',
                              'responses'
                          ]
                     ]
    return output

In [None]:
def surveyStringToFrame(subject, string):
    string = re.sub("""{""", '', string)
    string = re.sub("""}""", '', string)
    string = re.sub('"', '', string)
    output = pd.read_csv(StringIO(string), 
                         sep=":", 
                         lineterminator = ";", 
                         header=None, 
                         index_col=0
                        ).transpose()
    return output

def surveyDataPerSubject(subject, data):
    df_thisSubject = data.loc[
                          (
                             (data['run_id']==subject) &
                             (pd.notna(data["responses"])) & 
                             (data["responses"] != '"')
                          ), :].reset_index()
    output = []
    for i in range(0, len(df_thisSubject)):
#         thisResponseFrame = surveyStringToFrame(subject, df_thisSubject.loc[i, 'responses'])
#         print(thisResponseFrame)
        output.append(
                         surveyStringToFrame(subject, df_thisSubject.loc[i, 'responses'])
                     )

    output = pd.concat(output, axis=1)
    output['run_id'] = subject    
    return output

def surveyData(data):
    output = []
    for subject in data['run_id'].unique():
        thisSubject_surveyData = surveyDataPerSubject(subject, data)
        output.append(thisSubject_surveyData)
    output = pd.concat(output)
    return output


def addSurveyData(data):
    if np.invert('responses' in data.columns):
        print('No response variable there! Probably already added')
        data_output = data
    else:
        data_output = data.merge(surveyData(data_subject),
                                 on=['run_id'], 
                                 how='left'
                                )
        data_output = data_output.drop(columns=['responses']) \
            .drop_duplicates() \
            .reset_index(drop=True)
            
    return data_output

In [None]:
def replaceDegreeValues(data):
    data['degree'] = data['degree'] \
        .replace(['College / Undergraduate / Bachelor', 
                  'High School',
                  'Graduate / PhD / Master',
                  'Middle School'
                 ], 
                 ['college', 
                  'highSchool',
                  'grad',
                  'middle'
                 ]
                )
    return data

In [None]:
def addTrialColumnToSubjectData(data, sourceData, varName):
    output = data
    if varName in data.columns:
        print(varName + ' already added!')
    else:
        grouped = sourceData.groupby(['run_id', 'subject']).mean() \
            .reset_index() \
            .loc[:, 
                    [
                        'run_id', 'subject', varName
                    ]
                ]        
        output = output.merge(grouped, 
                            on=['run_id', 'subject'], 
                            how='left')
    return output

In [None]:
def tranformBinaryToInt(data):
    output = data.replace({'no': 0, 'yes': 1}) 
    return output

## init data_subject

In [None]:
data_subject = selectSubjectData(data_raw)
data_subject = addSurveyData(data_subject)
data_subject = convertToNumeric(data_subject, ['age'])
data_subject = replaceDegreeValues(data_subject)

binaryPredictors = [
                        'chinFirst',
                        'eyeshadow', 
                        'masquara',
                        'eyeliner',
                        'browliner',
                        'triedChin', 
                        'keptHead',
                       ]
data_subject[binaryPredictors] = tranformBinaryToInt(data_subject[binaryPredictors])
    
columnsToAdd = [
                   'window_diagonal_max',
                   'trial_duration_exact',
                   'fps',
                   'LL_top', 
                   'choseTop', 
                   'choseLL', 
                   'optionIndex', 
                   'attributeIndex', 
                   'payneIndex',
               ]
for i in range(0, len(columnsToAdd)):
    data_subject = addTrialColumnToSubjectData(data_subject, data_trial_choice, columnsToAdd[i])
    
columnsToAdd = [
                   'offset', 'offset_perc', 
                    'precision', 'precision_perc', 
                   'x_mean', 'y_mean'
               ]
for i in range(0, len(columnsToAdd)):
    data_subject = addTrialColumnToSubjectData(data_subject, data_trial_fixation, columnsToAdd[i])
data_subject = data_subject.rename(columns={
                                                'x_mean': 'x_mean_fixTask',
                                                'y_mean': 'y_mean_fixTask'
                                            })
data_subject.columns

# Add data_subject variables to data_trial variables

In [None]:
def addSubjectColumnToTrialData(data, sourceData, varName):
    output = data
    if varName in data.columns:
        print(varName + ' already added!')
    else:
        grouped = sourceData.loc[:, 
                    [
                        'run_id', 'subject', varName
                    ]
                ] \
            .reset_index(drop=True) \
            .drop_duplicates()
       
        output = output.merge(grouped, 
                            on=['run_id', 'subject'], 
                            how='left')
    return output

In [None]:
columnsToAdd = [
                   'age', 
                   'gender', 'ethnic', 'degree',
               ]

for i in range(0, len(columnsToAdd)):
    data_trial_choice = addSubjectColumnToTrialData(data_trial_choice, data_subject, columnsToAdd[i])

In [None]:
columnsToAdd = [
                   'age', 
                   'device', 'browser', 'platform', # Categorical
                   'gender', 'ethnic', 'sight', 'glasses', 'degree', 
                   'eyeshadow', 'masquara', 'eyeliner', 'browliner', 'vertPosition', 
                   'triedChin', 'keptHead'
               ]

for i in range(0, len(columnsToAdd)):
    data_trial_fixation = addSubjectColumnToTrialData(data_trial_fixation, data_subject, columnsToAdd[i])

# Export Data

In [None]:
if not os.path.exists('./data_jupyter'):
    os.mkdir('./data_jupyter')
    
data_et.to_csv("data_jupyter/data_et.csv", index=False, header=True)
data_et_fixation.to_csv("data_jupyter/data_et_fixation.csv", index=False, header=True)
data_et_choice.to_csv("data_jupyter/data_et_choice.csv", index=False, header=True)    

data_trial.to_csv("data_jupyter/data_trial.csv", index=False, header=True)
data_trial_fixation.to_csv("data_jupyter/data_trial_fixation.csv", index=False, header=True)
data_trial_choice.to_csv("data_jupyter/data_trial_choice.csv", index=False, header=True)

data_trial_choice.to_csv("data_jupyter/data_subject.csv", index=False, header=True)