In [1]:
import datetime
import numpy as np
import os
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm 
import sys


if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
    
from IPython.display import HTML
def View(df):
    css = """<style>
    table { border-collapse: collapse; border: 3px solid #eee; }
    table tr th:first-child { background-color: #eeeeee; color: #333; font-weight: bold }
    table thead th { background-color: #eee; color: #000; }
    tr, th, td { border: 1px solid #ccc; border-width: 1px 0 0 1px; border-collapse: collapse;
    padding: 3px; font-family: monospace; font-size: 10px }</style>
    """
    s  = '<script type="text/Javascript">'
    s += 'var win = window.open("", "Title", "toolbar=no, location=no, directories=no, status=no, menubar=no, scrollbars=yes, resizable=yes, width=780, height=200, top="+(screen.height-400)+", left="+(screen.width-840));'
    s += 'win.document.body.innerHTML = \'' + (df.to_html() + css).replace("\n",'\\') + '\';'
    s += '</script>'
    return(HTML(s+css))    

os.chdir(r'C:\Users\User\GitHub\WebET_Analysis')
print("Current Working directory " , os.getcwd())

Current Working directory  C:\Users\User\GitHub\WebET_Analysis


In [2]:
# from IPython.display import HTML

# HTML('''<script>
# code_show=true; 
# function code_toggle() {
#  if (code_show){
#  $('div.input').hide();
#  } else {
#  $('div.input').show();
#  }
#  code_show = !code_show
# } 
# $( document ).ready(code_toggle);
# </script>
# <form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

# Read Data

In [3]:
data_et = pd.read_csv(r'C:/Users/User/GitHub/WebET_Analysis/data_jupyter/data_et.csv')
data_trial = pd.read_csv(r'C:/Users/User/GitHub/WebET_Analysis/data_jupyter/data_trial.csv')
data_subject = pd.read_csv(r'C:/Users/User/GitHub/WebET_Analysis/data_jupyter/data_subject.csv')

print(pd.DataFrame([[len(data_et)], [len(data_trial)], [len(data_subject)]], 
                   columns=['length'],
                   index=['data_et', 'data_trial', 'data_subject'])
     )

              length
data_et       618566
data_trial     30362
data_subject      84


# Create datasets

## data_trial_choice

In [4]:
data_trial_choice = data_trial.loc[
    data_trial['trial_type']=='eyetracking-choice', 
    [
        'run_id', 'chinFirst', 
        'task_nr', 
        'trial_index', 'trial_type', 'withinTaskIndex', 
        'choiceTask_amountLeftFirst', 
        'option_topLeft', 'option_bottomLeft',
        'option_topRight', 'option_bottomRight',
        'key_press', 'trial_duration_exact',
        'window_width', 'window_height',
        'fps'
    ]
]

## data_et_choice

In [5]:
def add_var_to_data_et(data_et, source_data, varName):
    if varName in data_et.columns: data_et=data_et.drop(columns=varName)
    data_et = data_et.merge(
        source_data.loc[:, ['run_id', 'trial_index', varName]], 
        on=['run_id', 'trial_index'], how='left')
    return data_et

data_et = add_var_to_data_et(data_et, data_trial, 'trial_type')
data_et = add_var_to_data_et(data_et, data_trial, 'withinTaskIndex')

data_et_choice = data_et \
    .loc[data_et['trial_type']=='eyetracking-choice', :] \
    .drop(columns=['trial_type'])
data_et_choice

Unnamed: 0,x,y,t,t_task,run_id,trial_index,withinTaskIndex
607,0.627605,0.429164,509546.610,0.000,1.0,145.0,1.0
608,0.604833,0.425830,510013.380,466.770,1.0,145.0,1.0
609,0.622740,0.387974,510486.345,939.735,1.0,145.0,1.0
610,0.592737,0.415015,510958.065,1411.455,1.0,145.0,1.0
611,0.498632,0.300594,511430.640,1884.030,1.0,145.0,1.0
...,...,...,...,...,...,...,...
618561,0.288021,0.317593,1438291.000,1212.000,2009.0,507.0,80.0
618562,0.288021,0.317593,1438365.000,1286.000,2009.0,507.0,80.0
618563,0.288021,0.317593,1438437.000,1358.000,2009.0,507.0,80.0
618564,0.288021,0.317593,1438512.000,1433.000,2009.0,507.0,80.0


# Screening

## Not enough trials

In [6]:
def merge_max_var_by_index(data, data_subject, varName, newName):
    grouped = data \
        .groupby(['run_id'])[varName].max() \
        .reset_index() \
        .rename(columns={varName: newName})
    if newName in data_subject.columns:
        data_subject = data_subject.drop(columns=[newName])
    data_subject = data_subject.merge(
        grouped, 
        on='run_id',
        how='left'
    )
    return data_subject

data_subject = merge_max_var_by_index(
    data_trial_choice, 
    data_subject, 
    'withinTaskIndex', 
    'max_choiceIndex'
)

data_subject = merge_max_var_by_index(
    data_trial, 
    data_subject, 
    'trial_index', 
    'max_trial_index'
)
data_subject.loc[
    data_subject['max_choiceIndex']!=80,
    ['run_id', 'max_trial_index', 'max_choiceIndex']
].sort_values(by='run_id')

Unnamed: 0,run_id,max_trial_index,max_choiceIndex
13,14,,
30,49,,
32,54,,
37,61,271.0,42.0
43,72,,
48,83,,
50,88,,
60,99,,
2,106,,
3,108,,


In [7]:
subjects_not_enough_trials = data_subject.loc[
    pd.isna(data_subject['max_choiceIndex']) |
    (data_subject['max_choiceIndex']<40),
    'run_id'
]
subjects_not_enough_trials

2      106
3      108
6      124
7      125
13      14
30      49
32      54
43      72
48      83
50      88
60      99
62    1001
64    1014
65    1015
66    1016
67    1017
68    1019
69    1002
70    1020
73    1007
75    1009
76    1022
77    1024
78    2011
81    2002
Name: run_id, dtype: int64

## Did not keep head still

In [8]:
data_subject.loc[
    data_subject['keptHead']==0, 
    ['run_id', 'prolificID', 'keptHead']
]

Unnamed: 0,run_id,prolificID,keptHead
81,2002,Tim2,0.0
82,2008,Studie1970,0.0


In [9]:
subjects_not_kept_head = data_subject.loc[
    data_subject['keptHead']==0, 'run_id']

## Not approved on Prolific

In [10]:
data_subject.loc[
    ~data_subject['status'].isin(['APPROVED', 'NOTPROLIFIC']),
    :
]

Unnamed: 0,run_id,birthyear,browliner,browser,browser_version,degree,device,ethnic,eyeliner,eyeshadow,...,fps,choseLL,choseTop,LL_top,attributeIndex,optionIndex,payneIndex,choice_rt,max_choiceIndex,max_trial_index
7,125,,,Chrome,87.0.4280.88,,Macintosh,,,,...,,,,,,,,,,15.0


In [11]:
subjects_notApproved = data_subject.loc[
    ~data_subject['status'].isin(['APPROVED', 'NOTPROLIFIC']),
    'run_id'
]
subjects_notApproved

7    125
Name: run_id, dtype: int64

## Trials too long

In [12]:
data_trial_choice.loc[data_trial_choice['trial_duration_exact']>10000, :]

Unnamed: 0,run_id,chinFirst,task_nr,trial_index,trial_type,withinTaskIndex,choiceTask_amountLeftFirst,option_topLeft,option_bottomLeft,option_topRight,option_bottomRight,key_press,trial_duration_exact,window_width,window_height,fps
787,103,0.0,3.0,270.0,eyetracking-choice,1.0,0.0,180 days,Today,$4.5,$3,40.0,12336.0,1536.0,864.0,21.400778
6013,25,0.0,3.0,303.0,eyetracking-choice,12.0,1.0,$4.5,$5,Today,7 days,38.0,10705.0,1536.0,864.0,14.666044
9192,38,1.0,2.0,374.0,eyetracking-choice,77.0,1.0,Today,Tomorrow,$3,$4,40.0,12120.0,1920.0,1080.0,
9728,4,0.0,3.0,391.0,eyetracking-choice,41.0,1.0,90 days,Today,$4,$2.5,40.0,10375.0,1920.0,1080.0,10.698795
9740,4,0.0,3.0,403.0,eyetracking-choice,45.0,1.0,Today,30 days,$2.5,$5,40.0,18030.0,1920.0,1007.0,10.704382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29558,2008,1.0,2.0,231.0,eyetracking-choice,30.0,1.0,$5,$1.5,30 days,Today,40.0,26293.0,1920.0,1080.0,12.893165
29561,2008,1.0,2.0,234.0,eyetracking-choice,31.0,1.0,$4.5,$2,180 days,Today,40.0,14377.0,1920.0,1080.0,13.006886
29592,2008,1.0,2.0,265.0,eyetracking-choice,41.0,1.0,30 days,Today,$4.5,$4,40.0,12630.0,1920.0,1080.0,12.905780
29658,2008,1.0,2.0,331.0,eyetracking-choice,63.0,1.0,Today,90 days,$2.5,$5,38.0,12765.0,1920.0,1080.0,12.769291


## Not enough fps

In [13]:
grouped = data_et_choice \
    .groupby(['run_id', 'trial_index'])['x'].count() \
    .reset_index() \
    .rename(columns={'x': 'x_count'})
if 'x_count' in data_trial_choice.columns:
    data_trial_choice = data_trial_choice.drop(columns=['x_count'])
data_trial_choice = data_trial_choice \
    .merge(grouped, on=['run_id', 'trial_index'], how='left')
print(data_trial_choice.columns)
data_trial_choice['fps_choice']= \
    1000 * data_trial_choice['x_count'] / \
    data_trial_choice['trial_duration_exact']
data_trial_choice['fps_choice'].describe()

Index(['run_id', 'chinFirst', 'task_nr', 'trial_index', 'trial_type',
       'withinTaskIndex', 'choiceTask_amountLeftFirst', 'option_topLeft',
       'option_bottomLeft', 'option_topRight', 'option_bottomRight',
       'key_press', 'trial_duration_exact', 'window_width', 'window_height',
       'fps', 'x_count'],
      dtype='object')


count    4270.000000
mean       16.329448
std         8.727203
min         0.072031
25%        10.660565
50%        16.960651
75%        21.266541
max        35.980991
Name: fps_choice, dtype: float64

In [14]:
print(data_trial_choice.loc[
    data_trial_choice['fps_choice']<3, 
    'run_id'].unique())

grouped = data_trial_choice \
    .loc[
        data_trial_choice['fps_choice']<1, 
        ['run_id', 'trial_index', 'fps_choice']
    ] \
    .groupby(['run_id'])['trial_index'].count() \
    .reset_index() \
    .rename(columns={'trial_index': 'n_lowFPS'})
print(grouped)
subjects_lowFPS = grouped.loc[
    grouped['n_lowFPS'] > 10, 
    'run_id'
]
subjects_lowFPS

[   1    4   70   80    9   94   97 1021 1003 2012]
   run_id  n_lowFPS
0       9        12
1      70        80
2      94         3
3    1003         2
4    1021         3


0     9
1    70
Name: run_id, dtype: int64

# Cleaning

In [16]:
excludedSubjects = list(
    set(subjects_not_enough_trials) | 
    set(subjects_not_kept_head) |
    set(subjects_notApproved) |
    set(subjects_lowFPS)
)

if not os.path.exists('./data_jupyter'):
    os.mkdir('./data_jupyter')
pd.DataFrame(excludedSubjects) \
    .to_csv("data_jupyter/excludeSubjects_choice.csv", index=False, header=False)

excludedSubjects

[1024,
 70,
 72,
 9,
 1020,
 14,
 2002,
 83,
 88,
 2008,
 2011,
 99,
 1001,
 106,
 1002,
 108,
 1007,
 49,
 1009,
 1015,
 54,
 1014,
 1016,
 1017,
 1019,
 124,
 125,
 1022]

## data_trial_choice

In [None]:
def cleanTrialData(data):
    print('Raw: ' + str(len(data)))
    data = data.loc[
        ~(data['run_id'].isin(excludedSubjects)) &
        (data['trial_duration_exact']<10000), 
        :]
    print('Cleaned: ' + str(len(data)))
    return data

data_trial_choice = cleanTrialData(data_trial_choice)

## data_et_choice

In [None]:
def cleanETData(data):
    print('Raw: ' + str(len(data)))
    data = data.loc[
        (data['x'] > 0) & (data['x'] < 1) &
        (data['y'] > 0) & (data['y'] < 1) &
        ~(data['run_id'].isin(excludedSubjects)) &
        (data['t_task']<10000), 
        :]
    print('Cleaned: ' + str(len(data)))
    return data

data_et_choice = cleanETData(data_et_choice)

# Choice options

## Choice attributes

In [None]:
def identify_amount_left(data):
    data['amountLeft'] = 0 
    data.loc[
        (data['option_topLeft'].str.contains("\$", regex=True)) |
        (data['option_topLeft'].str.contains("cent", regex=True)), 
        'amountLeft'] = 1
    data['amountLeft'].unique()
    return data

data_trial_choice = identify_amount_left(data_trial_choice)
data_trial_choice.loc[:, ['amountLeft', 'option_topLeft']]

In [None]:
data_et_choice = add_var_to_data_et(
    data_et_choice, 
    data_trial_choice, 
    'amountLeft'
)

In [None]:
def choice_options_to_numeric(data, varName): 
    if data[varName].dtypes == 'float64':
        print(varName + ' is float64. Probably already converted?')
    else:
        data.loc[: , [varName]] = \
            data.loc[:, [varName]] \
                .replace(['Today', 'Tomorrow', '7 days', 
                          '15 days', '30 days', '90 days', 
                          '180 days'], 
                         [0, 1, 7, 15, 30, 90, 180]) \
                .replace({'\$':''}, regex = True) \
                .replace('50 cent', 0.5) \
                .astype(float)
    return data

variables = [
    'option_topLeft',
    'option_bottomLeft', 
    'option_topRight', 
    'option_bottomRight'
]
for var in variables:
    data_trial_choice = choice_options_to_numeric(
        data_trial_choice, var)

data_trial_choice.loc[
    : , 
    [
        'option_topLeft',
        'option_bottomLeft', 
        'option_topRight', 
        'option_bottomRight'
    ] 
]

In [None]:
def reformatAttributes(data):
    data['aSS'] = 0 
    data.loc[data['amountLeft']==1, 'aSS']= \
        data.loc[
            data['amountLeft']==1, 
            ["option_topLeft", "option_bottomLeft"]
        ].values.min(1)
    data.loc[data['amountLeft']==0, 'aSS']= \
        data.loc[
            data['amountLeft']==0, 
            ["option_topRight", "option_bottomRight"]
        ].values.min(1)

    data['aLL'] = 0 
    data.loc[data['amountLeft']==1, 'aLL']= \
        data.loc[
            data['amountLeft']==1, 
            ["option_topLeft", "option_bottomLeft"]
        ].values.max(1)
    data.loc[data['amountLeft']==0, 'aLL']= \
        data.loc[
            data['amountLeft']==0, 
            ["option_topRight", "option_bottomRight"]
        ].values.max(1)
    
    data.loc[:, "tSS"] = 0 
    
    data['tLL'] = 0 
    data.loc[data['amountLeft']==1, 'tLL']= \
        data.loc[
            data['amountLeft']==1, 
            ["option_topRight", "option_bottomRight"]
        ].values.max(1)
    data.loc[data['amountLeft']==0, 'tLL']= \
        data.loc[
            data['amountLeft']==0, 
            ["option_topLeft", "option_bottomLeft"]
        ].values.max(1)
    
    data['LL_top'] = \
        (data["option_topLeft"] > data["option_bottomLeft"]) \
        .astype(int)

    print('aLL values: ' + str(np.sort(data['aLL'].unique())))
    print('aSS values: ' + str(np.sort(data['aSS'].unique())))
    print('tLL values: ' + str(np.sort(data['tLL'].unique())))
    print('tSS values: ' + str(np.sort(data['tSS'].unique())))
    
    return data

data_trial_choice = reformatAttributes(data_trial_choice)
data_trial_choice.loc[
    : , 
    [
        'amountLeft', 
        'option_topLeft',
        'option_bottomLeft', 
        'option_topRight', 
        'option_bottomRight', 
        'aLL', 
        'aSS', 
        'tLL', 
        'tSS',
        'LL_top'
    ] 
]

In [None]:
data_et_choice = add_var_to_data_et(
    data_et_choice, 
    data_trial_choice, 
    'LL_top'
)

## Behavioral response variables

In [None]:
def choice_response_variables(data):
        
    data["choseTop"] = 0
    data.loc[(data["key_press"]==38), "choseTop"] = 1

    data["choseLL"] = 0
    data.loc[(data["choseTop"]==1) & (data["LL_top"] == 1), "choseLL"] = 1
    
    return(data)

data_trial_choice = choice_response_variables(data_trial_choice)
data_trial_choice.loc[
    :, 
    [
        'option_topLeft',
        'option_bottomLeft', 
        'option_topRight', 
        'option_bottomRight', 
        'key_press',
        'choseTop',
        'choseLL', 
        ]
]

## Aggregate on subject level

In [None]:
def merge_by_subject(data, large_data, varName):
    if varName in data.columns: data = data.drop(columns=[varName])
    grouped = large_data.groupby(['run_id'])[varName].mean() \
        .reset_index()        
    data = data.merge(grouped, on=['run_id'], how='left')
    return data
print(data_subject.columns)
for var in ['choseLL', 'choseTop', 'LL_top']:
    data_subject = merge_by_subject(data_subject, data_trial_choice, var)
data_subject.loc[
    ~data_subject['run_id'].isin(excludedSubjects),
    ['run_id', 'choseLL', 'choseTop', 'LL_top']
]

# k

In [None]:
def k(aLL, aSS, tLL):
    k = ((aLL / aSS) - 1) / tLL
    return k

data_trial_choice['k'] = k(data_trial_choice['aLL'], data_trial_choice['aSS'], data_trial_choice['tLL']) 
data_trial_choice

# Look direction

In [None]:
def lookDirections(data):
    data["look_left"] = (data["x"] < 0.5).astype(int)
    data["look_top"] = (data["y"] < 0.5).astype(int)
    return data

data_et_choice = lookDirections(data_et_choice)

# AOIs

In [None]:
def addAOI(data): 
    aoiCenters = pd.DataFrame(
        [
            [(0.05+0.9*0.2), 0.25],
            [(0.05+0.9*0.8), 0.25],
            [(0.05+0.9*0.2), 0.75],
            [(0.05+0.9*0.8), 0.75]
        ], 
        columns = ['x', 'y'],
        index = ['TL', 'TR', 'BL', 'BR']
    )
    
    data['aoi'] = 0 
    for aoi in aoiCenters.index:
        data.loc[
            (
                (data['x'] > (aoiCenters.loc[aoi, 'x'] - 0.175)) & \
                (data['x'] < (aoiCenters.loc[aoi, 'x'] + 0.175)) & \
                (data['y'] > (aoiCenters.loc[aoi, 'y'] - 0.175)) & \
                (data['y'] < (aoiCenters.loc[aoi, 'y'] + 0.175))
             ), 'aoi'] = aoi
    return data 

data_et_choice = addAOI(data_et_choice)
data_et_choice['aoi'].unique()

In [None]:
def createAOIColumns(data):
    data['aoi_aLL'] = 0
    data['aoi_tLL'] = 0 
    data['aoi_aSS'] = 0 
    data['aoi_tSS'] = 0
    
    # If amounts are on the left side
    # If the gaze point is in the top option
    data.loc[((data['amountLeft']==1) & (data['LL_top']==1) & (data['aoi']=='TL')), 
             'aoi_aLL'] = 1
    data.loc[((data['amountLeft']==1) & (data['LL_top']==1) & (data['aoi']=='TR')), 
             'aoi_tLL'] = 1
    data.loc[((data['amountLeft']==1) & (data['LL_top']==1) & (data['aoi']=='BL')), 
             'aoi_aSS'] = 1
    data.loc[((data['amountLeft']==1) & (data['LL_top']==1) & (data['aoi']=='BR')), 
             'aoi_tSS'] = 1
    
    data.loc[((data['amountLeft']==1) & (data['LL_top']==0) & (data['aoi']=='TL')), 
             'aoi_aSS'] = 1
    data.loc[((data['amountLeft']==1) & (data['LL_top']==0) & (data['aoi']=='TR')), 
             'aoi_tSS'] = 1
    data.loc[((data['amountLeft']==1) & (data['LL_top']==0) & (data['aoi']=='BL')), 
             'aoi_aLL'] = 1
    data.loc[((data['amountLeft']==1) & (data['LL_top']==0) & (data['aoi']=='BR')), 
             'aoi_tLL'] = 1
    
    # If amounts are on the right side
    # If the gaze point is in the top option
    data.loc[((data['amountLeft']==0) & (data['LL_top']==1) & (data['aoi']=='TL')), 
             'aoi_tLL'] = 1
    data.loc[((data['amountLeft']==0) & (data['LL_top']==1) & (data['aoi']=='TR')), 
             'aoi_aLL'] = 1
    data.loc[((data['amountLeft']==0) & (data['LL_top']==1) & (data['aoi']=='BL')), 
             'aoi_tSS'] = 1
    data.loc[((data['amountLeft']==0) & (data['LL_top']==1) & (data['aoi']=='BR')),
             'aoi_aSS'] = 1

    data.loc[((data['amountLeft']==0) & (data['LL_top']==0) & (data['aoi']=='TL')), 
             'aoi_tSS'] = 1
    data.loc[((data['amountLeft']==0) & (data['LL_top']==0) & (data['aoi']=='TR')), 
             'aoi_aSS'] = 1
    data.loc[((data['amountLeft']==0) & (data['LL_top']==0) & (data['aoi']=='BL')), 
             'aoi_tLL'] = 1
    data.loc[((data['amountLeft']==0) & (data['LL_top']==0) & (data['aoi']=='BR')), 
             'aoi_aLL'] = 1
    return data

data_et_choice = createAOIColumns(data_et_choice)

# Eye-Tracking indices

##  Option Index

In [None]:
def addOptionIndex(data, data_et_choice):

    grouped = data_et_choice.groupby(['run_id', 'trial_index']) \
        ['aoi_aSS', 'aoi_aLL', 'aoi_tSS', 'aoi_tLL'].sum() \
        .reset_index() 
    
    grouped['gazePoints_immediate'] = (grouped['aoi_aSS'] + grouped['aoi_tSS'])
    grouped['gazePoints_delay'] = (grouped['aoi_aLL'] + grouped['aoi_tLL'])
    grouped['optionIndex'] = (grouped['gazePoints_immediate'] - grouped['gazePoints_delay']) / \
                             (grouped['gazePoints_immediate'] + grouped['gazePoints_delay'])

    if "optionIndex" in data.columns: data = data.drop(columns=['optionIndex'])

    data = data.merge(grouped[['run_id', 'trial_index', 'optionIndex']], 
                             on=['run_id', 'trial_index'])
    return(data)

data_trial_choice = addOptionIndex(data_trial_choice, data_et_choice)
data_trial_choice['optionIndex'].describe()

## Attribute Index

In [None]:
def addAttributeIndex(data, data_et_choice):

    grouped = data_et_choice.groupby(['run_id', 'trial_index']) \
        ['aoi', 'aoi_aSS', 'aoi_aLL', 'aoi_tSS', 'aoi_tLL'].sum() \
        .reset_index()
    grouped['gazePoints_amount'] = (grouped['aoi_aLL'] + grouped['aoi_aSS'])
    grouped['gazePoints_time'] = (grouped['aoi_tLL'] + grouped['aoi_tSS'])
    grouped['attributeIndex'] = \
        (grouped['gazePoints_amount'] - grouped['gazePoints_time']) / \
        (grouped['gazePoints_amount'] + grouped['gazePoints_time'])

    if "attributeIndex" in data.columns: data = data.drop(columns=['attributeIndex'])
    data_output = data.merge(grouped[['run_id', 'trial_index', 'attributeIndex']], 
                             on=['run_id', 'trial_index'])
    return(data_output)

data_trial_choice = addAttributeIndex(data_trial_choice, data_et_choice)
data_trial_choice['attributeIndex'].describe()

## Payne Index

### Transitions between AOIs

In [None]:
def et_data_transition_type(data):
    data = data.loc[
        pd.notna(data['aoi']) &
        (data['aoi']!=0), :]
    data['newAOIIndex'] = 0
    data.loc[(data['aoi_aLL']==1), 'newAOIIndex'] = 1
    data.loc[(data['aoi_tLL']==1), 'newAOIIndex'] = 2
    data.loc[(data['aoi_aSS']==1), 'newAOIIndex'] = 4
    data.loc[(data['aoi_tSS']==1), 'newAOIIndex'] = 8
    data.sort_values(by=['run_id', 'withinTaskIndex'])
    # Add a 0 due to the way np.diff works
    data['transition_type'] = np.append([0], np.diff(data['newAOIIndex']))
    data['transition_type'] = abs(data['transition_type']) 

    data.loc[data['t_task']==0, 'transition_type'] = 0

    return data.loc[:, ['run_id', 'trial_index', 't_task', 'transition_type']]


In [None]:
def addTransition_type(data, data_et):
    data_et = et_data_transition_type(data_et)
    data_et.loc[:, 'transition_type'] = data_et.loc[:, 'transition_type']
    
    transition_count = pd.pivot_table(
        data_et.loc[:, ['run_id', 'trial_index', 'transition_type']], 
        index = ['run_id', 'trial_index'],
        columns = ['transition_type'], 
        aggfunc = len,
        fill_value = 0) \
        .reset_index() \
        .rename(columns={
        0: "trans_type_0",
        1: "trans_type_aLLtLL",
        2: "trans_type_tLLaSS",
        3: "trans_type_aLLaSS",
        4: "trans_type_aSStSS",
        6: "trans_type_tLLtSS",
        7: "trans_type_aLLtSS"
    })

    if "trans_type_0" in data: data = data.drop(columns=[
        "trans_type_0", "trans_type_aLLtLL", "trans_type_tLLaSS", "trans_type_aLLaSS",
        "trans_type_aSStSS", "trans_type_tLLtSS", "trans_type_aLLtSS"])
        
    data = data.merge(transition_count, on=['run_id', 'trial_index']) 
    return(data)

data_trial_choice = addTransition_type(data_trial_choice, data_et_choice)
data_trial_choice.loc[
    data_trial_choice['fps']>15, 
    [
        'run_id', 'trial_index', 'trans_type_0', 'trans_type_aLLtLL', 'trans_type_tLLaSS',
        'trans_type_aLLaSS', 'trans_type_aSStSS', 'trans_type_tLLtSS',
        'trans_type_aLLtSS'
    ]
]

In [None]:
def addPayneIndex(data):
    if "payneIndex" in data.columns: data = data.drop(columns='payneIndex')
    optionWise_transition = data.loc[:, 'trans_type_aLLtLL'] + data.loc[:, 'trans_type_aSStSS']
    attributeWise_transition = data.loc[:, 'trans_type_aLLaSS'] + data.loc[:, 'trans_type_tLLtSS']  
    data['payneIndex'] = \
        (optionWise_transition - attributeWise_transition) / \
        (optionWise_transition + attributeWise_transition) 
    data['payneIndex'] = data['payneIndex'].fillna(0)
    return(data)

data_trial_choice = addPayneIndex(data_trial_choice)

print(data_trial_choice.columns)
print(data_trial_choice['payneIndex'].describe())

data_trial_choice.loc[
    data_trial_choice['fps']>15, 
    ['run_id', 'trial_duration_exact',
     'trans_type_aLLtLL', 'trans_type_aSStSS', 'trans_type_aLLaSS', 'trans_type_tLLtSS',
     'payneIndex'
    ]
]

## Aggregate on subject-level

In [None]:
data_subject = merge_by_subject(data_subject, data_trial_choice, 'attributeIndex')
data_subject = merge_by_subject(data_subject, data_trial_choice, 'optionIndex')
data_subject = merge_by_subject(data_subject, data_trial_choice, 'payneIndex')

# Reaction time on subject-level

In [None]:
grouped = data_trial_choice.groupby(['run_id'])['trial_duration_exact'].mean() \
    .reset_index() \
    .rename(columns={'trial_duration_exact': 'choice_rt'})

if 'choice_rt' in data_subject.columns: data_subject = data_subject.drop(columns=['choice_rt'])
data_subject = data_subject.merge(grouped, on='run_id', how='left')
data_subject['choice_rt'].describe()

# Clusters

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data_trial_choice.loc[
    :, 
    [
       'trans_type_0',
       'trans_type_aLLtLL', 'trans_type_tLLaSS', 'trans_type_aLLaSS',
       'trans_type_aSStSS', 'trans_type_tLLtSS']
    ]
)

In [None]:
def clusters(n_clusters):
    kmeans = KMeans(
        init="random",
        n_clusters=n_clusters,
        n_init=10,
        max_iter=300,
        random_state=42
    )
    kmeans.fit(scaled_features)
            
#     https://realpython.com/k-means-clustering-python/
#     print(kmeans.inertia_)
#     print(kmeans.cluster_centers_)
#     print(kmeans.n_iter_)

    return kmeans.labels_

In [None]:
output = []
for n_cluster in range(2, 5):
    data_trial_choice['cluster' + str(n_cluster)] = clusters(n_cluster)
    X = data_trial_choice[["run_id", "withinTaskIndex", 'cluster' + str(n_cluster)]]
    X_ = sm.add_constant(X)
    y = 1-data_trial_choice[["choseLL"]]  
    log_reg = sm.Logit(y, X_).fit() 
    output.append([n_cluster, log_reg.bic, log_reg.aic]) 

output = pd.DataFrame(output, columns = ['n_clusters', 'BIC', 'AIC']) \
    .set_index('n_clusters')
output

# Export data

In [None]:
if not os.path.exists('./data_jupyter'):
    os.mkdir('./data_jupyter')

data_et_choice.to_csv("data_jupyter/data_et_choice.csv", index=False, header=True)
data_trial_choice.to_csv("data_jupyter/data_trial_choice.csv", index=False, header=True)
data_subject.to_csv("data_jupyter/data_subject.csv", index=False, header=True)

MatLab input

In [None]:
if not os.path.exists('./amasino_dataPrep/data_source'):
    os.mkdir('./amasino_dataPrep/data_source')

data_et_choice['fixationCounter'] = 1
data_et_choice.loc[:, 
                       [
                           'run_id', 
                           'withinTaskIndex', 
                           'x', 
                           'y', 
                           't_task'
                       ]
                  ] \
   .to_csv("amasino_dataPrep/data_source/schneegansEtAl_ET.csv", index=False, header=False)

In [None]:
data_trial_choice.loc[:, 
                       [
                           'run_id', 
                           'withinTaskIndex', 
                           'optionIndex', 
                           'attributeIndex', 
                           'payneIndex'
                       ]
                  ] \
    .fillna(0) \
    .to_csv("amasino_dataPrep/intermediateCSVs/ET_indices.csv", index=False, header=False)

In [None]:
data_trial_choice.loc[:, 
                          [
                              'run_id', 
                              'aSS', 
                              'aLL', 
                              'tSS', 
                              'tLL', 
                              'choseLL', 
                              'trial_duration_exact', 
                              'LL_top',
                              'choseTop'
                          ]
                     ] \
    .to_csv("amasino_dataPrep/data_source/schneegansEtAl_behavior.csv", index=False, header=False)

In [None]:
data_subject.loc[:, ['run_id', 'choseLL']] \
    .to_csv("amasino_dataPrep/intermediateCSVs/percLeft.csv", index=False, header=False)

# Feedback

In [None]:
print('Success! Script ran through')