In [1]:
import datetime
import numpy as np
import math
import os
import pandas as pd
import re
import seaborn as sns
import json
import statsmodels.api as sm 
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO    
from tqdm import tqdm 

    
from IPython.display import HTML
def View(df):
    css = """<style>
    table { border-collapse: collapse; border: 3px solid #eee; }
    table tr th:first-child { background-color: #eeeeee; color: #333; font-weight: bold }
    table thead th { background-color: #eee; color: #000; }
    tr, th, td { border: 1px solid #ccc; border-width: 1px 0 0 1px; border-collapse: collapse;
    padding: 3px; font-family: monospace; font-size: 10px }</style>
    """
    s  = '<script type="text/Javascript">'
    s += 'var win = window.open("", "Title", "toolbar=no, location=no, directories=no, status=no, menubar=no, scrollbars=yes, resizable=yes, width=780, height=200, top="+(screen.height-400)+", left="+(screen.width-840));'
    s += 'win.document.body.innerHTML = \'' + (df.to_html() + css).replace("\n",'\\') + '\';'
    s += '</script>'
    return(HTML(s+css))    

os.chdir(r'C:\Users\User\GitHub\WebET_Analysis')
print("Current Working directory " , os.getcwd())

Current Working directory  C:\Users\User\GitHub\WebET_Analysis


# Raw Data

## Search for specific subjects

In [2]:
path = 'data_prolific'
subject_files = os.listdir(path)
for i in range(0, len(subject_files)):
    thisSubject_txt = open(path + "/" + subject_files[i]).read()
    if thisSubject_txt.find('Unnamed') > (-1):
        print(subject_files[i])

## Read CSV from String

In [3]:
def cleanhtml(raw_html):
    # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
    cleanr = re.compile('<.*?>')
    cleanText = re.sub(cleanr, '', raw_html)
    return cleanText

def cleanETText(text):
    textWithinBrackets = re.findall(re.compile('\[.*?\]'), text)
    output = text
    for i in range(0,len(textWithinBrackets)):
        old = textWithinBrackets[i]
        new = re.sub(",", "$", old)
        output = output.replace(old, new)
    return output

def cleanSurveyText(text):
    output = text
    textWithinBrackets = re.findall(re.compile('\{.*?\}'), text)
    for i in range(0,len(textWithinBrackets)):
        old = textWithinBrackets[i]
        new = old.replace(',', '§')
        output = output.replace(old, new)
    return output

def compileData(path):
    subject_files = os.listdir(path)
    all_subjects = []
    for i in tqdm(range(0, len(subject_files))):
        csv_thisSubject = open(path + "/" + subject_files[i]).read()
        csv_thisSubject = cleanhtml(csv_thisSubject)
        csv_thisSubject = cleanETText(csv_thisSubject)
        csv_thisSubject = cleanSurveyText(csv_thisSubject)
        df_thisSubject = pd.read_csv(StringIO(csv_thisSubject))
        if len(df_thisSubject) > 0: 
            all_subjects.append(df_thisSubject)
    output = pd.concat(all_subjects).reset_index(drop=True)
    return output

data_raw = compileData("data_prolific")

100%|████████████████████████████████████████████████████████████████████████████████| 467/467 [00:49<00:00,  9.38it/s]


## survey data (to merge with data_subject)

In [4]:
def cleanOptionalNote(text):
    optionalNoteText = re.findall(re.compile('optionalNote":.*?\}'), text)
    if len(optionalNoteText) < 1:
        output = text
    else:
        old = optionalNoteText[0]
        new = old.replace('§', ' ') \
            .replace(':', ' ') \
            .replace('(', ' ') \
            .replace(')', ' ') \
            .replace('optionalNote" ', 'optionalNote":')
        
        output = text.replace(old, new)
    return output


def surveyStringToFrame(subject, string):
    string = cleanOptionalNote(string)
    string = re.sub("""{""", '', string)
    string = re.sub("""}""", '', string)
    string = re.sub('"', '', string)
    string = re.sub('§', '$', string)
    output = pd.read_csv(StringIO(string),
                         sep=":",
                         lineterminator="$",
                         header=None,
                         index_col=0
                         ).transpose()
    return output


def surveyData_thisSubject(data):
    
    subject = data['run_id'].unique()[0]
    output = []
    for i in range(0, len(data)):
        output.append(
            surveyStringToFrame(subject,
                                data.loc[i, 'responses'])
        )

    output = pd.concat(output, axis=1)
    output['run_id'] = subject

    return output


def surveyData(data):
    surveyData = pd.DataFrame(columns=[
            'prolificID', 'age', 'gender', 'ethnic', 'sight', 
            'glasses', 'degree', 'eyeshadow', 'masquara', 'eyeliner', 
            'browliner', 'vertPosition', 'triedChin', 'keptHead', 
            'optionalNote', 'run_id'])

    for subject in tqdm(data['run_id'].unique()):
        
        df_thisSubject = data.loc[
            (data['run_id']==subject) & 
            (pd.notna(data["responses"])) &
            (data["responses"] != '"'), 
            ['run_id', 'responses']
        ] \
            .reset_index()
        
        if len(df_thisSubject)>0:
        
            surveyData = \
                surveyData.append(
                    surveyData_thisSubject(
                        df_thisSubject
                    )
                )
    
    return(surveyData)
    
surveyData = surveyData(data_raw)
surveyData

100%|████████████████████████████████████████████████████████████████████████████████| 316/316 [00:08<00:00, 36.36it/s]


Unnamed: 0,prolificID,age,gender,ethnic,sight,glasses,degree,eyeshadow,masquara,eyeliner,browliner,vertPosition,triedChin,keptHead,optionalNote,run_id
1,5fccc8ac636416a4288a9f3d,1995,male,caucasian,glasses,shortSight,High School,no,no,no,no,a,yes,yes,,1
1,600063f2943eab0acc812ed8,2001,male,caucasian,perfectSight,noCorrection,College / Undergraduate / Bachelor,no,no,no,no,a,,,,103
1,5d485e8415055400194b707f,,,,,,,,,,,,,,,106
1,55b237e6fdf99b19ea79d2f7,,,,,,,,,,,,,,,108
1,5c5684ef9d244c0001b29f1e,1991,male,asian,glasses,shortSight,College / Undergraduate / Bachelor,no,no,no,no,a,yes,yes,,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,5ecfc227f036c902457fc44c,1998,female,black,perfectSight,noCorrection,Graduate / PhD / Master,no,no,no,no,a,yes,yes,,94
1,5eb9940830ab6c098bd1943d,1997,male,caucasian,perfectSight,noCorrection,College / Undergraduate / Bachelor,no,no,no,no,a,yes,yes,,96
1,5f68fe5be5cc370c0a0911de,1988,female,caucasian,perfectSight,noCorrection,,no,no,no,no,b,yes,yes,,97
1,5f779405dacc842fc44bc767,1996,male,caucasian,glasses,longSight,2,no,no,no,no,b,yes,yes,,98


### Cleaning

In [5]:
surveyData['degree'] = surveyData['degree'].replace(
    ['-3.0', '0', '1.25', '1.5', '2', '4.25'], np.nan)


surveyData['degree'] = surveyData['degree'].replace({
    'Middle School': 'middle',
    'High School': 'high',
    'high school': 'high',
    'Associate ': 'high',
    'masters degree': 'grad',
    'College / Undergraduate / Bachelor': 'college',
    'Bachelor': 'college',
    'Grad': 'grad',
    'Graduate / PhD / Master': 'grad',
    'masters degree': 'grad'
})
surveyData['degree'].unique()

array(['high', 'college', nan, 'grad', 'Associates Degree', 'highSchool',
       'graduate', 'middleSchool', 'middle'], dtype=object)

In [6]:
def convertToNumeric(data, columns):
    data[columns] = data[columns].apply(pd.to_numeric, errors='coerce')
    return data

surveyData = convertToNumeric(surveyData, ['age'])

### Optional Notes

In [7]:
pd.set_option('max_colwidth', 200)

data_optionalNotes = surveyData.loc[
        pd.notna(surveyData['optionalNote']), 
        ['run_id', 'optionalNote']
    ]
data_optionalNotes

Unnamed: 0,run_id,optionalNote
1,208,no further comments.
1,227,So much white text on a black background made my eyes tired so I'm sorry if that interfered with anything!
1,238,It was hard! Sorry my cat came along. Also I am being tested for multiple sclerosis right now but it has not been doing anything to my vision. So hopefully you can still use mine - I haven't bee...
1,256,Of course you can hear my family suddenly and randomly ask me questions in the background but I swear to you I kept my eyes on the center of the monitor and I clicked everything.
1,266,Na
1,270,I think I managed to stay pretty still but my heart raced after awhile.
1,347,I noticed my head had moved from the first picture to the last picture while my head was resting on my hand even though I had tried to hold still
1,361,
1,380,i am so sorry if you can hear my wife in this and for me getting up quite a bit. i hope i'm still eligible but i'm just being honest tried my best!
1,429,I had allergies so my eyes were watering


In [8]:
pd.set_option('max_colwidth', 40)

In [9]:
for subject in data_optionalNotes['run_id'].unique():
    print(f'run_id: {subject}')
    print(surveyData.loc[
        surveyData['run_id']==subject, 
        'optionalNote'].values[0])
    print('\n')

run_id: 208
no further comments.


run_id: 227
So much white text on a black background made my eyes tired so I'm sorry if that interfered with anything!


run_id: 238
It was hard! Sorry my cat came along. Also  I am being tested for multiple sclerosis right now  but it has not been doing anything to my vision. So hopefully you can still use mine - I haven't been diagnosed with anything yet.


run_id: 256
Of course you can hear my family suddenly and randomly ask me questions in the background but I swear to you I kept my eyes on the center of the monitor and I clicked everything. 


run_id: 266
Na


run_id: 270
I think I managed to stay pretty still  but my heart raced after awhile. 


run_id: 347
I noticed my head had moved from the first picture to the last picture while my head was resting on my hand even though I had tried to hold still


run_id: 361
  


run_id: 380
i am so sorry if you can hear my wife in this and for me getting up quite a bit. i hope i'm still eligible but i'm 

## Add to data_raw

In [10]:
if 'prolificID' in data_raw.columns:
    data_raw = data_raw.drop(columns='prolificID')
    
data_raw = data_raw \
    .merge(
        surveyData.loc[:, ['run_id', 'prolificID']],
        on='run_id',
        how='left'
)

## Trial type new

Most of these subjects reloaded when the eye-tracking initialized as well as during the first calibration

In [None]:
def add_trial_type_new(data):
    
    data['trial_type_new'] = 0
    
    data.loc[
        (data['run_id']<144) & (data['chinFirst']==0), 
        'trial_type_new'
    ] = pd.cut(
        data.loc[
            (data['run_id']<144) & (data['chinFirst']==0), 
            'trial_index'], 
        np.array([
            0,
            1.5, # start_page
            2.5, # prolific_id
            4.5, # pre_et_init
            5.5, # et_init
            11, # et_adjustment
            17.5, # calibration_1_briefing
            18.5, # first_cal
            97, # calibration_1
            134, # fixation_1
            221, # calibration_2
            259, # fixation_2
            514, # choice
            520 # end
        ]),
        labels=np.array([
            'start_page',
            'prolific_id',
            'pre_et_init',
            'et_init',
            'et_adjustment',
            'calibration_1_briefing',
            'first_cal', 
            'calibration_1',
            'fixation_1',
            'calibration_2',
            'fixation_2',
            'choice',
            'end'
        ]),
        include_lowest=True, 
    )

    data.loc[
        (data['run_id']<144) & (data['chinFirst']==1), 
        'trial_type_new'
    ] = pd.cut(
        data.loc[
            (data['run_id']<144) & (data['chinFirst']==1), 
            'trial_index'], 
        np.array([
            0, # 
            1.5, # start_page
            2.5, # prolific_id
            4.5, # pre_et_init
            5.5, # et_init
            11, # et_adjustment
            17.5, # calibration_1_briefing
            18.5, # first_cal
            97, # calibration_1
            134, # fixation_1
            387, # choice
            473, # calibration_2
            513, # fixation_2
            520 # end
        ]),
        labels=np.array([
            'start_page',
            'prolific_id',
            'pre_et_init',
            'et_init',
            'et_adjustment',
            'calibration_1_briefing',
            'first_cal', 
            'calibration_1',
            'fixation_1',
            'choice',        
            'calibration_2',
            'fixation_2',
            'end'
        ]),
        include_lowest=True, 
    )

    data.loc[
        (data['run_id']>143) & (data['chinFirst']==0), 
        'trial_type_new'
    ] = pd.cut(
        data.loc[
            (data['run_id']>143) & (data['chinFirst']==0), 
            'trial_index'], 
        np.array([
            0,
            1.5, # start_page
            2.5, # prolific_id
            4.5, # pre_et_init
            5.5, # et_init
            11, # et_adjustment
            17.5, # calibration_1_briefing
            97, # calibration_1
            143, # fixation_1
            231, # calibration_2
            277, # fixation_2
            532, # choice
            540 # end
        ]),
        labels=np.array([
            'start_page',
            'prolific_id',
            'pre_et_init',
            'et_init',
            'et_adjustment',
            'calibration_1_briefing',
            'calibration_1',
            'fixation_1',
            'calibration_2',
            'fixation_2',
            'choice',
            'end'
        ]),
        include_lowest=True, 
    )

    data.loc[
        (data['run_id']>143) & (data['chinFirst']==1), 
        'trial_type_new'
    ] = pd.cut(
        data.loc[
            (data['run_id']>143) & (data['chinFirst']==1), 
            'trial_index'], 
        np.array([
            0, # 
            1.5, # start_page
            2.5, # prolific_id
            4.5, # pre_et_init
            5.5, # et_init
            11, # et_adjustment
            17.5, # calibration_1_briefing
            97, # calibration_1
            143, # fixation_1
            397, # choice
            483, # calibration_2
            532, # fixation_2
            540 # end
        ]),
        labels=np.array([
            'start_page',
            'prolific_id',
            'pre_et_init',
            'et_init',
            'et_adjustment',
            'calibration_1_briefing',
            'calibration_1',
            'fixation_1',
            'choice',        
            'calibration_2',
            'fixation_2',
            'end'
        ]),
        include_lowest=True, 
    )
    
    return data

data_raw = add_trial_type_new(data_raw)

data_raw.loc[
    (data_raw['run_id']>143) & (data_raw['chinFirst']==1), 
    ['trial_index', 'trial_type_new', 'trial_type', 'stimulus', 'task_nr']
].iloc[530:540]

In [14]:
order_trial_types = pd.DataFrame(
    {'trial_type_new': [
                'start_page',
                'prolific_id',
                'pre_et_init',
                'et_init',
                'et_adjustment',
                'calibration_1_briefing',
                'calibration_1',
                'fixation_1',     
                'calibration_2',
                'fixation_2',
                'choice',   
                'end'
            ],
     'trial_type_nr': np.arange(12)
    }
)

if 'trial_type_nr' in data_raw.columns: 
    data_raw = data_raw.drop(columns='trial_type_nr')
    
data_raw = data_raw \
    .merge(order_trial_types, on='trial_type_new', how='left') 

## Dropouts (on run level)

In [15]:
def add_next_trial(data):
    full_trials_no_chin = data_raw.loc[
        data_raw['run_id']==421, 
        ['trial_index', 'trial_type', 'trial_type_new']
    ].reset_index(drop=True)

    next_trials_no_chin = full_trials_no_chin
    next_trials_no_chin['trial_index'] -= 1

    full_trials_chin = data_raw.loc[
        data_raw['run_id']==270, 
        ['trial_index', 'trial_type', 'trial_type_new']
    ].reset_index(drop=True)

    next_trials_chin = full_trials_chin
    next_trials_chin['trial_index'] -= 1
    
    data = data.copy()
    data['next_trial_type']=0
    data['next_trial_type_new']=0

    for i in data.index:
        this_trial = data.loc[i, 'trial_index']

        if data.loc[i, 'trial_type_new'] != 'end':
            next_trials = next_trials_chin if data.loc[i, 'chinFirst']>0 else next_trials_no_chin
            
            next_trial = next_trials.loc[
                next_trials['trial_index']==this_trial,
                'trial_type'
            ].values[0]
            data.loc[i, 'next_trial_type']=next_trial

            next_trial = next_trials.loc[
                next_trials['trial_index']==this_trial,
                'trial_type_new'
            ].values[0]
            data.loc[i, 'next_trial_type_new']=next_trial
            
        else:
            data.loc[i, 'next_trial_type']='end'
            data.loc[i, 'next_trial_type_new']='end'
            
    return data

Most runs drop out at the beginning (8.8% chinFirst +7.2% no chinFirst), in detail, during the initialization of Webgazer (4.4%+3.2%), during the calibration instruction (0.6%+2.5%) and the calibration itself (1.2+0.3%). Moreover, some dropouts during the tasks (0.6+1.8%).

In [16]:
def dropout(data):
    grouped_trial_type_new = data \
        .loc[: , ['run_id', 'chinFirst', 'trial_index', 'trial_type_new']] \
        .drop_duplicates()

    last_trial_for_each_subject = data \
        .groupby(['run_id'])['trial_index'].max() \
        .reset_index() \
        .merge(grouped_trial_type_new, on=['run_id', 'trial_index'], how='left')
    
    last_trial_for_each_subject = add_next_trial(last_trial_for_each_subject)
    
    dropout_by_type = last_trial_for_each_subject \
        .groupby(['trial_type_new', 'next_trial_type']).count() \
        .reset_index() \
        .rename(columns={'run_id': 'n_run_id'}) \
        .loc[
            :, 
            [
               'trial_type_new', 'next_trial_type', 'next_trial_type_new', 'n_run_id', 
            ]
        ] \
        .sort_values(by='n_run_id')
    
    
    
    
    dropout_by_type['perc'] = round(
        100 * dropout_by_type['n_run_id'] / len(data_raw['run_id'].unique()), 
        1
    )
    
    summary = pd.DataFrame([
        [
            'beginning', 'beginning', 'beginning', 
            sum(dropout_by_type.loc[
                dropout_by_type['next_trial_type'].isin([
                    'pre_et_init',
                    'et_init', 
                    'et_adjustment',
                    'calibration_1_briefing',
                    'calibration_1',
                ]),
                'n_run_id']),
            sum(dropout_by_type.loc[
                dropout_by_type['next_trial_type'].isin([
                    'pre_et_init',
                    'et_init', 
                    'et_adjustment',
                    'calibration_1_briefing',
                    'calibration_1',
                ]),
                'perc'])
        ], 
                [
            'et_tasks', 'et_tasks', 'et_tasks', 
            sum(dropout_by_type.loc[
                dropout_by_type['next_trial_type'].isin([
                    'fixation_1',
                    'fixation_2',
                    'calibration_2',
                    'choice',
                ]),
                'n_run_id']),
            sum(dropout_by_type.loc[
                dropout_by_type['next_trial_type'].isin([
                    'fixation_1',
                    'fixation_2',
                    'calibration_2',
                    'choice',
                ]),
                'perc'])
        ],
        [
            'total', 'total', 'total', 
            sum(dropout_by_type['n_run_id']),
            sum(dropout_by_type['perc'])
        ]
    
    ], columns=dropout_by_type.columns)
    
    dropout_by_type = dropout_by_type.append(summary)

    return dropout_by_type

print('chinFirst==0')
print(dropout(data_raw.loc[data_raw['chinFirst']==0, :]))
print('\n')
print('chinFirst==1')
print(dropout(data_raw.loc[data_raw['chinFirst']==1, :]))

chinFirst==0
   trial_type_new          next_trial_type next_trial_type_new  n_run_id  perc
0   calibration_1  eyetracking-calibration                   1         1   0.3
3   calibration_2  eyetracking-calibration                   1         1   0.3
4          choice   eyetracking-fix-object                   1         1   0.3
6   et_adjustment     html-button-response                   1         1   0.3
7   et_adjustment   html-keyboard-response                   1         1   0.3
..            ...                      ...                 ...       ...   ...
8     pre_et_init         eyetracking-init                  14        14   4.4
5             end                      end                 124       124  39.2
0       beginning                beginning           beginning         0   0.0
1        et_tasks                 et_tasks            et_tasks         0   0.0
2           total                    total               total       159  50.2

[14 rows x 5 columns]


chinFirst==1
 

### Inspect calibration

The last trial during calibration varies. There no one 'first calibration' or similar, when the subjects drop out 

In [18]:
data = data_raw

grouped_trial_type_new = data \
    .loc[: , ['run_id', 'chinFirst', 'trial_index', 'trial_type_new']] \
    .drop_duplicates()

last_trial_for_each_subject = data \
    .groupby(['run_id'])['trial_index'].max() \
    .reset_index() \
    .merge(grouped_trial_type_new, on=['run_id', 'trial_index'], how='left')

last_trial_for_each_subject = add_next_trial(last_trial_for_each_subject)
last_trial_for_each_subject.loc[
    last_trial_for_each_subject['trial_type_new']=='calibration_1'
]

Unnamed: 0,run_id,trial_index,chinFirst,trial_type_new,next_trial_type,next_trial_type_new
68,141,19,0,calibration_1,eyetracking-calibration,calibration_1
127,221,19,1,calibration_1,eyetracking-calibration,calibration_1
135,229,90,0,calibration_1,html-keyboard-response,calibration_1
191,300,28,0,calibration_1,html-keyboard-response,calibration_1
212,332,26,0,calibration_1,html-keyboard-response,calibration_1
245,374,18,0,calibration_1,html-keyboard-response,calibration_1
299,443,70,0,calibration_1,html-keyboard-response,calibration_1


## Duplicate Prolific ID's - Multiple participations

In [20]:
duplicates = data_raw.loc[
    pd.notna(data_raw['prolificID']), 
    ['prolificID', 'trial_index']
].duplicated()

duplicate_subjects = data_raw.loc[
    pd.notna(data_raw['prolificID']) & 
    duplicates, 
    'prolificID'
].unique()

if len(duplicate_subjects) > 0:
    print(f'! Attention: Duplicate prolific IDs ! \n')
    print(f' - Number of subjects who had to try multiple times: {len(duplicate_subjects)}')
    print(f' - Sum of duplicate entries in data_raw (prolificID & trial_index): {sum(duplicates)}')

else:
    print('Success: No duplicate subjects found')

! Attention: Duplicate prolific IDs ! 

 - Number of subjects who had to try multiple times: 19
 - Sum of duplicate entries in data_raw (prolificID & trial_index): 214


Most subjects, who had to redo, previously dropped out during calibration briefing (n=7) and the initialization (n=11) multiple times

In [21]:
runs_max_trial = data_raw.loc[data_raw['prolificID'].isin(duplicate_subjects), :] \
    .groupby(['prolificID', 'run_id'], as_index=False)['trial_index'].max() \
    .merge(
        data_raw.loc[
            :, 
            ['run_id', 'chinFirst', 'trial_index', 'trial_type', 'trial_type_nr', 'trial_type_new']
        ], 
        on=['run_id', 'trial_index'], 
        how='left')
runs_max_trial = runs_max_trial \
    .loc[runs_max_trial['trial_type_new']!='end', :]

runs_max_trial= add_next_trial(runs_max_trial)

runs_max_trial.head(10)

Unnamed: 0,prolificID,run_id,trial_index,chinFirst,trial_type,trial_type_nr,trial_type_new,next_trial_type,next_trial_type_new
0,56b7a271e77ebe000bbeff49,124,4,1,html-button-response,2.0,pre_et_init,eyetracking-init,et_init
1,56b7a271e77ebe000bbeff49,125,15,1,html-keyboard-response,5.0,calibration_1_briefing,html-keyboard-response,calibration_1_briefing
2,5c5219d7823a35000128d460,351,285,1,html-keyboard-response,10.0,choice,eyetracking-fix-object,choice
3,5c5219d7823a35000128d460,353,8,0,html-button-response,4.0,et_adjustment,html-keyboard-response,et_adjustment
4,5c95970cd676900016e1a940,168,14,1,html-keyboard-response,5.0,calibration_1_briefing,html-keyboard-response,calibration_1_briefing
5,5c95970cd676900016e1a940,178,14,1,html-keyboard-response,5.0,calibration_1_briefing,html-keyboard-response,calibration_1_briefing
6,5c95970cd676900016e1a940,181,14,1,html-keyboard-response,5.0,calibration_1_briefing,html-keyboard-response,calibration_1_briefing
8,5cbe04b4f429ff00159de30e,388,14,0,html-keyboard-response,5.0,calibration_1_briefing,html-keyboard-response,calibration_1_briefing
10,5d754a0ae8351c0001f3ffc8,147,4,0,html-button-response,2.0,pre_et_init,eyetracking-init,et_init
11,5d754a0ae8351c0001f3ffc8,148,4,0,html-button-response,2.0,pre_et_init,eyetracking-init,et_init


In [22]:
runs_max_trial \
    .groupby(['next_trial_type']).nunique()['prolificID'] \
    .reset_index() \
    .sort_values(by='prolificID')

Unnamed: 0,next_trial_type,prolificID
0,eyetracking-calibration,1
1,eyetracking-fix-object,1
3,html-button-response,2
4,html-keyboard-response,8
2,eyetracking-init,11


## Binary data

In [23]:
data_raw['chinFirst'] = data_raw['chinFirst'].replace({'no': 0, 'yes': 1}) 

## Numerics

In [24]:
columns = [
            'eyeshadow', 
            'masquara',
            'eyeliner',
            'browliner',
            'triedChin', 
            'keptHead'
        ]

for col in columns:
    surveyData[col] = surveyData[col].replace({'no': 0, 'yes': 1}) 

# data_trial

In [25]:
data_trial = data_raw.loc[
    :,
    [
        'run_id', 'subject', 'chinFirst', 
        'trial_index',
        'trial_type', 'task_nr', 
        'rt', 'stimulus', 'key_press', 
        'time_elapsed', 'trial_duration', 'recorded_at',
        'window_width', 'window_height', 'success', 

        'chin', 'x_pos', 'y_pos', 
        
        'choiceTask_amountLeftFirst', 
        'option_topLeft', 'option_bottomLeft',
        'option_topRight', 'option_bottomRight', 
        'chosenAmount', 'chosenDelay',
    ]
]

### Numerics

In [26]:
data_trial = convertToNumeric(data_trial, 
    [
        'run_id', 'subject', 'chinFirst', 'chin', 'task_nr', 'trial_index', # Int
        'key_press', 
        'x_pos', 'y_pos', 'time_elapsed', 'trial_duration',
        'rt',
        'window_width', 'window_height'
    ]
)

# data_et

## Compile

In [27]:
def reformatYang(text):
    text = text.replace('$', ',')
    text = text.replace("relative-x", "x")
    text = text.replace("relative-y", 'y')
    text = text.replace('elapse-time', 't')
    text = (text[11:len(text)-1])
    return(text)

#for subject in data_yang['run_id'].unique():
#    for i in data_raw.loc[
#        (data_raw['run_id']==subject) & 
#        (pd.notna(data_raw['et_data'])) &
#        ~(data_raw['et_data'].isin(['"', 'nan'])), 
#       :].index:
#       print('Reformat index: ' + str(i))
#       data_raw.loc[i, 'et_data'] = reformatYang(data_raw.loc[i, 'et_data'])

In [28]:
len(data_raw.loc[
        (pd.notna(data_raw['et_data'])) & 
        ~(data_raw['et_data'].isin(['"', '[]', 'nan'])), 
        'et_data'].index)

66224

In [29]:
def textToDataframe(text):
    text = text.replace('$', ',')
    dataframe = pd.read_json(text, orient='records')
    return(dataframe)

def extractEyetrackingData(data):
    data_eyetracking = pd.DataFrame(columns=['x', 'y', 't'])
    data["et_data"] = data['et_data'].apply(str)

    et_indices = data.loc[
        (pd.notna(data['et_data'])) & 
        ~(data['et_data'].isin(['"', '[]', 'nan'])), 
        :].index
    index_max = et_indices.max()
    
    for i in tqdm(et_indices):
        
        df = textToDataframe(data.loc[i, 'et_data'])        

        df["t_task"] = (df.loc[:, "t"] - df.loc[0, "t"])
        df['run_id'] = data.loc[i, 'run_id']
        df['trial_index'] = data.loc[i, 'trial_index']

        data_eyetracking = data_eyetracking.append(pd.DataFrame(data=df), ignore_index=True)
    return(data_eyetracking)

data_et = extractEyetrackingData(data_raw.loc[data_raw['run_id'].isin(data_trial['run_id'].unique()), :])

100%|██████████████████████████████████████████████████████████████████████████| 66224/66224 [1:52:41<00:00,  9.79it/s]


In [30]:
data_et = convertToNumeric(data_et, ['x', 'y', 't', 't_task'])

In [31]:
data_et.groupby(['run_id', 'trial_index'])['x', 'y'].count()

  data_et.groupby(['run_id', 'trial_index'])['x', 'y'].count()


Unnamed: 0_level_0,Unnamed: 1_level_0,x,y
run_id,trial_index,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,18.0,6,6
1.0,20.0,11,11
1.0,22.0,11,11
1.0,24.0,11,11
1.0,26.0,10,10
...,...,...,...
462.0,518.0,44,44
462.0,520.0,13,13
462.0,522.0,44,44
462.0,524.0,13,13


# data_subject

In [36]:
df1 = data_raw.loc[
    :, 
    [
        'run_id', 'browser', 'browser_version', 'device',
        'platform', 'platform_version', 'user_agent'
    ]
].drop_duplicates()
    
df2 = data_trial.loc[
    : ,
    [
        'chinFirst', 'choiceTask_amountLeftFirst', 'run_id'
    ]
].drop_duplicates()
    
data_subject = df1.merge(df2, on='run_id', how='left') \
    .rename(columns={'age': 'birthyear'})

## survey data

In [37]:
data_subject = data_subject \
    .merge(
        surveyData,
        on='run_id', 
        how='left'
)

# Var recorded_at

In [38]:
def add_completed_date(data, data_trial):
    output = []

    for subject in tqdm(data_trial['run_id'].unique()):
        thisSubject = data_trial.loc[data_trial['run_id'] == subject] \
            .reset_index(drop=True)
        date_time_obj = datetime.datetime.strptime(
            thisSubject.loc[0, 'recorded_at'], '%Y-%m-%d %H:%M:%S')

        output.append([thisSubject.loc[0, 'run_id'], date_time_obj.date()])

    output = pd.DataFrame(output, columns=['run_id', 'recorded_date'])

    if 'recorded_date' in data.columns: data = data.drop(columns=['date'])
    data = data.merge(output, on='run_id', how='left')
    return data

data_subject = add_completed_date(data_subject, data_trial)
data_subject['recorded_date']

100%|███████████████████████████████████████████████████████████████████████████████| 316/316 [00:00<00:00, 741.07it/s]


0      2021-01-17
1      2021-01-19
2      2021-01-19
3      2021-01-19
4      2021-01-19
          ...    
311    2021-01-19
312    2021-01-19
313    2021-01-19
314    2021-01-19
315    2021-01-19
Name: recorded_date, Length: 316, dtype: object

## Add prolific data

In [39]:
data_prolific_int = pd.read_csv(r'C:/Users/User/GitHub/WebET_Analysis/prolific/prolific_export_int.csv') \
    .rename(columns={'participant_id': 'prolificID'}) 

data_prolific_us = pd.read_csv(r'C:/Users/User/GitHub/WebET_Analysis/prolific/prolific_export_us.csv') \
    .rename(columns={'participant_id': 'prolificID'}) \

data_prolific = data_prolific_int \
    .append(data_prolific_us)
print(len(
    data_prolific.loc[data_prolific['status']=='APPROVED', :]))
print(len(data_subject.prolificID.unique()))

250
278


In [40]:
data_subject = data_subject.merge(data_prolific, on='prolificID', how='left')
data_subject['status'] = data_subject['status'].fillna('NOTPROLIFIC')
data_subject.loc[:, ['run_id', 'prolificID']]

Unnamed: 0,run_id,prolificID
0,1,5fccc8ac636416a4288a9f3d
1,103,600063f2943eab0acc812ed8
2,105,
3,106,5d485e8415055400194b707f
4,108,55b237e6fdf99b19ea79d2f7
...,...,...
311,94,5ecfc227f036c902457fc44c
312,96,5eb9940830ab6c098bd1943d
313,97,5f68fe5be5cc370c0a0911de
314,98,5f779405dacc842fc44bc767


# data_prolific
For participant management

In [41]:
temp = data_subject.rename(columns={
    'chosenAmount': 'bonus_USD',
    'chosenDelay': 'bonus_delay'
    }
)

data_prolific = data_prolific.merge(
    temp.loc[:, 
        np.append(['prolificID'], 
        temp.columns.difference(data_prolific.columns))], 
    on='prolificID', 
    how='left')

# Export data

In [42]:
if not os.path.exists('./data_jupyter'):
    os.mkdir('./data_jupyter')

#data_et.to_csv("data_jupyter/data_et.csv", index=False, header=True)
data_trial.to_csv("data_jupyter/data_trial.csv", index=False, header=True)
data_subject.to_csv("data_jupyter/data_subject_raw.csv", index=False, header=True)
data_prolific.to_csv("data_jupyter/data_prolific.csv", index=False, header=True)

# Feedback

In [43]:
print('Success! Script ran through')

Success! Script ran through
