In [1]:
# import sys
# !conda install --yes --prefix {sys.prefix} pingouin

In [2]:
import datetime
import numpy as np
import math
import os
import pandas as pd
import pingouin as pg
import re
import seaborn as sns
import json
import statsmodels.api as sm 
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import sys

if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
    
from IPython.display import HTML
def View(df):
    css = """<style>
    table { border-collapse: collapse; border: 3px solid #eee; }
    table tr th:first-child { background-color: #eeeeee; color: #333; font-weight: bold }
    table thead th { background-color: #eee; color: #000; }
    tr, th, td { border: 1px solid #ccc; border-width: 1px 0 0 1px; border-collapse: collapse;
    padding: 3px; font-family: monospace; font-size: 10px }</style>
    """
    s  = '<script type="text/Javascript">'
    s += 'var win = window.open("", "Title", "toolbar=no, location=no, directories=no, status=no, menubar=no, scrollbars=yes, resizable=yes, width=780, height=200, top="+(screen.height-400)+", left="+(screen.width-840));'
    s += 'win.document.body.innerHTML = \'' + (df.to_html() + css).replace("\n",'\\') + '\';'
    s += '</script>'
    return(HTML(s+css))    

os.chdir(r'C:\Users\User\GitHub\WebET_Analysis')
print("Current Working directory " , os.getcwd())

Current Working directory  C:\Users\User\GitHub\WebET_Analysis


In [3]:
# from IPython.display import HTML

# HTML('''<script>
# code_show=true; 
# function code_toggle() {
#  if (code_show){
#  $('div.input').hide();
#  } else {
#  $('div.input').show();
#  }
#  code_show = !code_show
# } 
# $( document ).ready(code_toggle);
# </script>
# <form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

# Raw Data

## Search for specific subjects

In [4]:
#path = 'data_cognition'
#subject_files = os.listdir(path)
#for i in range(0, len(subject_files)):
#    thisSubject = open(path + "/" + subject_files[i]).read()
#    if thisSubject.find('5c0e4e3d87e876000151cfec') > (-1):
#        print(subject_files[i])

## Read CSV from String

In [5]:
def clean_html(raw_html):
    # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
    cleanr = re.compile('<.*?>')
    cleanText = re.sub(cleanr, '', raw_html)
    return cleanText

def clean_et_text(text):
    textWithinBrackets = re.findall(re.compile('\[.*?\]'), text)
    output = text
    for i in range(0,len(textWithinBrackets)):
        old = textWithinBrackets[i]
        new = re.sub(",", "$", old)
        output = output.replace(old, new)
    return output

def cleanSurveyText(text):
    output = text
    textWithinBrackets = re.findall(re.compile('\{.*?\}'), text)
    for i in range(0,len(textWithinBrackets)):
        old = textWithinBrackets[i]
        new = old.replace(',', '§')
        output = output.replace(old, new)
    return output

def compileData(path):
    subject_files = os.listdir(path)
    all_subjects = []
    for i in range(0, len(subject_files)):
        csv_thisSubject = open(path + "/" + subject_files[i]).read()
        csv_thisSubject = clean_html(csv_thisSubject)
        csv_thisSubject = clean_et_text(csv_thisSubject)
        csv_thisSubject = cleanSurveyText(csv_thisSubject)
        all_subjects.append(pd.read_csv(StringIO(csv_thisSubject)))
    output = pd.concat(all_subjects).reset_index(drop=True)
    return output

data_raw = compileData("data_prolific")

# Custom modifications
data_raw['run_id'].unique()

array([1, 103, 105, 106, 108, 11, 117, 12, 121, 124, 125, 126, 128, 13,
       130, 131, 14, 19, 24, 25, 28, 3, 30, 32, 36, 37, 38, 4, 41, 42, 43,
       45, 47, 48, 49, 5, 50, 51, 54, 56, 57, 58, 59, 6, 61, 63, 66, 67,
       7, 70, 72, 74, 75, 80, 81, 83, 85, 88, 9, 90, 91, 92, 93, 94, 96,
       97, 98, 99], dtype=object)

In [6]:
data_yang = compileData("data_yang2020WG")
# Custom modifications
data_yang.loc[0:515, 'run_id'] = 0
data_yang = data_yang.loc[~data_yang['run_id'].isin([4, 18]), :]
data_yang = data_yang.rename(columns={'eyeData': 'et_data'})
data_yang['run_id'] = data_yang['run_id'] + 1000
print(data_yang['run_id'].unique())
data_raw = data_raw.append(data_yang.reset_index(drop=True))

[1000 1001 1011 1013 1014 1015 1016 1017 1019 1002 1020 1021 1003 1007
 1008 1009 1022 1024]


In [7]:
data_milieu = compileData("data_milieu")
data_milieu['run_id'] = data_milieu['run_id'] + 2000
print(data_milieu['run_id'].unique())
data_raw = data_raw.append(data_milieu.reset_index(drop=True))

[2011 2012 2013 2002 2008 2009]


### Exclude empty studies

In [8]:
data_raw = data_raw.loc[data_raw['trial_index']>0, :]

### survey data

In [9]:
def cleanOptionalNote(text):
    optionalNoteText = re.findall(re.compile('optionalNote":.*?\}'), text)
    if len(optionalNoteText) < 1:
        output = text
    else:
        old = optionalNoteText[0]
        new = old.replace('§', ' ')
        output = text.replace(old, new)
    return output


def surveyStringToFrame(subject, string):
    string = cleanOptionalNote(string)
    string = re.sub("""{""", '', string)
    string = re.sub("""}""", '', string)
    string = re.sub('"', '', string)
    string = re.sub('§', '$', string)
    output = pd.read_csv(StringIO(string),
                         sep=":",
                         lineterminator="$",
                         header=None,
                         index_col=0
                         ).transpose()
    return output


def surveyData_thisSubject(data):
    df_thisSubject = data.loc[
        (pd.notna(data["responses"])) &
        (data["responses"] != '"'), :].reset_index()
    subject = df_thisSubject['run_id'].unique()[0]
    output = []
    for i in range(0, len(df_thisSubject)):
        output.append(
            surveyStringToFrame(subject,
                                df_thisSubject.loc[i, 'responses'])
        )

    output = pd.concat(output, axis=1)
    output['run_id'] = subject
    return output

def addSurveyData(data):
    surveyData_allSubjects = pd.DataFrame(columns=[
            'prolificID', 'age', 'gender', 'ethnic', 'sight', 
            'glasses', 'degree', 'eyeshadow', 'masquara', 'eyeliner', 
            'browliner', 'vertPosition', 'triedChin', 'keptHead', 
            'optionalNote', 'run_id'])

    for subject in data['run_id'].unique():
        surveyData_allSubjects = \
            surveyData_allSubjects.append(
                surveyData_thisSubject(
                    data.loc[data['run_id']==subject, ['run_id', 'responses']]
                )
            )
    # print(surveyData_allSubjects)
    
    data = data.merge(surveyData_allSubjects, on='run_id', how='left')
    data = data.drop(columns='responses')
    return(data)
    
data_raw = addSurveyData(data_raw)
print(data_raw['run_id'].unique())
print(data_raw.columns)

[1 103 106 108 11 12 124 125 126 128 13 130 131 14 19 24 25 28 30 32 36 37
 38 4 41 42 43 45 47 48 49 5 54 56 58 59 6 61 63 66 67 7 70 72 74 75 80 81
 83 85 88 9 90 91 92 93 94 96 97 98 99 1000 1001 1011 1014 1015 1016 1017
 1019 1002 1020 1021 1003 1007 1008 1009 1022 1024 2011 2012 2013 2002
 2008 2009]
Index(['run_id', 'condition', 'rt', 'stimulus', 'key_press', 'trial_type',
       'trial_index', 'time_elapsed', 'internal_node_id', 'subject',
       'chinFirst', 'choiceTask_amountLeftFirst', 'webcam_label', 'webcam_fps',
       'webcam_height', 'webcam_width', 'button_pressed', 'window_width',
       'window_height', 'chin', 'success', 'x_pos', 'y_pos', 'task_nr',
       'et_data', 'trial_duration', 'option_topLeft', 'option_bottomLeft',
       'option_topRight', 'option_bottomRight', 'recorded_at', 'ip',
       'user_agent', 'device', 'browser', 'browser_version', 'platform',
       'platform_version', 'Unnamed: 2', 'chosenAmount', 'chosenDelay',
       'webcam_Id', 'webcam_aspect

In [10]:
def convertToNumeric(data, columns):
    data[columns] = data[columns].apply(pd.to_numeric, errors='coerce')
    return data


data_raw = convertToNumeric(data_raw, ['age'])
data_raw['degree'] = data_raw['degree'].replace(
        [
            'College / Undergraduate / Bachelor',
            'High School',
            'Graduate / PhD / Master',
            'Middle School'
        ],
    [
            'college',
            'highSchool',
            'grad',
            'middle'
        ]
)

columns = [
            'chinFirst',
            'eyeshadow', 
            'masquara',
            'eyeliner',
            'browliner',
            'triedChin', 
            'keptHead',
        ]

data_raw[columns] = data_raw[columns].replace({'no': 0, 'yes': 1}) 

data_raw['run_id'].unique()

array([1, 103, 106, 108, 11, 12, 124, 125, 126, 128, 13, 130, 131, 14, 19,
       24, 25, 28, 30, 32, 36, 37, 38, 4, 41, 42, 43, 45, 47, 48, 49, 5,
       54, 56, 58, 59, 6, 61, 63, 66, 67, 7, 70, 72, 74, 75, 80, 81, 83,
       85, 88, 9, 90, 91, 92, 93, 94, 96, 97, 98, 99, 1000, 1001, 1011,
       1014, 1015, 1016, 1017, 1019, 1002, 1020, 1021, 1003, 1007, 1008,
       1009, 1022, 1024, 2011, 2012, 2013, 2002, 2008, 2009], dtype=object)

# data_trial

In [11]:
data_trial = data_raw.loc[
    :,
    [
        'run_id', 'subject', 'prolificID', 'chinFirst', 
        'trial_index',
        'trial_type', 'task_nr', 
        'rt', 'stimulus', 'key_press', 
        'time_elapsed', 'trial_duration', 'recorded_at',
        'window_width', 'window_height', 'success', 

        'chin', 'x_pos', 'y_pos', 
        
        'choiceTask_amountLeftFirst', 
        'option_topLeft', 'option_bottomLeft',
        'option_topRight', 'option_bottomRight', 
        'chosenAmount', 'chosenDelay',
    ]
]

## Clean

### Failed Webgazer Setup

In [12]:
subjects_failedSetups = []
for subject in data_trial['run_id'].unique():
    maxTrialIndex = data_trial.loc[data_raw['run_id']==subject, ['trial_index']].max()
    if int(maxTrialIndex) < 15:
        subjects_failedSetups.append(subject)

data_trial.loc[data_trial['run_id'].isin(subjects_failedSetups), 
    [
        'run_id', 'prolificID'
    ]
].drop_duplicates()    

Unnamed: 0,run_id,prolificID
1036,106,5d485e8415055400194b707f
1038,108,55b237e6fdf99b19ea79d2f7
2079,124,56b7a271e77ebe000bbeff49
4684,14,5ec2cdfd1a17930ddf6f1443
12984,49,5ecfc227f036c902457fc44c
13507,54,5f94d146f65f56336a61cc16
18338,72,600324757b530e39903effc0
20414,83,600324757b530e39903effc0
20935,88,5ecfc227f036c902457fc44c
25608,99,55b237e6fdf99b19ea79d2f


In [13]:
data_trial = data_trial.loc[~data_trial['run_id'].isin(subjects_failedSetups), :]
print(data_trial['run_id'].unique())

[1 103 11 12 125 126 128 13 130 131 19 24 25 28 30 32 36 37 38 4 41 42 43
 45 47 48 5 56 58 59 6 61 63 66 67 7 70 74 75 80 81 85 9 90 91 92 93 94 96
 97 98 1000 1011 1021 1003 1008 2011 2012 2013 2002 2008 2009]


### Duplicate Prolific ID's

In [24]:
data_trial.loc[data_trial['run_id']==2002, 'prolificID'] = 'Tim2'

duplicates = data_trial.loc[:, ['prolificID', 'trial_index']].duplicated()
duplicateSubjects = data_trial.loc[duplicates, ['run_id', 'prolificID', 'trial_index']]

if len(duplicateSubjects) > 0:
    print('! Attention: Duplicate subjects: Check out the following: \n')
    print(duplicateSubjects)
else:
    print('Success: No duplicate subjects found')

Success: No duplicate subjects found


### Choice data

In [25]:
# Cleaning
data_trial.loc[: , 
       [
           'option_topLeft',
           'option_bottomLeft', 
           'option_topRight', 
           'option_bottomRight', 
           'chosenAmount', 
           'chosenDelay'
       ] 
    ] = data_trial.loc[:, 
       [
           'option_topLeft',
           'option_bottomLeft', 
           'option_topRight', 
           'option_bottomRight', 
           'chosenAmount', 
           'chosenDelay'
       ] 
    ].replace(['Today', 'Tomorrow', '7 days', '15 days', '30 days', '90 days', '180 days'], 
             [0, 1, 7, 15, 30, 90, 180]) \
    .replace({'\$':''}, regex = True) \
    .replace('50 cent', 0.5) 

TypeError: Cannot compare types 'ndarray(dtype=float64)' and 'str'

### Numerics

In [None]:
data_trial = convertToNumeric(data_trial, 
    [
        'run_id', 'subject', 'chinFirst', 'chin', 'task_nr', 'trial_index', # Int
        'key_press', 
        'x_pos', 'y_pos', 'time_elapsed', 'trial_duration',
        'rt',
        'window_width', 'window_height', 
        'option_topLeft',
        'option_bottomLeft', 
        'option_topRight', 
        'option_bottomRight', 
        'chosenAmount', 
        'chosenDelay'
    ]
)

# data_et

## Compile

In [None]:
def reformatYang(text):
    text = text.replace('$', ',')
    text = text.replace("relative-x", "x")
    text = text.replace("relative-y", 'y')
    text = text.replace('elapse-time', 't')
    text = (text[11:len(text)-1])
    return(text)

for subject in data_yang['run_id'].unique():
    for i in data_raw.loc[
        (data_raw['run_id']==subject) & 
        (pd.notna(data_raw['et_data'])) &
        ~(data_raw['et_data'].isin(['"', 'nan'])), 
        :].index:
        print('Reformat index: ' + str(i))
        data_raw.loc[i, 'et_data'] = reformatYang(data_raw.loc[i, 'et_data'])

In [None]:
def text_to_data_frame(text):
    text = text.replace('$', ',')
    dataframe = pd.read_json(text, orient='records')
    return(dataframe)

def extractEyetrackingData(data):
    data_eyetracking = pd.DataFrame(columns=['x', 'y', 't'])
    data["et_data"] = data['et_data'].apply(str)

    for i in data.loc[
        (pd.notna(data['et_data'])) & 
        ~(data['et_data'].isin(['"', '[]', 'nan'])), 
        :].index:
        
        print('Start decoding Index ' + str(i))
        df = text_to_data_frame(data.loc[i, 'et_data'])        

        df["t_task"] = (df.loc[:, "t"] - df.loc[0, "t"])
        df['run_id'] = data.loc[i, 'run_id']
        df['trial_index'] = data.loc[i, 'trial_index']

        data_eyetracking = data_eyetracking.append(pd.DataFrame(data=df), ignore_index=True)
        print('Index ' + str(i) + ' extracted')
    return(data_eyetracking)

data_et = extractEyetrackingData(data_raw.loc[data_raw['run_id'].isin(data_trial['run_id'].unique()), :])

In [None]:
data_et = convertToNumeric(data_et, ['x', 'y', 't', 't_task'])

In [None]:
data_et.groupby(['run_id', 'trial_index'])['x', 'y'].count()

# data_subject

In [None]:
data_subject = data_trial.loc[
    : ,
    [
        'chinFirst', 'choiceTask_amountLeftFirst', 'chosenAmount',
        'chosenDelay', 'run_id', 'prolificID'
    ]
] \
    .merge(data_raw.loc[:, ['run_id',
                            'age', 'browliner', 'browser', 'browser_version', 'degree', 'device',
                            'ethnic', 'eyeliner', 'eyeshadow', 'gender', 'glasses', 'keptHead',
                            'masquara', 'optionalNote', 'platform', 'platform_version', 'sight',
                            'triedChin', 'user_agent', 'vertPosition', 'webcam_fps',
                            'webcam_height', 'webcam_label', 'webcam_width']],
           on='run_id', 
           how='left') \
    .drop_duplicates()
data_subject.columns

# Export data

In [None]:
if not os.path.exists('./data_jupyter'):
    os.mkdir('./data_jupyter')

data_et.to_csv("data_jupyter/data_et.csv", index=False, header=True)
data_trial.to_csv("data_jupyter/data_trial.csv", index=False, header=True)
data_subject.to_csv("data_jupyter/data_subject.csv", index=False, header=True)

# Feedback

In [None]:
print('Success! Script ran through')