In [8]:
import datetime
import numpy as np
import math
import os
import pandas as pd
import re
import seaborn as sns
import json
import statsmodels.api as sm 
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO    
from tqdm import tqdm 

    
from IPython.display import HTML
def View(df):
    css = """<style>
    table { border-collapse: collapse; border: 3px solid #eee; }
    table tr th:first-child { background-color: #eeeeee; color: #333; font-weight: bold }
    table thead th { background-color: #eee; color: #000; }
    tr, th, td { border: 1px solid #ccc; border-width: 1px 0 0 1px; border-collapse: collapse;
    padding: 3px; font-family: monospace; font-size: 10px }</style>
    """
    s  = '<script type="text/Javascript">'
    s += 'var win = window.open("", "Title", "toolbar=no, location=no, directories=no, status=no, menubar=no, scrollbars=yes, resizable=yes, width=780, height=200, top="+(screen.height-400)+", left="+(screen.width-840));'
    s += 'win.document.body.innerHTML = \'' + (df.to_html() + css).replace("\n",'\\') + '\';'
    s += '</script>'
    return(HTML(s+css))    

os.chdir(r'C:\Users\User\GitHub\WebET_Analysis')
print("Current Working directory " , os.getcwd())

Current Working directory  C:\Users\User\GitHub\WebET_Analysis


# Raw Data

## Search for specific subjects

In [9]:
path = 'data_prolific'
subject_files = os.listdir(path)
for i in range(0, len(subject_files)):
    thisSubject_txt = open(path + "/" + subject_files[i]).read()
    if thisSubject_txt.find('Unnamed') > (-1):
        print(subject_files[i])

## Read CSV from String

In [10]:
def clean_html(raw_html):
    # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
    cleanr = re.compile('<.*?>')
    cleanText = re.sub(cleanr, '', raw_html)
    return cleanText

def clean_et_text(text):
    textWithinBrackets = re.findall(re.compile('\[.*?\]'), text)
    output = text
    for i in range(0,len(textWithinBrackets)):
        old = textWithinBrackets[i]
        new = re.sub(",", "$", old)
        output = output.replace(old, new)
    return output

def cleanSurveyText(text):
    output = text
    textWithinBrackets = re.findall(re.compile('\{.*?\}'), text)
    for i in range(0,len(textWithinBrackets)):
        old = textWithinBrackets[i]
        new = old.replace(',', '§')
        output = output.replace(old, new)
    return output

def compileData(path):
    subject_files = os.listdir(path)
    all_subjects = []
    for i in tqdm(range(0, len(subject_files))):
        csv_thisSubject = open(path + "/" + subject_files[i]).read()
        csv_thisSubject = clean_html(csv_thisSubject)
        csv_thisSubject = clean_et_text(csv_thisSubject)
        csv_thisSubject = cleanSurveyText(csv_thisSubject)
        df_thisSubject = pd.read_csv(StringIO(csv_thisSubject))
        if len(df_thisSubject) > 0: 
            all_subjects.append(df_thisSubject)
    output = pd.concat(all_subjects).reset_index(drop=True)
    return output

data_raw = compileData("data_prolific")

100%|████████████████████████████████████████████████████████████████████████████████| 467/467 [00:59<00:00,  7.81it/s]


## survey data (to merge with data_subject)

In [11]:
def cleanOptionalNote(text):
    optionalNoteText = re.findall(re.compile('optionalNote":.*?\}'), text)
    if len(optionalNoteText) < 1:
        output = text
    else:
        old = optionalNoteText[0]
        new = old.replace('§', ' ') \
            .replace(':', ' ') \
            .replace('(', ' ') \
            .replace(')', ' ') \
            .replace('optionalNote" ', 'optionalNote":')
        
        output = text.replace(old, new)
    return output


def surveyStringToFrame(subject, string):
    string = cleanOptionalNote(string)
    string = re.sub("""{""", '', string)
    string = re.sub("""}""", '', string)
    string = re.sub('"', '', string)
    string = re.sub('§', '$', string)
    output = pd.read_csv(StringIO(string),
                         sep=":",
                         lineterminator="$",
                         header=None,
                         index_col=0
                         ).transpose()
    return output


def surveyData_thisSubject(data):
    
    subject = data['run_id'].unique()[0]
    output = []
    for i in range(0, len(data)):
        output.append(
            surveyStringToFrame(subject,
                                data.loc[i, 'responses'])
        )

    output = pd.concat(output, axis=1)
    output['run_id'] = subject

    return output


def surveyData(data):
    surveyData = pd.DataFrame(columns=[
            'prolificID', 'age', 'gender', 'ethnic', 'sight', 
            'glasses', 'degree', 'eyeshadow', 'masquara', 'eyeliner', 
            'browliner', 'vertPosition', 'triedChin', 'keptHead', 
            'optionalNote', 'run_id'])

    for subject in tqdm(data['run_id'].unique()):
        
        df_thisSubject = data.loc[
            (data['run_id']==subject) & 
            (pd.notna(data["responses"])) &
            (data["responses"] != '"'), 
            ['run_id', 'responses']
        ] \
            .reset_index()
        
        if len(df_thisSubject)>0:
        
            surveyData = \
                surveyData.append(
                    surveyData_thisSubject(
                        df_thisSubject
                    )
                )
    
    surveyData = surveyData.rename(columns={'age': 'birthyear'})
    
    return(surveyData)
    
survey_data = surveyData(data_raw)
survey_data

100%|████████████████████████████████████████████████████████████████████████████████| 316/316 [00:07<00:00, 40.12it/s]


Unnamed: 0,prolificID,birthyear,gender,ethnic,sight,glasses,degree,eyeshadow,masquara,eyeliner,browliner,vertPosition,triedChin,keptHead,optionalNote,run_id
1,5fccc8ac636416a4288a9f3d,1995,male,caucasian,glasses,shortSight,High School,no,no,no,no,a,yes,yes,,1
1,600063f2943eab0acc812ed8,2001,male,caucasian,perfectSight,noCorrection,College / Undergraduate / Bachelor,no,no,no,no,a,,,,103
1,5d485e8415055400194b707f,,,,,,,,,,,,,,,106
1,55b237e6fdf99b19ea79d2f7,,,,,,,,,,,,,,,108
1,5c5684ef9d244c0001b29f1e,1991,male,asian,glasses,shortSight,College / Undergraduate / Bachelor,no,no,no,no,a,yes,yes,,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,5ecfc227f036c902457fc44c,1998,female,black,perfectSight,noCorrection,Graduate / PhD / Master,no,no,no,no,a,yes,yes,,94
1,5eb9940830ab6c098bd1943d,1997,male,caucasian,perfectSight,noCorrection,College / Undergraduate / Bachelor,no,no,no,no,a,yes,yes,,96
1,5f68fe5be5cc370c0a0911de,1988,female,caucasian,perfectSight,noCorrection,,no,no,no,no,b,yes,yes,,97
1,5f779405dacc842fc44bc767,1996,male,caucasian,glasses,longSight,2,no,no,no,no,b,yes,yes,,98


### Optional Notes

In [12]:
data_optionalNotes = survey_data.loc[
    pd.notna(survey_data['optionalNote']),
    ['run_id', 'optionalNote']
]

for subject in data_optionalNotes['run_id'].unique():
    print(f'run_id: {subject}')
    print(data_optionalNotes.loc[
        data_optionalNotes['run_id']==subject, 
        'optionalNote'].values[0])
    print('\n')

run_id: 208
no further comments.


run_id: 227
So much white text on a black background made my eyes tired so I'm sorry if that interfered with anything!


run_id: 238
It was hard! Sorry my cat came along. Also  I am being tested for multiple sclerosis right now  but it has not been doing anything to my vision. So hopefully you can still use mine - I haven't been diagnosed with anything yet.


run_id: 256
Of course you can hear my family suddenly and randomly ask me questions in the background but I swear to you I kept my eyes on the center of the monitor and I clicked everything. 


run_id: 266
Na


run_id: 270
I think I managed to stay pretty still  but my heart raced after awhile. 


run_id: 347
I noticed my head had moved from the first picture to the last picture while my head was resting on my hand even though I had tried to hold still


run_id: 361
  


run_id: 380
i am so sorry if you can hear my wife in this and for me getting up quite a bit. i hope i'm still eligible but i'm 

### Add Prolific ID to data_raw

In [13]:
if 'prolificID' in data_raw.columns:
    data_raw = data_raw.drop(columns='prolificID')
    
data_raw = data_raw \
    .merge(
        survey_data.loc[:, ['run_id', 'prolificID']],
        on='run_id',
        how='left'
)

## Binary data

In [14]:
data_raw['chinFirst'] = data_raw['chinFirst'].replace({'no': 0, 'yes': 1}) 

## Numerics

In [15]:
columns = [
            'eyeshadow', 
            'masquara',
            'eyeliner',
            'browliner',
            'triedChin', 
            'keptHead'
        ]

for col in columns:
    survey_data[col] = survey_data[col].replace({'no': 0, 'yes': 1}) 

# data_trial

In [16]:
data_trial = data_raw.loc[
    :,
    [
        'run_id', 'prolificID', 'subject', 'chinFirst', 
        'trial_index',
        'trial_type', 'task_nr', 
        'rt', 'stimulus', 'key_press', 
        'time_elapsed', 'trial_duration', 'recorded_at',
        'window_width', 'window_height', 'success', 

        'chin', 'x_pos', 'y_pos', 
        
        'choiceTask_amountLeftFirst', 
        'option_topLeft', 'option_bottomLeft',
        'option_topRight', 'option_bottomRight', 
        'chosenAmount', 'chosenDelay',
    ]
]

## Numerics

In [17]:
columns = [
            'run_id', 'subject', 'chinFirst', 'chin', 'task_nr', 'trial_index', # Int
            'key_press', 
            'x_pos', 'y_pos', 'time_elapsed', 'trial_duration',
            'rt',
            'window_width', 'window_height'
            ]

for col in columns:
    data_trial[col] = data_trial[col].replace({'"': np.nan}) 
    data_trial[col] = pd.to_numeric(data_trial[col])
    
    
data_trial.dtypes

run_id                          int64
prolificID                     object
subject                         int64
chinFirst                       int64
trial_index                     int64
trial_type                     object
task_nr                       float64
rt                            float64
stimulus                       object
key_press                     float64
time_elapsed                    int64
trial_duration                float64
recorded_at                    object
window_width                  float64
window_height                 float64
success                        object
chin                          float64
x_pos                         float64
y_pos                         float64
choiceTask_amountLeftFirst      int64
option_topLeft                 object
option_bottomLeft              object
option_topRight                object
option_bottomRight             object
chosenAmount                   object
chosenDelay                    object
dtype: objec

# data_et

## Compile

In [11]:
def reformatYang(text):
    text = text.replace('$', ',')
    text = text.replace("relative-x", "x")
    text = text.replace("relative-y", 'y')
    text = text.replace('elapse-time', 't')
    text = (text[11:len(text)-1])
    return(text)

#for subject in data_yang['run_id'].unique():
#    for i in data_raw.loc[
#        (data_raw['run_id']==subject) & 
#        (pd.notna(data_raw['et_data'])) &
#        ~(data_raw['et_data'].isin(['"', 'nan'])), 
#       :].index:
#       print('Reformat index: ' + str(i))
#       data_raw.loc[i, 'et_data'] = reformatYang(data_raw.loc[i, 'et_data'])

In [13]:
def text_to_data_frame(text):
    text = text.replace('$', ',')
    dataframe = pd.read_json(text, orient='records')
    return(dataframe)

def extractEyetrackingData(data):
    data_eyetracking = pd.DataFrame(columns=['x', 'y', 't'])
    data["et_data"] = data['et_data'].apply(str)

    et_indices = data.loc[
        (pd.notna(data['et_data'])) & 
        ~(data['et_data'].isin(['"', '[]', 'nan'])), 
        :].index
    index_max = et_indices.max()
    
    for i in tqdm(et_indices):
        
        df = text_to_data_frame(data.loc[i, 'et_data'])        

        df["t_task"] = (df.loc[:, "t"] - df.loc[0, "t"])
        df['run_id'] = data.loc[i, 'run_id']
        df['trial_index'] = data.loc[i, 'trial_index']

        data_eyetracking = data_eyetracking.append(pd.DataFrame(data=df), ignore_index=True)
    return(data_eyetracking)

data_et = extractEyetrackingData(data_raw.loc[data_raw['run_id'].isin(data_trial['run_id'].unique()), :])

100%|██████████████████████████████████████████████████████████████████████████| 66224/66224 [1:58:45<00:00,  9.29it/s]


In [14]:
data_et.groupby(['run_id', 'trial_index'])['x', 'y'].count()

  data_et.groupby(['run_id', 'trial_index'])['x', 'y'].count()


Unnamed: 0_level_0,Unnamed: 1_level_0,x,y
run_id,trial_index,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,18.0,6,6
1.0,20.0,11,11
1.0,22.0,11,11
1.0,24.0,11,11
1.0,26.0,10,10
...,...,...,...
462.0,518.0,44,44
462.0,520.0,13,13
462.0,522.0,44,44
462.0,524.0,13,13


# data_subject

In [18]:
data_subject = data_raw.loc[
    :, 
    [
        'run_id', 'chinFirst', 'choiceTask_amountLeftFirst',
        'browser', 'browser_version', 'device',
        'platform', 'platform_version', 'user_agent',
        'webcam_label', 'webcam_fps', 'webcam_height', 'webcam_width'
    ]
].drop_duplicates()

## Add survey data

In [19]:
data_subject = data_subject \
    .merge(
        survey_data,
        on='run_id', 
        how='left'
    ) \
    .rename(columns={'age': 'birthyear'})
data_subject.columns

Index(['run_id', 'chinFirst', 'choiceTask_amountLeftFirst', 'browser',
       'browser_version', 'device', 'platform', 'platform_version',
       'user_agent', 'webcam_label', 'webcam_fps', 'webcam_height',
       'webcam_width', 'prolificID', 'birthyear', 'gender', 'ethnic', 'sight',
       'glasses', 'degree', 'eyeshadow', 'masquara', 'eyeliner', 'browliner',
       'vertPosition', 'triedChin', 'keptHead', 'optionalNote'],
      dtype='object')

## Add prolific data

In [20]:
data_prolific_int = pd.read_csv(r'C:/Users/User/GitHub/WebET_Analysis/prolific/prolific_export_int.csv') \
    .rename(columns={'participant_id': 'prolificID'}) 

data_prolific_us = pd.read_csv(r'C:/Users/User/GitHub/WebET_Analysis/prolific/prolific_export_us.csv') \
    .rename(columns={'participant_id': 'prolificID'}) \

data_prolific = data_prolific_int \
    .append(data_prolific_us)
print(len(
    data_prolific.loc[data_prolific['status']=='APPROVED', :]))
print(len(data_subject.prolificID.unique()))

250
278


In [21]:
data_subject = data_subject.merge(data_prolific, on='prolificID', how='left')
data_subject['status'] = data_subject['status'].fillna('NOTPROLIFIC')
data_subject.loc[:, ['run_id', 'prolificID']]

Unnamed: 0,run_id,prolificID
0,1,5fccc8ac636416a4288a9f3d
1,103,600063f2943eab0acc812ed8
2,105,
3,106,5d485e8415055400194b707f
4,108,55b237e6fdf99b19ea79d2f7
...,...,...
311,94,5ecfc227f036c902457fc44c
312,96,5eb9940830ab6c098bd1943d
313,97,5f68fe5be5cc370c0a0911de
314,98,5f779405dacc842fc44bc767


# data_prolific
For participant management

In [22]:
temp = data_subject.rename(columns={
    'chosenAmount': 'bonus_USD',
    'chosenDelay': 'bonus_delay'
    }
)

data_prolific = data_prolific.merge(
    temp.loc[:, 
        np.append(['prolificID'], 
        temp.columns.difference(data_prolific.columns))], 
    on='prolificID', 
    how='left')

# Export data

In [24]:
if not os.path.exists('./data_jupyter/raw'):
    os.mkdir('./data_jupyter/raw')

data_et.to_csv("data_jupyter/raw/data_et.csv", index=False, header=True)
print('data_et saved!')
data_trial.to_csv("data_jupyter/raw/data_trial.csv", index=False, header=True)
print('data_trial saved!')
data_subject.to_csv("data_jupyter/raw/data_subject.csv", index=False, header=True)
data_prolific.to_csv("data_jupyter/raw/data_prolific.csv", index=False, header=True)

data_et saved!
data_trial saved!


# Feedback

In [25]:
print('Success! Script ran through')

Success! Script ran through
