In [1]:
import datetime
from datetime import timedelta  
import matplotlib.pyplot as plt
import numpy as np
import os
import math
import pandas as pd
import sys
from tqdm import tqdm 

    
from IPython.display import HTML
def View(df):
    css = """<style>
    table { border-collapse: collapse; border: 3px solid #eee; }
    table tr th:first-child { background-color: #eeeeee; color: #333; font-weight: bold }
    table thead th { background-color: #eee; color: #000; }
    tr, th, td { border: 1px solid #ccc; border-width: 1px 0 0 1px; border-collapse: collapse;
    padding: 3px; font-family: monospace; font-size: 10px }</style>
    """
    s  = '<script type="text/Javascript">'
    s += 'var win = window.open("", "Title", "toolbar=no, location=no, directories=no, status=no, menubar=no, scrollbars=yes, resizable=yes, width=780, height=200, top="+(screen.height-400)+", left="+(screen.width-840));'
    s += 'win.document.body.innerHTML = \'' + (df.to_html() + css).replace("\n",'\\') + '\';'
    s += '</script>'
    return(HTML(s+css))    
    
print("Current Working directory " , os.getcwd())

Current Working directory  C:\Users\User\GitHub\WebET_Analysis\jupyter_notebooks


In [2]:
root = r'C:/Users/User/GitHub/WebET_Analysis'
data_et = pd.read_csv(
    os.path.join(root, 'data', 'all_trials', 'added_var', 'data_et.csv'))

data_trial = pd.read_csv(
    os.path.join(root, 'data', 'all_trials', 'added_var', 'data_trial.csv'))

data_subject = pd.read_csv(
    os.path.join(root, 'data', 'all_trials', 'added_var', 'data_subject.csv'))
data_subject = data_subject \
    .loc[data_subject['prolificID']!='5fccc8ac636416a4288a9f3d', :] # Free test participant

data_prolific = pd.read_csv(
    os.path.join(r'C:/Users/User/GitHub/WebET_Analysis', 'data', 'all_trials', 
                 'combined', 'data_prolific.csv')) 

data_prolific = data_prolific \
    .loc[data_prolific['prolificID']!='5fccc8ac636416a4288a9f3d', :] # Free test participant

summary = pd.DataFrame({
    'dataset': [
        'data_et', 
        'data_trial',
        'data_prolific',
        'data_subject'],
    'length': [
        len(data_et),
        len(data_trial),
        len(data_prolific),
        len(data_subject)],
    'prolific_ids': [
        '-',
        len(data_trial['prolificID'].unique()),
        len(data_prolific['prolificID'].unique()),
        len(data_subject['prolificID'].unique())],
    'length_approved': [
        '-',
        '-',
        len(data_prolific.loc[data_prolific['status']=='APPROVED', 'prolificID']),
        len(data_subject.loc[data_subject['status']=='APPROVED', 'prolificID'])],
    'approved_unique': [
        '-',
        '-',
        len(data_prolific.loc[data_prolific['status']=='APPROVED', 'prolificID'].unique()),
        len(data_subject.loc[data_subject['status']=='APPROVED', 'prolificID'].unique())],
})
        
print(summary)

         dataset   length prolific_ids length_approved approved_unique
0        data_et  2846488            -               -               -
1     data_trial   140743          292               -               -
2  data_prolific      405          378             281             263
3   data_subject      424          382             281             263


In [4]:
data_subject['prolificID'].unique()

array(['600063f2943eab0acc812ed8', nan, '5d485e8415055400194b707f',
       '55b237e6fdf99b19ea79d2f7', '5c5684ef9d244c0001b29f1e',
       '5edc20443467e28ec4e30f93', '5b8969006651ea000118e42e',
       '56b7a271e77ebe000bbeff49', '5c0e4e3d87e876000151cfec',
       '5f01a0893963d02ea1d65d12', '5b4e65de09f5af0001080f45',
       '5fea19869bf07d97b4a87ae8', '5e53a7b4d9a7ea3a2b062da0',
       '5ec2cdfd1a17930ddf6f1443', '5e9c8aace6eae81abf309d53',
       '60220f3c276dd72c78daa7bb', '5d754a0ae8351c0001f3ffc8',
       '5de80e0b521f95791f912e49', '5ead6b0e96a62440399d8caf',
       '5ff6841f306c942319b20520', '5f6560fd86ca9f0ac37ca381',
       '5a9c91196219a30001f534a5', '5ff7baf4cbc6b34ca5779e90',
       '5e331de721bbd22ed15f2405', '5f493291e3e64832df458323',
       '5e6002b9e9e2ca34e24deab8', '5d7fd2a5987f70001602d95b',
       '5df967d2f6191f6e87b236e6', '5f1b1f6a908b2609a62ab7ad',
       '5c95970cd676900016e1a940', '5eaca70e6e45b6318e89ef55',
       '5615777c7ffc8a000a811c61', '5dddef56946f7c

In [3]:
def clean_multiple_attempts(data_subject_raw):
    
    data_subject = data_subject_raw \
        .sort_values(by=['prolificID', 'max_trial']) \
        .drop_duplicates(
            subset=['prolificID'],
            keep='last')

    summary = pd.DataFrame({
        'datasets': [
            'raw', 
            'clean'],
        'runs': [
            len(data_subject_raw),
            len(data_subject)],
        'mean_trial_index': [
            round(np.mean(data_subject_raw['max_trial'])),
            round(np.mean(data_subject['max_trial']))],
        'prolific_ids': [
            len(data_subject_raw['prolificID'].unique()),
            len(data_subject['prolificID'].unique())]
    })
    
    print(f"""Cleaned multiple attempts: \n{summary}""")
    
    return data_subject

data_prolific = data_prolific.merge(
    data_subject.loc[:, ['prolificID', 'fps', 'max_trial']], 
    on='prolificID',
    how='left'
)

data_prolific = clean_multiple_attempts(data_prolific)

Cleaned multiple attempts: 
  datasets  runs  mean_trial_index  prolific_ids
0      raw   473               390           378
1    clean   378               483           378


In [4]:
def find_prolific_id_in_raw(this_id):
    """
        Search for specific subjects
    """
    path = os.path.join(root, 'data', 'all_trials', 'cognition_run')
    subject_files = os.listdir(path)
    for i in range(0, len(subject_files)):
        this_subject_txt = open(path + "/" + subject_files[i]).read()
        if this_subject_txt.find(this_id) > (-1):
            print(f'ID {this_id} is in {subject_files[i]} \n')
            
find_prolific_id_in_raw('5ee2916b70aa643be19c0036')

# Approve subjects

## Check choice and eye-tracking data

In [5]:
data_trial['choseTop'] = 0    
data_trial['choseTop'] = (data_trial['key_press']==38).astype(int)

data_et = data_et \
    .merge(
        data_trial.loc[
            :, [
                    'run_id', 'chinFirst', 'trial_index', 'choseTop',
                    'trial_type', 'task_nr', 'fixTask', 'trial_duration_exact']
               ], 
        on=['run_id', 'trial_index'], 
        how='left')

grouped_fix = data_et \
    .loc[data_et['fixTask']==1, :] \
    .groupby(
        ['run_id'],
        as_index=False).agg(
            n_fix=('x', 'count'),
            x_fix=('x', 'mean'),
            x_fix_std=('x', 'std'),
            y_fix=('y', 'mean'),
            y_fix_std=('y', 'std'),    
    )
    
grouped_choice = data_et \
    .loc[data_et['trial_type']=='eyetracking-choice', :] \
    .groupby(
        ['run_id'],
        as_index=False).agg(
            n_choice=('x', 'count'),
            x_choice=('x', 'mean'),
            x_choice_std=('x', 'std'),
            y_choice=('y', 'mean'),
            y_choice_std=('y', 'std'),   
            choseTop=('choseTop', 'mean'),
            choice_rt =('trial_duration_exact', 'mean'))    

data_prolific = data_prolific \
    .merge(grouped_fix, on='run_id', how='left') \
    .merge(grouped_choice, on='run_id', how='left')

data_prolific.loc[
    :, 
    [
        'run_id', 'prolificID', 
        'n_fix', 'x_fix', 'x_fix_std', 'y_fix', 'y_fix_std',
        'n_choice', 'x_choice', 'x_choice_std', 'y_choice', 'y_choice_std',
        'choseTop', 'choice_rt'
        
    ]
]

Unnamed: 0,run_id,prolificID,n_fix,x_fix,x_fix_std,y_fix,y_fix_std,n_choice,x_choice,x_choice_std,y_choice,y_choice_std,choseTop,choice_rt
0,473.0,5484655ffdf99b07b28f22cc,1394.0,0.619090,0.178720,0.388704,0.214231,2543.0,0.688555,0.175613,0.493627,0.201312,0.519858,2193.444357
1,339.0,55a83419fdf99b055f579192,1065.0,0.547281,0.174586,0.521261,0.104629,1386.0,0.558659,0.165372,0.526529,0.061349,0.479798,1556.276335
2,108.0,55b237e6fdf99b19ea79d2f7,,,,,,,,,,,,
3,,55b660dffdf99b1c029069a2,,,,,,,,,,,,
4,309.0,55cb960cfdf99b45e4bb11e0,1168.0,0.469950,0.210170,0.491282,0.220019,2864.0,0.405567,0.121336,0.522569,0.178694,0.493715,2571.211941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,422.0,6021a1418a2f591e46a92aea,1061.0,0.524749,0.250502,0.460514,0.247254,3393.0,0.482432,0.198995,0.621111,0.171880,0.519010,3300.275862
374,146.0,60220f3c276dd72c78daa7bb,1385.0,0.503798,0.214921,0.514055,0.162583,3461.0,0.554381,0.310359,0.546675,0.175166,0.495810,2439.946258
375,,6023e203579351068c9750f1,,,,,,,,,,,,
376,253.0,602407b2c3177c0365d16e07,742.0,0.500257,0.228849,0.508769,0.210017,1444.0,0.490183,0.119534,0.566813,0.125421,0.472992,1672.663435


## These subjects await review

In [23]:
awaiting_review = data_prolific.loc[
    data_prolific['status']=='AWAITING REVIEW', 
    [
        'run_id', 'prolificID', 
        'started_datetime', 'completed_date_time', 'max_trial',
        'n_fix', 'x_fix', 'x_fix_std', 'y_fix', 'y_fix_std',
        'n_choice', 'x_choice', 'x_choice_std', 'y_choice', 'y_choice_std',
        'choseTop', 'choice_rt'
    ]
].sort_values(by='started_datetime')

if len(awaiting_review) > 0: 
    print(
        f"""Len: {len(awaiting_review)} \n"""
        f"""IDs: \n"""
        f"""{','.join(awaiting_review['prolificID'])}""")

else:
    print('No subject awaits review')
    
awaiting_review

No subject awaits review


Unnamed: 0,run_id,prolificID,started_datetime,completed_date_time,max_trial,n_fix,x_fix,x_fix_std,y_fix,y_fix_std,n_choice,x_choice,x_choice_std,y_choice,y_choice_std,choseTop,choice_rt


# Costs

In [8]:
data_pay = data_prolific.loc[
    data_prolific['status']=='APPROVED', 
    [
        'run_id', 'prolificID', 'max_trial', 
        'status', 'reviewed_at_datetime', 'entered_code',
        'session_id', 'started_datetime', 'completed_date_time', 'time_taken',
        'bonus_USD', 'bonus_delay'
    ]
]
print(len(data_pay))
print(f"""unique: {len(data_pay['prolificID'].unique())}""")

263
unique: 263


# Bonus payment

Reformat payments

In [9]:
data_pay['bonus_delay'] = data_pay['bonus_delay'].astype(str)
data_pay['bonus_delay'] = data_pay['bonus_delay'] \
    .replace(['Today', 'Tomorrow', '7 days', 
          '15 days', '30 days', '90 days', 
          '180 days'], 
         [0, 1, 7, 15, 30, 90, 180]) \
    .astype(float)

data_pay['bonus_USD'] = data_pay['bonus_USD'].astype(str)
data_pay['bonus_USD'] = data_pay['bonus_USD'] \
    .replace({'\$':''}, regex = True) \
    .replace('50 cent', 0.5) \
    .astype(float)

data_pay.loc[:, ['bonus_USD', 'bonus_delay']].head(5)

Unnamed: 0,bonus_USD,bonus_delay
0,5.0,15.0
1,4.0,15.0
4,4.0,7.0
7,4.0,30.0
9,3.5,0.0


## Missing values

In [10]:
data_pay.loc[
    pd.isna(data_pay['bonus_USD']), 
    [
        'run_id', 'prolificID', 'status', 'started_datetime', 'completed_date_time', 
        'max_trial', 'bonus_USD', 'bonus_delay']
]

Unnamed: 0,run_id,prolificID,status,started_datetime,completed_date_time,max_trial,bonus_USD,bonus_delay
26,12.0,5b8969006651ea000118e42e,APPROVED,2021-01-18 08:33:12.885000,2021-01-18 08:56:36.517000,518.0,,
155,,5ec5a64c306f255ec98d5cc1,APPROVED,2021-02-13 21:00:03.633000,2021-02-13 21:23:18.932000,,,
170,,5ee2916b70aa643be19c0036,APPROVED,2021-03-18 11:20:00.367000,2021-03-18 11:50:04.312000,,,
222,392.0,5f4fe72e9468441227166179,APPROVED,2021-02-13 02:53:03.452000,2021-02-13 03:14:15.378000,232.0,,
280,61.0,5fb2af792942a58ffe303948,APPROVED,2021-01-19 09:09:32.561000,2021-01-19 09:37:01.137000,271.0,,
315,9.0,5fea6632bf9ae4a79153efdf,APPROVED,2021-01-18 08:30:53.598000,2021-01-18 09:14:43.719000,518.0,,
363,273.0,60186dc2cc1aa8103499603a,APPROVED,2021-02-12 22:28:51.483000,2021-02-12 22:52:45.588000,4.0,,


In [11]:
data_pay.loc[
    data_pay['prolificID'].isin([
        '5fea6632bf9ae4a79153efdf',
        '5b8969006651ea000118e42e',
        '5fb2af792942a58ffe303948',
        '5f4fe72e9468441227166179',
        '60186dc2cc1aa8103499603a',
        '5ec5a64c306f255ec98d5cc1',
    ]), 
    ['bonus_USD', 'bonus_delay']] = [5, 1]

summary = data_pay.loc[
    pd.isna(data_pay['bonus_USD']) |
    pd.isna(data_pay['bonus_delay']), 
    [
        'run_id', 'prolificID', 'status', 'started_datetime', 'completed_date_time', 
        'max_trial', 'bonus_USD', 'bonus_delay']
]

if len(summary)<1:
    print('No more participants with missing bonus payment information')
else:
    print(summary)

     run_id                prolificID    status            started_datetime  \
170     NaN  5ee2916b70aa643be19c0036  APPROVED  2021-03-18 11:20:00.367000   

            completed_date_time  max_trial  bonus_USD  bonus_delay  
170  2021-03-18 11:50:04.312000        NaN        NaN          NaN  


In [12]:
data_pay.loc[
    data_pay['prolificID']=='5ee2916b70aa643be19c0036', 
    ['bonus_USD', 'bonus_delay']] = [0, 0]

## Bonus in other currencies

In [13]:
data_pay['bonus_GBP'] = data_pay['bonus_USD'] * 0.75
data_pay['bonus_EUR'] = data_pay['bonus_GBP'] * 1.13

## When is the bonus due?

In [14]:
data_pay.loc[data_pay['run_id']==444, 'completed_date_time'] = \
    '2021-02-13 21:52:30.000000'
    
data_pay['completed_date']=data_pay.apply(
    lambda x: datetime.datetime.strptime(
        x['completed_date_time'], '%Y-%m-%d %H:%M:%S.%f') \
            .date(),
    axis=1)

data_pay['due_on'] = data_pay['completed_date'] + \
    data_pay['bonus_delay'].map(datetime.timedelta) 

print(len(data_pay))

263




In [15]:
data_due = data_pay.loc[:, 
               [
                   'prolificID', 'run_id',
                   'bonus_USD', 'bonus_GBP', 'bonus_EUR', 
                   'completed_date', 'bonus_delay', 'due_on'
               ]
              ]. sort_values(by='due_on')
data_due

Unnamed: 0,prolificID,run_id,bonus_USD,bonus_GBP,bonus_EUR,completed_date,bonus_delay,due_on
187,5f0cb319d29147695796a208,37.0,3.0,2.250,2.54250,2021-01-18,0.0,2021-01-18
174,5eeaa0ffaa6af11cf32ce057,30.0,3.0,2.250,2.54250,2021-01-18,0.0,2021-01-18
42,5c5684ef9d244c0001b29f1e,11.0,3.5,2.625,2.96625,2021-01-18,0.0,2021-01-18
254,5f8472685956c40c720d0936,36.0,0.5,0.375,0.42375,2021-01-18,0.0,2021-01-18
232,5f561a95aa1c4ea13672f138,28.0,2.5,1.875,2.11875,2021-01-18,0.0,2021-01-18
...,...,...,...,...,...,...,...,...
62,5d36600685d1d50001affacb,446.0,5.0,3.750,4.23750,2021-02-13,180.0,2021-08-12
366,6019f33f17fdc21e027f72aa,328.0,5.0,3.750,4.23750,2021-02-13,180.0,2021-08-12
43,5c5da579bfe5280001448433,340.0,4.0,3.000,3.39000,2021-02-13,180.0,2021-08-12
223,5f50468c0868af1baaec306c,491.0,4.0,3.000,3.39000,2021-03-18,180.0,2021-09-14


## Total bonus payment costs

In [16]:
print(
    f"""Total: n={len(data_pay)} participants, """
    f"""{(4/3) * sum(data_pay['bonus_GBP'])} GBP \n"""
)

Total: n=263 participants, 1049.5 GBP 



## Bonus left to pay

In [26]:
bonuses_future = data_pay.loc[
    data_pay['due_on']>datetime.datetime.now().date(), 
    ['prolificID', 'completed_date', 'bonus_GBP', 'due_on']]

total_bonus = (4/3) * sum(bonuses_future['bonus_GBP'])

print(
    f"""Bonuses due in the future: \n"""
    f"""Total: n={len(bonuses_future)} participants, """
    f"""{total_bonus} GBP, incl. fees\n\n"""
    f"""{bonuses_future}"""
)

Bonuses due in the future: 
Total: n=70 participants, 319.0 GBP, incl. fees

                   prolificID completed_date  bonus_GBP      due_on
0    5484655ffdf99b07b28f22cc     2021-03-18      3.750  2021-04-02
18   59c0a297c458e800017b5499     2021-02-12      3.375  2021-05-13
25   5b6a87d2cda8590001db8e07     2021-02-12      3.000  2021-08-11
31   5bf3761862e1bc0001f15cb2     2021-02-11      3.000  2021-08-10
32   5c0e4e3d87e876000151cfec     2021-01-19      3.375  2021-04-19
..                        ...            ...        ...         ...
362  601836ba0681ff045e6d6940     2021-02-13      3.750  2021-05-14
366  6019f33f17fdc21e027f72aa     2021-02-13      3.750  2021-08-12
371  601dfce11ffd11245d1bf9de     2021-02-11      3.000  2021-05-12
373  6021a1418a2f591e46a92aea     2021-02-13      3.750  2021-05-14
374  60220f3c276dd72c78daa7bb     2021-02-11      3.750  2021-08-10

[70 rows x 4 columns]


# Total costs

## Basic payment

In [24]:
data_pay['basic_GBP'] = 2.25

## Total

In [19]:
n=len(data_pay)
total_GBP = data_pay['bonus_GBP'].sum() + data_pay['basic_GBP'].sum()
total_euro = (4/3) * total_GBP * 1.14
remaining_funds = 2101-total_euro

summary = pd.DataFrame({
    'n': [n],
    'basic_average': [data_pay['basic_GBP'].mean()], 
    'basic_total': [data_pay['basic_GBP'].sum()], 
    'bonus_average': [data_pay['bonus_GBP'].mean()], 
    'bonus_total': [data_pay['bonus_GBP'].sum()], 
    'service_fee': [(1/3) * total_GBP],
    'total': [(4/3) * total_GBP],
    'total € (incl. fee)': [(4/3) * total_GBP * 1.14]    
}).T

print(round(summary, 2))
print(f"""\n{round(remaining_funds, 2)}€ remaining funds""")

                           0
n                     263.00
basic_average           2.25
basic_total           591.75
bonus_average           2.99
bonus_total           787.12
service_fee           459.62
total                1838.50
total € (incl. fee)  2095.89

5.11€ remaining funds


# Prognosis for full budget

In [20]:
n=1
total_GBP = data_pay['bonus_GBP'].mean() + data_pay['basic_GBP'].mean()
total_euro = (4/3) * total_GBP * 1.14

summary = pd.DataFrame({
    'n': [n],
    'basic_average': [data_pay['basic_GBP'].mean()], 
    'bonus_average': [data_pay['bonus_GBP'].mean()], 
    'service_fee': [(1/3) * total_GBP],
    'total': [(4/3) * total_GBP],
    'total € (incl. fee)': [(4/3) * total_GBP * 1.14]    
}).T

print(
    f"""{round(summary, 2)}\n \n"""
    f"""{math.floor(remaining_funds / total_euro)} more participants possible""")

                        0
n                    1.00
basic_average        2.25
bonus_average        2.99
service_fee          1.75
total                6.99
total € (incl. fee)  7.97
 
0 more participants possible


# Feedback

In [21]:
print('Success! Script ran through.')

Success! Script ran through.
