In [175]:
import datetime
from datetime import timedelta  
import matplotlib.pyplot as plt
import numpy as np
import os
import math
import pandas as pd
import sys
from tqdm import tqdm 

    
from IPython.display import HTML
def View(df):
    css = """<style>
    table { border-collapse: collapse; border: 3px solid #eee; }
    table tr th:first-child { background-color: #eeeeee; color: #333; font-weight: bold }
    table thead th { background-color: #eee; color: #000; }
    tr, th, td { border: 1px solid #ccc; border-width: 1px 0 0 1px; border-collapse: collapse;
    padding: 3px; font-family: monospace; font-size: 10px }</style>
    """
    s  = '<script type="text/Javascript">'
    s += 'var win = window.open("", "Title", "toolbar=no, location=no, directories=no, status=no, menubar=no, scrollbars=yes, resizable=yes, width=780, height=200, top="+(screen.height-400)+", left="+(screen.width-840));'
    s += 'win.document.body.innerHTML = \'' + (df.to_html() + css).replace("\n",'\\') + '\';'
    s += '</script>'
    return(HTML(s+css))    
    
print("Current Working directory " , os.getcwd())

Current Working directory  C:\Users\User\GitHub\WebET_Analysis\jupyter_notebooks


In [133]:
root = r'C:/Users/User/GitHub/WebET_Analysis'
data_et = pd.read_csv(
    os.path.join(root, 'data', 'all_trials', 'added_var', 'data_et.csv'))

data_trial = pd.read_csv(
    os.path.join(root, 'data', 'all_trials', 'added_var', 'data_trial.csv'))

data_subject = pd.read_csv(
    os.path.join(root, 'data', 'all_trials', 'added_var', 'data_subject.csv'))
data_subject = data_subject \
    .loc[data_subject['prolificID']!='5fccc8ac636416a4288a9f3d', :] # Free test participant

data_prolific = pd.read_csv(
    os.path.join(r'C:/Users/User/GitHub/WebET_Analysis', 'data', 'all_trials', 
                 'combined', 'data_prolific.csv')) 

data_prolific = data_prolific \
    .loc[data_prolific['prolificID']!='5fccc8ac636416a4288a9f3d', :] # Free test participant

summary = pd.DataFrame({
    'dataset': [
        'data_et', 
        'data_trial',
        'data_prolific',
        'data_subject'],
    'length': [
        len(data_et),
        len(data_trial),
        len(data_prolific),
        len(data_subject)],
    'prolific_ids': [
        '-',
        len(data_trial['prolificID'].unique()),
        len(data_prolific['prolificID'].unique()),
        len(data_subject['prolificID'].unique())],
    'length_approved': [
        '-',
        '-',
        len(data_prolific.loc[data_prolific['status']=='APPROVED', 'prolificID']),
        len(data_subject.loc[data_subject['status']=='APPROVED', 'prolificID'])],
    'approved_unique': [
        '-',
        '-',
        len(data_prolific.loc[data_prolific['status']=='APPROVED', 'prolificID'].unique()),
        len(data_subject.loc[data_subject['status']=='APPROVED', 'prolificID'].unique())],
})
        
print(summary)

         dataset   length prolific_ids length_approved approved_unique
0        data_et  2712806            -               -               -
1     data_trial   133740          278               -               -
2  data_prolific      381          356             267             250
3   data_subject      398          359             267             250


In [134]:
def clean_multiple_attempts(data_subject_raw):
    
    data_subject = data_subject_raw \
        .sort_values(by=['prolificID', 'max_trial']) \
        .drop_duplicates(
            subset=['prolificID'],
            keep='last')

    summary = pd.DataFrame({
        'datasets': [
            'raw', 
            'clean'],
        'runs': [
            len(data_subject_raw),
            len(data_subject)],
        'mean_trial_index': [
            round(np.mean(data_subject_raw['max_trial'])),
            round(np.mean(data_subject['max_trial']))],
        'prolific_ids': [
            len(data_subject_raw['prolificID'].unique()),
            len(data_subject['prolificID'].unique())]
    })
    
    print(f"""Cleaned multiple attempts: \n{summary}""")
    
    return data_subject

data_prolific = data_prolific.merge(
    data_subject.loc[:, ['prolificID', 'fps', 'max_trial']], 
    on='prolificID',
    how='left'
)

data_prolific = clean_multiple_attempts(data_prolific)

Cleaned multiple attempts: 
  datasets  runs  mean_trial_index  prolific_ids
0      raw   445               391           356
1    clean   356               482           356


# Approve subjects

## Check choice and eye-tracking data

In [135]:
data_trial['choseTop'] = 0    
data_trial['choseTop'] = (data_trial['key_press']==38).astype(int)

data_et = data_et \
    .merge(
        data_trial.loc[
            :, [
                    'run_id', 'chinFirst', 'trial_index', 'choseTop',
                    'trial_type', 'task_nr', 'fixTask', 'trial_duration_exact']
               ], 
        on=['run_id', 'trial_index'], 
        how='left')

grouped_fix = data_et \
    .loc[data_et['fixTask']==1, :] \
    .groupby(
        ['run_id'],
        as_index=False).agg(
            n_fix=('x', 'count'),
            x_fix=('x', 'mean'),
            x_fix_std=('x', 'std'),
            y_fix=('y', 'mean'),
            y_fix_std=('y', 'std'),    
    )
    
grouped_choice = data_et \
    .loc[data_et['trial_type']=='eyetracking-choice', :] \
    .groupby(
        ['run_id'],
        as_index=False).agg(
            n_choice=('x', 'count'),
            x_choice=('x', 'mean'),
            x_choice_std=('x', 'std'),
            y_choice=('y', 'mean'),
            y_choice_std=('y', 'std'),   
            choseTop=('choseTop', 'mean'),
            choice_rt =('trial_duration_exact', 'mean'))    

data_prolific = data_prolific \
    .merge(grouped_fix, on='run_id', how='left') \
    .merge(grouped_choice, on='run_id', how='left')

data_prolific.loc[
    :, 
    [
        'run_id', 'prolificID', 
        'n_fix', 'x_fix', 'x_fix_std', 'y_fix', 'y_fix_std',
        'n_choice', 'x_choice', 'x_choice_std', 'y_choice', 'y_choice_std',
        'choseTop', 'choice_rt'
        
    ]
]

Unnamed: 0,run_id,prolificID,n_fix,x_fix,x_fix_std,y_fix,y_fix_std,n_choice,x_choice,x_choice_std,y_choice,y_choice_std,choseTop,choice_rt
0,339.0,55a83419fdf99b055f579192,1065.0,0.547281,0.174586,0.521261,0.104629,1386.0,0.558659,0.165372,0.526529,0.061349,0.479798,1556.276335
1,108.0,55b237e6fdf99b19ea79d2f7,,,,,,,,,,,,
2,,55b660dffdf99b1c029069a2,,,,,,,,,,,,
3,309.0,55cb960cfdf99b45e4bb11e0,1168.0,0.469950,0.210170,0.491282,0.220019,2864.0,0.405567,0.121336,0.522569,0.178694,0.493715,2571.211941
4,172.0,5615777c7ffc8a000a811c61,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,422.0,6021a1418a2f591e46a92aea,1061.0,0.524749,0.250502,0.460514,0.247254,3393.0,0.482432,0.198995,0.621111,0.171880,0.519010,3300.275862
352,146.0,60220f3c276dd72c78daa7bb,1385.0,0.503798,0.214921,0.514055,0.162583,3461.0,0.554381,0.310359,0.546675,0.175166,0.495810,2439.946258
353,,6023e203579351068c9750f1,,,,,,,,,,,,
354,253.0,602407b2c3177c0365d16e07,742.0,0.500257,0.228849,0.508769,0.210017,1444.0,0.490183,0.119534,0.566813,0.125421,0.472992,1672.663435


## These subjects await review

In [124]:
awaiting_review = data_prolific.loc[
    data_prolific['status']=='AWAITING REVIEW', 
    [
        'run_id', 'prolificID', 
        'started_datetime', 'time_taken', 'max_trial',
        'n_fix', 'x_fix', 'x_fix_std', 'y_fix', 'y_fix_std',
        'n_choice', 'x_choice', 'x_choice_std', 'y_choice', 'y_choice_std',
        'choseTop', 'choice_rt'
    ]
].sort_values(by='started_datetime')
awaiting_review

Unnamed: 0,run_id,prolificID,started_datetime,time_taken,max_trial,n_fix,x_fix,x_fix_std,y_fix,y_fix_std,n_choice,x_choice,x_choice_std,y_choice,y_choice_std,choseTop,choice_rt


# Costs

In [136]:
data_pay = data_prolific.loc[
    data_prolific['status']=='APPROVED', 
    [
        'run_id', 'prolificID', 'max_trial', 
        'status', 'reviewed_at_datetime', 'entered_code',
        'session_id', 'started_datetime', 'completed_date_time', 'time_taken',
        'bonus_USD', 'bonus_delay'
    ]
]
print(len(data_pay))
print(f"""unique: {len(data_pay['prolificID'].unique())}""")

250
unique: 250


# Bonus payment

Reformat payments

In [137]:
data_pay['bonus_delay'] = data_pay['bonus_delay'].astype(str)
data_pay['bonus_delay'] = data_pay['bonus_delay'] \
    .replace(['Today', 'Tomorrow', '7 days', 
          '15 days', '30 days', '90 days', 
          '180 days'], 
         [0, 1, 7, 15, 30, 90, 180]) \
    .astype(float)

data_pay['bonus_USD'] = data_pay['bonus_USD'].astype(str)
data_pay['bonus_USD'] = data_pay['bonus_USD'] \
    .replace({'\$':''}, regex = True) \
    .replace('50 cent', 0.5) \
    .astype(float)

data_pay.loc[:, ['bonus_USD', 'bonus_delay']].head(5)

Unnamed: 0,bonus_USD,bonus_delay
0,4.0,15.0
3,4.0,7.0
6,4.0,30.0
8,3.5,0.0
9,5.0,30.0


## Missing values

In [138]:
data_pay.loc[
    pd.isna(data_pay['bonus_USD']), 
    [
        'run_id', 'prolificID', 'status', 'started_datetime', 'completed_date_time', 
        'max_trial', 'bonus_USD', 'bonus_delay']
]

Unnamed: 0,run_id,prolificID,status,started_datetime,completed_date_time,max_trial,bonus_USD,bonus_delay
24,12.0,5b8969006651ea000118e42e,APPROVED,2021-01-18 08:33:12.885000,2021-01-18 08:56:36.517000,518.0,,
145,,5ec5a64c306f255ec98d5cc1,APPROVED,2021-02-13 21:00:03.633000,2021-02-13 21:23:18.932000,,,
209,392.0,5f4fe72e9468441227166179,APPROVED,2021-02-13 02:53:03.452000,2021-02-13 03:14:15.378000,232.0,,
262,61.0,5fb2af792942a58ffe303948,APPROVED,2021-01-19 09:09:32.561000,2021-01-19 09:37:01.137000,271.0,,
297,9.0,5fea6632bf9ae4a79153efdf,APPROVED,2021-01-18 08:30:53.598000,2021-01-18 09:14:43.719000,518.0,,
341,273.0,60186dc2cc1aa8103499603a,APPROVED,2021-02-12 22:28:51.483000,2021-02-12 22:52:45.588000,4.0,,


In [142]:
data_pay.loc[
    data_pay['prolificID'].isin([
        '5fea6632bf9ae4a79153efdf',
        '5b8969006651ea000118e42e',
        '5fb2af792942a58ffe303948',
        '5f4fe72e9468441227166179',
        '60186dc2cc1aa8103499603a',
        '5ec5a64c306f255ec98d5cc1',
    ]), 
    ['bonus_USD', 'bonus_delay']] = [5, 1]

summary = data_pay.loc[
    pd.isna(data_pay['bonus_USD']) |
    pd.isna(data_pay['bonus_delay']), 
    [
        'run_id', 'prolificID', 'status', 'started_datetime', 'completed_date_time', 
        'max_trial', 'bonus_USD', 'bonus_delay']
]

if len(summary)<1:
    print('No more participants with missing bonus payment information')

No more participants with missing bonus payment information


## Bonus in other currencies

In [143]:
data_pay['bonus_GBP'] = data_pay['bonus_USD'] * 0.75
data_pay['bonus_EUR'] = data_pay['bonus_GBP'] * 1.13

## When is the bonus due?

In [144]:
data_pay.loc[data_pay['run_id']==444, 'completed_date_time'] = \
    '2021-02-13 21:52:30.000000'
    
data_pay['completed_date']=data_pay.apply(
    lambda x: datetime.datetime.strptime(
        x['completed_date_time'], '%Y-%m-%d %H:%M:%S.%f') \
            .date(),
    axis=1)

data_pay['due_on'] = data_pay['completed_date'] + \
    data_pay['bonus_delay'].map(datetime.timedelta) 

print(len(data_pay))

250




In [147]:
data_due = data_pay.loc[:, 
               [
                   'prolificID', 'run_id',
                   'bonus_USD', 'bonus_GBP', 'bonus_EUR', 
                   'completed_date', 'bonus_delay', 'due_on'
               ]
              ]. sort_values(by='due_on')
data_due

Unnamed: 0,prolificID,run_id,bonus_USD,bonus_GBP,bonus_EUR,completed_date,bonus_delay,due_on
218,5f561a95aa1c4ea13672f138,28.0,2.5,1.875,2.11875,2021-01-18,0.0,2021-01-18
175,5f0cb319d29147695796a208,37.0,3.0,2.250,2.54250,2021-01-18,0.0,2021-01-18
40,5c5684ef9d244c0001b29f1e,11.0,3.5,2.625,2.96625,2021-01-18,0.0,2021-01-18
162,5eeaa0ffaa6af11cf32ce057,30.0,3.0,2.250,2.54250,2021-01-18,0.0,2021-01-18
229,5f77a902fc647a327b77004a,24.0,3.0,2.250,2.54250,2021-01-18,0.0,2021-01-18
...,...,...,...,...,...,...,...,...
323,600612a62facd195d940e24d,389.0,4.5,3.375,3.81375,2021-02-13,180.0,2021-08-12
73,5d8270af4104eb001941224c,377.0,4.0,3.000,3.39000,2021-02-13,180.0,2021-08-12
90,5de40feb61872d000d8803ff,403.0,4.5,3.375,3.81375,2021-02-13,180.0,2021-08-12
202,5f489766dbc4a2364c0f4f93,405.0,5.0,3.750,4.23750,2021-02-13,180.0,2021-08-12


## Total bonus payment costs

In [158]:
print(
    f"""Total: n={len(data_pay)} participants, """
    f"""{(4/3) * sum(data_pay['bonus_GBP'])} GBP \n"""
)

Total: n=250 participants, 999.5 GBP 



## Bonus left to pay

In [157]:
bonuses_future = data_pay.loc[
    data_pay['due_on']>datetime.datetime.now().date(), 
    ['prolificID', 'completed_date', 'bonus_GBP', 'due_on']]

print(
    f"""Bonuses due in the future: \n"""
    f"""Total: n={len(bonuses_future)} participants, """
    f"""{(4/3) * sum(bonuses_future['bonus_GBP'])} GBP \n"""
    f"""{bonuses_future}"""
)

Bonuses due in the future: 
Total: n=61 participants, 276.5 GBP 
                   prolificID completed_date  bonus_GBP      due_on
16   59c0a297c458e800017b5499     2021-02-12      3.375  2021-05-13
23   5b6a87d2cda8590001db8e07     2021-02-12      3.000  2021-08-11
29   5bf3761862e1bc0001f15cb2     2021-02-11      3.000  2021-08-10
30   5c0e4e3d87e876000151cfec     2021-01-19      3.375  2021-04-19
36   5c2f9cddc5459b0001bae5b3     2021-02-12      3.750  2021-08-11
..                        ...            ...        ...         ...
340  601836ba0681ff045e6d6940     2021-02-13      3.750  2021-05-14
344  6019f33f17fdc21e027f72aa     2021-02-13      3.750  2021-08-12
349  601dfce11ffd11245d1bf9de     2021-02-11      3.000  2021-05-12
351  6021a1418a2f591e46a92aea     2021-02-13      3.750  2021-05-14
352  60220f3c276dd72c78daa7bb     2021-02-11      3.750  2021-08-10

[61 rows x 4 columns]


# Total costs

## Basic payment

In [154]:
data_pay['basic_GBP'] = 2.25
data_pay.loc[:, ['basic_GBP', 'bonus_GBP']].head(5)

Unnamed: 0,basic_GBP,bonus_GBP
0,2.25,3.0
3,2.25,3.0
6,2.25,3.0
8,2.25,2.625
9,2.25,3.75


## Total

In [172]:
n=len(data_pay)
total_GBP = data_pay['bonus_GBP'].sum() + data_pay['basic_GBP'].sum()
total_euro = (4/3) * total_GBP * 1.14
remaining_funds = 2101-total_euro

summary = pd.DataFrame({
    'n': [n],
    'basic_average': [data_pay['basic_GBP'].mean()], 
    'basic_total': [data_pay['basic_GBP'].sum()], 
    'bonus_average': [data_pay['bonus_GBP'].mean()], 
    'bonus_total': [data_pay['bonus_GBP'].sum()], 
    'service_fee': [(1/3) * total_GBP],
    'total': [(4/3) * total_GBP],
    'total € (incl. fee)': [(4/3) * total_GBP * 1.14]    
}).T

print(round(summary, 2))
print(f"""\n{round(remaining_funds, 2)}€ remaining funds""")

                           0
n                     250.00
basic_average           2.25
basic_total           562.50
bonus_average           3.00
bonus_total           749.62
service_fee           437.38
total                1749.50
total € (incl. fee)  1994.43

106.57€ remaining funds


# Prognosis for full budget

In [178]:
n=1
total_GBP = data_pay['bonus_GBP'].mean() + data_pay['basic_GBP'].mean()
total_euro = (4/3) * total_GBP * 1.14

summary = pd.DataFrame({
    'n': [n],
    'basic_average': [data_pay['basic_GBP'].mean()], 
    'bonus_average': [data_pay['bonus_GBP'].mean()], 
    'service_fee': [(1/3) * total_GBP],
    'total': [(4/3) * total_GBP],
    'total € (incl. fee)': [(4/3) * total_GBP * 1.14]    
}).T

print(
    f"""{round(summary, 2)}\n \n"""
    f"""{math.floor(remaining_funds / total_euro)} more participants possible"""
)


                        0
n                    1.00
basic_average        2.25
bonus_average        3.00
service_fee          1.75
total                7.00
total € (incl. fee)  7.98
 
13 more participants possible


# Feedback

In [None]:
print('Success! Script ran through.')