This notebook pre-processes the data of Study 2 of the paper.

Click the "Show Code" buttons to see the code associated with each output.

In [1]:
import pandas as pd
import numpy as np

STUDY_NAME = "Study 2"

def print_collection_information(df):
    startstring = df.sort_values("startdate").startdate.iloc[0].strftime("%B %d, %Y at %H:%M %p")
    endstring = df.sort_values("enddate").enddate.iloc[-1].strftime("%B %d, %Y at %H:%M %p")
    return f"{STUDY_NAME} was started on {startstring} and ended on {endstring}."


df = pd.read_pickle(f"../Data/{STUDY_NAME}/RawData.pickle")

Data collection information:

In [2]:
print_collection_information(df)

'Study 2 was started on August 05, 2019 at 22:16 PM and ended on August 06, 2019 at 01:07 AM.'

Number of participants in database:

In [3]:
df.shape[0]

299

Number of valid participants:

In [4]:
def has_both_allocations(x):
    """
    Check if both allocations were correctly recorded.
    """
    alloc1_recorded = (len(x.alloc_item_one) == 25)
    alloc2_recorded = (len(x.alloc_item_two) == 25)
    return alloc1_recorded & alloc2_recorded
df_clean = df[df.apply(has_both_allocations, axis=1)].copy()
print(df_clean.shape [0])
df_clean["total_learning_time"] = df_clean.clicks.apply(lambda x: np.diff(x)/1000).apply(lambda x: x.mean())*52
df_clean["avg_learning_time"] = df_clean.clicks.apply(lambda x: np.diff(x)/1000).apply(lambda x: x.mean())

296


In [5]:
def convert_allocation_to_distribution(allocation, buckets, isstr=False):
    """
    Takes an allocation of balls to buckets, and a list of buckets.
    Return the corresponding distribution of values.
    
    Example: 
        buckets = [1, 2, 3, 4, 5]
        x = "0, 3, 1, 2, 1"
        dist = convert_allocation_to_distribution(x, buckets)
        print(dist) -> (2, 2, 2, 3, 4, 4, 5)
    """
    if isstr:
        arr = allocation.split(",")
    else:
        arr = allocation
    if len(arr) != len(buckets):
        raise ValueError("The number of buckets should match the length of the allocations.")
    values = np.repeat(buckets, arr)
    return values

# Converting the allocations into distributions, and storing them in respective columns, before dropping the "allocation" columns
df_clean["dist_item_one"] = df_clean.alloc_item_one.apply(convert_allocation_to_distribution, buckets=np.arange(1, 51, 2))
df_clean["dist_item_two"] = df_clean.alloc_item_two.apply(convert_allocation_to_distribution, buckets=np.arange(1, 51, 2))
df_clean.drop(columns=["alloc_item_one", "alloc_item_two"], inplace=True)

In [6]:
# Each participant reported two distributions (item_one and item_two), pivoting the data in long form.
df_long = pd.wide_to_long(df_clean, stubnames=["name", "history", "dist"], i="pid", j="value", suffix="?(item_one|item_two)", sep="_").reset_index()

# Coding if the reported distribution was for the focal distribution or the distractor
df_long["is_distractor"] = (df_long.name_distractor == df_long.name)

#Number of participants times two distributions:
print(df_long.shape[0])
df_long.to_pickle(f"../Data/{STUDY_NAME}/CleanData.pickle")

592


In [7]:
# Pivoting the distributions in long form: one line per value per distribution per participant.
df_dist = df_long.set_index(['turkid', 'avg_learning_time', 'total_learning_time', 
                             'name_distractor', 'is_distractor', 'name', 'sd_distractor'])['dist'].apply(pd.Series).stack().reset_index()
df_dist.columns = ['turkid', 'avg_learning_time', 'total_learning_time', 
                   'name_manipulated', 'is_manipulated', 'name', 'sd_manipulated',
                   "value_id", "value"]
print(df_dist.shape[0])
df_dist.to_csv(f"../Data/{STUDY_NAME}/LongData.csv", index=None)

15392


In [8]:
df_long["true_sd"] = df_long.apply(lambda x: 4.5 if ((x.is_distractor == False) or (x.sd_distractor == "Med")) else 1.1 if x.sd_distractor == "Low" else 7.5, axis=1)
df_long["est_sd"] = df_long.history.apply(lambda x: np.std(np.repeat(np.arange(1, 51, 2), x[-1].split(","))))
df_long["diff_sd"] = df_long["true_sd"] - df_long["est_sd"]
df_long["diff_m"] = df_long.history.apply(lambda x: np.mean(np.repeat(np.arange(1, 51, 2), x[-1].split(",")))) - 25

# Pivoting the distributions in long form: one line per allocation state per item per participant.
df_hist = df_long.set_index(['pid', 'is_distractor', 'sd_distractor', "diff_sd", "diff_m"])['history'].apply(pd.Series).stack().reset_index()
df_hist.columns = ['pid', 'is_distractor', 'sd_distractor', "diff_sd", "diff_m", 'timestamp', 'value']

# Converting the allocation to distributions
allocs = df_hist.value.apply(lambda x: x.split(",")) # All allocations
dists = allocs.apply(convert_allocation_to_distribution, buckets = np.arange(1, 51, 2)) # Converted to distributions
df_hist["distribution"] = dists
df_hist.columns = ['pid', 'is_manipulated', 'sd_manipulated', 'diff_sd', 'diff_m', 'timestamp', 'value', 'distribution']
df_hist.to_pickle(f"../Data/{STUDY_NAME}/HistoryData.pickle")