This notebook pre-processes the data of Study 8 of the paper.

Click the "Show Code" buttons to see the code associated with each output.

In [1]:
import pandas as pd
import numpy as np

STUDY_NAME = "Study 8"

def print_collection_information(df):
    startstring = df.sort_values("startdate").startdate.iloc[0].strftime("%B %d, %Y at %H:%M %p")
    endstring = df.sort_values("enddate").enddate.iloc[-1].strftime("%B %d, %Y at %H:%M %p")
    return f"{STUDY_NAME} was started on {startstring} and ended on {endstring}."


df = pd.read_pickle(f"../Data/{STUDY_NAME}/RawData.pickle")

Data collection information:

In [2]:
print_collection_information(df)

'Study 8 was started on February 22, 2017 at 17:47 PM and ended on February 22, 2017 at 20:43 PM.'

Number of participants in database:

In [3]:
df.shape[0]

150

In [4]:
def has_both_allocations(x):
    """
    Check if both allocations were correctly recorded.
    """
    alloc1_recorded = (len(x.alloc_item_one) == 25)
    alloc2_recorded = (len(x.alloc_item_two) == 25)
    return alloc1_recorded & alloc2_recorded

Number of valid participants:

In [5]:
df_clean = df[df.apply(has_both_allocations, axis=1)].copy()
print(df_clean.shape [0])

149


In [6]:
def convert_allocation_to_distribution(allocation, buckets, isstr=False):
    """
    Takes an allocation of balls to buckets, and a list of buckets.
    Return the corresponding distribution of values.
    
    Example: 
        buckets = [1, 2, 3, 4, 5]
        x = "0, 3, 1, 2, 1"
        dist = convert_allocation_to_distribution(x, buckets)
        print(dist) -> (2, 2, 2, 3, 4, 4, 5)
    """
    if isstr:
        arr = allocation.split(",")
    else:
        arr = allocation
    if len(arr) != len(buckets):
        raise ValueError("The number of buckets should match the length of the allocations.")
    values = np.repeat(buckets, arr)
    return values

# Converting the allocations into distributions, and storing them in respective columns, before dropping the "allocation" columns
df_clean["dist_item_one"] = df_clean.alloc_item_one.apply(convert_allocation_to_distribution, buckets=np.arange(1, 51, 2))
df_clean["dist_item_two"] = df_clean.alloc_item_two.apply(convert_allocation_to_distribution, buckets=np.arange(1, 51, 2))
df_clean.drop(columns=["alloc_item_one", "alloc_item_two", "dist_order"], inplace=True)

In [7]:
# Each participant reported two distributions (item_one and item_two), pivoting the data in long form.
df_long = pd.wide_to_long(df_clean, stubnames=["wine", "dist"], i="pid", j="item_name", suffix="\\D+", sep="_").reset_index()
df_long["item_name"] = df_long["item_name"].map({"item_one":"ItemOne", "item_two":"ItemTwo"})
# Coding if the reported distribution was for the focal distribution or the distractor
df_long["is_distractor"] = (df_long.name_distractor == df_long.item_name)
#Number of participants times two distributions:
print(df_long.shape[0])
df_long.to_pickle(f"../Data/{STUDY_NAME}/CleanData.pickle")

298


In [8]:
# Pivoting the distributions in long form: one line per value per distribution per participant.
df_dist = df_long.set_index(['turkid', 'is_distractor', 'wine', 'mean_distractor'])['dist'].apply(pd.Series).stack().reset_index()
df_dist.columns = ["turkid", "is_manipulated", "wine", 'mean_manipulated', "value_id", "value"]
print(df_dist.shape[0])
df_dist.to_csv(f"../Data/{STUDY_NAME}/LongData.csv", index=None)

7450
