This notebook pre-processes the data of Study 1 in the Appendix (the "Range" study).

Click the "Show Code" buttons to see the code associated with each output.

In [1]:
import pandas as pd
import numpy as np

STUDY_NAME = "Study A1"
    
def print_collection_information(df):
    startstring = df.sort_values("startdate").startdate.iloc[0].strftime("%B %d, %Y at %H:%M %p")
    endstring = df.sort_values("enddate").enddate.iloc[-1].strftime("%B %d, %Y at %H:%M %p")
    return f"{STUDY_NAME} was started on {startstring} and ended on {endstring}."


df = pd.read_pickle(f"../Data/{STUDY_NAME}/RawData.pickle")

Data collection information:

In [2]:
print_collection_information(df)

'Study A1 was started on July 31, 2019 at 16:04 PM and ended on July 31, 2019 at 21:14 PM.'

Number of participants in database:

In [3]:
df.shape[0]

399

Number of valid participants:

In [4]:
def has_both_allocations(x):
    """
    Check if both allocations were correctly recorded.
    """
    alloc1_recorded = (len(x.alloc_item_one) == 25)
    alloc2_recorded = (len(x.alloc_item_two) == 25)
    return alloc1_recorded & alloc2_recorded

df_clean = df[df.apply(has_both_allocations, axis=1)].copy()
print(df_clean.shape [0])

395


In [5]:
def convert_allocation_to_distribution(allocation, buckets, isstr=False):
    """
    Takes an allocation of balls to buckets, and a list of buckets.
    Return the corresponding distribution of values.
    
    Example: 
        buckets = [1, 2, 3, 4, 5]
        x = "0, 3, 1, 2, 1"
        dist = convert_allocation_to_distribution(x, buckets)
        print(dist) -> (2, 2, 2, 3, 4, 4, 5)
    """
    if isstr:
        arr = allocation.split(",")
    else:
        arr = allocation
    if len(arr) != len(buckets):
        raise ValueError("The number of buckets should match the length of the allocations.")
    values = np.repeat(buckets, arr)
    return values

# Converting the allocations into distributions, and storing them in respective columns, before dropping the "allocation" columns
df_clean["dist_item_one"] = df_clean.alloc_item_one.apply(convert_allocation_to_distribution, buckets=np.arange(2, 52, 2))
df_clean["dist_item_two"] = df_clean.alloc_item_two.apply(convert_allocation_to_distribution, buckets=np.arange(2, 52, 2))
df_clean.drop(columns=["alloc_item_one", "alloc_item_two"], inplace=True)

In [6]:
# Each participant reported two distributions (item_one and item_two), pivoting the data in long form.
df_long = pd.wide_to_long(df_clean, stubnames=["wine", "history", "dist"], i="pid", j="value", suffix="\\D+", sep="_").reset_index()

# Coding if the reported distribution was for the focal distribution or the distractor
df_long["is_distractor"] = (df_long.name_distractor == df_long.wine)

#Number of participants times two distributions:
print(df_long.shape[0])
df_long.to_pickle("../Data/Appendix Study 1/CleanData.pickle")

790


FileNotFoundError: [Errno 2] No such file or directory: '../Data/Appendix Study 1/CleanData.pickle'

In [None]:
# Pivoting the distributions in long form: one line per value per distribution per participant.
df_dist = df_long.set_index(['pid', 'turkid', 'name_distractor', 'is_distractor', 'wine', 'shape_distractor'])['dist'].apply(pd.Series).stack().reset_index()
df_dist.columns = ['pid', 'turkid', 'name_manipulated', 'is_manipulated', 'wine', 
                             'shape_manipulated', "value_id", "value"]
print(df_dist.shape[0])

df_dist.to_pickle("../Data/Appendix Study 1/LongData.pickle")
df_dist.to_csv("../Data/Appendix Study 1/LongData.csv", index=None)