In [1]:
import pandas as pd
import numpy as np
df = pd.read_pickle("../Data/Study 4/RawData.pickle")

Number of participants in database:

In [2]:
df.shape[0]

304

Data collection started on:

In [3]:
df.startdate[0].strftime("%d-%m-%y %H:%M")

'07-08-17 21:56'

In [4]:
def has_both_allocations(x):
    """
    Check if both allocations were correctly recorded.
    """
    allocone_recorded = (len(x.alloc_item_one) == 25) if x.alloc_item_one else False
    alloctwo_recorded = (len(x.alloc_item_two) == 25) if x.alloc_item_two else False
    return allocone_recorded & alloctwo_recorded

Number of valid participants:

In [5]:
df_clean = df[df.apply(has_both_allocations, axis=1)].copy()
print(df_clean.shape [0])

301


In [6]:
def convert_allocation_to_distribution(allocation, buckets, isstr=False):
    """
    Takes an allocation of balls to buckets, and a list of buckets.
    Return the corresponding distribution of values.
    
    Example: 
        buckets = [1, 2, 3, 4, 5]
        x = "0, 3, 1, 2, 1"
        dist = convert_allocation_to_distribution(x, buckets)
        print(dist) -> (2, 2, 2, 3, 4, 4, 5)
    """
    if isstr:
        arr = allocation.split(",")
    else:
        arr = allocation
    if len(arr) != len(buckets):
        raise ValueError("The number of buckets should match the length of the allocations.")
    values = np.repeat(buckets, arr)
    return values

# Converting the allocations into distributions, and storing them in respective columns, before dropping the "allocation" columns
df_clean["dist_item_one"] = df_clean.alloc_item_one.apply(convert_allocation_to_distribution, buckets=np.arange(1, 51, 2))
df_clean["dist_item_two"] = df_clean.alloc_item_two.apply(convert_allocation_to_distribution, buckets=np.arange(1, 51, 2))
df_clean.drop(columns=["alloc_item_one", "alloc_item_two", "dist_order"], inplace=True)

In [7]:
def get_shown_dist(x, itemone):
    shown_all = x["value_prices"]
    type_all = x["label_prices"]
    test = (itemone == True)*1
    values = [str(v) for v, t in zip(shown_all, type_all) if t == test]
    return ",".join(values)

df_clean["showndist_item_one"] = df_clean.apply(get_shown_dist, axis=1, args=[True])
df_clean["showndist_item_one"]  = df_clean["showndist_item_one"].apply(lambda x: [float(i) for i in x.split(",")])
df_clean["showndist_item_two"] = df_clean.apply(get_shown_dist, axis=1, args=[False])
df_clean["showndist_item_two"]  = df_clean["showndist_item_two"].apply(lambda x: [float(i) for i in x.split(",")])

In [8]:
df_clean["variance_item_one"] = df_clean["showndist_item_one"].apply(lambda x: "Low" if np.std(x) < 3 else "High")
df_clean["variance_item_two"] = df_clean["showndist_item_two"].apply(lambda x: "Low" if np.std(x) < 3 else "High")
df_clean["mean_item_one"] = df_clean["showndist_item_one"].apply(lambda x: np.mean(x))
df_clean["mean_item_two"] = df_clean["showndist_item_two"].apply(lambda x: np.mean(x))

In [9]:
# Each participant reported two distributions (item_one and item_two), pivoting the data in long form.
df_long = pd.wide_to_long(df_clean, stubnames=["dist", "showndist"], i="pid", j="item", suffix="\\D+", sep="_").reset_index()

In [10]:
df_long["sd_focal"] = df_long.apply(lambda x: x.variance_item_one if (x["item"] == "item_one") else x.variance_item_two, axis=1)
df_long["sd_other"] = df_long.apply(lambda x: x.variance_item_one if (x["item"] != "item_one") else x.variance_item_two, axis=1)
df_long["mean_focal"] = df_long.apply(lambda x: x.mean_item_one if (x["item"] == "item_one") else x.mean_item_two, axis=1)
df_long["mean_other"] = df_long.apply(lambda x: x.mean_item_one if (x["item"] != "item_one") else x.mean_item_two, axis=1)

In [11]:
#Number of participants times two distributions:
print(df_long.shape[0])
df_long.to_pickle("../Data/Study 4/CleanData.pickle")

602


In [12]:
# Pivoting the distributions in long form: one line per value per distribution per participant.
df_dist = df_long.set_index(['pid', 'turkid', 'item', "variance_item_one", "variance_item_two", 
                             "mean_item_one", "mean_item_two", "sd_focal", 
                             "sd_other", "mean_focal", "mean_other"])['dist'].apply(pd.Series).stack().reset_index()
df_dist.columns = ['pid', 'turkid', 'item', "sd_item_one", "sd_item_two", 
                   "mean_item_one", "mean_item_two", "sd_focal", "sd_other", 
                   "mean_focal", "mean_other", "value_id", "value"]
print(df_dist.shape[0])
df_dist.to_csv("../Data/Study 4/LongData.csv", index=None)

15652
