This notebook pre-processes the data of Study 1 of the paper.

Click the "Show Code" buttons to see the code associated with each output.

In [1]:
import pandas as pd
import numpy as np
import janitor
from datetime import datetime
import pytz

STUDY_NAME = "Study 1"

def cet_to_utc(dt):
    local_tz = pytz.timezone('CET')
    target_tz = pytz.timezone('UTC')
    dt = datetime.strptime(dt, "%d/%m/%Y %H:%M")
    return target_tz.normalize(local_tz.localize(dt))

def print_collection_information(df):
    starttime = df.sort_values("StartDate").StartDate.iloc[0]
    endtime = df.sort_values("EndDate").EndDate.iloc[-1]
    startstring = cet_to_utc(starttime).strftime("%B %d, %Y at %H:%M %p")
    endstring = cet_to_utc(endtime).strftime("%B %d, %Y at %H:%M %p")
    return f"{STUDY_NAME} was started on {startstring} and ended on {endstring}."

df = pd.read_csv(f"../Data/{STUDY_NAME}/RawData.csv")

Data collection information:

In [2]:
print_collection_information(df)

'Study 1 was started on March 13, 2019 at 20:45 PM and ended on March 14, 2019 at 01:01 AM.'

Number of participants in raw Qualtrics Export:

In [3]:
df.shape[0]

300

Number of valid participants (for which the difference between Maximum and Minimum price is greater than 0):

In [4]:
df["Range_White"] = df["Priciest_White"] - df["Cheapest_White"]
df["Range_Red"] = df["Priciest_Red"] - df["Cheapest_Red"]

df["Name_Manipulated"] = df.condid.map(
    {0: "White", 1: "White", 2: "White", 3: "Red", 4: "Red", 5: "Red"}
)
df["SD_Manipulated"] = df.condid.map(
    {0: "Low", 1: "High", 2: "Med", 3: "Low", 4: "High", 5: "Med"}
)
df["SD_Manipulated_Label"] = df["SD_Manipulated"].map(
    {"Low": "Low (SD = 1.1)", "Med": "Med (SD = 4.5)", "High": "High (SD = 7.5)"}
)
df["Valid_Response"] = (df.Range_Red >= 0) & (df.Range_White >= 0)
df[df.Valid_Response == 1].shape[0]

293

In [5]:
# Each participant reported two sets of responses (white and red wine), pivoting the data in long form.
df_long = (
    pd.wide_to_long(
        df,
        i=[
            "turkid",
            "condid",
            "Name_Manipulated",
            "SD_Manipulated",
            "SD_Manipulated_Label",
        ],
        stubnames=["Priciest", "Cheapest", "Range", "Unpredictable", "Inconsistent"],
        j="Target",
        sep="_",
        suffix="(White|Red)",
    )
    .reset_index()
    .rename_columns(
        {
            "Priciest": "Memory_Max",
            "Cheapest": "Memory_Min",
            "Range": "Memory_Range",
            "turkid": "TurkID",
        }
    )
)

df_long["Unpredictable"] = (
    df_long.Unpredictable - 32
)  # Qualtrics quirk: responses were coded from 33...
df_long["Inconsistent"] = df_long.Inconsistent - 32
df_long["Is_Manipulated"] = df_long.Target == df_long.Name_Manipulated

cols = [
    "TurkID",
    "Name_Manipulated",
    "SD_Manipulated",
    "Is_Manipulated",
    "SD_Manipulated_Label",
    "Target",
    "Memory_Max",
    "Memory_Min",
    "Memory_Range",
    "Unpredictable",
    "Inconsistent",
    "Valid_Response",
]

df_long = df_long[cols].clean_names()
df_long.to_csv(f"../Data/{STUDY_NAME}/LongData.csv", index=None)