This notebook pre-processes the data of Study 5 of the paper.

Click the "Show Code" buttons to see the code associated with each output.

In [1]:
import pandas as pd
import numpy as np
import janitor
from datetime import datetime
import pytz

STUDY_NAME = "Study 5"

def print_prereg_time_utc(dt):
    local_tz = pytz.timezone('US/Pacific')
    target_tz = pytz.timezone('UTC')
    dt = datetime.strptime(dt, "%m/%d/%Y %H:%M %p")
    preregstring = target_tz.normalize(local_tz.localize(dt)).strftime("%B %d, %Y at %H:%M %p")
    return f"{STUDY_NAME} was preregistered on {preregstring}."

def cet_to_utc(dt):
    local_tz = pytz.timezone('CET')
    target_tz = pytz.timezone('UTC')
    dt = datetime.strptime(dt, "%d/%m/%Y %H:%M")
    return target_tz.normalize(local_tz.localize(dt))

def print_collection_information(df):
    starttime = df.sort_values("StartDate").StartDate.iloc[0]
    endtime = df.sort_values("EndDate").EndDate.iloc[-1]
    startstring = cet_to_utc(starttime).strftime("%B %d, %Y at %H:%M %p")
    endstring = cet_to_utc(endtime).strftime("%B %d, %Y at %H:%M %p")
    return f"{STUDY_NAME} was started on {startstring} and ended on {endstring}."

df = pd.read_csv(f"../Data/{STUDY_NAME}/RawData.csv")

Pre-registration information:

In [2]:
print_prereg_time_utc("02/15/2020 08:31 AM")

'Study 5 was preregistered on February 15, 2020 at 16:31 PM.'

Data collection information:

In [3]:
print_collection_information(df)

'Study 5 was started on February 15, 2020 at 16:52 PM and ended on February 16, 2020 at 02:07 AM.'

Number of participants in raw Qualtrics Export:

In [4]:
df.shape[0]

503

In [5]:
df["Condition"] = df.dispersion.map({"same": "Equal dispersion", "higher": "Higher dispersion"}) # Nicer condition labels
df["Label_Category"] = df.manipulated.map({"florida": "Colorado", "colorado": "Florida"})

df["Memory_Avg"] = df[["Avg_Florida", "Avg_Florida"]].fillna(0).sum(axis=1) # Reported average
df["Memory_Min"] = df[["Min_Colorado", "Min_Florida"]].fillna(0).sum(axis=1) # Reported minimum price

df["Below_True_Min"] = (df.Memory_Min < 240)*1 # Dummy coding if minimum price below true minimum price
df["Paid_Price"] = df.PaidPrice
df["Search_Length"] = df.SearchLength
df["Excluded"] = (df.Memory_Min > 320).map({False: "No", True:"Yes"})
cols = ['ResponseId', 'Label_Category', 'Condition', 'Memory_Avg', 'Memory_Min', 'Below_True_Min', 'Search_Length',
        'Paid_Price', 'Excluded']

df = df[cols].rename_columns({"ResponseId": "turkid"}).clean_names() # Subsetting columns of interests

df.to_csv(f"../Data/{STUDY_NAME}/CleanData.csv", index=False)