This notebook pre-processes the data of Study 4 of the paper.

Click the "Show Code" buttons to see the code associated with each output.

In [1]:
import pandas as pd
import numpy as np
import janitor
from datetime import datetime
import pytz

STUDY_NAME = "Study 4"

def cet_to_utc(dt):
    local_tz = pytz.timezone('CET')
    target_tz = pytz.timezone('UTC')
    dt = datetime.strptime(dt, "%d/%m/%Y %H:%M")
    return target_tz.normalize(local_tz.localize(dt))

def print_collection_information(df):
    starttime = df.sort_values("StartDate").StartDate.iloc[0]
    endtime = df.sort_values("EndDate").EndDate.iloc[-1]
    startstring = cet_to_utc(starttime).strftime("%B %d, %Y at %H:%M %p")
    endstring = cet_to_utc(endtime).strftime("%B %d, %Y at %H:%M %p")
    return f"{STUDY_NAME} was started on {startstring} and ended on {endstring}."

df = pd.read_csv(f"../Data/{STUDY_NAME}/RawData.csv")

Number of participants in raw Qualtrics Export:

In [2]:
df.shape[0]

301

Data collection information:

In [3]:
print_collection_information(df)

'Study 4 was started on June 27, 2019 at 14:14 PM and ended on June 27, 2019 at 16:01 PM.'

Data collection started on:

In [4]:
df["Condition"] = df.dispersion.map({"same": "Equal dispersion", "higher": "Higher dispersion"}) # Nicer condition labels
df["Label_Category"] = df.manipulated.map({"florida": "Colorado", "colorado": "Florida"})

df["Memory_Avg"] = df[["Avg_Florida", "Avg_Colorado"]].fillna(0).sum(axis=1) # Reported average
df["Memory_Min"] = df[["Min_Colorado", "Avg_Colorado.1"]].fillna(0).sum(axis=1) # Reported minimum price

df["Below_True_Min"] = (df.Memory_Min < 240)*1 # Dummy coding if minimum price below true minimu price

for l in range(1, 6):
    df[f"PickLikelihood_{l}"] = df[[f'{l}_WTA_Colorado', f'{l}_WTA_Florida']].fillna(0).sum(axis=1) # Likelihood of picking each price
    
cols = ['ResponseId', 'Label_Category', 'Condition', 'Memory_Avg', 'Memory_Min', 'Below_True_Min',
        'PickLikelihood_1', 'PickLikelihood_2', 'PickLikelihood_3', 'PickLikelihood_4', 'PickLikelihood_5']

df = df[cols].rename_columns({"ResponseId": "turkid"}).clean_names() # Subsetting columns of interests

df.to_csv(f"../Data/{STUDY_NAME}/CleanData.csv", index=False)

In [5]:
# Data in long form: one line per evaluation of LTA and per participant

df_long = pd.melt(df, id_vars=["turkid", "condition", "label_category", "memory_min", "memory_avg"], 
                  value_vars=[f"picklikelihood_{i}" for i in range(1, 6)], 
                  value_name="LTA", 
                  var_name="offer")

# Mapping the offer values
df_long["offer"] = df_long.offer.apply(lambda x: x[-1]).map({"1":280, "2":260, "3":240, "4":220, "5":200})

# Evaluating consistency of preferences
is_consistent = df_long.sort_values(["turkid", "offer"]).groupby("turkid")["LTA"].apply(lambda x: all(np.diff(x)<=0))
is_consistent.name = "has_consistent_prefs"
df_long = df_long.merge(is_consistent, on="turkid")
df_long.to_csv(f"../Data/{STUDY_NAME}/LongData.csv", index=False)
print(df_long.shape[0])

1505
