In [95]:
import pandas as pd
from collections import Counter
import numpy as np
from pathlib import Path

In [96]:
#Define QID - question matching
qids = ["V201324","V201416", "V202234", "V202257", "V202287", "V202332", "V202337", "V202348", "V202371", "V202378"]
name_dict = {"V201324" : "Current Economy", "V201416" : "Gay Marriage", "V202234" : "Refugee Allowing", 
            "V202257" : "Income Inequality", "V202287" : "Gender Role", "V202332": "Climate Change",
            "V202337": "Gun Regulation", "V202348": "Drug Addiction", "V202371": "Race Diversity", "V202378": "Health Insurance"}

def get_counts(path, qids): #function to get a dict with counts for each question

    value_list = []

    for id in qids:
        df = pd.read_csv(path + id + ".csv")
        df['Response'] = pd.to_numeric(df['Response'], errors='coerce') 

        value_counts = Counter(df['Response'].dropna())

        if id in ["V201324", "V202332"]:

            answers = {i: value_counts[i] for i in range(1, 6)}
        else:
            answers = {i: value_counts[i] for i in range(1, 4)}

        value_list.append({name_dict[id]: answers})

    return value_list



original_results = get_counts("results/main_mq_results_unzipped/full_results_2020_", qids)
replicate_results = get_counts("MLMM Results/main_mq_results/full_results_2020_", qids)
reformulated_results = get_counts("MLMM Results/main_mq_reform_results/full_results_2020_", qids)
reformulated_3rdP_results = get_counts("MLMM Results/main_mq_reform_3rdP_results/full_results_2020_", qids)

preamble_original_results = get_counts("MLMM Results/Preamble/Original/full_results_2020_", qids)
preamble_reformulated_results = get_counts("MLMM Results/Preamble/Reformulated/full_results_2020_", qids)

priming_original_results = get_counts("MLMM Results/Priming/Original/full_results_2020_", qids)
priming_reformulated_results = get_counts("MLMM Results/Priming/Reformulated/full_results_2020_", qids)


#Only some IDs were used for reverse-coded
qids_reverse_coded = ["V201416", "V202234", "V202257", "V202287", "V202332", "V202371"]
rever_coded_results = get_counts("MLMM Results/main_mq_reverse_results/full_results_2020_", qids_reverse_coded)
rever_coded_3rdP_results = get_counts("MLMM Results/main_mq_reverse_3rdP_results/full_results_2020_", qids_reverse_coded)
rever_coded_V2_results = get_counts("MLMM Results/main_mq_reverseV2_results/full_results_2020_", qids_reverse_coded)

#for altered prompts it's just V202287: "Gender Role"
altered_prompts_results = get_counts("MLMM Results/Altered Prompts/1stPov/full_results_2020_", ["V202287"])
altered_prompts_3rdP_results = get_counts("MLMM Results/Altered Prompts/3rdPov/full_results_2020_", ["V202287"])

In [97]:
def get_human_counts(path, qids): #getting results from anes 2020

     anes_df = pd.read_csv(path)

     human_values = []

     for id in qids:

          human_counts = Counter(anes_df[id].dropna())

          if id in ["V201324", "V202332"]:

               answers = {i: human_counts[i] for i in range(1, 6)}
          else:
               answers = {i: human_counts[i] for i in range(1, 4)}

          human_values.append({name_dict[id]: answers})

     return human_values

human_results = get_human_counts("data/2020 ANES_test.csv", qids)

  anes_df = pd.read_csv(path)


In [98]:
#Recoding reverse-coded questions
from copy import deepcopy 

rever_coded_results = {list(d.keys())[0]: list(d.values())[0] for d in rever_coded_results}
rever_coded_results_copy = deepcopy(rever_coded_results)

rever_coded_results["Gay Marriage"][1] = rever_coded_results_copy["Gay Marriage"][3]
rever_coded_results["Gay Marriage"][3] = rever_coded_results_copy["Gay Marriage"][1]

rever_coded_results["Refugee Allowing"][1] = rever_coded_results_copy["Refugee Allowing"][2]
rever_coded_results["Refugee Allowing"][2] = rever_coded_results_copy["Refugee Allowing"][1]

rever_coded_results["Income Inequality"][1] = rever_coded_results_copy["Income Inequality"][2]
rever_coded_results["Income Inequality"][2] = rever_coded_results_copy["Income Inequality"][1]

rever_coded_results["Gender Role"][1] = rever_coded_results_copy["Gender Role"][2]
rever_coded_results["Gender Role"][2] = rever_coded_results_copy["Gender Role"][1]

rever_coded_results["Climate Change"][1] = rever_coded_results_copy["Climate Change"][5]
rever_coded_results["Climate Change"][2] = rever_coded_results_copy["Climate Change"][4]
rever_coded_results["Climate Change"][4] = rever_coded_results_copy["Climate Change"][2]
rever_coded_results["Climate Change"][5] = rever_coded_results_copy["Climate Change"][1]

rever_coded_results["Race Diversity"][1] = rever_coded_results_copy["Race Diversity"][2]
rever_coded_results["Race Diversity"][2] = rever_coded_results_copy["Race Diversity"][1]

rever_coded_results = [{key: value} for key, value in rever_coded_results.items()]

In [99]:
rever_coded_3rdP_results = {list(d.keys())[0]: list(d.values())[0] for d in rever_coded_3rdP_results}
rever_coded_3rdP_results_copy = deepcopy(rever_coded_3rdP_results)

rever_coded_3rdP_results["Gay Marriage"][1] = rever_coded_3rdP_results_copy["Gay Marriage"][3]
rever_coded_3rdP_results["Gay Marriage"][3] = rever_coded_3rdP_results_copy["Gay Marriage"][1]

rever_coded_3rdP_results["Refugee Allowing"][1] = rever_coded_3rdP_results_copy["Refugee Allowing"][2]
rever_coded_3rdP_results["Refugee Allowing"][2] = rever_coded_3rdP_results_copy["Refugee Allowing"][1]

rever_coded_3rdP_results["Income Inequality"][1] = rever_coded_3rdP_results_copy["Income Inequality"][2]
rever_coded_3rdP_results["Income Inequality"][2] = rever_coded_3rdP_results_copy["Income Inequality"][1]

rever_coded_3rdP_results["Gender Role"][1] = rever_coded_3rdP_results_copy["Gender Role"][2]
rever_coded_3rdP_results["Gender Role"][2] = rever_coded_3rdP_results_copy["Gender Role"][1]

rever_coded_3rdP_results["Climate Change"][1] = rever_coded_3rdP_results_copy["Climate Change"][5]
rever_coded_3rdP_results["Climate Change"][2] = rever_coded_3rdP_results_copy["Climate Change"][4]
rever_coded_3rdP_results["Climate Change"][4] = rever_coded_3rdP_results_copy["Climate Change"][2]
rever_coded_3rdP_results["Climate Change"][5] = rever_coded_3rdP_results_copy["Climate Change"][1]

rever_coded_3rdP_results["Race Diversity"][1] = rever_coded_3rdP_results_copy["Race Diversity"][2]
rever_coded_3rdP_results["Race Diversity"][2] = rever_coded_3rdP_results_copy["Race Diversity"][1]

rever_coded_3rdP_results = [{key: value} for key, value in rever_coded_3rdP_results.items()]

In [100]:
rever_coded_V2_results = {list(d.keys())[0]: list(d.values())[0] for d in rever_coded_V2_results}
rever_coded_V2_results_copy = deepcopy(rever_coded_V2_results)

rever_coded_V2_results["Gay Marriage"][1] = rever_coded_V2_results_copy["Gay Marriage"][3]
rever_coded_V2_results["Gay Marriage"][3] = rever_coded_V2_results_copy["Gay Marriage"][1]

rever_coded_V2_results["Refugee Allowing"][1] = rever_coded_V2_results_copy["Refugee Allowing"][2]
rever_coded_V2_results["Refugee Allowing"][2] = rever_coded_V2_results_copy["Refugee Allowing"][1]

rever_coded_V2_results["Income Inequality"][1] = rever_coded_V2_results_copy["Income Inequality"][2]
rever_coded_V2_results["Income Inequality"][2] = rever_coded_V2_results_copy["Income Inequality"][1]

rever_coded_V2_results["Gender Role"][1] = rever_coded_V2_results_copy["Gender Role"][2]
rever_coded_V2_results["Gender Role"][2] = rever_coded_V2_results_copy["Gender Role"][1]

rever_coded_V2_results["Climate Change"][1] = rever_coded_V2_results_copy["Climate Change"][5]
rever_coded_V2_results["Climate Change"][2] = rever_coded_V2_results_copy["Climate Change"][4]
rever_coded_V2_results["Climate Change"][4] = rever_coded_V2_results_copy["Climate Change"][2]
rever_coded_V2_results["Climate Change"][5] = rever_coded_V2_results_copy["Climate Change"][1]

rever_coded_V2_results["Race Diversity"][1] = rever_coded_V2_results_copy["Race Diversity"][2]
rever_coded_V2_results["Race Diversity"][2] = rever_coded_V2_results_copy["Race Diversity"][1]

rever_coded_V2_results = [{key: value} for key, value in rever_coded_V2_results.items()]

In [108]:
rever_coded_V2_results

[{'Gay Marriage': {1: 5441, 2: 0, 3: 0}},
 {'Refugee Allowing': {1: 1862, 2: 3578, 3: 1}},
 {'Income Inequality': {1: 2343, 2: 2732, 3: 366}},
 {'Gender Role': {1: 13, 2: 5390, 3: 38}},
 {'Climate Change': {1: 261, 2: 60, 3: 0, 4: 2563, 5: 2557}},
 {'Race Diversity': {1: 5422, 2: 0, 3: 19}}]

In [109]:
from scipy.stats import entropy  # entropy is a function to compute KL-divergence

def kl_divergence_between_sets(set1, set2):

    kl_results = {}

    dict1 = {list(d.keys())[0]: list(d.values())[0] for d in set1}
    dict2 = {list(d.keys())[0]: list(d.values())[0] for d in set2}

    for question in dict1.keys():
        if question not in dict2:
            continue  

        p_counts = dict1[question]
        q_counts = dict2[question]

        p = np.array([p_counts.get(k, 0) for k in all_keys], dtype=float)
        q = np.array([q_counts.get(k, 0) for k in all_keys], dtype=float)

        p /= p.sum()
        q /= q.sum()

        epsilon = 1e-10
        p = np.clip(p, epsilon, 1)
        q = np.clip(q, epsilon, 1)
        
        kl_pq = entropy(p, q) 
        kl_results[question] = kl_pq

    return kl_results

In [54]:
#JSD divergence is a more numerically stable version of KL-divergence - better for our case

def jsd_divergence_between_sets(set1, set2, base=2):

    jsd_results = {}

    dict1 = {list(d.keys())[0]: list(d.values())[0] for d in set1}
    dict2 = {list(d.keys())[0]: list(d.values())[0] for d in set2}

    for question in dict1.keys():
        if question not in dict2:
            continue 

        p_counts = dict1[question]
        q_counts = dict2[question]

        p = np.array([p_counts.get(k, 0) for k in p_counts], dtype=float)
        q = np.array([q_counts.get(k, 0) for k in p_counts], dtype=float)

        p /= p.sum()
        q /= q.sum()

        epsilon = 1e-10
        p = np.clip(p, epsilon, 1)
        q = np.clip(q, epsilon, 1)

        m = 0.5 * (p + q)
        jsd_pq = 0.5 * (entropy(p, m, base=base) + entropy(q, m, base=base)) 
        jsd_results[question] = jsd_pq

    return jsd_results

In [110]:
#Get all the results and divergences

jsd_results_previous_study = jsd_divergence_between_sets(human_results, original_results, base=2)
jsd_results_replicate = jsd_divergence_between_sets(human_results, replicate_results, base=2)
jsd_results_reformulated = jsd_divergence_between_sets(human_results, reformulated_results, base=2)
jsd_results_3rdP_reformulated = jsd_divergence_between_sets(human_results, reformulated_3rdP_results, base=2)
jsd_results_rev_coded = jsd_divergence_between_sets(human_results, rever_coded_results, base = 2)


diff_replicate_vs_previous = {key: (jsd_results_replicate[key] - jsd_results_previous_study[key]) for key in jsd_results_replicate.keys()}
diff_reformulated_vs_replicate = {key: (jsd_results_reformulated[key] - jsd_results_replicate[key]) for key in jsd_results_replicate.keys()}
diff_reformulated_3rdP_vs_replicate = {key: (jsd_results_3rdP_reformulated[key] - jsd_results_replicate[key]) for key in jsd_results_replicate.keys()}
diff_rev_coded_vs_replicate = {key: (jsd_results_rev_coded[key] - jsd_results_replicate[key]) for key in jsd_results_rev_coded.keys()}


In [111]:
jsd_results_preamble_original = jsd_divergence_between_sets(human_results, preamble_original_results, base=2)
jsd_results_preamble_reformulated = jsd_divergence_between_sets(human_results, preamble_reformulated_results, base=2)

jsd_results_priming_original = jsd_divergence_between_sets(human_results, priming_original_results, base=2)
jsd_results_priming_reformulated = jsd_divergence_between_sets(human_results, priming_reformulated_results, base=2)

jsd_results_rev_coded_3rdP = jsd_divergence_between_sets(human_results, rever_coded_3rdP_results, base = 2)
jsd_results_rev_coded_V2 = jsd_divergence_between_sets(human_results, rever_coded_V2_results, base = 2)

jsd_results_altered_prompts = jsd_divergence_between_sets(human_results, altered_prompts_results, base = 2)
jsd_results_altered_prompts_3rdP = jsd_divergence_between_sets(human_results, altered_prompts_3rdP_results, base = 2)

diff_preamble_original_vs_replicate = {key: (jsd_results_preamble_original[key] - jsd_results_replicate[key]) for key in jsd_results_replicate.keys()}
diff_preamble_reformulated_vs_replicate = {key: (jsd_results_preamble_reformulated[key] - jsd_results_replicate[key]) for key in jsd_results_replicate.keys()}

diff_priming_original_vs_replicate = {key: (jsd_results_priming_original[key] - jsd_results_replicate[key]) for key in jsd_results_replicate.keys()}
diff_priming_reformulated_vs_replicate = {key: (jsd_results_priming_reformulated[key] - jsd_results_replicate[key]) for key in jsd_results_replicate.keys()}

diff_rev_coded_3rdP_vs_replicate = {key: (jsd_results_rev_coded_3rdP[key] - jsd_results_replicate[key]) for key in jsd_results_rev_coded.keys()}
diff_rev_coded_V2_vs_replicate = {key: (jsd_results_rev_coded_V2[key] - jsd_results_replicate[key]) for key in jsd_results_rev_coded.keys()}

diff_altered_prompts_vs_replicate = {key: (jsd_results_altered_prompts[key] - jsd_results_replicate[key]) for key in jsd_results_altered_prompts.keys()}
diff_altered_prompts_3rdP_vs_replicate = {key: (jsd_results_altered_prompts_3rdP[key] - jsd_results_replicate[key]) for key in jsd_results_altered_prompts.keys()}

In [112]:
jsd_results_rev_coded

{'Gay Marriage': np.float64(0.1720979525539047),
 'Refugee Allowing': np.float64(0.20359388929756037),
 'Income Inequality': np.float64(0.3198374015458996),
 'Gender Role': np.float64(0.8470767946162353),
 'Climate Change': np.float64(0.19529426926374827),
 'Race Diversity': np.float64(0.21868819405883255)}

In [113]:
def jsd_divergence_for_bootstrapping(human_sample, bootstrap_sample, base=2):

    p_counts = human_sample
    q_counts = bootstrap_sample

    p = np.array([p_counts.get(k, 0) for k in p_counts], dtype=float)
    q = np.array([q_counts.get(k, 0) for k in p_counts], dtype=float)

    p /= p.sum()
    q /= q.sum()

    epsilon = 1e-10
    p = np.clip(p, epsilon, 1)
    q = np.clip(q, epsilon, 1)

    m = 0.5 * (p + q)
    jsd_pq = 0.5 * (entropy(p, m, base=base) + entropy(q, m, base=base))

    return jsd_pq


In [114]:
human_df = pd.read_csv("data/2020 ANES_test.csv")

def bootstrap_jsd(path, qid, n_boot=2000, base=2):

    question_df = pd.read_csv(path + qid + ".csv")
    question_df['Response'] = pd.to_numeric(question_df['Response'], errors='coerce')

    responses = np.array(question_df['Response'].dropna())

    human_responses = {list(d.keys())[0]: list(d.values())[0] for d in human_results}[name_dict[qid]]

    jsd_samples = []

    for _ in range(n_boot):
        sample = np.random.choice(responses, size=len(responses), replace=True) #sample with replacement
        
        counts = Counter(sample)

        jsd_val = jsd_divergence_for_bootstrapping(human_responses, counts, base=base) #compute JSD for this iteration
        jsd_samples.append(jsd_val) #append to the samples list

    jsd_samples = np.array(jsd_samples)
    mean_jsd = jsd_samples.mean()
    ci_lower, ci_upper = np.percentile(jsd_samples, [2.5, 97.5])
    
    return (("mean", mean_jsd), ("ci_lower", ci_lower), ("ci_upper", ci_upper))



  human_df = pd.read_csv("data/2020 ANES_test.csv")


In [115]:
#Create CIs and compare them

ci_replicate = []
ci_reformulated = []

for qid in qids:
    ci_replicate.append((name_dict[qid], bootstrap_jsd(path = "MLMM Results/main_mq_results/full_results_2020_", qid = qid)))
    ci_reformulated.append((name_dict[qid], bootstrap_jsd(path = "MLMM Results/main_mq_reform_3rdP_results/full_results_2020_", qid = qid)))

ci_replicate = [{item[0] : item[1]} for item in ci_replicate]
ci_reformulated = [{item[0] : item[1]} for item in ci_reformulated]

ci_replicate = {list(d.keys())[0]: list(d.values())[0] for d in ci_replicate}
ci_reformulated = {list(d.keys())[0]: list(d.values())[0] for d in ci_reformulated}

ci_comparison = {key: (ci_replicate[key], ci_reformulated[key]) for key in ci_replicate.keys()}

In [27]:
ci_replicate

{'Current Economy': (('mean', np.float64(0.3088875053766295)),
  ('ci_lower', np.float64(0.30409177164661044)),
  ('ci_upper', np.float64(0.3139900150942718))),
 'Gay Marriage': (('mean', np.float64(0.14318711910201562)),
  ('ci_lower', np.float64(0.13894765233103717)),
  ('ci_upper', np.float64(0.14748029754140993))),
 'Refugee Allowing': (('mean', np.float64(0.2755686625475066)),
  ('ci_lower', np.float64(0.2734773726296294)),
  ('ci_upper', np.float64(0.2771770190299979))),
 'Income Inequality': (('mean', np.float64(0.19750155770094405)),
  ('ci_lower', np.float64(0.18905304817860247)),
  ('ci_upper', np.float64(0.2060638724056752))),
 'Gender Role': (('mean', np.float64(0.01614773446196469)),
  ('ci_lower', np.float64(0.01334672383399805)),
  ('ci_upper', np.float64(0.019126957467939253))),
 'Climate Change': (('mean', np.float64(0.41031665563973285)),
  ('ci_lower', np.float64(0.40530775822328047)),
  ('ci_upper', np.float64(0.4154004570939942))),
 'Gun Regulation': (('mean', np.f