# Strategies for Mitigating Social Desirability Bias in Silicon Sampling Studies - Result analysis

This notebook demonstrates the workflow for processing the results for the paper "Strategies for Mitigating Social Desirability Bias in Silicon Sampling Studies". 
### Prerequisites
In order to run this notebook it is recommended to install a conda distribution like [Anaconda or Miniconda (click for a link)](https://www.anaconda.com/download/success) and to create a new environment based on this repository's `environment.yml` file. The code in this notebook will not be guaranteed to work otherwise.

**Conda configuration guide:**

After installing a conda distribution, open a terminal (On windows it might be required to open the "Anaconda Terminal" instead).

Then navigate to the root folder of this repository inside of the terminal in order to have access to the `environment.yml` file.

Then run the following command to create/import the enviroment from the file: `conda env create -f environment.yml -n ml-results-analysis`

And the second command to activate the environment: `conda activate ml-results-analysis`.

Then using Jupyter Notebook or VS Code switch the kernel to the name of the newly created environment (which by default is "ml-results-analysis")
### Imports

Please make sure to run the following code block in order to import the packages required for analysis

In [159]:
import pandas as pd
from collections import Counter
import numpy as np
from pathlib import Path

In [160]:
#Define QID - question matching
qids = ["V201324","V201416", "V202234", "V202257", "V202287", "V202332", "V202337", "V202348", "V202371", "V202378"]
name_dict = {"V201324" : "Current Economy", "V201416" : "Gay Marriage", "V202234" : "Refugee Allowing", 
            "V202257" : "Income Inequality", "V202287" : "Gender Role", "V202332": "Climate Change",
            "V202337": "Gun Regulation", "V202348": "Drug Addiction", "V202371": "Race Diversity", "V202378": "Health Insurance"}

def get_counts(path, qids): #function to get a dict with counts for each question

    value_list = []

    for id in qids:
        df = pd.read_csv(path + id + ".csv")
        df['Response'] = pd.to_numeric(df['Response'], errors='coerce') 

        value_counts = Counter(df['Response'].dropna())

        if id in ["V201324", "V202332"]:

            answers = {i: value_counts[i] for i in range(1, 6)}
        else:
            answers = {i: value_counts[i] for i in range(1, 4)}

        value_list.append({name_dict[id]: answers})

    return value_list



original_results = get_counts("results/main_mq_results_unzipped/full_results_2020_", qids)
replicate_results = get_counts("MLMM Results/main_mq_results/full_results_2020_", qids)
reformulated_results = get_counts("MLMM Results/main_mq_reform_results/full_results_2020_", qids)
reformulated_3rdP_results = get_counts("MLMM Results/main_mq_reform_3rdP_results/full_results_2020_", qids)

preamble_original_results = get_counts("MLMM Results/Preamble/Original/full_results_2020_", qids)
preamble_reformulated_results = get_counts("MLMM Results/Preamble/Reformulated/full_results_2020_", qids)

priming_original_results = get_counts("MLMM Results/Priming/Original/full_results_2020_", qids)
priming_reformulated_results = get_counts("MLMM Results/Priming/Reformulated/full_results_2020_", qids)


#Only some IDs were used for reverse-coded
qids_reverse_coded = ["V201416", "V202234", "V202257", "V202287", "V202332", "V202371"]
rever_coded_results = get_counts("MLMM Results/main_mq_reverse_results/full_results_2020_", qids_reverse_coded)
rever_coded_3rdP_results = get_counts("MLMM Results/main_mq_reverse_3rdP_results/full_results_2020_", qids_reverse_coded)
rever_coded_V2_results = get_counts("MLMM Results/main_mq_reverseV2_results/full_results_2020_", qids_reverse_coded)

#for altered prompts it's just V202287: "Gender Role"
altered_prompts_results = get_counts("MLMM Results/Altered Prompts/1stPov/full_results_2020_", ["V202287"])
altered_prompts_3rdP_results = get_counts("MLMM Results/Altered Prompts/3rdPov/full_results_2020_", ["V202287"])

In [97]:
def get_human_counts(path, qids): #getting results from anes 2020

     anes_df = pd.read_csv(path)

     human_values = []

     for id in qids:

          human_counts = Counter(anes_df[id].dropna())

          if id in ["V201324", "V202332"]:

               answers = {i: human_counts[i] for i in range(1, 6)}
          else:
               answers = {i: human_counts[i] for i in range(1, 4)}

          human_values.append({name_dict[id]: answers})

     return human_values

human_results = get_human_counts("data/2020 ANES_test.csv", qids)

  anes_df = pd.read_csv(path)


In [98]:
#Recoding reverse-coded questions
from copy import deepcopy 

rever_coded_results = {list(d.keys())[0]: list(d.values())[0] for d in rever_coded_results}
rever_coded_results_copy = deepcopy(rever_coded_results)

rever_coded_results["Gay Marriage"][1] = rever_coded_results_copy["Gay Marriage"][3]
rever_coded_results["Gay Marriage"][3] = rever_coded_results_copy["Gay Marriage"][1]

rever_coded_results["Refugee Allowing"][1] = rever_coded_results_copy["Refugee Allowing"][2]
rever_coded_results["Refugee Allowing"][2] = rever_coded_results_copy["Refugee Allowing"][1]

rever_coded_results["Income Inequality"][1] = rever_coded_results_copy["Income Inequality"][2]
rever_coded_results["Income Inequality"][2] = rever_coded_results_copy["Income Inequality"][1]

rever_coded_results["Gender Role"][1] = rever_coded_results_copy["Gender Role"][2]
rever_coded_results["Gender Role"][2] = rever_coded_results_copy["Gender Role"][1]

rever_coded_results["Climate Change"][1] = rever_coded_results_copy["Climate Change"][5]
rever_coded_results["Climate Change"][2] = rever_coded_results_copy["Climate Change"][4]
rever_coded_results["Climate Change"][4] = rever_coded_results_copy["Climate Change"][2]
rever_coded_results["Climate Change"][5] = rever_coded_results_copy["Climate Change"][1]

rever_coded_results["Race Diversity"][1] = rever_coded_results_copy["Race Diversity"][2]
rever_coded_results["Race Diversity"][2] = rever_coded_results_copy["Race Diversity"][1]

rever_coded_results = [{key: value} for key, value in rever_coded_results.items()]

In [99]:
rever_coded_3rdP_results = {list(d.keys())[0]: list(d.values())[0] for d in rever_coded_3rdP_results}
rever_coded_3rdP_results_copy = deepcopy(rever_coded_3rdP_results)

rever_coded_3rdP_results["Gay Marriage"][1] = rever_coded_3rdP_results_copy["Gay Marriage"][3]
rever_coded_3rdP_results["Gay Marriage"][3] = rever_coded_3rdP_results_copy["Gay Marriage"][1]

rever_coded_3rdP_results["Refugee Allowing"][1] = rever_coded_3rdP_results_copy["Refugee Allowing"][2]
rever_coded_3rdP_results["Refugee Allowing"][2] = rever_coded_3rdP_results_copy["Refugee Allowing"][1]

rever_coded_3rdP_results["Income Inequality"][1] = rever_coded_3rdP_results_copy["Income Inequality"][2]
rever_coded_3rdP_results["Income Inequality"][2] = rever_coded_3rdP_results_copy["Income Inequality"][1]

rever_coded_3rdP_results["Gender Role"][1] = rever_coded_3rdP_results_copy["Gender Role"][2]
rever_coded_3rdP_results["Gender Role"][2] = rever_coded_3rdP_results_copy["Gender Role"][1]

rever_coded_3rdP_results["Climate Change"][1] = rever_coded_3rdP_results_copy["Climate Change"][5]
rever_coded_3rdP_results["Climate Change"][2] = rever_coded_3rdP_results_copy["Climate Change"][4]
rever_coded_3rdP_results["Climate Change"][4] = rever_coded_3rdP_results_copy["Climate Change"][2]
rever_coded_3rdP_results["Climate Change"][5] = rever_coded_3rdP_results_copy["Climate Change"][1]

rever_coded_3rdP_results["Race Diversity"][1] = rever_coded_3rdP_results_copy["Race Diversity"][2]
rever_coded_3rdP_results["Race Diversity"][2] = rever_coded_3rdP_results_copy["Race Diversity"][1]

rever_coded_3rdP_results = [{key: value} for key, value in rever_coded_3rdP_results.items()]

In [100]:
rever_coded_V2_results = {list(d.keys())[0]: list(d.values())[0] for d in rever_coded_V2_results}
rever_coded_V2_results_copy = deepcopy(rever_coded_V2_results)

rever_coded_V2_results["Gay Marriage"][1] = rever_coded_V2_results_copy["Gay Marriage"][3]
rever_coded_V2_results["Gay Marriage"][3] = rever_coded_V2_results_copy["Gay Marriage"][1]

rever_coded_V2_results["Refugee Allowing"][1] = rever_coded_V2_results_copy["Refugee Allowing"][2]
rever_coded_V2_results["Refugee Allowing"][2] = rever_coded_V2_results_copy["Refugee Allowing"][1]

rever_coded_V2_results["Income Inequality"][1] = rever_coded_V2_results_copy["Income Inequality"][2]
rever_coded_V2_results["Income Inequality"][2] = rever_coded_V2_results_copy["Income Inequality"][1]

rever_coded_V2_results["Gender Role"][1] = rever_coded_V2_results_copy["Gender Role"][2]
rever_coded_V2_results["Gender Role"][2] = rever_coded_V2_results_copy["Gender Role"][1]

rever_coded_V2_results["Climate Change"][1] = rever_coded_V2_results_copy["Climate Change"][5]
rever_coded_V2_results["Climate Change"][2] = rever_coded_V2_results_copy["Climate Change"][4]
rever_coded_V2_results["Climate Change"][4] = rever_coded_V2_results_copy["Climate Change"][2]
rever_coded_V2_results["Climate Change"][5] = rever_coded_V2_results_copy["Climate Change"][1]

rever_coded_V2_results["Race Diversity"][1] = rever_coded_V2_results_copy["Race Diversity"][2]
rever_coded_V2_results["Race Diversity"][2] = rever_coded_V2_results_copy["Race Diversity"][1]

rever_coded_V2_results = [{key: value} for key, value in rever_coded_V2_results.items()]

In [126]:
rever_coded_V2_results

[{'Gay Marriage': {1: 5441, 2: 0, 3: 0}},
 {'Refugee Allowing': {1: 1862, 2: 3578, 3: 1}},
 {'Income Inequality': {1: 2343, 2: 2732, 3: 366}},
 {'Gender Role': {1: 13, 2: 5390, 3: 38}},
 {'Climate Change': {1: 261, 2: 60, 3: 0, 4: 2563, 5: 2557}},
 {'Race Diversity': {1: 5422, 2: 0, 3: 19}}]

In [109]:
from scipy.stats import entropy  # entropy is a function to compute KL-divergence

def kl_divergence_between_sets(set1, set2):

    kl_results = {}

    dict1 = {list(d.keys())[0]: list(d.values())[0] for d in set1}
    dict2 = {list(d.keys())[0]: list(d.values())[0] for d in set2}

    for question in dict1.keys():
        if question not in dict2:
            continue  

        p_counts = dict1[question]
        q_counts = dict2[question]

        p = np.array([p_counts.get(k, 0) for k in all_keys], dtype=float)
        q = np.array([q_counts.get(k, 0) for k in all_keys], dtype=float)

        p /= p.sum()
        q /= q.sum()

        epsilon = 1e-10
        p = np.clip(p, epsilon, 1)
        q = np.clip(q, epsilon, 1)
        
        kl_pq = entropy(p, q) 
        kl_results[question] = kl_pq

    return kl_results

In [54]:
#JSD divergence is a more numerically stable version of KL-divergence - better for our case

def jsd_divergence_between_sets(set1, set2, base=2):

    jsd_results = {}

    dict1 = {list(d.keys())[0]: list(d.values())[0] for d in set1}
    dict2 = {list(d.keys())[0]: list(d.values())[0] for d in set2}

    for question in dict1.keys():
        if question not in dict2:
            continue 

        p_counts = dict1[question]
        q_counts = dict2[question]

        p = np.array([p_counts.get(k, 0) for k in p_counts], dtype=float)
        q = np.array([q_counts.get(k, 0) for k in p_counts], dtype=float)

        p /= p.sum()
        q /= q.sum()

        epsilon = 1e-10
        p = np.clip(p, epsilon, 1)
        q = np.clip(q, epsilon, 1)

        m = 0.5 * (p + q)
        jsd_pq = 0.5 * (entropy(p, m, base=base) + entropy(q, m, base=base)) 
        jsd_results[question] = jsd_pq

    return jsd_results

In [110]:
#Get all the results and divergences

jsd_results_previous_study = jsd_divergence_between_sets(human_results, original_results, base=2)
jsd_results_replicate = jsd_divergence_between_sets(human_results, replicate_results, base=2)
jsd_results_reformulated = jsd_divergence_between_sets(human_results, reformulated_results, base=2)
jsd_results_3rdP_reformulated = jsd_divergence_between_sets(human_results, reformulated_3rdP_results, base=2)
jsd_results_rev_coded = jsd_divergence_between_sets(human_results, rever_coded_results, base = 2)


diff_replicate_vs_previous = {key: (jsd_results_replicate[key] - jsd_results_previous_study[key]) for key in jsd_results_replicate.keys()}
diff_reformulated_vs_replicate = {key: (jsd_results_reformulated[key] - jsd_results_replicate[key]) for key in jsd_results_replicate.keys()}
diff_reformulated_3rdP_vs_replicate = {key: (jsd_results_3rdP_reformulated[key] - jsd_results_replicate[key]) for key in jsd_results_replicate.keys()}
diff_rev_coded_vs_replicate = {key: (jsd_results_rev_coded[key] - jsd_results_replicate[key]) for key in jsd_results_rev_coded.keys()}


In [111]:
jsd_results_preamble_original = jsd_divergence_between_sets(human_results, preamble_original_results, base=2)
jsd_results_preamble_reformulated = jsd_divergence_between_sets(human_results, preamble_reformulated_results, base=2)

jsd_results_priming_original = jsd_divergence_between_sets(human_results, priming_original_results, base=2)
jsd_results_priming_reformulated = jsd_divergence_between_sets(human_results, priming_reformulated_results, base=2)

jsd_results_rev_coded_3rdP = jsd_divergence_between_sets(human_results, rever_coded_3rdP_results, base = 2)
jsd_results_rev_coded_V2 = jsd_divergence_between_sets(human_results, rever_coded_V2_results, base = 2)

jsd_results_altered_prompts = jsd_divergence_between_sets(human_results, altered_prompts_results, base = 2)
jsd_results_altered_prompts_3rdP = jsd_divergence_between_sets(human_results, altered_prompts_3rdP_results, base = 2)

diff_preamble_original_vs_replicate = {key: (jsd_results_preamble_original[key] - jsd_results_replicate[key]) for key in jsd_results_replicate.keys()}
diff_preamble_reformulated_vs_replicate = {key: (jsd_results_preamble_reformulated[key] - jsd_results_replicate[key]) for key in jsd_results_replicate.keys()}

diff_priming_original_vs_replicate = {key: (jsd_results_priming_original[key] - jsd_results_replicate[key]) for key in jsd_results_replicate.keys()}
diff_priming_reformulated_vs_replicate = {key: (jsd_results_priming_reformulated[key] - jsd_results_replicate[key]) for key in jsd_results_replicate.keys()}

diff_rev_coded_3rdP_vs_replicate = {key: (jsd_results_rev_coded_3rdP[key] - jsd_results_replicate[key]) for key in jsd_results_rev_coded.keys()}
diff_rev_coded_V2_vs_replicate = {key: (jsd_results_rev_coded_V2[key] - jsd_results_replicate[key]) for key in jsd_results_rev_coded.keys()}

diff_altered_prompts_vs_replicate = {key: (jsd_results_altered_prompts[key] - jsd_results_replicate[key]) for key in jsd_results_altered_prompts.keys()}
diff_altered_prompts_3rdP_vs_replicate = {key: (jsd_results_altered_prompts_3rdP[key] - jsd_results_replicate[key]) for key in jsd_results_altered_prompts.keys()}

In [None]:
#For table1, although currently not used

display(jsd_results_replicate)          # Baseline
display(jsd_results_reformulated)       # 1a (Re-form.)
display(diff_reformulated_vs_replicate) # Δ (1a–Base)

{'Current Economy': np.float64(0.3089343216443693),
 'Gay Marriage': np.float64(0.14307686374186043),
 'Refugee Allowing': np.float64(0.2754547064846679),
 'Income Inequality': np.float64(0.19739544544897714),
 'Gender Role': np.float64(0.016114490428752785),
 'Climate Change': np.float64(0.41027913651738845),
 'Gun Regulation': np.float64(0.11942370568278651),
 'Drug Addiction': np.float64(0.172733460639531),
 'Race Diversity': np.float64(0.1865950426445117),
 'Health Insurance': np.float64(0.2626018434290641)}

{'Current Economy': np.float64(0.15963050653832958),
 'Gay Marriage': np.float64(0.03441359308915744),
 'Refugee Allowing': np.float64(0.24421871154797012),
 'Income Inequality': np.float64(0.15005058071296512),
 'Gender Role': np.float64(0.3294401567477192),
 'Climate Change': np.float64(0.41617819996860295),
 'Gun Regulation': np.float64(0.057870254321471595),
 'Drug Addiction': np.float64(0.172733460639531),
 'Race Diversity': np.float64(0.20097962441051948),
 'Health Insurance': np.float64(0.24802097866941136)}

{'Current Economy': np.float64(-0.1493038151060397),
 'Gay Marriage': np.float64(-0.108663270652703),
 'Refugee Allowing': np.float64(-0.031235994936697792),
 'Income Inequality': np.float64(-0.047344864736012016),
 'Gender Role': np.float64(0.3133256663189664),
 'Climate Change': np.float64(0.005899063451214492),
 'Gun Regulation': np.float64(-0.06155345136131492),
 'Drug Addiction': np.float64(0.0),
 'Race Diversity': np.float64(0.014384581766007765),
 'Health Insurance': np.float64(-0.014580864759652756)}

In [None]:
#For table1

display(jsd_results_3rdP_reformulated)       # 1b (Re-form 3rdP.)
display(diff_reformulated_3rdP_vs_replicate) # Δ (1b–Base)

{'Current Economy': np.float64(0.12011639529079346),
 'Gay Marriage': np.float64(0.052663658962000134),
 'Refugee Allowing': np.float64(0.21026359648978726),
 'Income Inequality': np.float64(0.14884697431552066),
 'Gender Role': np.float64(0.38141899165290616),
 'Climate Change': np.float64(0.3620570362819388),
 'Gun Regulation': np.float64(0.06422818310523307),
 'Drug Addiction': np.float64(0.16017365938057604),
 'Race Diversity': np.float64(0.13999066332608223),
 'Health Insurance': np.float64(0.21360574008852262)}

{'Current Economy': np.float64(-0.18881792635357583),
 'Gay Marriage': np.float64(-0.0904132047798603),
 'Refugee Allowing': np.float64(-0.06519110999488065),
 'Income Inequality': np.float64(-0.04854847113345648),
 'Gender Role': np.float64(0.36530450122415337),
 'Climate Change': np.float64(-0.04822210023544965),
 'Gun Regulation': np.float64(-0.055195522577553446),
 'Drug Addiction': np.float64(-0.01255980125895495),
 'Race Diversity': np.float64(-0.04660437931842948),
 'Health Insurance': np.float64(-0.0489961033405415)}

In [None]:
#For table1

display(jsd_results_rev_coded_V2)       # 2 (Rev_coded)
display(diff_rev_coded_V2_vs_replicate) # Δ (2–Base)

{'Gay Marriage': np.float64(0.1720979525539047),
 'Refugee Allowing': np.float64(0.30754986950526286),
 'Income Inequality': np.float64(0.06333039394731574),
 'Gender Role': np.float64(0.8094911032054941),
 'Climate Change': np.float64(0.21275792818766415),
 'Race Diversity': np.float64(0.2541772989140924)}

{'Gay Marriage': np.float64(0.029021088812044266),
 'Refugee Allowing': np.float64(0.03209516302059495),
 'Income Inequality': np.float64(-0.1340650515016614),
 'Gender Role': np.float64(0.7933766127767413),
 'Climate Change': np.float64(-0.1975212083297243),
 'Race Diversity': np.float64(0.0675822562695807)}

In [None]:
#For table1

display(jsd_results_preamble_original)       # 3 (Preambled)
display(diff_preamble_original_vs_replicate) # Δ (3–Base)

{'Current Economy': np.float64(0.29307151541144644),
 'Gay Marriage': np.float64(0.1720979525539047),
 'Refugee Allowing': np.float64(0.24976660026070246),
 'Income Inequality': np.float64(0.21405503818198618),
 'Gender Role': np.float64(0.15558866670035593),
 'Climate Change': np.float64(0.41158717459566185),
 'Gun Regulation': np.float64(0.20435158200778916),
 'Drug Addiction': np.float64(0.16801130752219084),
 'Race Diversity': np.float64(0.010058756697789412),
 'Health Insurance': np.float64(0.26497278023601517)}

In [None]:
#For table1

display(jsd_results_priming_original)       # 4 (Priming)
display(diff_priming_original_vs_replicate) # Δ (4–Base)

{'Current Economy': np.float64(0.3663018616277002),
 'Gay Marriage': np.float64(0.1720979525539047),
 'Refugee Allowing': np.float64(0.2754547064846679),
 'Income Inequality': np.float64(0.27155220879481656),
 'Gender Role': np.float64(0.18857525266986758),
 'Climate Change': np.float64(0.4163255452744199),
 'Gun Regulation': np.float64(0.21556268502387943),
 'Drug Addiction': np.float64(0.172733460639531),
 'Race Diversity': np.float64(0.032549427240388215),
 'Health Insurance': np.float64(0.2650993971420573)}

{'Current Economy': np.float64(0.057367539983330884),
 'Gay Marriage': np.float64(0.029021088812044266),
 'Refugee Allowing': np.float64(0.0),
 'Income Inequality': np.float64(0.07415676334583943),
 'Gender Role': np.float64(0.17246076224111478),
 'Climate Change': np.float64(0.006046408757031441),
 'Gun Regulation': np.float64(0.09613897934109292),
 'Drug Addiction': np.float64(0.0),
 'Race Diversity': np.float64(-0.15404561540412348),
 'Health Insurance': np.float64(0.0024975537129932057)}

In [113]:
def jsd_divergence_for_bootstrapping(human_sample, bootstrap_sample, base=2):

    p_counts = human_sample
    q_counts = bootstrap_sample

    p = np.array([p_counts.get(k, 0) for k in p_counts], dtype=float)
    q = np.array([q_counts.get(k, 0) for k in p_counts], dtype=float)

    p /= p.sum()
    q /= q.sum()

    epsilon = 1e-10
    p = np.clip(p, epsilon, 1)
    q = np.clip(q, epsilon, 1)

    m = 0.5 * (p + q)
    jsd_pq = 0.5 * (entropy(p, m, base=base) + entropy(q, m, base=base))

    return jsd_pq


In [131]:
human_df = pd.read_csv("data/2020 ANES_test.csv")

def bootstrap_jsd(path, qid, n_boot=2000, base=2, return_samples=False):

    question_df = pd.read_csv(path + qid + ".csv")
    question_df['Response'] = pd.to_numeric(question_df['Response'], errors='coerce')

    responses = np.array(question_df['Response'].dropna())

    human_responses = {list(d.keys())[0]: list(d.values())[0] for d in human_results}[name_dict[qid]]

    jsd_samples = []

    for _ in range(n_boot):
        sample = np.random.choice(responses, size=len(responses), replace=True) #sample with replacement
        
        counts = Counter(sample)

        jsd_val = jsd_divergence_for_bootstrapping(human_responses, counts, base=base) #compute JSD for this iteration
        jsd_samples.append(jsd_val) #append to the samples list

    jsd_samples = np.array(jsd_samples)

    if return_samples:
        return jsd_samples  # <— raw samples for delta CI

    mean_jsd = jsd_samples.mean()
    ci_lower, ci_upper = np.percentile(jsd_samples, [2.5, 97.5])
    
    return (("mean", mean_jsd), ("ci_lower", ci_lower), ("ci_upper", ci_upper))



  human_df = pd.read_csv("data/2020 ANES_test.csv")


In [None]:
#Create CIs and compare them

ci_replicate = []
ci_reformulated = []
ci_preamble = []
ci_priming = []
ci_reverse_coded = []

for qid in qids:
    ci_replicate.append((name_dict[qid], bootstrap_jsd(path = "MLMM Results/main_mq_results/full_results_2020_", qid = qid)))
    ci_reformulated.append((name_dict[qid], bootstrap_jsd(path = "MLMM Results/main_mq_reform_3rdP_results/full_results_2020_", qid = qid)))
    ci_preamble.append((name_dict[qid], bootstrap_jsd(path = "MLMM Results/Preamble/Original/full_results_2020_", qid = qid)))
    ci_priming.append((name_dict[qid], bootstrap_jsd(path = "MLMM Results/Priming/Original/full_results_2020_", qid = qid)))

ci_replicate = [{item[0] : item[1]} for item in ci_replicate]
ci_reformulated = [{item[0] : item[1]} for item in ci_reformulated]
ci_preamble = [{item[0] : item[1]} for item in ci_preamble]
ci_priming = [{item[0] : item[1]} for item in ci_priming]

ci_replicate = {list(d.keys())[0]: list(d.values())[0] for d in ci_replicate}
ci_reformulated = {list(d.keys())[0]: list(d.values())[0] for d in ci_reformulated}
ci_preamble = {list(d.keys())[0]: list(d.values())[0] for d in ci_preamble}
ci_priming = {list(d.keys())[0]: list(d.values())[0] for d in ci_priming}

In [None]:
def bootstrap_jsd_for_reverse_coded(path, qid, n_boot=2000, base=2, return_samples=False):

    question_df = pd.read_csv(path + qid + ".csv")
    question_df['Response'] = pd.to_numeric(question_df['Response'], errors='coerce')

    reverse_mappings = {
        "Gay Marriage": {1: 3, 3: 1},
        "Refugee Allowing": {1: 2, 2: 1},
        "Income Inequality": {1: 2, 2: 1},
        "Gender Role": {1: 2, 2: 1},
        "Climate Change": {1: 5, 2: 4, 4: 2, 5: 1},
        "Race Diversity": {1: 2, 2: 1}
    }

    qname = name_dict[qid]
    responses = np.array(question_df['Response'].dropna())

    #apply the map to swap the answers
    if qname in reverse_mappings:
        mapping = reverse_mappings[qname]
        responses = np.array([mapping.get(val, val) for val in responses])

    human_responses = {list(d.keys())[0]: list(d.values())[0] for d in human_results}[qname]

    jsd_samples = []

    for _ in range(n_boot):
        sample = np.random.choice(responses, size=len(responses), replace=True)
        counts = Counter(sample)
        jsd_val = jsd_divergence_for_bootstrapping(human_responses, counts, base=base)
        jsd_samples.append(jsd_val)

    jsd_samples = np.array(jsd_samples)

    if return_samples:
        return jsd_samples

    mean_jsd = jsd_samples.mean()
    ci_lower, ci_upper = np.percentile(jsd_samples, [2.5, 97.5])

    return (("mean", mean_jsd), ("ci_lower", ci_lower), ("ci_upper", ci_upper))


In [152]:
ci_reverse_coded = []

for qid in qids_reverse_coded:
    ci_reverse_coded.append((name_dict[qid], bootstrap_jsd_for_reverse_coded(path = "MLMM Results/main_mq_reverseV2_results/full_results_2020_", qid = qid)))

ci_reverse_coded = [{item[0] : item[1]} for item in ci_reverse_coded]
ci_reverse_coded = {list(d.keys())[0]: list(d.values())[0] for d in ci_reverse_coded}

In [None]:
#for table 2 in the appendix

ci_comparison = {key: (ci_replicate[key], ci_reformulated[key], ci_preamble[key], ci_priming[key]) for key in ci_replicate.keys()}

display(ci_comparison)

{'Current Economy': ((('mean', np.float64(0.30888024684581705)),
   ('ci_lower', np.float64(0.304179264834076)),
   ('ci_upper', np.float64(0.3136070934615195))),
  (('mean', np.float64(0.1201309875441333)),
   ('ci_lower', np.float64(0.1170568697796161)),
   ('ci_upper', np.float64(0.12340859050974275))),
  (('mean', np.float64(0.29323067577586776)),
   ('ci_lower', np.float64(0.2875618726259383)),
   ('ci_upper', np.float64(0.2986232334897399))),
  (('mean', np.float64(0.3664585004646765)),
   ('ci_lower', np.float64(0.3600239304242139)),
   ('ci_upper', np.float64(0.3733287996156019)))),
 'Gay Marriage': ((('mean', np.float64(0.14324350017839824)),
   ('ci_lower', np.float64(0.1390887851531668)),
   ('ci_upper', np.float64(0.1477387291973739))),
  (('mean', np.float64(0.052664457740211515)),
   ('ci_lower', np.float64(0.04841877640154858)),
   ('ci_upper', np.float64(0.057112845070914055))),
  (('mean', np.float64(0.17209795255390464)),
   ('ci_lower', np.float64(0.1720979525539047)

In [158]:
#For table 2 in the appendix

display(ci_reverse_coded)

{'Gay Marriage': (('mean', np.float64(0.17209795255390464)),
  ('ci_lower', np.float64(0.1720979525539047)),
  ('ci_upper', np.float64(0.1720979525539047))),
 'Refugee Allowing': (('mean', np.float64(0.3077505248224518)),
  ('ci_lower', np.float64(0.3007703124666105)),
  ('ci_upper', np.float64(0.31458358837137046))),
 'Income Inequality': (('mean', np.float64(0.06336619865345999)),
  ('ci_lower', np.float64(0.058132128999711326)),
  ('ci_upper', np.float64(0.0689189983015691))),
 'Gender Role': (('mean', np.float64(0.8096396645660773)),
  ('ci_lower', np.float64(0.8011713855238537)),
  ('ci_upper', np.float64(0.8182592800163506))),
 'Climate Change': (('mean', np.float64(0.21286787095399984)),
  ('ci_lower', np.float64(0.207272845263157)),
  ('ci_upper', np.float64(0.2186113832584872))),
 'Race Diversity': (('mean', np.float64(0.25427702435667304)),
  ('ci_lower', np.float64(0.24935972089889055)),
  ('ci_upper', np.float64(0.2589382670927961)))}