In [2]:
import pandas as pd
from pathlib import Path
import plotly.graph_objects as go
import plotly.express as px

In [3]:
# Direct Democracy Study

data_path = Path("../data/anes_specialstudy_2012_directdem_dta/anes_specialstudy_2012_directdem_stata12.dta")
data = pd.read_stata(data_path)

First, we need to collect all of the objective questions and find their respective correct answers. Luckily, I've done this and compiled the information in a csv file we can use. 

It appears that there are **three** non-opiniated questions on the survey. These are only the questions I could observe, so there may be more in addition to these. They are:

1. What is Medicare?
2. On which of the following does the U.S. federal government currently spend the least?
3. For how many years is a United States Senator elected – that is, how many years are there in one full term of office for a U.S. Senator?

Now we can load the column names and correct answers for each of these questions.

In [4]:
question_info_path = Path("../output/objective_questions_list.xlsx")
question_info = pd.read_excel(question_info_path)
question_info.head()

Unnamed: 0,Question,correct_answer
0,pre_medicare,1
1,pre_leastsp,1
2,pre_senterm,6


In [24]:
# Now we can examine the format of the answers in our survey data
num_questions = question_info.shape[0]
for i in range(0,num_questions):
    curr_question = question_info["Question"][i]
    print(data[curr_question][0:5])
    print("------------------------------")

0    1. A program run by the U.S. federal governmen...
1    1. A program run by the U.S. federal governmen...
2    1. A program run by the U.S. federal governmen...
3    1. A program run by the U.S. federal governmen...
4    1. A program run by the U.S. federal governmen...
Name: pre_medicare, dtype: category
Categories (6, object): [-9. Refused < -4. Error < 1. A program run by the U.S. federal governmen... < 2. A program run by state governments to provi... < 3. A private health insurance plan sold to ind... < 4. A private, non-profit organization that run...]
------------------------------
0           2. Medicare
1    4. Social Security
2        1. Foreign aid
3           2. Medicare
4    4. Social Security
Name: pre_leastsp, dtype: category
Categories (6, object): [-9. Refused < -4. Error < 1. Foreign aid < 2. Medicare < 3. National defense < 4. Social Security]
------------------------------
0    2
1    6
2    4
3    2
4    4
Name: pre_senterm, dtype: category
Categories (24, obje

We can see that for each question, we have the possibility that the participant refused to answer, or that their answer was recorded as an error. We'll have to filter these out.

 Also, for the two multiple choice questions, the text of the answer is included after the number id. We'll have to account for this when filtering the answers. 

In [23]:
# Select only the columns of interest: id, party affiliation, and the questions of interest.
wanted_columns = ["caseid", "pre_rptyid"] + list(question_info["Question"])
trivia_data = data[wanted_columns]

# Now let's filter out all of the errorneous and refusal answers
pre = trivia_data.shape[0]
for column in wanted_columns:
    if column != "caseid":
        trivia_data = trivia_data[(trivia_data[column] != trivia_data[column].cat.categories[0]) & (trivia_data[column] != trivia_data[column].cat.categories[1])]
post = trivia_data.shape[0]
rows_removed = pre - post
print("Rows removed: " + str(rows_removed))


Rows removed: 127


In [45]:
# Now we can add new columns on the correctness of each participants answer
for i in range(0,num_questions):
    question = question_info.Question[i]
    correct_answer = question_info.correct_answer[i]
    if type(trivia_data[question][0]) == str:
        trivia_data[question + "_correct"] = trivia_data[question].str.contains(str(correct_answer))
    else:
        trivia_data[question + "_correct"] = trivia_data[question] == correct_answer

# Let's add one more column as an average score of the three questions for each participant
score_columns = [question + "_correct" for question in list(question_info["Question"])]
trivia_data["avg_score"] = trivia_data[score_columns].mean(axis=1)

trivia_data.head()


Unnamed: 0,caseid,pre_rptyid,pre_medicare,pre_leastsp,pre_senterm,pre_medicare_correct,pre_leastsp_correct,pre_senterm_correct,avg_score
0,3001,1. Democrat,1. A program run by the U.S. federal governmen...,2. Medicare,2,True,False,False,0.333333
1,3004,2. Republican,1. A program run by the U.S. federal governmen...,4. Social Security,6,True,False,True,0.666667
2,3005,1. Democrat,1. A program run by the U.S. federal governmen...,1. Foreign aid,4,True,True,False,0.666667
3,3009,1. Democrat,1. A program run by the U.S. federal governmen...,2. Medicare,2,True,False,False,0.333333
4,3016,2. Republican,1. A program run by the U.S. federal governmen...,4. Social Security,4,True,False,False,0.333333


Great, now we have all of the information we need! So, now we can examine how the different parties did on these questions. Let's start with the average score.

In [49]:

fig = px.box(trivia_data, x="pre_rptyid", y="avg_score", points="all")
fig.show()

At first glance, it appears that the Democrats, Republicans, and Other parties all have equivalent scores, while the independents may have a higher mean since they have more high averages. It's hard to observe the trends with the low resolution of the averages, so let's use a different plot.

In [58]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=2, cols=2, subplot_titles=("Democrat", "Republican", "Independent", "Other Party"), shared_xaxes=True)

democrat = go.Histogram(x = trivia_data[trivia_data["pre_rptyid"].str.contains("Democrat")].avg_score)
republican = go.Histogram(x = trivia_data[trivia_data["pre_rptyid"].str.contains("Republican") ].avg_score)
independent = go.Histogram(x = trivia_data[trivia_data["pre_rptyid"].str.contains("Independent") ].avg_score)
other = go.Histogram(x = trivia_data[trivia_data["pre_rptyid"].str.contains("Other") ].avg_score)

fig.append_trace(democrat, 1, 1)
fig.append_trace(republican, 1, 2)
fig.append_trace(independent, 2, 1)
fig.append_trace(other, 2, 2)

fig.update_layout(showlegend=False)
fig.show()

"""fig1 = px.histogram(trivia_data[trivia_data["pre_rptyid"].str.contains("Democrat") ], x="avg_score")
fig1.show()
fig2 = px.histogram(trivia_data[trivia_data["pre_rptyid"].str.contains("Republican") ], x="avg_score")
fig2.show()"""

'fig1 = px.histogram(trivia_data[trivia_data["pre_rptyid"].str.contains("Democrat") ], x="avg_score")\nfig1.show()\nfig2 = px.histogram(trivia_data[trivia_data["pre_rptyid"].str.contains("Republican") ], x="avg_score")\nfig2.show()'

Was there any difference in accuracy between the questions?

In [74]:
averages = trivia_data[score_columns].mean()

fig = go.Figure(data=[go.Table(
    header=dict(values=list(averages.index),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[averages[0], averages[1], averages[2]],
               fill_color='lavender',
               align='left'))
])

fig.show()

It seems that the medicare questions was the most well known, while the budgeting question was the least well-known. This is expected, since medicare and the term of a senator are things that stay constant and are encountered in school or advertised to the public. Meanwhile, the federal budget changes year-by-year and is not well-advertised to the public

Now let's breakdown the accuracy by question and party

In [79]:
party_breakdown = trivia_data.groupby(by=["pre_rptyid"]).mean()
party_breakdown = party_breakdown[(party_breakdown.index != '-9. Refused') & (party_breakdown.index != '-4. Error')]
party_breakdown

Unnamed: 0_level_0,pre_medicare_correct,pre_leastsp_correct,pre_senterm_correct,avg_score
pre_rptyid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1. Democrat,0.864806,0.349754,0.50301,0.572523
2. Republican,0.89601,0.349902,0.536952,0.594288
3. Independent,0.865111,0.401821,0.575413,0.614115
5. Other party,0.8,0.32,0.531429,0.550476
