In [13]:
import pandas as pd
from df_transform import *
from statistical_tests import *

### Loading in and cleaning the dataset

In [30]:
survey = pd.read_csv("../data/cookie_survey_responses.csv")
survey.drop([c for c in survey.columns[:2]], axis=1, inplace=True)
new_df, question_map = rename_columns(survey)
new_df = new_df.drop(0)
new_df["Question 1"] = new_df["Question 1"].astype(int)
answer_encs = encode_answers(new_df)
N = len(new_df)  # number of observations
print(new_df.shape)
print(new_df.columns)
new_df.head(5)

(22, 22)
Index(['Question 1', 'Question 2', 'Question 3', 'Question 4', 'Question 5',
       'Question 6', 'Question 7', 'Question 8', 'Question 9', 'Question 10',
       'Question 11', 'Question 12', 'Question 13', 'Question 14',
       'Question 15', 'Question 16', 'Question 17', 'Question 18',
       'Question 19', 'Question 20', 'Question 21', 'Question 22'],
      dtype='object')


Unnamed: 0,Question 1,Question 2,Question 3,Question 4,Question 5,Question 6,Question 7,Question 8,Question 9,Question 10,...,Question 13,Question 14,Question 15,Question 16,Question 17,Question 18,Question 19,Question 20,Question 21,Question 22
1,20,Economics & Data Science,Safari,2,0,2,4,2,1,1,...,4,Not sure,Yes,"Generally shopping/marketplace websites, e.g. ...",No,,0,1,1,1
2,21,Linguistics,Chrome,2,0,2,4,0,1,0,...,4,No,No,,No,,0,0,0,5
3,19,Chemistry,Chrome,2,0,1,5,0,1,0,...,2,No,No,,No,,2,0,0,3
4,34,Biophysics,Chrome,0,2,0,0,2,2,0,...,3,Not sure,I didn't know that was possible,,No,,0,0,0,9
5,21,Accounting,Chrome,2,0,2,4,1,3,2,...,4,Not sure,I didn't know that was possible,,Yes,I did not the website to have my data,0,1,1,2


Adding a expertise categorization column (STEM or Non STEM)

In [4]:
# 1 means STEM, 0 means non-stem. Obtained via manual observation
stem_cat = pd.Series([1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0])
# 1 means directly related to computers/data, 0 means not computer/data related
comp_cat = pd.Series([1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0])
new_df["stem"] = stem_cat
new_df["computers"] = comp_cat

### Run Statistical Tests

Compare number of adults that claim a comprehensive understanding of cookies in our study (22 observations) to a Statista survey of adults (1100 observations). <br/> 
Corresponds to Question 5: <br/> 

Do you understand what a cookie does and what data you give away when you accept cookies?
{0: 'I understand the basics',
 1: 'No, not at all',
 2: 'Not really',
 3: 'Yes, fully'}

In [47]:
# those who have comprehensive understanding
p_exp = len(new_df[new_df["Question 5"] == 3]) / len(new_df)  # 4/22
p_comp = .28  # see cookieeyes.com link in cookie_awareness.md
print("Comprehensive Knowledge:", two_prop_z_test(p_exp, p_comp, N, 1100))
# partial knowledge
p_exp = len(new_df[new_df["Question 5"] == 0]) / len(new_df)
p_comp = .49  # see cookieeyes.com link in cookie_awareness.md
print("Partial Knowledge:", two_prop_z_test(p_exp, p_comp, N, 1100))
# no knowledge
p_exp = len(new_df.loc[(new_df["Question 5"] == 1) | (new_df["Question 5"] == 2)]) / len(new_df)
p_comp = .24  # see cookieeyes.com link in cookie_awareness.md
print("No Knowledge:", two_prop_z_test(p_exp, p_comp, N, 1100))

Comprehensive Knowledge: (-1.0176896912577567, 0.3088254478393895, 'fail to reject')
Partial Knowledge: (-0.32939162299787755, 0.7418596987241401, 'fail to reject')
No Knowledge: (1.3398507379555382, 0.18029387638173944, 'fail to reject')


Next, we check to see if there's any significance in the proportions of students that blindly accept cookies. We compare to the allaboutcookies study of 1000 observations of US adults which found that 38% of adults do so. We are interested in option "0" for Question 11: <br/> 

When you visit a website and see a cookie consent banner, what do you usually do?
{0: 'Accept all cookies without reading', 1: 'Decline non-essential cookies', 2: 'Leave the website'}
<br/> 

Then we isolate the Gen Z respondents by age (<= 28) and compare the proportion that blindly accept all cookies. In the allaboutcookies article, this number is .47.

In [48]:
# blindly accept whole sample
p_exp = len(new_df.loc[new_df["Question 11"] == 0]) / len(new_df)
p_comp = .38  # see allaboutcookies.org link in cookie_awareness.md
print("Whole Sample:", two_prop_z_test(p_exp, p_comp, N, 1000))
# blindly accept for gen Z
p_exp = len(new_df.loc[(new_df["Question 1"] <= 28) & (new_df["Question 11"] == 0)]) / len(new_df)
n_exp = sum(new_df["Question 1"] <= 28)
p_comp = .47  # see allaboutcookies.org link in cookie_awareness.md
print("Gen Z Only:", two_prop_z_test(p_exp, p_comp, n_exp, 250))

Whole Sample: (-0.15644299315234023, 0.8756838507100282, 'fail to reject')
Gen Z Only: (-1.3111114469863319, 0.18982011394345655, 'fail to reject')


### Computer vs Non-Computer Expertise Statistical Tests