In [1]:
import matplotlib.pyplot as plt 
import metapack as mp
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(color_codes=True)
%matplotlib inline

In [3]:
pkg = mp.jupyter.open_package()
pkg

In [4]:
pkg.resource('cnss_2017')

Header,Type,Description
caseid,integer,Case identification number (assigned by SRI)
survid,integer,Case identification number (assigned by SRI)
timezone,string,Time zone (provided by MSG)
state,string,State (provided by MSG)
msa,string,Metropolitan Statistical Area (provided by MSG)
msc,integer,Metropolitan Status Code (provided by MSG)
censusr,integer,Census Region (provided by MSG)
censusd,integer,Census Division (provided by MSG)
cbsamsa,integer,CBSA MSA Met Status Code (provided by MSG)
cbsamcsa,integer,CBSA MCSA Met Status Code (provided by MSG)


In [5]:
df = pkg.resource('cnss_2017').read_csv()

In [11]:
pd.pivot_table(df, index='party', columns='gender', values='caseid', aggfunc=len, margins=True)

gender,Female,Male,All
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Independent (close to Neither),110,135,245
"Independent, close to Democrat",56,39,95
"Independent, close to Republican",44,52,96
Not very strong Democrat,60,55,115
Not very strong Republican,45,50,95
Other party affiliation (specify ...),17,26,43
Strong Democrat,104,71,175
Strong Republican,62,70,132
All,498,498,996


In [18]:
pd.pivot_table(df, index='riq1', columns='gender', values='caseid', aggfunc=len, margins=True)

gender,Female,Male,All
riq1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
All of the above,201,192,393
Bacteria,266,247,513
Fungi,7,8,15
Viruses,22,36,58
All,496,483,979


In [19]:
pd.pivot_table(df, index='party', columns='ideo', values='caseid', aggfunc=len, margins=True)

ideo,Conservative,Extremely conservative,Extremely liberal,Liberal,Moderate or middle of the road,Slightly conservative,Slightly liberal,All
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Independent (close to Neither),26.0,5.0,3.0,34.0,132.0,25.0,18.0,243
"Independent, close to Democrat",2.0,,14.0,25.0,36.0,5.0,13.0,95
"Independent, close to Republican",29.0,6.0,,1.0,34.0,25.0,1.0,96
Not very strong Democrat,8.0,,9.0,30.0,43.0,9.0,16.0,115
Not very strong Republican,31.0,6.0,,3.0,34.0,20.0,1.0,95
Other party affiliation (specify ...),2.0,2.0,4.0,4.0,20.0,6.0,2.0,40
Strong Democrat,8.0,3.0,38.0,62.0,43.0,2.0,17.0,173
Strong Republican,65.0,25.0,3.0,3.0,21.0,15.0,,132
All,171.0,47.0,71.0,162.0,363.0,107.0,68.0,989


In [20]:
pd.pivot_table(df, index='party', columns='idq4', values='caseid', aggfunc=len, margins=True)

idq4,Agree,Disagree,Neither agree or disagree,Strongly agree,Strongly disagree,All
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent (close to Neither),71.0,37.0,39.0,89.0,9.0,245
"Independent, close to Democrat",26.0,8.0,14.0,47.0,,95
"Independent, close to Republican",27.0,22.0,15.0,24.0,8.0,96
Not very strong Democrat,44.0,15.0,23.0,30.0,3.0,115
Not very strong Republican,27.0,23.0,24.0,15.0,6.0,95
Other party affiliation (specify ...),14.0,4.0,5.0,17.0,3.0,43
Strong Democrat,57.0,20.0,27.0,66.0,5.0,175
Strong Republican,22.0,36.0,28.0,23.0,23.0,132
All,288.0,165.0,175.0,311.0,57.0,996


In [25]:
pd.pivot_table(df, index='kwq1_b', columns='kwq1_a', values='caseid', aggfunc=len, margins=True)

kwq1_a,No,Yes,All
kwq1_b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,570,149,719
Yes,138,142,280
All,708,291,999


In [27]:
df.groupby(['kwq1_a','kwq1_b']).age.count()

kwq1_a  kwq1_b
No      No        570
        Yes       138
Yes     No        149
        Yes       142
Name: age, dtype: int64

In [37]:
pd.crosstab(df.kwq1_b, df.race_b, values='caseid', aggfunc=len, margins=True, normalize='all')

race_b,No,Yes,All
kwq1_b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,0.629032,0.091734,0.720766
Yes,0.225806,0.053427,0.279234
All,0.854839,0.145161,1.0


In [79]:
_1 = pd.crosstab(df.kwq1_b, df.race_b, values='caseid', aggfunc=len, normalize='all')

_1

race_b,No,Yes
kwq1_b,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.629032,0.091734
Yes,0.225806,0.053427


In [82]:
pd.crosstab(df.kwq1_b, df.race_b, values='caseid', aggfunc=len, normalize='columns')


race_b,No,Yes
kwq1_b,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.735849,0.631944
Yes,0.264151,0.368056


In [93]:
# "Minorities get government advantages" Vs Black

#  { k:k for k in list(df.idq5.unique()) } Make the mapping dict

df['min_advtg'] =  df.idq5.replace(
    {'Disagree': 'Disagree',
    'Strongly agree': 'Agree',
    'Strongly disagree': 'Disagree',
    'Agree': 'Agree',
    'Neither agree or disagree': np.nan,
    np.nan: np.nan})

df['race_b'] = df.race_b.replace({'Yes': 'Black', 'No': 'Not Black'})

pd.crosstab(df.min_advtg, df.race_b, values='caseid', aggfunc=len, normalize='columns')


race_b,Black,Not Black
min_advtg,Unnamed: 1_level_1,Unnamed: 2_level_1
Agree,0.578125,0.430029
Disagree,0.421875,0.569971


In [99]:
# EBq2 "Worry about crime, neighborhood " Vs Black

#  { k:k for k in list(df.idq5.unique()) } Make the mapping dict

df['worry_crime_neighborhood'] =  df.ebq2.replace(
    {'Only a little': 'Less',
     'A fair amount': 'More',
     'Not at all': 'Less',
     'A great deal': 'More'})

df['race_b'] = df.race_b.replace({'Yes': 'Black', 'No': 'Not Black'})

pd.crosstab(df.worry_crime_neighborhood, df.race_b, values='caseid', aggfunc=len, normalize='columns')


race_b,Black,Not Black
worry_crime_neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1
Less,0.520833,0.724382
More,0.479167,0.275618


In [106]:
# kwq1_b "Experienced Domestic Violence " Vs Party

#  { k:k for k in list(df.idq5.unique()) } Make the mapping dict

df['party_agg'] =  df.party.replace(
    {'Independent (close to Neither)': 'Independent',
    'Not very strong Democrat': 'Democrat',
    'Independent, close to Republican': 'Independent',
    'Strong Republican': 'Republican',
    'Independent, close to Democrat': 'Independent',
    'Not very strong Republican': 'Republican',
    'Strong Democrat': 'Democrat',
    'Other party affiliation (specify ...)': np.nan,
    np.nan: np.nan}
)


pd.crosstab(df.kwq1_b, df.party_agg, values='caseid', aggfunc=len, normalize='columns')


party_agg,Democrat,Independent,Republican
kwq1_b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,0.696552,0.705747,0.801762
Yes,0.303448,0.294253,0.198238


In [110]:
# SKq5 "Feminism Good or Bad " Vs Party

df['feminism_good'] =  df.party.replace(
    {'Neutral': 'Neutral',
    'Strongly agree': 'Agree',
    'Disagree': 'Disagree',
    'Strongly disagree': 'Disagree',
    'Agree': 'Agree',
    np.nan: np.nan}
)

pd.crosstab(df.skq5, df.party_agg, values='caseid', aggfunc=len, normalize='columns')





{'Neutral': 'Neutral',
 'Strongly agree': 'Strongly agree',
 'Disagree': 'Disagree',
 'Strongly disagree': 'Strongly disagree',
 'Agree': 'Agree'}

In [38]:
from scipy.stats import pointbiserialr

In [58]:
_1 = pd.get_dummies(df[['skq1','skq2_a']])

pointbiserialr(_1.skq1_Yes, _1.skq2_a_Yes)

PointbiserialrResult(correlation=0.09969027556610287, pvalue=0.0015972627271983277)

In [57]:
_1.head()

Unnamed: 0,skq1_No,skq1_Yes,skq2_a_No,skq2_a_Yes
0,1,0,1,0
1,1,0,1,0
2,1,0,0,1
3,1,0,1,0
4,1,0,1,0


In [62]:
_1 = pd.get_dummies(df[['skq1','skq3']])
_1.corr()

Unnamed: 0,skq1_No,skq1_Yes,skq3_No,skq3_Unfamiliar with the term alpha male,skq3_Yes
skq1_No,1.0,-1.0,0.008431,0.056115,-0.041827
skq1_Yes,-1.0,1.0,-0.008431,-0.056115,0.041827
skq3_No,0.008431,-0.008431,1.0,-0.409994,-0.796235
skq3_Unfamiliar with the term alpha male,0.056115,-0.056115,-0.409994,1.0,-0.173474
skq3_Yes,-0.041827,0.041827,-0.796235,-0.173474,1.0


In [68]:
df.ahq3.value_counts()

Strongly agree                450
Somewhat agree                243
Neither agree nor disagree    122
Somewhat disagree             106
Strongly disagree              78
Name: ahq3, dtype: int64

In [71]:
df.kwq2.value_counts()

No- they haven't had an impact on my work.    241
Yes                                           152
Name: kwq2, dtype: int64

In [74]:
df.ebq1.value_counts()

Not at all                     377
N/A - Not currently working    275
Only a little                  202
A fair amount                   88
A great deal                    58
Name: ebq1, dtype: int64

In [76]:
df.mjq1.value_counts()

Somewhat positive impact    261
Neutral                     259
Very positive impact        218
Somewhat negative impact    165
Very negative impact         94
Name: mjq1, dtype: int64