In [56]:
import pandas as pd
import numpy as np

import thinkstats2
import thinkplot

import statsmodels.formula.api as smf

# Data

In [57]:
dct = thinkstats2.ReadStataDct('./Who_Votes_2/GSS.dct')
df = dct.ReadFixedWidth('./Who_Votes_2/GSS.dat')
print(df.shape)
df.head()

(62466, 22)


Unnamed: 0,year,partyid,vote12,relig,happy,health,trust,relpersn,wwwhr,ballot,...,id_,wrkstat,hrs2,wrkgovt,industry,age,educ,sex,race,sei10
0,1972,2,0,3,3,2,3,0,-1,0,...,1,1,-1,0,609,23,16,2,1,50.0
1,1972,1,0,2,3,3,1,0,-1,0,...,2,5,-1,0,338,70,10,1,1,46.5
2,1972,3,0,1,2,1,2,0,-1,0,...,3,2,-1,0,718,48,12,2,1,56.9
3,1972,1,0,5,3,2,2,0,-1,0,...,4,1,-1,0,319,27,17,2,1,76.3
4,1972,0,0,1,2,2,2,0,-1,0,...,5,7,-1,0,448,61,12,2,1,31.9


For now, we are only interested in people who voted in 2012, the year that we will be investigating.

In [58]:
df2012 = df[df.vote12 != 0].copy()
#Copying the slice makes it possible to add values
df2012.shape

(5405, 22)

In [59]:
print(df2012.shape)
df2012.head()

(5405, 22)


Unnamed: 0,year,partyid,vote12,relig,happy,health,trust,relpersn,wwwhr,ballot,...,id_,wrkstat,hrs2,wrkgovt,industry,age,educ,sex,race,sei10
57061,2014,5,2,2,1,1,2,4,-1,3,...,1,1,-1,2,0,53,16,1,1,59.1
57062,2014,5,1,2,1,1,1,1,-1,3,...,2,1,-1,2,0,26,16,2,1,61.9
57063,2014,6,1,1,3,2,0,2,-1,1,...,3,4,-1,2,0,59,13,1,1,38.1
57064,2014,5,1,2,1,0,1,2,2,2,...,4,2,-1,2,0,56,16,2,1,73.9
57065,2014,3,1,2,1,1,2,2,-1,3,...,5,5,-1,1,0,74,17,2,1,76.3


The following helper functions were borrowed from [Allen Downey](https://github.com/AllenDowney/GssReligion), as part of a different exploration of the GSS dataset.

In [60]:
#This function renames a convenient feature of pandas
#It makes it easy to identify strange, outlier values
def values(df, varname):
    """Values and counts in index order.
    
    df: DataFrame
    varname: strign column name
    
    returns: Series that maps from value to frequency
    """
    return df[varname].value_counts().sort_index()

#This fixes the problems you can find with the previous one.
def fill_missing(df, varname, badvals=[98, 99]):
    """Fill missing data with random values.
    
    df: DataFrame
    varname: string column name
    badvals: list of values to be replaced
    """
    df[varname].replace(badvals, np.nan, inplace=True)
    null = df[varname].isnull()
    fill = np.random.choice(df[varname].dropna(), sum(null), replace=True)
    df.loc[null, varname] = fill
    return sum(null)

# Who votes?

In [61]:
values(df2012, "relig")

1     2496
2     1255
3       91
4     1141
5       71
6       47
7       26
8        7
9       28
10      16
11     174
12       6
13      11
98       6
99      30
Name: relig, dtype: int64

The values with a signifigant amount of data are 1, 2, and 4, which are Protestant, Catholic, and None, respectively. We can lump the others in under "other"

In [62]:
df2012['isProtestant'] = (df2012.relig == 1).astype(int)
df2012['isCatholic'] = (df2012.relig == 2).astype(int)
df2012['isNone'] = (df2012.relig == 4).astype(int)