In [280]:
import pandas as pd
import numpy as np

import thinkstats2
import thinkplot

import statsmodels.formula.api as smf

# Data

In [281]:
dct = thinkstats2.ReadStataDct('./Who_Votes_2/GSS.dct')
df = dct.ReadFixedWidth('./Who_Votes_2/GSS.dat')
print(df.shape)
df.head()

(62466, 22)


Unnamed: 0,year,partyid,vote12,relig,happy,health,trust,relpersn,wwwhr,ballot,...,id_,wrkstat,hrs2,wrkgovt,industry,age,educ,sex,race,sei10
0,1972,2,0,3,3,2,3,0,-1,0,...,1,1,-1,0,609,23,16,2,1,50.0
1,1972,1,0,2,3,3,1,0,-1,0,...,2,5,-1,0,338,70,10,1,1,46.5
2,1972,3,0,1,2,1,2,0,-1,0,...,3,2,-1,0,718,48,12,2,1,56.9
3,1972,1,0,5,3,2,2,0,-1,0,...,4,1,-1,0,319,27,17,2,1,76.3
4,1972,0,0,1,2,2,2,0,-1,0,...,5,7,-1,0,448,61,12,2,1,31.9


For now, we are only interested in people who voted in 2012, the year that we will be investigating.

In [282]:
df2012 = df[(df.vote12==2) | (df.vote12==1)].copy()
#Copying the slice makes it possible to add values
df2012.shape

(4983, 22)

In [283]:
#I want vote2012 to be 0 false, 1 true: right now 2 is false. So...
df2012.vote12 = df.vote12%2

print(df2012.shape)
df2012.head()

(4983, 22)


Unnamed: 0,year,partyid,vote12,relig,happy,health,trust,relpersn,wwwhr,ballot,...,id_,wrkstat,hrs2,wrkgovt,industry,age,educ,sex,race,sei10
57061,2014,5,0,2,1,1,2,4,-1,3,...,1,1,-1,2,0,53,16,1,1,59.1
57062,2014,5,1,2,1,1,1,1,-1,3,...,2,1,-1,2,0,26,16,2,1,61.9
57063,2014,6,1,1,3,2,0,2,-1,1,...,3,4,-1,2,0,59,13,1,1,38.1
57064,2014,5,1,2,1,0,1,2,2,2,...,4,2,-1,2,0,56,16,2,1,73.9
57065,2014,3,1,2,1,1,2,2,-1,3,...,5,5,-1,1,0,74,17,2,1,76.3


The following helper functions were borrowed from [Allen Downey](https://github.com/AllenDowney/GssReligion), as part of a different exploration of the GSS dataset.

In [284]:
#This function renames a convenient feature of pandas
#It makes it easy to identify strange, outlier values
def values(df, varname):
    """Values and counts in index order.
    
    df: DataFrame
    varname: strign column name
    
    returns: Series that maps from value to frequency
    """
    return df[varname].value_counts().sort_index()

#This fixes the problems you can find with the previous one.
def fill_missing(df, varname, badvals=[98, 99]):
    """Fill missing data with random values.
    
    df: DataFrame
    varname: string column name
    badvals: list of values to be replaced
    """
    df[varname].replace(badvals, np.nan, inplace=True)
    null = df[varname].isnull()
    fill = np.random.choice(df[varname].dropna(), sum(null), replace=True)
    df.loc[null, varname] = fill
    return sum(null)

# Who votes?

The first variable to investigate is age - it's a very simple predictor, but known to be excellent at predicting patterns. To quantify that, we can build a simple logistic regression formula based on age

In [285]:
values(df2012, "age")

20     16
21     24
22     55
23     69
24     62
25     98
26     82
27     92
28     67
29     96
30    105
31    100
32     90
33     94
34     87
35     92
36     89
37    102
38     85
39     89
40     83
41     84
42     80
43     93
44     87
45     62
46     75
47     90
48     77
49     77
     ... 
61     93
62     98
63     82
64     70
65     80
66     80
67     90
68     63
69     65
70     63
71     54
72     46
73     44
74     48
75     39
76     49
77     36
78     52
79     46
80     41
81     33
82     18
83     32
84     21
85     17
86     19
87     16
88     14
89     39
99     17
Name: age, dtype: int64

Under the assumption that there *probably* weren't 17 different 99 year olds in this one relatively small sample (5405 people), we can clean the data by replacing those with random values in the cleaning function

In [286]:
fill_missing(df2012, 'age')

17

In [287]:
#Creating the model
results = smf.logit('vote12 ~ age', data=df2012).fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.579072
         Iterations 5


0,1,2,3
Dep. Variable:,vote12,No. Observations:,4983.0
Model:,Logit,Df Residuals:,4981.0
Method:,MLE,Df Model:,1.0
Date:,"Tue, 02 May 2017",Pseudo R-squ.:,0.05903
Time:,02:20:03,Log-Likelihood:,-2885.5
converged:,True,LL-Null:,-3066.5
,,LLR p-value:,9.995e-81

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.9324,0.098,-9.472,0.000,-1.125,-0.740
age,0.0364,0.002,18.068,0.000,0.032,0.040


This shows us that age is definately an excellent predictor of whether someone is likely to vote

### Basic Model of voting and Religion

In [288]:
values(df2012, "relig")

1     2358
2     1132
3       84
4     1047
5       62
6       41
7       14
8        5
9       20
10      13
11     166
12       4
13      10
98       6
99      21
Name: relig, dtype: int64

The values with a signifigant amount of data are 1, 2, and 4, which are Protestant, Catholic, and None, respectively. The others are more general, or don't have quite enough data to analyze.

In [289]:
df2012['isProtestant'] = (df2012.relig == 1).astype(int)
df2012['isCatholic'] = (df2012.relig == 2).astype(int)
df2012['isNone'] = (df2012.relig == 4).astype(int)

In [291]:
#Creating the model
results = smf.logit('vote12 ~ isProtestant + isCatholic + isNone', data=df2012).fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.612393
         Iterations 5


0,1,2,3
Dep. Variable:,vote12,No. Observations:,4983.0
Model:,Logit,Df Residuals:,4979.0
Method:,MLE,Df Model:,3.0
Date:,"Tue, 02 May 2017",Pseudo R-squ.:,0.004889
Time:,02:23:30,Log-Likelihood:,-3051.6
converged:,True,LL-Null:,-3066.5
,,LLR p-value:,1.391e-06

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.8882,0.104,8.524,0.000,0.684,1.092
isProtestant,0.0885,0.114,0.777,0.437,-0.135,0.312
isCatholic,-0.1497,0.122,-1.226,0.220,-0.389,0.090
isNone,-0.3297,0.122,-2.694,0.007,-0.570,-0.090
