In [110]:
import pandas as pd
import numpy as np

import thinkstats2
import thinkplot

import statsmodels.formula.api as smf

# Data

In [111]:
dct = thinkstats2.ReadStataDct('./Who_Votes_2/GSS.dct')
df = dct.ReadFixedWidth('./Who_Votes_2/GSS.dat')
print(df.shape)
df.head()

(62466, 22)


Unnamed: 0,year,partyid,vote12,relig,happy,health,trust,relpersn,wwwhr,ballot,...,id_,wrkstat,hrs2,wrkgovt,industry,age,educ,sex,race,sei10
0,1972,2,0,3,3,2,3,0,-1,0,...,1,1,-1,0,609,23,16,2,1,50.0
1,1972,1,0,2,3,3,1,0,-1,0,...,2,5,-1,0,338,70,10,1,1,46.5
2,1972,3,0,1,2,1,2,0,-1,0,...,3,2,-1,0,718,48,12,2,1,56.9
3,1972,1,0,5,3,2,2,0,-1,0,...,4,1,-1,0,319,27,17,2,1,76.3
4,1972,0,0,1,2,2,2,0,-1,0,...,5,7,-1,0,448,61,12,2,1,31.9


For now, we are only interested in people who voted in 2012, the year that we will be investigating.

In [112]:
df2012 = df[(df.vote12==2) | (df.vote12==1)].copy()
#Copying the slice makes it possible to add values
df2012.shape

(4983, 22)

In [113]:
#I want vote2012 to be 0 false, 1 true: right now 2 is false. So...
df2012.vote12 = df.vote12%2

print(df2012.shape)
df2012.head()

(4983, 22)


Unnamed: 0,year,partyid,vote12,relig,happy,health,trust,relpersn,wwwhr,ballot,...,id_,wrkstat,hrs2,wrkgovt,industry,age,educ,sex,race,sei10
57061,2014,5,0,2,1,1,2,4,-1,3,...,1,1,-1,2,0,53,16,1,1,59.1
57062,2014,5,1,2,1,1,1,1,-1,3,...,2,1,-1,2,0,26,16,2,1,61.9
57063,2014,6,1,1,3,2,0,2,-1,1,...,3,4,-1,2,0,59,13,1,1,38.1
57064,2014,5,1,2,1,0,1,2,2,2,...,4,2,-1,2,0,56,16,2,1,73.9
57065,2014,3,1,2,1,1,2,2,-1,3,...,5,5,-1,1,0,74,17,2,1,76.3


The following helper functions were borrowed from [Allen Downey](https://github.com/AllenDowney/GssReligion), as part of a different exploration of the GSS dataset.

In [114]:
#This function renames a convenient feature of pandas
#It makes it easy to identify strange, outlier values
def values(df, varname):
    """Values and counts in index order.
    
    df: DataFrame
    varname: strign column name
    
    returns: Series that maps from value to frequency
    """
    return df[varname].value_counts().sort_index()

#This fixes the problems you can find with the previous one.
def fill_missing(df, varname, badvals=[98, 99]):
    """Fill missing data with random values.
    
    df: DataFrame
    varname: string column name
    badvals: list of values to be replaced
    """
    df[varname].replace(badvals, np.nan, inplace=True)
    null = df[varname].isnull()
    fill = np.random.choice(df[varname].dropna(), sum(null), replace=True)
    df.loc[null, varname] = fill
    return sum(null)

# Who votes?

The first variable to investigate is age - it's a very simple predictor, but known to be excellent at predicting patterns. To quantify that, we can build a simple logistic regression formula based on age

In [115]:
values(df2012, "age")

20     16
21     24
22     55
23     69
24     62
25     98
26     82
27     92
28     67
29     96
30    105
31    100
32     90
33     94
34     87
35     92
36     89
37    102
38     85
39     89
40     83
41     84
42     80
43     93
44     87
45     62
46     75
47     90
48     77
49     77
     ... 
61     93
62     98
63     82
64     70
65     80
66     80
67     90
68     63
69     65
70     63
71     54
72     46
73     44
74     48
75     39
76     49
77     36
78     52
79     46
80     41
81     33
82     18
83     32
84     21
85     17
86     19
87     16
88     14
89     39
99     17
Name: age, dtype: int64

Under the assumption that there *probably* weren't 17 different 99 year olds in this one relatively small sample (5405 people), we can clean the data by replacing those with random values in the cleaning function

In [116]:
fill_missing(df2012, 'age')

17

In [117]:
#Creating the model
results = smf.logit('vote12 ~ age', data=df2012).fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.578317
         Iterations 5


0,1,2,3
Dep. Variable:,vote12,No. Observations:,4983.0
Model:,Logit,Df Residuals:,4981.0
Method:,MLE,Df Model:,1.0
Date:,"Tue, 02 May 2017",Pseudo R-squ.:,0.06026
Time:,11:55:26,Log-Likelihood:,-2881.8
converged:,True,LL-Null:,-3066.5
,,LLR p-value:,2.303e-82

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.9510,0.099,-9.653,0.000,-1.144,-0.758
age,0.0368,0.002,18.237,0.000,0.033,0.041


This shows us that age is definately an excellent predictor of whether someone is likely to vote

### Basic Model of voting and Religion

In [118]:
values(df2012, "relig")

1     2358
2     1132
3       84
4     1047
5       62
6       41
7       14
8        5
9       20
10      13
11     166
12       4
13      10
98       6
99      21
Name: relig, dtype: int64

The values with a signifigant amount of data are 1, 2, and 4, which are Protestant, Catholic, and None, respectively. The others are more general, or don't have quite enough data to analyze.

In [119]:
df2012['isProtestant'] = (df2012.relig == 1).astype(int)
df2012['isCatholic'] = (df2012.relig == 2).astype(int)
df2012['isNone'] = (df2012.relig == 4).astype(int)

In [120]:
#Creating the model
results = smf.logit('vote12 ~ age + isProtestant + isCatholic + isNone', data=df2012).fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.577512
         Iterations 5


0,1,2,3
Dep. Variable:,vote12,No. Observations:,4983.0
Model:,Logit,Df Residuals:,4978.0
Method:,MLE,Df Model:,4.0
Date:,"Tue, 02 May 2017",Pseudo R-squ.:,0.06157
Time:,11:55:26,Log-Likelihood:,-2877.7
converged:,True,LL-Null:,-3066.5
,,LLR p-value:,1.906e-80

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.7766,0.140,-5.540,0.000,-1.051,-0.502
age,0.0365,0.002,17.739,0.000,0.032,0.041
isProtestant,-0.0992,0.118,-0.838,0.402,-0.331,0.133
isCatholic,-0.2806,0.127,-2.216,0.027,-0.529,-0.032
isNone,-0.2226,0.127,-1.757,0.079,-0.471,0.026


Religion isn't a very good predictor. Catholics are less likely to vote, but that is the only statistically signifigant one of the three checked here.

### Adding Race and Class to the model

In [121]:
df2012['isWhite'] = (df2012.race == 1).astype(int)
df2012['isBlack'] = (df2012.race == 2).astype(int)

In [122]:
results = smf.logit('vote12 ~ age + isCatholic + isProtestant + isNone + isWhite + isBlack + sei10', data=df2012).fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.535359
         Iterations 6


0,1,2,3
Dep. Variable:,vote12,No. Observations:,4983.0
Model:,Logit,Df Residuals:,4975.0
Method:,MLE,Df Model:,7.0
Date:,"Tue, 02 May 2017",Pseudo R-squ.:,0.1301
Time:,11:55:26,Log-Likelihood:,-2667.7
converged:,True,LL-Null:,-3066.5
,,LLR p-value:,5.7890000000000005e-168

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.5537,0.188,-13.587,0.000,-2.922,-2.185
age,0.0357,0.002,16.551,0.000,0.031,0.040
isCatholic,-0.0680,0.135,-0.506,0.613,-0.332,0.196
isProtestant,-0.0589,0.126,-0.468,0.640,-0.305,0.188
isNone,-0.1629,0.134,-1.213,0.225,-0.426,0.100
isWhite,0.4439,0.115,3.876,0.000,0.219,0.668
isBlack,1.2261,0.142,8.660,0.000,0.949,1.504
sei10,0.0279,0.002,17.748,0.000,0.025,0.031


### And the other things

Having seen the effects of age, class, and race (and how poorly religious preference performs), we can classify most of the rest of the variables.

In [129]:
df2012["isFemale"] = (df2012.sex == 2).astype(int)
df2012["strongPartyIdentify"] = ((df2012.partyid == 0) | (df2012.partyid == 6)).astype(int)
df2012["isIndependent"] = (df2012.partyid == 3).astype(int)
df2012["isHappy"] = ((df2012.happy == 1) | (df2012.happy == 2)).astype(int)
df2012["suspicious"] = (df2012.trust == 2).astype(int)
df2012["isGovt"] = (df2012.wrkgovt == 1).astype(int)

In [130]:
results = smf.logit('vote12 ~ age + isWhite + isBlack + isFemale + sei10 + strongPartyIdentify + isIndependent + isHappy + suspicious + isGovt', data=df2012).fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.494250
         Iterations 6


0,1,2,3
Dep. Variable:,vote12,No. Observations:,4983.0
Model:,Logit,Df Residuals:,4972.0
Method:,MLE,Df Model:,10.0
Date:,"Tue, 02 May 2017",Pseudo R-squ.:,0.1969
Time:,11:56:42,Log-Likelihood:,-2462.8
converged:,True,LL-Null:,-3066.5
,,LLR p-value:,3.643e-253

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.7947,0.199,-14.038,0.000,-3.185,-2.405
age,0.0318,0.002,14.252,0.000,0.027,0.036
isWhite,0.4297,0.119,3.606,0.000,0.196,0.663
isBlack,1.0571,0.146,7.227,0.000,0.770,1.344
isFemale,0.2535,0.071,3.577,0.000,0.115,0.392
sei10,0.0245,0.002,14.575,0.000,0.021,0.028
strongPartyIdentify,1.1022,0.101,10.946,0.000,0.905,1.300
isIndependent,-0.9464,0.087,-10.821,0.000,-1.118,-0.775
isHappy,0.5246,0.096,5.448,0.000,0.336,0.713
