In [1]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

In [2]:
cps = pd.read_csv("data/cps.csv.gz")
cps.columns = cps.columns.str.lower()

In [3]:
cps.columns

Index(['year', 'serial', 'month', 'hwtfinl', 'cpsid', 'asecflag', 'asecwth',
       'statefip', 'metro', 'cbsasz', 'faminc', 'pernum', 'wtfinl', 'cpsidp',
       'asecwt', 'age', 'sex', 'race', 'marst', 'citizen', 'hispan', 'empstat',
       'educ', 'diffany', 'inctot', 'adjginc', 'spmtotres', 'spmthresh',
       'spmfamunit', 'hourwage', 'voted', 'vosuppwt'],
      dtype='object')

In [4]:
cps["vote"] = cps.voted == 2

In [5]:
p = cps[cps.month == 3].copy()  # ASEC person file
train = cps[cps.month == 11].sample(frac=0.1).copy()  # Set to 1.0 for final analysis, will take a few minutes.
train["vote"] = train.voted == 2

In [6]:
XCOLS = ["statefip", 'age', 'sex', 'race', 'marst', 'citizen', 'hispan', 'empstat', 'educ', 'hourwage', 'diffany', 'metro', 'cbsasz', 'faminc']
rf = RandomForestClassifier().fit(train[XCOLS], train.vote, train.vosuppwt)

In [7]:
preds = rf.predict_proba(p[XCOLS])
p["pred_vote"] = preds[:, 1]

In [8]:
cps[cps.month == 11].groupby("year").vote.mean()

year
2018    0.393803
2020    0.492427
Name: vote, dtype: float64

In [9]:
p.pred_vote.mean()

0.4040320349474658

In [10]:
p[["age", "educ", "citizen", "pred_vote"]].sample(10)

Unnamed: 0,age,educ,citizen,pred_vote
631803,80,125,4,0.7
8774,50,73,5,0.07
615815,34,73,1,0.32
327080,65,111,1,0.57
134171,4,1,1,0.0
143555,73,91,1,0.76
587071,21,81,1,0.25
456416,23,73,1,0.3
166228,26,91,1,0.63
397446,51,92,1,0.944221


In [11]:
# Check that 18 year olds have more than zero.
p[p.age == 18][["age", "educ", "citizen", "pred_vote"]]

Unnamed: 0,age,educ,citizen,pred_vote
10,18,30,1,0.18
30,18,60,1,0.16
269,18,60,1,0.11
274,18,71,1,0.12
672,18,60,1,0.16
...,...,...,...,...
640570,18,81,1,0.45
640648,18,60,5,0.07
640662,18,81,1,0.29
640721,18,73,1,0.41


In [12]:
pd.Series(rf.feature_importances_, index=XCOLS).sort_values(ascending=False)

age         0.205533
educ        0.160152
statefip    0.112634
faminc      0.090158
citizen     0.076154
marst       0.068828
empstat     0.062846
cbsasz      0.055392
diffany     0.043371
metro       0.035541
race        0.027853
sex         0.022504
hispan      0.021843
hourwage    0.017190
dtype: float64

In [13]:
p.to_csv("data/asec_vote_prob.csv.gz", index=False, compression="gzip")

* Share of population comes out ahead
* Share of eligibleadults citizens
* Share of predicted voters
* Share of predicted voters by state (slice by 2022)
* Each for a 2x2 of policy designs: include kids vs. include non-citizens
* Add context of other polls and poverty/inequality effects
