In [1]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

In [2]:
cps = pd.read_csv("data/cps.csv.gz")
cps.columns = cps.columns.str.lower()

In [3]:
cps.columns

Index(['year', 'serial', 'month', 'hwtfinl', 'cpsid', 'asecflag', 'asecwth',
       'statefip', 'pernum', 'wtfinl', 'cpsidp', 'asecwt', 'age', 'sex',
       'race', 'marst', 'citizen', 'hispan', 'empstat', 'educ', 'inctot',
       'adjginc', 'spmtotres', 'spmthresh', 'spmfamunit', 'voted', 'vosuppwt'],
      dtype='object')

In [4]:
cps["vote"] = cps.voted == 2

In [5]:
p = cps[cps.month == 3].copy()  # ASEC person file
train = cps[cps.month == 11].sample(frac=0.1).copy()  # Set to 1.0 for final analysis, will take a few minutes.
train["vote"] = train.voted == 2

In [6]:
XCOLS = ["statefip", 'age', 'sex', 'race', 'marst', 'citizen', 'hispan', 'empstat', 'educ',]
rf = RandomForestClassifier().fit(train[XCOLS], train.vote, train.vosuppwt)

In [7]:
preds = rf.predict_proba(p[XCOLS])
p["pred_vote"] = preds[:, 1]

In [8]:
cps[cps.month == 11].groupby("year").vote.mean()

year
2018    0.393803
2020    0.492427
Name: vote, dtype: float64

In [9]:
p.pred_vote.mean()

0.4152820880886519

In [10]:
p[["age", "educ", "citizen", "pred_vote"]].sample(10)

Unnamed: 0,age,educ,citizen,pred_vote
220004,29,81,1,0.62
171641,3,1,1,0.0
185416,25,81,5,0.0
277304,8,1,1,0.0
205529,76,73,1,0.873601
270507,40,111,1,0.70775
242095,73,111,1,0.93
123621,70,111,1,0.95
155985,11,1,1,0.0
257704,1,1,1,0.0


In [11]:
# Check that 18 year olds have more than zero.
p[p.age == 18][["age", "educ", "citizen", "pred_vote"]]

Unnamed: 0,age,educ,citizen,pred_vote
122777,18,71,1,0.560000
122780,18,71,1,0.142513
122810,18,81,1,0.710000
122831,18,60,5,0.020000
122893,18,50,1,0.294305
...,...,...,...,...
280385,18,81,1,0.530000
280463,18,60,5,0.020000
280477,18,81,1,0.080000
280536,18,73,1,0.280000


In [12]:
pd.Series(rf.feature_importances_, index=XCOLS).sort_values(ascending=False)

age         0.307478
statefip    0.209098
educ        0.193812
citizen     0.083327
marst       0.061576
empstat     0.061297
race        0.033163
hispan      0.025690
sex         0.024559
dtype: float64

In [13]:
p.to_csv("data/asec_vote_prob.csv.gz", index=False, compression="gzip")

* Add FAMINC
* Add wage
* Add 2018-2019 ASEC
* Share of population comes out ahead
* Share of eligibleadults citizens
* Share of predicted voters
* Share of predicted voters by state (slice by 2022)
* Each for a 2x2 of policy designs: include kids vs. include non-citizens
* Add context of other polls and poverty/inequality effects
