In [19]:
###### Build VRA statistical models ######
###### This cell defines, but does not run, the modeling functions. ######
###### It does not produce any output, so it might not seem to do anything. ######
###### You will specifty model parameters and run these functions later. ######

import pandas as pd, statsmodels.api as sm
# I will try to read VRA csv files from the local path below.
# If that fails, I will try the google drive URL in model_votes.
path = f'/home/jupyter/redistricting_data/'

###### Helper functions ######
def listify(x):
    """ensure x is a list"""
    if x is None:
        x = []
    elif isinstance(x, str):
        x = [x]
    return x

def check(x, valid):
    """check x is a valid value"""
    bad = set(listify(x)).difference(valid)
    assert len(bad)==0,  f'unknown {bad} ... must be one of {valid}'

###### Define function that builds weighted least squares model  ######
def model_votes(level, metric, election, races, predictors=None, interactions=None, const=True, weight='vap_pop'):
    # Check valid inputs
    valid_levels = {'county', 'cntyvtd'}
    check(level, valid_levels)
    try:
        file = path + f'vra_{level}.csv'
        df = pd.read_csv(file)
        print(f'using local {file}')
    except:
        if level == 'county':
            url = 'https://drive.google.com/file/d/143OH38F_fTqSnwwTBkkGsniBSeo8by18/view?usp=sharing'
        else:
            url = 'https://drive.google.com/file/d/149IB9m4YKcgrleAJTd44yJiu-7pHh0-n/view?usp=sharing'
        url = 'https://drive.google.com/uc?id=' + url.split('/')[-2]
        df = pd.read_csv(url)
        print(f'using remote {url}')
    display(df.sample(n=3))

    valid_elections = {x[:-8] for x in df.columns if 'red_pct' in x}
    valid_metrics   = {'red_pct', 'red_blue_gap'}
    valid_races     = {'hisp', 'black', 'white'}
    
    check(election, valid_elections)
    check(metric  , valid_metrics)
    check(races   , valid_races)
    # if all races included, disable constant term to prevent colinearity
    if valid_races.issubset(races):
        const = False

    target = f'{election}_{metric}'
    races = {f'vap_{r}_pct': r for r in races}
    predictors = list(races.keys()) + listify(predictors)
    cols = [target, weight] + predictors
    if const:
        predictors.append('const')
    print(f'target column = {target}\nweight column = {weight}\npredictors = {predictors}')
    
    # get columns we need and rename for convenience
    X = df[cols].rename(columns=races)
    
    # create interaction columns
    for a, b in listify(interactions):
        X[a+b] = X[a] * X[b]
    
    # create constant column
    if const:
        X['const'] = 1.0

    # drop rows with missing values - typically small vtds with no recorded votes in this election
    X.dropna(inplace=True)
    
    # pop the target and weight columns from X
    y = X.pop(target)
    w = X.pop(weight)
    
    # create WLS model
    mod_sm = sm.WLS(y, X, w)
    res = mod_sm.fit()
    print(res.summary())
    return res

using local /home/jupyter/redistricting_data/vra_cntyvtd.csv


Unnamed: 0,cntyvtd,county,vap_pop,density,vap_hisp_pct,vap_black_pct,vap_white_pct,dist_border,Pres_20_red_pct,Pres_16_red_pct,...,Sen_12_Sadler,Sen_12_Cruz,Pres_20_votes,Pres_16_votes,Pres_12_votes,Sen_20_votes,Sen_18_votes,Sen_14_votes,Sen_12_votes,aland
0,25000018,Bee,1852,42.830818,46.00432,3.023758,50.971922,117.849478,73.809524,74.139729,...,268,682,1134,959,994,1100,970,639,950,43.239893
1,25000013,Bee,1577,10.903907,62.396956,0.443881,37.159163,111.368878,65.120968,59.210526,...,273,412,992,760,712,930,655,445,685,144.627052
2,25000009,Bee,764,723.040307,88.350785,1.701571,9.947644,119.880787,42.95302,27.272727,...,273,87,447,429,387,411,334,288,360,1.056649
3,25000002,Bee,213,3.639055,33.333333,1.877934,64.788732,112.012345,90.10989,91.351351,...,25,129,182,185,165,176,167,119,154,58.531673
4,25000008,Bee,210,10.53955,36.190476,0.0,63.809524,120.219405,80.246914,73.943662,...,37,96,162,142,134,157,123,105,133,19.924949


target column = Pres_20_red_blue_gap
weight column = vap_pop
predictors = ['vap_hisp_pct', 'vap_black_pct', 'vap_white_pct', 'density', 'dist_border']
                             WLS Regression Results                             
Dep. Variable:     Pres_20_red_blue_gap   R-squared:                       0.684
Model:                              WLS   Adj. R-squared:                  0.684
Method:                   Least Squares   F-statistic:                     3773.
Date:                  Sun, 10 Oct 2021   Prob (F-statistic):               0.00
Time:                          17:46:17   Log-Likelihood:                -42403.
No. Observations:                  8716   AIC:                         8.482e+04
Df Residuals:                      8710   BIC:                         8.486e+04
Df Model:                             5                                         
Covariance Type:              nonrobust                                         
                  coef    std err      

In [None]:
###### Specify model options and call function above to generate it ######
###### You may create many copies of this cell to try different model configurations ######
opts = {'level'       : 'cntyvtd',
        'metric'      : 'red_blue_gap',
        'election'    : 'Pres_20',
        'const'       : True,
        'races'       : [
            'hisp',
            'black',
            'white',
        ],
        'predictors'  : [
            'density',
            'dist_border',
        ],
        'interactions': [
            ['hisp','black'],
        ],
       }
res = model_votes(**opts)

In [None]:
###### Specify model options and call function above to generate it ######
###### You may create many copies of this cell to try different model configurations ######
opts = {'level'       : 'cntyvtd',
        'metric'      : 'red_blue_gap',
        'election'    : 'Pres_20',
        'const'       : True,
        'races'       : [
            'hisp',
            'black',
            # 'white',
        ],
        'predictors'  : [
            'density',
            'dist_border',
        ],
        'interactions': [
            ['hisp','black'],
        ],
       }
res = model_votes(**opts)

In [8]:
###### Generate Raw Data - Users cannot run this unless they have access to the source table in BigQuery ######

%load_ext autoreload
%autoreload
%cd /home/jupyter/MathVGerrmandering_CMAT_2021/
from src import *
src_tbl = f'{data_bq}.TX_2020_cd_planc2100_cntyvtd_0_nodes'
src_cols = get_cols(src_tbl)

def get_vra(level):
    if level == 'county':
        labels = 'county'
    elif level == 'cntyvtd':
        labels = 'cntyvtd, county'
    else:
        raise Exception(f'invalid level {level}')

    sels = []
    votes = []
    diff = []
    red = []
    def f(a, b):
        short = dict()
        for p in ['D', 'R']:
            col = [x for x in src_cols if f'{a}_{p}' in x].pop()
            nm = col.split('_')[3]
            short[p] = f'{b}_{nm}'
            sels.append(f'cast(sum({col}) as int) as {short[p]}')
        votes.append(f'{short["D"]} + {short["R"]} as {b}_votes')
        diff.append(f'case when {b}_votes > 0 then ({short["R"]} - {short["D"]}) / {b}_votes * 100 else Null end as {b}_red_blue_gap')
        red .append(f'case when {b}_votes > 0 then  {short["R"]} / {b}_votes * 100 else Null end as {b}_red_pct')

    for yr in [2020, 2016, 2012]:
        a = f'President_{yr}'
        b = f'Pres_{yr%100}'
        f(a, b)

    for yr in [2020, 2018, 2014, 2012]:
        a = f'USSen_{yr}'
        b = f'Sen_{yr%100}'
        f(a, b)

    query = []
    query.append(f"""
select
    {labels},
    {join_str().join(sels)},
    sum(vap_hisp_pop) as vap_hisp,
    {' + '.join([f'sum({x})' for x in src_cols if 'vap_nonhisp' in x and 'black' in x])} as vap_black,
    {' + '.join([f'sum({x})' for x in src_cols if 'vap_nonhisp' in x and 'white' in x and 'black' not in x])} as vap_white,
    sum(aland) / {m_per_mi**2} as aland,
    st_union_agg(polygon) as polygon,
from
    {src_tbl}
group by
    {labels}
""")
    
    query.append(f"""
select
    *,
    vap_hisp + vap_black + vap_white as vap_pop,
    {join_str().join(votes)},
from (
    {subquery(query[-1])}
    )
""")

    query.append(f"""
select
    {labels},
    vap_pop,
    vap_pop / aland as density,
    vap_hisp  / vap_pop * 100 as vap_hisp_pct,
    vap_black / vap_pop * 100 as vap_black_pct,
    vap_white / vap_pop * 100 as vap_white_pct,
    st_distance(polygon, (select polygon from {data_bq}.countries where country = 'Mexico')) / {m_per_mi} as dist_border,
    {join_str().join(red)},
    {join_str().join(diff)},
    vap_hisp,
    vap_black,
    vap_white,
    * except ({labels}, vap_hisp, vap_black, vap_white, vap_pop, aland, polygon),
    aland,
    polygon,
from (
    {subquery(query[-1])}
    )
where vap_pop > 0
""")
    return query[-1]
    
for level in ['county', 'cntyvtd']:
    print(level)
    query = get_vra(level)
    targ_tbl = f'{root_bq}.VRA.{level}'
#     load_table(tbl=targ_tbl, query=query)
    df = run_query(f'select * except (polygon) from {targ_tbl}')
    f = data_path / f'vra_{level}.csv'
    f.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(f, index=False)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/home/jupyter/MathVGerrmandering_CMAT_2021
county
cntyvtd
