In [1]:
###### Build VRA statistical models ######
###### This cell defines, but does not run, the modeling functions. ######
###### It does not produce any output, so it might not seem to do anything. ######
###### You will specifty model parameters and run these functions later. ######

import numpy as np, pandas as pd, statsmodels.api as sm
# I will try to read VRA csv files from the local path below.
# If that fails, I will try the google drive URL in model_votes.
path = f'/home/jupyter/redistricting_data/'

###### Helper functions ######
def listify(x):
    """ensure x is a list"""
    if x is None:
        x = []
    elif isinstance(x, str):
        x = [x]
    return x

def check(x, valid):
    """check x is a valid value"""
    bad = set(listify(x)).difference(valid)
    assert len(bad)==0,  f'unknown {bad} ... must be one of {valid}'

def read_data(level):
    valid_levels = {'county', 'cntyvtd'}
    check(level, valid_levels)
    try:
        file = path + f'vra_{level}.csv'
        df = pd.read_csv(file)
        print(f'using local {file}')
    except:
        if level == 'county':
            url = 'https://drive.google.com/file/d/143OH38F_fTqSnwwTBkkGsniBSeo8by18/view?usp=sharing'
        else:
            url = 'https://drive.google.com/file/d/149IB9m4YKcgrleAJTd44yJiu-7pHh0-n/view?usp=sharing'
        url = 'https://drive.google.com/uc?id=' + url.split('/')[-2]
        df = pd.read_csv(url)
        print(f'using remote {url}')
    return df
    
    
###### Define function that builds weighted least squares model  ######
def model_votes(level, metric, election, races, predictors=None, interactions=None, const=True, weight='vap_pop'):
    df = read_data(level)
    valid_elections = {x[:-8] for x in df.columns if 'red_pct' in x}
    valid_metrics   = {'red_pct', 'red_blue_gap'}
    valid_races     = {'hisp', 'black', 'white'}
    
    check(election, valid_elections)
    check(metric  , valid_metrics)
    check(races   , valid_races)
    # if all races included, disable constant term to prevent colinearity
    if valid_races.issubset(races):
        const = False

    target = f'{election}_{metric}'
    races = {f'vap_{r}_pct': r for r in races}
    predictors = list(races.keys()) + listify(predictors)
    cols = [target, weight] + predictors
    if const:
        predictors.append('const')
    print(f'target column = {target}\nweight column = {weight}\npredictors = {predictors}')
    
    # get columns we need and rename for convenience
    X = df[cols].rename(columns=races)
    
    # create interaction columns
    for a, b in listify(interactions):
        X[a+b] = X[a] * X[b]
    
    # create constant column
    if const:
        X['const'] = 100.0

    # drop rows with missing values - typically small vtds with no recorded votes in this election
    X.dropna(inplace=True)
    
    # pop the target and weight columns from X
    y = X.pop(target)
    w = X.pop(weight)
    
    # create WLS model
    mod = sm.WLS(y, X, w)
    res = mod.fit()
    print(res.summary())
    return df, mod, res

In [37]:
px.chloropleth_mapbox?

Object `px.chloropleth_mapbox` not found.


In [29]:
races = ['white', 'black', 'hisp']
parties = ['red', 'blue']

def prep_data(level='cntyvtd', election='Pres_20'):
    # df = read_data(level)
    tbl = f'cmat-315920.VRA.{level}'
    df = run_query(f'select * from {tbl}')
    geo = gpd.GeoSeries.from_wkt(df['polygon'], crs=crs_census).simplify(0.003).buffer(0) #<-- little white space @ .001 ~5.7 mb, minimal at .0001 ~10mb, with no white space ~37mb
    # gdf = df.drop(columns='polygon').join(plans, how='inner')
    df = gpd.GeoDataFrame(df.drop(columns='polygon'), geometry=geo).reset_index()
    # print(df.columns)
    if level == 'cntyvtd':
        cols = ['cntyvtd', 'county', 'geometry', 'vap_pop']
    else:
        cols = [           'county', 'geometry', 'vap_pop']
    cols += [f'vap_{r}' for r in races]
    cols += [f'{election}_votes', f'{election}_red_pct']
    df = df[cols]
    df[f'{election}_blue_pct'] = 100 - df[f'{election}_red_pct']
    for party in parties:
        df[f'{election}_{party}_votes'] = df[f'{election}_votes'] * df[f'{election}_{party}_pct'] / 100
    df[f'{election}_vote_rate'] = df[f'{election}_votes'] / df['vap_pop']
    for race in races:
        df[f'{election}_{race}_votes'] = df[f'vap_{race}'] * df[f'{election}_vote_rate']
    return df

def hisp_red_pct(df, white_red_pct=100, black_red_pct=10):
    a = [x for x in df.columns if 'votes' in x][0]
    election = a[:a.rfind('_')]
    df[f'{election}_white_red_votes'] = df[f'{election}_white_votes'] * white_red_pct / 100
    df[f'{election}_black_red_votes'] = df[f'{election}_black_votes'] * black_red_pct / 100
    df[f'{election}_hisp_red_votes' ] = df[f'{election}_red_votes'] - df[f'{election}_white_red_votes'] - df[f'{election}_black_red_votes']
    df[f'{election}_white_red_pct']   = white_red_pct
    df[f'{election}_black_red_pct']   = black_red_pct
    df[f'{election}_hisp_red_pct']    = np.maximum(df[f'{election}_hisp_red_votes'] / df[f'{election}_hisp_votes'], 0) * 100
    return df

In [40]:
import plotly_express as px
g = px.data.gapminder()
px.choropleth_mapbox?

[0;31mSignature:[0m
[0mpx[0m[0;34m.[0m[0mchoropleth_mapbox[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdata_frame[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgeojson[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfeatureidkey[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlocations[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolor[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhover_name[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhover_data[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcustom_data[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0manimation_frame[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0manimation_group[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcategory_orders[0m[0;34m=[0m[0;32mNone[

In [35]:
g.head(3)
px.scatter?

[0;31mSignature:[0m
[0mpx[0m[0;34m.[0m[0mscatter[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdata_frame[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mx[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolor[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msymbol[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msize[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhover_name[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhover_data[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcustom_data[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtext[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfacet_row[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfacet_col[0m

In [30]:
for level in ['county', 'cntyvtd']:    
    df = prep_data(level=level)
    df = hisp_red_pct(df)
    # df.to_csv(f'{path}/VRA_min_hisp_red_pct_{level}.csv')
df.head(3)

Unnamed: 0,cntyvtd,county,geometry,vap_pop,vap_white,vap_black,vap_hisp,Pres_20_votes,Pres_20_red_pct,Pres_20_blue_pct,Pres_20_red_votes,Pres_20_blue_votes,Pres_20_vote_rate,Pres_20_white_votes,Pres_20_black_votes,Pres_20_hisp_votes,Pres_20_white_red_votes,Pres_20_black_red_votes,Pres_20_hisp_red_votes,Pres_20_white_red_pct,Pres_20_black_red_pct,Pres_20_hisp_red_pct
0,25000018,Bee,"POLYGON ((-97.87296 28.50344, -97.86470 28.520...",1852,944,56,852,1134,73.809524,26.190476,837.0,297.0,0.612311,578.021598,34.289417,521.688985,578.021598,3.428942,255.54946,100,10,48.985021
1,25000013,Bee,"POLYGON ((-97.98479 28.48571, -97.88110 28.518...",1577,586,7,984,992,65.120968,34.879032,646.0,346.0,0.629042,368.618897,4.403297,618.977806,368.618897,0.44033,276.940774,100,10,44.741632
2,25000009,Bee,"POLYGON ((-97.76164 28.40223, -97.75104 28.400...",764,76,13,675,447,42.95302,57.04698,192.0,255.0,0.585079,44.465969,7.606021,394.92801,44.465969,0.760602,146.773429,100,10,37.164604


In [31]:
px.

geopandas.geodataframe.GeoDataFrame

In [15]:
df = hisp_redness(election='Sen_18')
# 42.953, 88.548
# df.to_csv(f'{path}/VRA_min_hisp_red_pct_{level}.csv')
df.head(3)

using local /home/jupyter/redistricting_data/vra_cntyvtd.csv


Unnamed: 0,cntyvtd,county,vap_pop,vap_white,vap_black,vap_hisp,Sen_18_votes,Sen_18_red_pct,Sen_18_blue_pct,Sen_18_red_votes,...,red_votes_cum,Sen_18_white_votes,Sen_18_black_votes,Sen_18_hisp_votes,Sen_18_white_red_votes,Sen_18_black_red_votes,Sen_18_hisp_red_votes,Sen_18_white_red_pct,Sen_18_black_red_pct,Sen_18_hisp_red_pct
0,25000018,Bee,1852,944,56,852,970,72.783505,27.216495,706.0,...,497.360691,494.427646,29.330454,446.241901,494.427646,2.933045,-51.11879,100,10,0.0
1,25000013,Bee,1577,586,7,984,655,65.648855,34.351145,430.0,...,243.683259,243.392517,2.907419,408.700063,243.392517,0.290742,165.016804,100,10,40.376016
2,25000009,Bee,764,76,13,675,334,30.838323,69.161677,103.0,...,33.793455,33.225131,5.683246,295.091623,33.225131,0.568325,261.298168,100,10,88.548148


In [17]:
165/408
430-243

187

In [2]:
###### Specify model options and call function above to generate it ######
###### You may create many copies of this cell to try different model configurations ######
opts = {'level'       : 'cntyvtd',
        'metric'      : 'red_blue_gap',
        'election'    : 'Pres_20',
        'const'       : True,
        'races'       : [
            'hisp',
            'black',
            'white',
        ],
        'predictors'  : [
            'density',
            'dist_border',
        ],
        'interactions': [
            ['hisp','black'],
        ],
       }
df, mod, res = model_votes(**opts)

using local /home/jupyter/redistricting_data/vra_cntyvtd.csv
target column = Pres_20_red_blue_gap
weight column = vap_pop
predictors = ['vap_hisp_pct', 'vap_black_pct', 'vap_white_pct', 'density', 'dist_border']
                             WLS Regression Results                             
Dep. Variable:     Pres_20_red_blue_gap   R-squared:                       0.684
Model:                              WLS   Adj. R-squared:                  0.684
Method:                   Least Squares   F-statistic:                     3773.
Date:                  Fri, 22 Oct 2021   Prob (F-statistic):               0.00
Time:                          19:11:07   Log-Likelihood:                -42403.
No. Observations:                  8716   AIC:                         8.482e+04
Df Residuals:                      8710   BIC:                         8.486e+04
Df Model:                             5                                         
Covariance Type:              nonrobust                    

In [3]:
###### Specify model options and call function above to generate it ######
###### You may create many copies of this cell to try different model configurations ######
opts = {'level'       : 'cntyvtd',
        'metric'      : 'red_blue_gap',
        'election'    : 'Pres_20',
        'const'       : True,
        'races'       : [
            'hisp',
            'black',
            # 'white',
        ],
        'predictors'  : [
            'density',
            'dist_border',
        ],
        'interactions': [
            ['hisp','black'],
        ],
       }
df, mod, res = model_votes(**opts)

using local /home/jupyter/redistricting_data/vra_cntyvtd.csv
target column = Pres_20_red_blue_gap
weight column = vap_pop
predictors = ['vap_hisp_pct', 'vap_black_pct', 'density', 'dist_border', 'const']
                             WLS Regression Results                             
Dep. Variable:     Pres_20_red_blue_gap   R-squared:                       0.684
Model:                              WLS   Adj. R-squared:                  0.684
Method:                   Least Squares   F-statistic:                     3773.
Date:                  Fri, 22 Oct 2021   Prob (F-statistic):               0.00
Time:                          19:11:13   Log-Likelihood:                -42403.
No. Observations:                  8716   AIC:                         8.482e+04
Df Residuals:                      8710   BIC:                         8.486e+04
Df Model:                             5                                         
Covariance Type:              nonrobust                            

In [16]:
###### Generate Raw Data - Users cannot run this unless they have access to the source table in BigQuery ######

%load_ext autoreload
%autoreload
%cd /home/jupyter/MathVGerrmandering_CMAT_2021/
from src import *
src_tbl = f'{root_bq}.TX_sldl_planh2100.cntyvtd_0_nodes'
src_cols = get_cols(src_tbl)

def get_vra(level):
    if level == 'county':
        labels = 'county'
    elif level == 'cntyvtd':
        labels = 'cntyvtd, county'
    else:
        raise Exception(f'invalid level {level}')

    sels = []
    votes = []
    diff = []
    red = []
    def f(a, b):
        short = dict()
        for p in ['D', 'R']:
            col = [x for x in src_cols if f'{a}_{p}' in x].pop()
            nm = col.split('_')[3]
            short[p] = f'{b}_{nm}'
            sels.append(f'cast(sum({col}) as int) as {short[p]}')
        votes.append(f'{short["D"]} + {short["R"]} as {b}_votes')
        diff.append(f'case when {b}_votes > 0 then ({short["R"]} - {short["D"]}) / {b}_votes * 100 else Null end as {b}_red_blue_gap')
        red .append(f'case when {b}_votes > 0 then  {short["R"]} / {b}_votes * 100 else Null end as {b}_red_pct')

    for yr in [2020, 2016, 2012]:
        a = f'President_{yr}'
        b = f'Pres_{yr%100}'
        f(a, b)

    for yr in [2020, 2018, 2014, 2012]:
        a = f'USSen_{yr}'
        b = f'Sen_{yr%100}'
        f(a, b)

    query = []
    query.append(f"""
select
    {labels},
    {join_str().join(sels)},
    sum(vap_hisp) as vap_hisp,
    {' + '.join([f'sum({x})' for x in src_cols if 'vap_nonhisp' in x and 'black' in x])} as vap_black,
    {' + '.join([f'sum({x})' for x in src_cols if 'vap_nonhisp' in x and 'white' in x and 'black' not in x])} as vap_white,
    sum(aland) as aland,
    st_union_agg(polygon) as polygon,
from
    {src_tbl}
group by
    {labels}
""")
    
    query.append(f"""
select
    *,
    vap_hisp + vap_black + vap_white as vap_pop,
    {join_str().join(votes)},
from (
    {subquery(query[-1])}
    )
""")

    query.append(f"""
select
    {labels},
    vap_pop,
    vap_pop / aland as density,
    vap_hisp  / vap_pop * 100 as vap_hisp_pct,
    vap_black / vap_pop * 100 as vap_black_pct,
    vap_white / vap_pop * 100 as vap_white_pct,
    st_distance(polygon, (select polygon from {data_bq}.countries where country = 'Mexico')) / {m_per_mi} as dist_border,
    {join_str().join(red)},
    {join_str().join(diff)},
    vap_hisp,
    vap_black,
    vap_white,
    * except ({labels}, vap_hisp, vap_black, vap_white, vap_pop, aland, polygon),
    aland,
    polygon,
from (
    {subquery(query[-1])}
    )
where vap_pop > 0
""")
    return query[-1]
    
for level in ['county', 'cntyvtd']:
    print(level)
    query = get_vra(level)
    targ_tbl = f'{root_bq}.VRA.{level}'
    load_table(tbl=targ_tbl, query=query)
    df = run_query(f'select * except (polygon) from {targ_tbl}')
    f = data_path / f'vra_{level}.csv'
    f.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(f, index=False)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/home/jupyter/MathVGerrmandering_CMAT_2021
county
cntyvtd


In [20]:
###### Generate Raw Data - Users cannot run this unless they have access to the source table in BigQuery ######

%load_ext autoreload
%autoreload
%cd /home/jupyter/MathVGerrmandering_CMAT_2021/
from src import *
src_tbl = f'{root_bq}.TX_sldl_planh2100.cntyvtd_0_nodes'
src_cols = get_cols(src_tbl)

def get_vra(level):
    if level == 'county':
        labels = 'county'
    elif level == 'cntyvtd':
        labels = 'cntyvtd, county'
    else:
        raise Exception(f'invalid level {level}')

    sels = []
    votes = []
    diff = []
    red = []
    def f(a, b):
        short = dict()
        for party, color in {'D':'blue', 'R':'red'}.items():
            col = [x for x in src_cols if f'{a}_{party}' in x].pop()
            nm = col.split('_')[3]
            short[party] = f'{b}_{color}_votes_{nm}'
            sels.append(f'cast(sum({col}) as int) as {short[party]}')
        votes.append(f'{short["D"]} + {short["R"]} as {b}_votes')
        diff.append(f'case when {b}_votes > 0 then ({short["R"]} - {short["D"]}) / {b}_votes * 100 else Null end as {b}_red_blue_gap')
        red .append(f'case when {b}_votes > 0 then  {short["R"]} / {b}_votes * 100 else Null end as {b}_red_pct')

    for yr in [2020, 2016, 2012]:
        a = f'President_{yr}'
        b = f'Pres_{yr%100}'
        f(a, b)

    for yr in [2020, 2018, 2014, 2012]:
        a = f'USSen_{yr}'
        b = f'Sen_{yr%100}'
        f(a, b)

    query = []
    query.append(f"""
select
    {labels},
    {join_str().join(sels)},
    sum(vap_hisp) as vap_hisp,
    {' + '.join([f'sum({x})' for x in src_cols if 'vap_nonhisp' in x and 'black' in x])} as vap_black,
    {' + '.join([f'sum({x})' for x in src_cols if 'vap_nonhisp' in x and 'white' in x and 'black' not in x])} as vap_white,
    sum(aland) as aland,
    st_union_agg(polygon) as polygon,
from
    {src_tbl}
group by
    {labels}
""")
    
    query.append(f"""
select
    *,
    vap_hisp + vap_black + vap_white as vap_pop,
    {join_str().join(votes)},
from (
    {subquery(query[-1])}
    )
""")

    query.append(f"""
select
    {labels},
    vap_pop,
    vap_pop / aland as density,
    vap_hisp  / vap_pop * 100 as vap_hisp_pct,
    vap_black / vap_pop * 100 as vap_black_pct,
    vap_white / vap_pop * 100 as vap_white_pct,
    st_distance(polygon, (select polygon from {data_bq}.countries where country = 'Mexico')) / {m_per_mi} as dist_border,
    {join_str().join(red)},
    {join_str().join(diff)},
    vap_hisp,
    vap_black,
    vap_white,
    * except ({labels}, vap_hisp, vap_black, vap_white, vap_pop, aland, polygon),
    aland,
    polygon,
from (
    {subquery(query[-1])}
    )
where vap_pop > 0
""")
    return query[-1]
    
for level in ['county', 'cntyvtd']:
    print(level)
    query = get_vra(level)
    targ_tbl = f'{root_bq}.VRA.{level}'
    load_table(tbl=targ_tbl, query=query)
    # df = run_query(f'select * except (polygon) from {targ_tbl}')
    df = run_query(f'select * from {targ_tbl}')
    f = data_path / f'vra_{level}.csv'
    f.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(f, index=False)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/home/jupyter/MathVGerrmandering_CMAT_2021
county
cntyvtd
