Knudson et al., (2021). PyEI: A Python package for ecological inference. Journal of Open Source Software, 6(64), 3397, https://doi.org/10.21105/joss.03397

In [1]:
import numpy as np
import pandas as pd
from pyei import TwoByTwoEI



In [3]:
race_columns = {
    'WHITE': 'WHT_NHSP22',
    'BLACK': 'BLK_NHSP22',
    'HISPANIC': 'HSP_POP22',
    'ASIAN': 'ASN_NHSP22',
}
election_columns = {
    'REPUBLICAN': 'TOT_REP',
    'DEMOCRATIC': 'TOT_DEM',
}
income_columns = ['0_35K', '35K_60K', '60K_100K', '100K_125K', '125K_150K', '150K_MORE']

### Functions

In [4]:
def calculate_population_distribution(df, race_columns, columns_to_keep):
    df['TOTAL_POP_RECALCULATED'] = df[list(race_columns.values())].sum(axis=1)
    
    for race, col in race_columns.items():
        percentage_col = f"{race}_PERCENT"
        df[percentage_col] = (df[col] / df['TOTAL_POP_RECALCULATED'])
    columns_to_keep += [f"{race}_PERCENT" for race in race_columns.keys()]

    return df[columns_to_keep]



In [5]:
def calculate_voting_distribution(df, election_columns, columns_to_keep):
    df['TOTAL_VOTES_RECALCULATED'] = df[list(election_columns.values())].sum(axis=1)
    
    for party, col in election_columns.items():
        percentage_col = f"{party}_PERCENT"
        df[percentage_col] = (df[col] / df['TOTAL_VOTES_RECALCULATED'])
    columns_to_keep += [f"{party}_PERCENT" for party in election_columns.keys()]

    return df[columns_to_keep]

In [6]:
def calculate_income_distribution(df, income_columns, columns_to_keep):
    df['TOTAL_INCOME_POPULATION'] = df[income_columns].sum(axis=1)
    
    for income_range in income_columns:
        percentage_col = f"{income_range}_PERCENT"
        df[percentage_col] = df[income_range] / df['TOTAL_INCOME_POPULATION']
    
    columns_to_keep += [f"{income_range}_PERCENT" for income_range in income_columns]

    return df[columns_to_keep]

In [7]:
def run_ecological_inference(
    df, 
    demographic_col, 
    voting_col, 
    total_votes_col, 
    demographic_group_name="Demographic Group", 
    candidate_name="Candidate", 
    precinct_col="UNIQUE_ID",
    model_name="king99_pareto_modification",
    pareto_scale=15,
    pareto_shape=1.5,
    draws=3000,
    tune=5000,
    target_accept=0.95
):
    # Extract data
    X = np.array(df[demographic_col])  # Demographic percentage
    T = np.array(df[voting_col])       # Voting percentage
    N = np.array(df[total_votes_col])  # Total votes
    precinct_names = df[precinct_col]  # Precinct names

    ei = TwoByTwoEI(
        model_name=model_name,
        pareto_scale=pareto_scale,
        pareto_shape=pareto_shape
    )

    # Fit the model
    ei.fit(
        X, T, N,
        demographic_group_name=demographic_group_name,
        candidate_name=candidate_name,
        precinct_names=precinct_names,
        draws=draws,
        tune=tune,
        target_accept=target_accept
    )

    # Display a summary
    summary = ei.summary()
    print(summary)

    return ei, summary


## South Carolina

### Import data

In [19]:
sc_election_gov_df = pd.read_json('states/south_carolina/election/sc_election_gov_22.json')
sc_race_df = pd.read_json('states/south_carolina/demographics/south_carolina_precincts_racial_population.json')
sc_economic_df = pd.read_json('states/south_carolina/economic/south_carolina_precincts_household_income.json')
sc_region_type_df = pd.read_json('states/south_carolina/geodata/south_carolina_precincts_region_type.json')

In [9]:
sc_candidate_mapping = {
    'REPUBLICAN': 'Henry McMaster',
    'DEMOCRATIC': 'Joe Cunningham'
}

### Main Script

#### Preprocess data

##### Racial Group

In [20]:
columns_to_keep = ['UNIQUE_ID']

In [21]:
sc_race_with_percentages = calculate_population_distribution(sc_race_df, race_columns, columns_to_keep)

In [22]:
columns_to_keep = ['UNIQUE_ID', 'TOT_VOT']

In [23]:
sc_election_gov_with_percentages = calculate_voting_distribution(sc_election_gov_df, election_columns, columns_to_keep)

In [24]:
sc_race_merged_ei_df = pd.merge(sc_election_gov_with_percentages, sc_race_with_percentages, on='UNIQUE_ID', how='left')

In [25]:
sc_race_merged_ei_df.fillna(0, inplace=True)

In [26]:
print(sc_race_merged_ei_df)

                   UNIQUE_ID  TOT_VOT  REPUBLICAN_PERCENT  DEMOCRATIC_PERCENT  \
0     ABBEVILLE_PRECINCT_001      975            0.736410            0.263590   
1     ABBEVILLE_PRECINCT_002      628            0.356688            0.643312   
2     ABBEVILLE_PRECINCT_003      574            0.585366            0.414634   
3     ABBEVILLE_PRECINCT_004      437            0.723112            0.276888   
4     ABBEVILLE_PRECINCT_005      722            0.829640            0.170360   
...                      ...      ...                 ...                 ...   
2256  CLARENDON_PRECINCT_024      600            0.430000            0.570000   
2257   BARNWELL_PRECINCT_016        0            0.000000            0.000000   
2258   BERKELEY_PRECINCT_099        0            0.000000            0.000000   
2259   BERKELEY_PRECINCT_098        0            0.000000            0.000000   
2260      AIKEN_PRECINCT_000        0            0.000000            0.000000   

      WHITE_PERCENT  BLACK_

In [30]:
sc_race_merged_ei_df['UNIQUE_ID'] = (
    sc_race_merged_ei_df['UNIQUE_ID'] +
    '_' +
    sc_race_merged_ei_df.groupby('UNIQUE_ID').cumcount().astype(str).replace('0', '')
)


##### Economic Group

In [27]:
columns_to_keep = ['UNIQUE_ID']

In [28]:
sc_economic_with_percentages = calculate_income_distribution(sc_economic_df, income_columns, columns_to_keep)

In [29]:
sc_economic_merged_ei_df = pd.merge(sc_election_gov_with_percentages, sc_economic_with_percentages, on='UNIQUE_ID', how='left')

#### Running EI models

##### DEMOCRATIC Run

In [31]:
dem_white_ei, dem_white_summary = run_ecological_inference(
    df=sc_race_merged_ei_df,
    demographic_col="WHITE_PERCENT",
    voting_col="DEMOCRATIC_PERCENT",
    total_votes_col="TOT_VOT",
    demographic_group_name="White",
    candidate_name=sc_candidate_mapping['DEMOCRATIC']
)

Compiling...
Compilation time = 0:00:03.070114
Sampling...


  0%|          | 0/8000 [00:00<?, ?it/s]

  0%|          | 0/8000 [00:00<?, ?it/s]

  0%|          | 0/8000 [00:00<?, ?it/s]

  0%|          | 0/8000 [00:00<?, ?it/s]

Sampling time = 0:00:05.202504
Transforming variables...
Transformation time = 0:08:16.055366


Model: king99_pareto_modification
        Computed from the raw b_i samples by multiplying by population and then getting
        the proportion of the total pop (total pop=summed across all districts):
        The posterior mean for the district-level voting preference of
        White for Joe Cunningham is
        0.292
        The posterior mean for the district-level voting preference of
        non-White for Joe Cunningham is
        0.654
        95% equal-tailed Bayesian credible interval for district-level voting preference of
        White for Joe Cunningham is
        [0.28712629 0.29643871]
        95% equal-tailed Bayesian credible interval for district-level voting preference of
        non-White for Joe Cunningham is
        [0.64572759 0.66288096]
        


In [None]:
dem_black_ei, dem_black_summary = run_ecological_inference(
    df=sc_race_merged_ei_df,
    demographic_col="BLACK_PERCENT",
    voting_col="DEMOCRATIC_PERCENT",
    total_votes_col="TOT_VOT",
    demographic_group_name="Black",
    candidate_name=sc_candidate_mapping['DEMOCRATIC']
)

Compiling...
Compilation time = 0:00:02.336437
Sampling...
Compiling.. :   0%|          | 0/8000 [00:00<?, ?it/s]
[A
[A

[A[A

[A[ASampling time = 0:00:02.455560
Transforming variables...
Running chain 0:   0%|          | 0/8000 [00:02<?, ?it/s]
[A

[A[A

Running chain 0:   5%|▌         | 400/8000 [00:22<06:14, 20.29it/s]
[A

Running chain 0:   5%|▌         | 400/8000 [00:34<06:14, 20.29it/s]
[A

Running chain 0:  10%|█         | 800/8000 [00:35<04:52, 24.64it/s]
[A

[A[A
Running chain 0:  15%|█▌        | 1200/8000 [00:49<04:11, 27.06it/s]

[A[A
Running chain 0:  20%|██        | 1600/8000 [01:02<03:48, 28.05it/s]

[A[A
Running chain 0:  25%|██▌       | 2000/8000 [01:15<03:29, 28.68it/s]

[A[A
[A

[A[A
Running chain 0:  30%|███       | 2400/8000 [01:29<03:12, 29.03it/s]

[A[A
Running chain 0:  35%|███▌      | 2800/8000 [01:42<02:57, 29.29it/s]

Running chain 0:  35%|███▌      | 2800/8000 [01:54<02:57, 29.29it/s]
[A

Running chain 0:  40%|████      | 3200/8000 [

Model: king99_pareto_modification
        Computed from the raw b_i samples by multiplying by population and then getting
        the proportion of the total pop (total pop=summed across all districts):
        The posterior mean for the district-level voting preference of
        Black for Joe Cunningham is
        0.801
        The posterior mean for the district-level voting preference of
        non-Black for Joe Cunningham is
        0.307
        95% equal-tailed Bayesian credible interval for district-level voting preference of
        Black for Joe Cunningham is
        [0.79063018 0.8107545 ]
        95% equal-tailed Bayesian credible interval for district-level voting preference of
        non-Black for Joe Cunningham is
        [0.30451352 0.31051245]
        


In [None]:
dem_asian_ei, dem_asian_summary = run_ecological_inference(
    df=sc_race_merged_ei_df,
    demographic_col="ASIAN_PERCENT",
    voting_col="DEMOCRATIC_PERCENT",
    total_votes_col="TOT_VOT",
    demographic_group_name="Asian",
    candidate_name=sc_candidate_mapping['DEMOCRATIC']
)

Compiling...
Compilation time = 0:00:01.494738
Sampling...
Compiling.. :   0%|          | 0/8000 [00:00<?, ?it/s]
[A
[A

[A[A

[A[ASampling time = 0:00:03.288312
Transforming variables...
Running chain 0:   0%|          | 0/8000 [00:02<?, ?it/s]
[A

Running chain 0:   5%|▌         | 400/8000 [00:16<04:29, 28.24it/s]

[A[A
Running chain 0:  10%|█         | 800/8000 [00:25<03:19, 36.16it/s]

[A[A
Running chain 0:  15%|█▌        | 1200/8000 [00:42<03:55, 28.89it/s]

[A[A
[A

[A[A
Running chain 0:  15%|█▌        | 1200/8000 [00:55<03:55, 28.89it/s]

[A[A
Running chain 0:  25%|██▌       | 2000/8000 [01:12<03:34, 27.95it/s]

[A[A
[A
Running chain 0:  30%|███       | 2400/8000 [01:24<03:08, 29.77it/s]

[A[A

[A[A
Running chain 0:  35%|███▌      | 2800/8000 [01:34<02:40, 32.39it/s]

Running chain 0:  40%|████      | 3200/8000 [01:44<02:20, 34.15it/s]
[A
Running chain 0:  45%|████▌     | 3600/8000 [01:54<02:00, 36.41it/s]

[A[A
Running chain 0:  50%|█████     | 4000/

Model: king99_pareto_modification
        Computed from the raw b_i samples by multiplying by population and then getting
        the proportion of the total pop (total pop=summed across all districts):
        The posterior mean for the district-level voting preference of
        Asian for Joe Cunningham is
        0.866
        The posterior mean for the district-level voting preference of
        non-Asian for Joe Cunningham is
        0.405
        95% equal-tailed Bayesian credible interval for district-level voting preference of
        Asian for Joe Cunningham is
        [0.74397844 0.95550905]
        95% equal-tailed Bayesian credible interval for district-level voting preference of
        non-Asian for Joe Cunningham is
        [0.40295398 0.40684488]
        


In [None]:
dem_hispanic_ei, rep_asian_summary = run_ecological_inference(
    df=sc_race_merged_ei_df,
    demographic_col="HISPANIC_PERCENT",
    voting_col="DEMOCRATIC_PERCENT",
    total_votes_col="TOT_VOT",
    demographic_group_name="Hispanic",
    candidate_name=sc_candidate_mapping['DEMOCRATIC']
)

Compiling...
Compilation time = 0:00:01.620940
Sampling...
Compiling.. :   0%|          | 0/8000 [00:00<?, ?it/s]
[A
[A

[A[A

[A[ASampling time = 0:00:04.626075
Transforming variables...
Running chain 0:   0%|          | 0/8000 [00:04<?, ?it/s]
[A

[A[A
[A

Running chain 0:   5%|▌         | 400/8000 [00:25<06:33, 19.34it/s]
[A

Running chain 0:  10%|█         | 800/8000 [00:35<04:23, 27.33it/s]
[A

Running chain 0:  15%|█▌        | 1200/8000 [00:45<03:34, 31.65it/s]
[A

Running chain 0:  20%|██        | 1600/8000 [00:55<03:05, 34.53it/s]
[A

Running chain 0:  25%|██▌       | 2000/8000 [01:06<02:46, 35.93it/s]
[A

Running chain 0:  25%|██▌       | 2000/8000 [01:16<02:46, 35.93it/s]

Running chain 0:  30%|███       | 2400/8000 [01:18<02:41, 34.70it/s]
[A

Running chain 0:  35%|███▌      | 2800/8000 [01:27<02:19, 37.14it/s]
[A

Running chain 0:  40%|████      | 3200/8000 [01:37<02:06, 37.91it/s]
[A

Running chain 0:  45%|████▌     | 3600/8000 [01:47<01:53, 38.68it/s]
[

Model: king99_pareto_modification
        Computed from the raw b_i samples by multiplying by population and then getting
        the proportion of the total pop (total pop=summed across all districts):
        The posterior mean for the district-level voting preference of
        Hispanic for Joe Cunningham is
        0.593
        The posterior mean for the district-level voting preference of
        non-Hispanic for Joe Cunningham is
        0.402
        95% equal-tailed Bayesian credible interval for district-level voting preference of
        Hispanic for Joe Cunningham is
        [0.54832353 0.63604578]
        95% equal-tailed Bayesian credible interval for district-level voting preference of
        non-Hispanic for Joe Cunningham is
        [0.39961269 0.40473651]
        


#### Running Repulican

In [None]:
rep_white_ei, rep_white_summary = run_ecological_inference(
    df=sc_race_merged_ei_df,
    demographic_col="WHITE_PERCENT",
    voting_col="REPUBLICAN_PERCENT",
    total_votes_col="TOT_VOT",
    demographic_group_name="White",
    candidate_name=sc_candidate_mapping['REPUBLICAN']
)

Compiling...
Compilation time = 0:00:01.453011
Sampling...
Compiling.. :   0%|          | 0/8000 [00:00<?, ?it/s]
[A
[A

[A[A

[A[ASampling time = 0:00:02.673957
Transforming variables...
Running chain 0:   0%|          | 0/8000 [00:02<?, ?it/s]
[A

[A[A
[A

Running chain 0:   5%|▌         | 400/8000 [00:22<06:24, 19.75it/s]
[A

Running chain 0:  10%|█         | 800/8000 [00:37<05:07, 23.41it/s]

[A[A
[A
[A

Running chain 0:  15%|█▌        | 1200/8000 [00:53<04:40, 24.22it/s]
[A

Running chain 0:  15%|█▌        | 1200/8000 [01:07<04:40, 24.22it/s]

Running chain 0:  20%|██        | 1600/8000 [01:08<04:12, 25.37it/s]
[A
[A

Running chain 0:  25%|██▌       | 2000/8000 [01:23<03:51, 25.91it/s]
[A

Running chain 0:  30%|███       | 2400/8000 [01:37<03:31, 26.53it/s]

[A[A
Running chain 0:  30%|███       | 2400/8000 [01:47<03:31, 26.53it/s]
[A

Running chain 0:  35%|███▌      | 2800/8000 [01:52<03:16, 26.48it/s]

[A[A
Running chain 0:  35%|███▌      | 2800/8000 [02:0

Model: king99_pareto_modification
        Computed from the raw b_i samples by multiplying by population and then getting
        the proportion of the total pop (total pop=summed across all districts):
        The posterior mean for the district-level voting preference of
        White for Henry McMaster is
        0.708
        The posterior mean for the district-level voting preference of
        non-White for Henry McMaster is
        0.345
        95% equal-tailed Bayesian credible interval for district-level voting preference of
        White for Henry McMaster is
        [0.70346268 0.71275406]
        95% equal-tailed Bayesian credible interval for district-level voting preference of
        non-White for Henry McMaster is
        [0.33665187 0.35424039]
        


In [None]:
rep_black_ei, rep_black_summary = run_ecological_inference(
    df=sc_race_merged_ei_df,
    demographic_col="BLACK_PERCENT",
    voting_col="REPUBLICAN_PERCENT",
    total_votes_col="TOT_VOT",
    demographic_group_name="Black",
    candidate_name=sc_candidate_mapping['REPUBLICAN']
)

Compiling...
Compilation time = 0:00:02.882429
Sampling...
Compiling.. :   0%|          | 0/8000 [00:00<?, ?it/s]
[A
[A

[A[A

[A[ASampling time = 0:00:02.638974
Running chain 0:   0%|          | 0/8000 [00:02<?, ?it/s]Transforming variables...

[A

[A[A
Running chain 0:   5%|▌         | 400/8000 [00:22<06:10, 20.53it/s]

Running chain 0:  10%|█         | 800/8000 [00:34<04:34, 26.24it/s]
[A

Running chain 0:  15%|█▌        | 1200/8000 [00:46<03:55, 28.86it/s]

[A[A
[A
[A

Running chain 0:  15%|█▌        | 1200/8000 [00:56<03:55, 28.86it/s]
[A

Running chain 0:  20%|██        | 1600/8000 [01:01<03:47, 28.16it/s]
[A

Running chain 0:  25%|██▌       | 2000/8000 [01:14<03:29, 28.61it/s]

Running chain 0:  25%|██▌       | 2000/8000 [01:26<03:29, 28.61it/s]
Running chain 0:  30%|███       | 2400/8000 [01:26<03:06, 30.10it/s]
[A

Running chain 0:  35%|███▌      | 2800/8000 [01:39<02:51, 30.34it/s]

[A[A
Running chain 0:  40%|████      | 3200/8000 [01:51<02:32, 31.41it/s]



Model: king99_pareto_modification
        Computed from the raw b_i samples by multiplying by population and then getting
        the proportion of the total pop (total pop=summed across all districts):
        The posterior mean for the district-level voting preference of
        Black for Henry McMaster is
        0.199
        The posterior mean for the district-level voting preference of
        non-Black for Henry McMaster is
        0.692
        95% equal-tailed Bayesian credible interval for district-level voting preference of
        Black for Henry McMaster is
        [0.18887903 0.20892499]
        95% equal-tailed Bayesian credible interval for district-level voting preference of
        non-Black for Henry McMaster is
        [0.68942933 0.69550547]
        


In [None]:
rep_asian_ei, rep_asian_summary = run_ecological_inference(
    df=sc_race_merged_ei_df,
    demographic_col="ASIAN_PERCENT",
    voting_col="REPUBLICAN_PERCENT",
    total_votes_col="TOT_VOT",
    demographic_group_name="Asian",
    candidate_name=sc_candidate_mapping['REPUBLICAN']
)

Compiling...
Compilation time = 0:00:01.500531
Sampling...
Compiling.. :   0%|          | 0/8000 [00:00<?, ?it/s]
[A
[A

[A[A

[A[ASampling time = 0:00:03.441723
Transforming variables...
Running chain 0:   0%|          | 0/8000 [00:03<?, ?it/s]
[A

Running chain 0:   5%|▌         | 400/8000 [00:18<04:45, 26.60it/s]
[A

[A[A

[A[A
Running chain 0:   5%|▌         | 400/8000 [00:29<04:45, 26.60it/s]
Running chain 0:  10%|█         | 800/8000 [00:34<04:46, 25.10it/s]

[A[A
Running chain 0:  15%|█▌        | 1200/8000 [00:49<04:16, 26.49it/s]

[A[A

[A[A
Running chain 0:  20%|██        | 1600/8000 [01:03<03:54, 27.29it/s]

[A[A
[A

[A[A
Running chain 0:  20%|██        | 1600/8000 [01:19<03:54, 27.29it/s]

[A[A
Running chain 0:  25%|██▌       | 2000/8000 [01:26<04:27, 22.44it/s]

[A[A
Running chain 0:  25%|██▌       | 2000/8000 [01:39<04:27, 22.44it/s]
Running chain 0:  30%|███       | 2400/8000 [01:44<04:08, 22.52it/s]

[A[A

[A[A
Running chain 0:  35%|███▌   

Model: king99_pareto_modification
        Computed from the raw b_i samples by multiplying by population and then getting
        the proportion of the total pop (total pop=summed across all districts):
        The posterior mean for the district-level voting preference of
        Asian for Henry McMaster is
        0.150
        The posterior mean for the district-level voting preference of
        non-Asian for Henry McMaster is
        0.595
        95% equal-tailed Bayesian credible interval for district-level voting preference of
        Asian for Henry McMaster is
        [0.04853489 0.31025161]
        95% equal-tailed Bayesian credible interval for district-level voting preference of
        non-Asian for Henry McMaster is
        [0.59224262 0.59689542]
        


In [None]:
rep_hispanic_ei, rep_hispanic_summary = run_ecological_inference(
    df=sc_race_merged_ei_df,
    demographic_col="HISPANIC_PERCENT",
    voting_col="REPUBLICAN_PERCENT",
    total_votes_col="TOT_VOT",
    demographic_group_name="Hispanic",
    candidate_name=sc_candidate_mapping['REPUBLICAN']
)

Compiling...
Compilation time = 0:00:01.673356
Sampling...
Compiling.. :   0%|          | 0/8000 [00:00<?, ?it/s]
[A
[A

[A[A

[A[ASampling time = 0:00:03.005287
Transforming variables...
Running chain 0:   0%|          | 0/8000 [00:02<?, ?it/s]
[A

[A[A

Running chain 0:   5%|▌         | 400/8000 [00:18<04:58, 25.43it/s]
[A

Running chain 0:  10%|█         | 800/8000 [00:26<03:23, 35.33it/s]
[A

Running chain 0:  15%|█▌        | 1200/8000 [00:35<02:48, 40.46it/s]
[A

Running chain 0:  20%|██        | 1600/8000 [00:43<02:31, 42.31it/s]
[A

Running chain 0:  25%|██▌       | 2000/8000 [00:52<02:14, 44.48it/s]
Running chain 0:  30%|███       | 2400/8000 [01:01<02:07, 43.82it/s]

[A[A
Running chain 0:  35%|███▌      | 2800/8000 [01:10<01:57, 44.30it/s]

[A[A
Running chain 0:  40%|████      | 3200/8000 [01:18<01:45, 45.53it/s]

[A[A
Running chain 0:  45%|████▌     | 3600/8000 [01:26<01:33, 46.81it/s]

[A[A
Running chain 0:  50%|█████     | 4000/8000 [01:34<01:23, 47.66i

Model: king99_pareto_modification
        Computed from the raw b_i samples by multiplying by population and then getting
        the proportion of the total pop (total pop=summed across all districts):
        The posterior mean for the district-level voting preference of
        Hispanic for Henry McMaster is
        0.406
        The posterior mean for the district-level voting preference of
        non-Hispanic for Henry McMaster is
        0.598
        95% equal-tailed Bayesian credible interval for district-level voting preference of
        Hispanic for Henry McMaster is
        [0.36271511 0.44657405]
        95% equal-tailed Bayesian credible interval for district-level voting preference of
        non-Hispanic for Henry McMaster is
        [0.59538989 0.6003309 ]
        


#### Ploting

In [None]:
# Plot the results

### Export data