# Setup

This file should be filled out once per project. It will generate the template that the rest of the project will use.

In [1]:
import pandas as pd
import geopandas as gpd
import os
from shutil import copyfile

from op_verification.reference_data import (
    geoid_to_county_name,
    state_to_fips,
    state_to_state_po,
    state_abbreviation_to_state_name,
    state_fip_to_county_to_geoid,
)

In [14]:
state = "REPLACE ME"
assert state in state_to_fips.keys()
state_fips = state_to_fips[state]
state_po = state_to_state_po[state]
n_counties = len(state_fip_to_county_to_geoid[1])
print("State:",state," | State Fips:",state_fips," | State PO:", state_po, " |  n_counties:", n_counties)

State: Alabama  | State Fips: 1  | State PO: AL  |  n_counties: 67


### Import the datasets

If you will be using any other statewide datasets (precinct shapefiles or election results) import them below. 

* `gdf` denotes "GeoDataFrame" which is the data structure that will be used to hold shapefiles
* `df` denotes "DataFrame" which is the data structure that will be used to hold election results

In [15]:
statewide_shapefile_file_path = 'data/shapefiles/census_partnership/compiled_vtds'
statewide_shapefile_gdf = gpd.read_file(statewide_shapefile_file_path)
statewide_shapefile_gdf.head()

Unnamed: 0,STATEFP,COUNTYFP,VTDST,NAMELSAD,VTDI,LSAD,CHNG_TYPE,ORIG_NAME,ORIG_CODE,RELATE,NAME,VINTAGE,FUNCSTAT,JUSTIFY,MTFCC,geometry
0,1,3,1,Little River VFD,A,0,,,,,Little River VFD,90,N,,G5240,"POLYGON Z ((-87.85327 31.25596 0.00000, -87.85..."
1,1,3,2,Tensaw VFD,A,0,,,,,Tensaw VFD,90,N,,G5240,"POLYGON Z ((-87.97692 31.08658 0.00000, -87.97..."
2,1,3,3,Old Vaughn School,A,0,,,,,Old Vaughn School,90,N,,G5240,"POLYGON Z ((-87.96499 30.96896 0.00000, -87.96..."
3,1,3,4,Crossroads Durant Chapel Bapt Church,A,0,,,,,Crossroads Durant Chapel Bapt Church,90,N,,G5240,"POLYGON Z ((-87.98693 30.87518 0.00000, -87.98..."
4,1,3,5,Douglasville Boykin Ctr,A,0,,,,,Douglasville Boykin Ctr,90,N,,G5240,"POLYGON Z ((-87.80901 30.85498 0.00000, -87.80..."


In [16]:
statewide_election_results_file_path = 'data/precinct_election_results/AL_G18_pivoted.csv'
statewide_results_df = pd.read_csv(statewide_election_results_file_path)
statewide_results_df.head()

Unnamed: 0,county,precinct,Gov_DEM,Gov_REP,LtGov_DEM,LtGov_REP,StHOR_DEM,StHOR_IND,StHOR_LIB,StHOR_REP,StSen_DEM,StSen_IND,StSen_REP,SP_DEM,SP_LIB,SP_REP,USHOR_DEM,USHOR_REP
0,Autauga County,10 JONES COMMUNITY CTR,105.0,168.0,109.0,162.0,127.0,,,,,,170.0,98.0,,110.0,118.0,153.0
1,Autauga County,100 TRINITY METHODIST,333.0,1436.0,288.0,1470.0,252.0,,,1499.0,,,1494.0,150.0,,916.0,391.0,1375.0
2,Autauga County,110 CENTRAL AL ELECTRIC,149.0,106.0,157.0,97.0,166.0,,,,,,107.0,138.0,,59.0,159.0,94.0
3,Autauga County,140 AUTAUGAVILLE VFD,308.0,280.0,314.0,271.0,351.0,,,,,,296.0,267.0,,186.0,326.0,254.0
4,Autauga County,150 PRATTMONT BAPTIST CH,175.0,472.0,163.0,478.0,158.0,,,482.0,,,507.0,94.0,,300.0,196.0,448.0


### Precondition: County ID

Make a column in `statewide_shapefile_gdf` and `statewide_results_df` called `county_id` with a common naming scheme. County name or county fips code would work. I highly reccomend using county name e.g. "Essex County" for readablity. You can use the helper functions below to convert between county FIPS and county name.

In [18]:
def get_county_name(county_fips):
    geoid = str(state_fips).zfill(2) + str(county_fips).zfill(3)
    if(len(geoid) != 5):
        print(geoid)
    assert(len(geoid) == 5)
    return geoid_to_county_name[geoid]

county_to_geoid = state_fip_to_county_to_geoid[int(state_fips)]
def get_geoid(county_name):
    return county_to_geoid[county_name]

# TODO: Pass the precondition described above which takes the form of an assert statement in this cell.
statewide_shapefile_gdf['county_id'] = statewide_shapefile_gdf['COUNTYFP'].map(get_county_name)
statewide_results_df['county_id'] = statewide_results_df['county']
n_counties = 67
assert 'county_id' in statewide_shapefile_gdf.columns and 'county_id' in statewide_results_df.columns
county_set_statewide_shapefile_gdf = set(statewide_shapefile_gdf['county_id'].unique())
county_set_statewide_results_df = set(statewide_results_df['county_id'].unique())
print("statewide_shapefile_gdf unmatched counties: ", county_set_statewide_shapefile_gdf.difference(county_set_statewide_results_df))
print("statewide_results_df unmatched counties: ", county_set_statewide_results_df.difference(county_set_statewide_shapefile_gdf))
assert county_set_statewide_shapefile_gdf.intersection(county_set_statewide_results_df) != set()
assert len(county_set_statewide_shapefile_gdf.union(county_set_statewide_results_df)) == int(n_counties)

statewide_shapefile_gdf unmatched counties:  set()
statewide_results_df unmatched counties:  {'Tuscaloosa County'}


### Precondition: Precinct Name

Identify the column that should contains the precinct names to be matched by naming it `original_precinct_name`

In [19]:
# TODO: Pass the precondition described above which takes the form of an assert statement in this cell.
statewide_shapefile_gdf['original_precinct_name'] = statewide_shapefile_gdf['NAMELSAD']
statewide_results_df['original_precinct_name'] = statewide_results_df['precinct']
assert 'original_precinct_name' in statewide_shapefile_gdf.columns and 'original_precinct_name' in statewide_results_df.columns

## Find easier counties

In [24]:
gdf = statewide_shapefile_gdf.copy()
df = statewide_results_df.copy()

In [25]:
cnty_to_n_df = df.groupby('county_id').original_precinct_name.nunique().sort_values()
cnty_to_n_df = cnty_to_n_df[cnty_to_n_df.index!='Tuscaloosa County']
cnty_to_n_df

county_id
Bibb County           10
Perry County          14
Lowndes County        14
Coosa County          14
Henry County          15
                    ... 
Montgomery County     51
DeKalb County         53
Madison County        76
Jefferson County     177
Mobile County        193
Name: original_precinct_name, Length: 66, dtype: int64

In [26]:
cnty_to_n_gdf = gdf.groupby('county_id').original_precinct_name.nunique().sort_values()
cnty_to_n_gdf = cnty_to_n_gdf[cnty_to_n_gdf.index!='Genesee County']
cnty_to_n_gdf

county_id
DeKalb County          4
Perry County          10
Winston County        11
Barbour County        11
Henry County          13
                    ... 
Montgomery County     49
Baldwin County        49
Madison County        72
Mobile County         88
Jefferson County     172
Name: original_precinct_name, Length: 66, dtype: int64

In [27]:
cnty_to_n_gdf[cnty_to_n_gdf.sort_index() == cnty_to_n_df.sort_index()]

county_id
Lowndes County       14
Chilton County       18
Russell County       19
Tallapoosa County    28
Conecuh County       29
Etowah County        33
Name: original_precinct_name, dtype: int64

In [38]:
gdf_mean_precs = cnty_to_n_gdf.mean()
gdf_mean_precs

28.28787878787879

In [39]:
df_mean_precs = cnty_to_n_df.mean()
df_mean_precs

33.09090909090909

In [37]:
avg_abs_diff = (cnty_to_n_gdf - cnty_to_n_df).abs().mean()
avg_abs_diff

6.681818181818182

In [40]:
avg_abs_diff / df_mean_precs

0.2019230769230769

In [41]:
avg_abs_diff / gdf_mean_precs

0.2362078200321371

On average there are more precincts in the election results that VTDs.

The next cell makes a directory where all the county specific matching will take place and initializes each county matching folder.

In [28]:
path = './matching'
os.mkdir(path)

for county_id in county_set_statewide_shapefile_gdf.union(county_set_statewide_results_df):
    # Make a folder for this county
    os.mkdir('/'.join([path,county_id]))
    
    # Make the Matching Notebook
    notebook_filename = '{} Precinct Matching.ipynb'.format(county_id)
    notebook_filepath = "/".join([path,county_id,notebook_filename])
    copyfile('precinct_matching_framework.ipynb',notebook_filepath)
    
    # Read in the file
    with open('precinct_matching_framework.ipynb', 'r') as file:
        precinct_matching_framework = file.read()

    # Customize the county_id string
    precinct_matching_framework = precinct_matching_framework.replace('<$COUNTY_ID$>', county_id)

    # Write the file out again
    with open(notebook_filepath, 'w') as file:
          file.write(precinct_matching_framework)
    
    # Initialize a README.md file
    with open('/'.join([path,county_id,"README.md"]), "a") as f:
        f.write("## Documentation for matching in `county_id` = {}".format(county_id))
    
    # Generate CSV for the election results for this county
    if county_id in county_set_statewide_results_df:
        county_results_filename = 'election_results_county_id={}.csv'.format(county_id)
        county_results_df = statewide_results_df[statewide_results_df.county_id == county_id]
        county_results_df.to_csv("/".join([path,county_id,county_results_filename]), index=False)
    
    # Generate Shapefile (for GIS inspection purposes)
    if county_id in county_set_statewide_shapefile_gdf:
        county_shapefile_filename = 'shapefile_county_id={}'.format(county_id)
        county_shapefile_gdf = statewide_shapefile_gdf[statewide_shapefile_gdf.county_id == county_id]
        county_shapefile_gdf.to_file("/".join([path,county_id,county_shapefile_filename]))