# Setup

This file should be filled out once per project. It will generate the template that the rest of the project will use.

In [1]:
import pandas as pd
import geopandas as gpd
import os
from shutil import copyfile

from op_verification.reference_data import (
    geoid_to_county_name,
    state_to_fips,
    state_to_state_po,
    state_abbreviation_to_state_name,
    state_fip_to_county_to_geoid,
)

In [2]:
state = "New York"
assert state in state_to_fips.keys()
state_fips = state_to_fips[state]
state_po = state_to_state_po[state]
print("State:",state," | State Fips:",state_fips," | State PO:", state_po)

State: New York  | State Fips: 36  | State PO: NY


Confirm that the cell above prints out the correct information for the state you're working on. 

The next cell will take in the year for the election you are working on as an input. Then it will make a  congressional district shapefile and county shapefile for the year and state you are working on. The shapefile will be taken from a national shapefile prepared by the [U.S. Census Bureau](https://www.census.gov/). The national congressional shapefiles are stored in this repository at `./data/congressioinal_distric_shapefiles` and the national county shapefiles are stored at `./data/county_shapefiles`.

These shapefiles may come in handy as you try to match precincts for the rest of the state. Its worthwhile to make state specific shapefiles (which this next cell does for you) because it will be faster than using the national shapefile.

Presently, this codebase supports 2014, 2016, and 2018. Feel free to submit a pull request if you want to add 2020 the Census Buearu publishes `cb_2020_us_county_500k` or `tl_2020_us_cd117`. Of course these aren't essential to have for precinct mathcing. So if you don't want to have these helper files no sweat!

In [3]:
years_to_congress_num = {"2014":"114", "2016":"115", "2018":"116"}
year = "2018"
assert year in years_to_congress_num.keys()
national_county_gdf = gpd.read_file("./data/county_shapefiles/cb_{}_us_county_500k".format(year))
state_county_gdf = national_county_gdf[national_county_gdf.STATEFP == str(state_fips)][["NAME", "COUNTYFP", "geometry"]].reset_index()
state_county_gdf.head()

Unnamed: 0,index,NAME,COUNTYFP,geometry
0,161,Chautauqua,13,"POLYGON ((-79.76215 42.24305, -79.76196 42.251..."
1,162,Erie,29,"POLYGON ((-79.13689 42.56980, -79.13299 42.582..."
2,163,Herkimer,43,"POLYGON ((-75.21911 43.05247, -75.21251 43.059..."
3,164,Jefferson,45,"MULTIPOLYGON (((-76.14753 43.94248, -76.14566 ..."
4,165,Kings,47,"POLYGON ((-74.04201 40.62605, -74.04199 40.626..."


In [4]:
national_congressional_districts_gdf = gpd.read_file("./data/congressioinal_distric_shapefiles/tl_{}_us_cd{}".format(year,years_to_congress_num[year]))
state_congressional_districts_gdf = national_congressional_districts_gdf[national_congressional_districts_gdf.STATEFP == str(state_fips)][['NAMELSAD','CD116FP','geometry']].sort_values(by='CD116FP').reset_index()
state_congressional_districts_gdf.head()

Unnamed: 0,index,NAMELSAD,CD116FP,geometry
0,417,Congressional District 1,1,"POLYGON ((-73.29646 40.93480, -73.29655 40.934..."
1,432,Congressional District 2,2,"POLYGON ((-73.54556 40.74358, -73.54442 40.743..."
2,435,Congressional District 3,3,"POLYGON ((-73.82758 40.80394, -73.82344 40.804..."
3,438,Congressional District 4,4,"POLYGON ((-73.75557 40.58309, -73.75557 40.583..."
4,423,Congressional District 5,5,"POLYGON ((-74.03813 40.53829, -74.03749 40.542..."


### Persisting the state specific dataset.

This cell makes a folder and stores state data there for your future use (including  `state_congressional_districts_gdf` and `state_county_gdf`). If the folder already exists, then this script will throw an exception to prevent an accidental overwrite.

In [5]:
path = './data/state_specific_data'
congressional_districts_file = '{}_{}_congressional_districts'.format(year,state)
counties_file = '{}_{}_counties'.format(year,state)
os.mkdir(path)
os.mkdir("/".join([path, "shapefiles"]))

# Generate Shapefiles (for GIS inspection purposes)
state_congressional_districts_gdf.to_file("/".join([path,"shapefiles",congressional_districts_file]))
state_county_gdf.to_file("/".join([path,"shapefiles",counties_file]))

### Import the datasets

If you will be using any other statewide datasets (precinct shapefiles or election results) import them below. 

* `gdf` denotes "GeoDataFrame" which is the data structure that will be used to hold shapefiles
* `df` denotes "DataFrame" which is the data structure that will be used to hold election results

In [6]:
statewide_shapefile_file_path = 'data/ny_precincts/NY19Partnership'
statewide_shapefile_gdf = gpd.read_file(statewide_shapefile_file_path)
statewide_shapefile_gdf.head()

Unnamed: 0,STATEFP,COUNTYFP,VTDST,NAMELSAD,VTDI,LSAD,CHNG_TYPE,ORIG_NAME,ORIG_CODE,RELATE,NAME,VINTAGE,FUNCSTAT,JUSTIFY,MTFCC,geometry
0,36,1,1,1,A,0,,,,,1,90,N,,G5240,"POLYGON Z ((-73.81005 42.63703 0.00000, -73.81..."
1,36,1,2,2,A,0,,,,,2,90,N,,G5240,"POLYGON Z ((-73.79377 42.63951 0.00000, -73.79..."
2,36,1,3,3,A,0,,,,,3,90,N,,G5240,"POLYGON Z ((-73.79031 42.63767 0.00000, -73.79..."
3,36,1,11,11,A,0,,,,,11,90,N,,G5240,"POLYGON Z ((-73.76981 42.64066 0.00000, -73.76..."
4,36,1,13,13,A,0,,,,,13,90,N,,G5240,"POLYGON Z ((-73.76204 42.63530 0.00000, -73.76..."


Looks like no precinct names (just numbers) in `statewide_shapefile_gdf`. Will need to use lookup tables for all counties.

In [7]:
(statewide_shapefile_gdf.VTDST == statewide_shapefile_gdf.NAME.str.zfill(6)).unique()

array([ True])

In [8]:
statewide_election_results_file_path = 'data/precinct_election_results/NY_G18_pivoted.csv'
statewide_results_df = pd.read_csv(statewide_election_results_file_path)
statewide_results_df.head()

Unnamed: 0,county,precinct,HOR_DEM,HOR_REP,HOR_CON,HOR_GRE,HOR_WOR,HOR_IND,HOR_WEP,HOR_REF,...,Gov_DEM,Gov_REP,Gov_CON,Gov_GRE,Gov_WOR,Gov_IND,Gov_WEP,Gov_REF,Gov_SAM,Gov_LBT
0,Albany County,0001 ALBANY W1 ED1,40,7,0,0,5,0,1,0,...,35,9,1,3,3,0,0,0,2,1
1,Albany County,0002 ALBANY W1 ED2,182,20,0,0,21,0,6,3,...,148,32,11,15,11,1,1,2,9,3
2,Albany County,0003 ALBANY W1 ED3,241,41,0,0,11,0,1,4,...,212,55,11,4,10,1,3,0,5,5
3,Albany County,0004 ALBANY W1 ED4,245,18,0,0,18,0,5,3,...,220,30,5,4,12,4,1,3,7,4
4,Albany County,0005 ALBANY W1 ED5,4,0,0,0,0,0,0,0,...,2,0,0,1,0,0,0,0,0,0


### Precondition: County ID

Make a column in `statewide_shapefile_gdf` and `statewide_results_df` called `county_id` with a common naming scheme. County name or county fips code would work. I highly reccomend using county name e.g. "Essex County" for readablity. You can use the helper functions below to convert between county FIPS and county name.

In [9]:
def get_county_name(county_fips):
    geoid = str(state_fips) + str(county_fips).zfill(3)
    assert(len(geoid) == 5)
    return geoid_to_county_name[geoid]

county_to_geoid = state_fip_to_county_to_geoid[int(state_fips)]
def get_geoid(county_name):
    return county_to_geoid[county_name]

# TODO: Pass the precondition described above which takes the form of an assert statement in this cell.
statewide_shapefile_gdf['county_id'] = statewide_shapefile_gdf['COUNTYFP'].map(get_county_name)
statewide_results_df['county_id'] = statewide_results_df['county']
n_counties = 62
assert 'county_id' in statewide_shapefile_gdf.columns and 'county_id' in statewide_results_df.columns
county_set_statewide_shapefile_gdf = set(statewide_shapefile_gdf['county_id'].unique())
county_set_statewide_results_df = set(statewide_results_df['county_id'].unique())
print("statewide_shapefile_gdf unmatched counties: ", county_set_statewide_shapefile_gdf.difference(county_set_statewide_results_df))
print("statewide_results_df unmatched counties: ", county_set_statewide_results_df.difference(county_set_statewide_shapefile_gdf))
assert county_set_statewide_shapefile_gdf.intersection(county_set_statewide_results_df) != set()
assert len(county_set_statewide_shapefile_gdf.union(county_set_statewide_results_df)) == int(n_counties)

statewide_shapefile_gdf unmatched counties:  {'Genesee County'}
statewide_results_df unmatched counties:  set()


### Precondition: Precinct Name

Identify the column that should contains the precinct names to be matched by naming it `original_precinct_name`

In [10]:
# TODO: Pass the precondition described above which takes the form of an assert statement in this cell.
statewide_shapefile_gdf['original_precinct_name'] = 'NAMELSAD'
statewide_results_df['original_precinct_name'] = 'precinct'
assert 'original_precinct_name' in statewide_shapefile_gdf.columns and 'original_precinct_name' in statewide_results_df.columns

The next cell makes a directory where all the county specific matching will take place and initializes each county matching folder.

In [11]:
path = './matching'
os.mkdir(path)

for county_id in county_set_statewide_shapefile_gdf.union(county_set_statewide_results_df):
    # Make a folder for this county
    os.mkdir('/'.join([path,county_id]))
    
    # Make the Matching Notebook
    notebook_filename = 'precinct_matching_county_id={}.ipynb'.format(county_id)
    notebook_filepath = "/".join([path,county_id,notebook_filename])
    copyfile('precinct_matching_framework.ipynb',notebook_filepath)
    
    # Read in the file
    with open('precinct_matching_framework.ipynb', 'r') as file:
        precinct_matching_framework = file.read()

    # Customize the county_id string
    precinct_matching_framework = precinct_matching_framework.replace('<$COUNTY_ID$>', county_id)

    # Write the file out again
    with open(notebook_filepath, 'w') as file:
          file.write(precinct_matching_framework)
    
    # Initialize a README.md file
    with open('/'.join([path,county_id,"README.md"]), "a") as f:
        f.write("## Documentation for matching in `county_id` = {}".format(county_id))
    
    # Generate CSV for the election results for this county
    if county_id in county_set_statewide_results_df:
        county_results_filename = 'election_results_county_id={}.csv'.format(county_id)
        county_results_df = statewide_results_df[statewide_results_df.county_id == county_id]
        county_results_df.to_csv("/".join([path,county_id,county_results_filename]), index=False)
    
    # Generate Shapefile (for GIS inspection purposes)
    if county_id in county_set_statewide_shapefile_gdf:
        county_shapefile_filename = 'shapefile_county_id={}'.format(county_id)
        county_shapefile_gdf = statewide_shapefile_gdf[statewide_shapefile_gdf.county_id == county_id]
        county_shapefile_gdf.to_file("/".join([path,county_id,county_shapefile_filename]))