# Precinct Matching Framework

In [1]:
import pandas as pd
import geopandas as gpd
from pprint import pprint

from op_verification.reference_data import (
    geoid_to_county_name,
    state_to_fips,
    state_to_state_po,
    state_abbreviation_to_state_name,
    state_fip_to_county_to_geoid,
)

### Import the datasets

In [2]:
county_id = 'Bibb County'

county_results_filename = 'election_results_county_id={}.csv'.format(county_id)
county_results_df = pd.read_csv(county_results_filename)
county_shapefile_filename = 'shapefile_county_id={}'.format(county_id)
county_shapefile_gdf = gpd.read_file(county_shapefile_filename)

# correct for the truncation caused by 10 character column name limit in shapefiles
county_shapefile_gdf.rename(columns={'original_p':'original_precinct_name'}, inplace=True)

The next cell aliases `county_results_df` as `df` and `county_shapefile_gdf` as `gdf` here because typing fewer characters allows for faster data exploration. 

In [3]:
df = county_results_df.copy()
gdf = county_shapefile_gdf.copy()

In [4]:
df.head()

Unnamed: 0,county,precinct,Gov_DEM,Gov_REP,LtGov_DEM,LtGov_REP,StHOR_DEM,StHOR_IND,StHOR_LIB,StHOR_REP,StSen_DEM,StSen_IND,StSen_REP,SP_DEM,SP_LIB,SP_REP,USHOR_DEM,USHOR_REP,county_id,original_precinct_name
0,Bibb County,ABSENTEE,41.0,155.0,38.0,157.0,14.0,,,139.0,37.0,,155.0,27.0,,125.0,35.0,159.0,Bibb County,ABSENTEE
1,Bibb County,ALTERNATIVE SCHOOL,374.0,1034.0,333.0,1065.0,,,,1087.0,329.0,,1071.0,260.0,,775.0,326.0,1070.0,Bibb County,ALTERNATIVE SCHOOL
2,Bibb County,BRENT CITY HALL,419.0,252.0,419.0,249.0,445.0,,,2.0,415.0,,255.0,374.0,,169.0,418.0,251.0,Bibb County,BRENT CITY HALL
3,Bibb County,CENTREVILLE ROCK BLDG,203.0,647.0,172.0,675.0,127.0,,,471.0,161.0,,689.0,133.0,,488.0,167.0,682.0,Bibb County,CENTREVILLE ROCK BLDG
4,Bibb County,EOLINE FIRE DEPT,83.0,434.0,78.0,436.0,82.0,,,94.0,78.0,,436.0,59.0,,324.0,71.0,444.0,Bibb County,EOLINE FIRE DEPT


In [5]:
gdf.head()

Unnamed: 0,STATEFP,COUNTYFP,VTDST,NAMELSAD,VTDI,LSAD,CHNG_TYPE,ORIG_NAME,ORIG_CODE,RELATE,NAME,VINTAGE,FUNCSTAT,JUSTIFY,MTFCC,county_id,original_precinct_name,geometry
0,1,7,1,Green Pond Fire Dept.-1,A,0,,,,,Green Pond Fire Dept.-1,90,N,,G5240,Bibb County,Green Pond Fire Dept.-1,"POLYGON Z ((-87.17367 33.21599 0.00000, -87.17..."
1,1,7,2,Alternative School-2,A,0,,,,,Alternative School-2,90,N,,G5240,Bibb County,Alternative School-2,"POLYGON Z ((-87.31477 33.05873 0.00000, -87.31..."
2,1,7,3,Eoline Fire Dept-3,A,0,,,,,Eoline Fire Dept-3,90,N,,G5240,Bibb County,Eoline Fire Dept-3,"POLYGON Z ((-87.42194 33.00338 0.00000, -87.41..."
3,1,7,4,Brent City Hall-4,A,0,,,,,Brent City Hall-4,90,N,,G5240,Bibb County,Brent City Hall-4,"POLYGON Z ((-87.26785 32.88378 0.00000, -87.26..."
4,1,7,5,Rock Building-5,A,0,,,,,Rock Building-5,90,N,,G5240,Bibb County,Rock Building-5,"POLYGON Z ((-87.21260 32.83543 0.00000, -87.21..."


### Check Preconditions
These should all pass - they're here to ensure that everything in `setup.ipynb` worked correctly

In [6]:
# TODO: Pass the precondition described above which takes the form of an assert statement in this cell.
assert 'county_id' in df.columns and 'county_id' in gdf.columns
assert 'original_precinct_name' in df.columns and 'original_precinct_name' in gdf.columns

### General Modifications
Its normally benificial to apply some modifications uniformly to all precincts. For example, its good practice to make everything lower case. This modification is made in `edit_precinct_name` - read its specification to learn more about how to use it to make more modifications. Consider removing substrings that appear in every precinct name like `voting district`. Be careful of removing words that will result in duplicate precinct names. For example, if there are two `Newtown Voting District` precincts in the shapefile, and the election results have `Newtown Boro` and `Newtown Township`, its okay to remove `Voting District`, but you probably don't want to remove `Boro` or `Township`. Of course, this will differ from County to County - so be vigilant!

In [7]:
dataset_name_df = 'df'
dataset_name_gdf = 'gdf'

def edit_precinct_name(prec_name, 
    remove_lst=[], 
    target_to_replacement={},
    stopping_words=[],
    prec_dict={}):
    '''
    Returns a lower case precinct name (string) with certian modifications depending other arguments. 
    
    Modifications are performed in order of the parameters they depend on. By convention, case is 
    ignored by making prec_name lower case. Accordingly, one should pass arguements with lower case
    elements. That is, keys of the dictionaries and elements of lists should be lower case strings.

	Parameters:
		prec_name (str): precinct name
		remove_lst ((str) list): if a string in this list is a substring in prec_name it will be removed. 
            All elements should be lower case.
        target_to_replacement ({str:str} dictionary): keys (targets) will be replaced with their 
            corresponding value (replacements) in prec_name. All keys should be lower case.
        stopping_words ({str} list): If any substring of prec_name contains a element of stopping_words
             that is adjacent to a space character it will be removed. All elements should be lower case.
        prec_dict ({str:str} dictionary): After all the modifications above, if the edited prec_name
            string is in the set of keys for prec_dict, then it will be replaced with that key's value.  
            All keys should be lower case.

	Returns:
		prec_name (str): prec_name arguement returned with the 
    '''
    prec_name = str(prec_name).lower()
    for word in remove_lst:
        prec_name = prec_name.replace(word, '')
    for target, replacement in target_to_replacement.items():
        prec_name = prec_name.replace(target, replacement)
    words = prec_name.split()
    words = [word.lstrip('0') for word in words if word not in stopping_words]
    prec_name = " ".join(words)
    return prec_dict[prec_name] if prec_name in prec_dict.keys() else prec_name

df_to_gdf = {
}

gdf_to_df = {
 'alternative school-10': 'alternative school',
 'alternative school-11': 'alternative school',
 'alternative school-2': 'centreville rock bldg',
 'brent city hall-13': 'brent city hall',
 'brent city hall-4': 'brent city hall',
 'brent national guard armory': 'national guard armory',
 'eoline fire dept-3': 'eoline fire dept',
 'eoline fire dept.-12': 'eoline fire dept',
 'green pond fire dept.-1': 'greenpond fire dept',
 'greenpond fire dept-11.01': 'greenpond fire dept',
 'lawley comm center-7': 'lawley community ctr',
 'lawley comm center-8': 'lawley community ctr',
 'rock building -14': 'centreville rock bldg',
 'rock building-5': 'centreville rock bldg',
 'six mile comm center-10.01': 'six mile community ctr',
 'six mile comm center-6': 'six mile community ctr'
}

# Tune the matching by adding optional arguements to edit
df['edited_precinct_name'] = df['original_precinct_name'].apply(lambda name: edit_precinct_name(name,prec_dict=df_to_gdf))
gdf['edited_precinct_name'] = gdf['original_precinct_name'].apply(lambda name: edit_precinct_name(name,prec_dict=gdf_to_df))

######## Manual Corrections ###########
# Make precinct specific corrections here like splitting one precinct into two because of new congressional districts
# e.g gdf.loc[gdf['VTDST']=='000290','edited_precinct_name'] = 'howard township'

# make the 'original_precinct_name, edited_precinct_name' for use in the loop below
df['original_precinct_name, edited_precinct_name'] = df[['original_precinct_name','edited_precinct_name']].apply(tuple, axis=1)
gdf['original_precinct_name, edited_precinct_name'] = gdf[['original_precinct_name','edited_precinct_name']].apply(tuple, axis=1)

######## Matching Framework ###########
unmatched_precinct_lst_gdf = []
unmatched_precinct_lst_df = ['absentee', 'provisional']

precinct_list_df = sorted(list(df[df['county_id'] == county_id]['original_precinct_name, edited_precinct_name'].unique()), key=lambda x: x[1])
precinct_list_gdf = sorted(list(gdf[gdf['county_id'] == county_id]['original_precinct_name, edited_precinct_name'].unique()), key=lambda x: x[1])

precinct_set_df = {x[1] for x in precinct_list_df if x[1] not in unmatched_precinct_lst_df}
precinct_set_gdf = {x[1] for x in precinct_list_gdf if x[1] not in unmatched_precinct_lst_gdf}

unmatched_precincts_df = sorted(list(precinct_set_df - precinct_set_gdf))
unmatched_precincts_gdf = sorted(list(precinct_set_gdf - precinct_set_df))
n_unmatched = len(unmatched_precincts_df) + len(unmatched_precincts_gdf)
if n_unmatched > 0:
        print("county_id: '{}' | {} precincts in {} | {} precincts in {}:\n".format(county_id, len(precinct_list_df), dataset_name_df, len(precinct_list_gdf), dataset_name_gdf))
        n_precincts_total = len(precinct_list_df) + len(precinct_list_gdf)
        print(n_unmatched, " precincts are unmatched out of ", n_precincts_total)
        df_unmatched = df[(df['edited_precinct_name'].isin(unmatched_precincts_df)) & (df.county_id == county_id)]
        gdf_unmatched = gdf[(gdf['edited_precinct_name'].isin(unmatched_precincts_gdf)) & (gdf.county_id == county_id)]
        if n_unmatched > 100:
            print("\nLook for parterns and use change the parameters to edit_precinct_name accordingly.\n")
            for index, (original_precinct_name_df, edited_precinct_name_df) in enumerate(precinct_list_df):
                original_precinct_name_gdf, edited_precinct_name_gdf = precinct_list_gdf[index]
                if edited_precinct_name_df in unmatched_precincts_df and edited_precinct_name_gdf in unmatched_precincts_gdf:
                    print("{} <-- {} ({})".format(edited_precinct_name_df, original_precinct_name_df, dataset_name_df))
                    print("{} <-- {} ({})\n".format(edited_precinct_name_gdf, original_precinct_name_gdf, dataset_name_gdf))
        else:
            print("unmatched_precincts_df ({}) - len = {}| '{}':".format(dataset_name_df, len(unmatched_precincts_df), county_id), unmatched_precincts_df)
            print("\nunmatched_precincts_gdf ({}) - len = {}| '{}':".format(dataset_name_gdf, len(unmatched_precincts_gdf), county_id), unmatched_precincts_gdf)
            precinct_modification_dictionary_df_to_gdf = {unmatched_precincts_df[i]: unmatched_precincts_gdf[i] if i < len(unmatched_precincts_gdf) else '' for i in range(len(unmatched_precincts_df))}
            precinct_modification_dicitonary_gdf_to_df = {unmatched_precincts_gdf[i]: unmatched_precincts_df[i] if i < len(unmatched_precincts_df) else '' for i in range(len(unmatched_precincts_gdf))}
            print("{}  to {} precinct modification dictionary: ".format(dataset_name_df, dataset_name_gdf))
            print("'{}':".format(county_id))
            pprint(precinct_modification_dictionary_df_to_gdf)
            print("{}  to {} precinct modification dictionary: ".format(dataset_name_gdf, dataset_name_df))
            print("'{}':".format(county_id))
            pprint(precinct_modification_dicitonary_gdf_to_df)
            for index, (original_precinct_name_df, edited_precinct_name_df) in enumerate(precinct_list_df):
                original_precinct_name_gdf, edited_precinct_name_gdf = precinct_list_gdf[index]
                if edited_precinct_name_df in unmatched_precincts_df or edited_precinct_name_gdf in unmatched_precincts_gdf:
                    print("{} <-- {} ({})".format(edited_precinct_name_df, original_precinct_name_df, dataset_name_df))
                    print("{} <-- {} ({})\n".format(edited_precinct_name_gdf, original_precinct_name_gdf, dataset_name_gdf))
            else:
                print("Add unmatched precincts to the unmatched precinct.")
else:
    print("All Done! (make sure you have one to one matches)")

All Done! (make sure you have one to one matches)


### Validate

In [8]:
acceptable_duplicates_to_reason_df = {
}
counts = df['edited_precinct_name'].value_counts()
duplicates = counts[(counts>1) & (~counts.index.isin(acceptable_duplicates_to_reason_df.keys()))]
assert len(duplicates) == 0

In [9]:
acceptable_duplicates_to_reason_gdf = {
    'centreville rock bldg':'Only one in the results - merge the geometries',
    'six mile community ctr':'Only one in the results - merge the geometries',
    'eoline fire dept':'Only one in the results - merge the geometries',
    'lawley community ctr':'Only one in the results - merge the geometries',
    'alternative school':'Only one in the results - merge the geometries',
    'brent city hall':'Only one in the results - merge the geometries',
    'greenpond fire dept':'Only one in the results - merge the geometries',
}
counts = gdf['edited_precinct_name'].value_counts()
duplicates = counts[(counts>1) & (~counts.index.isin(acceptable_duplicates_to_reason_gdf.keys()))]
assert len(duplicates) == 0

In [10]:
duplicates

Series([], Name: edited_precinct_name, dtype: int64)

In [11]:
gdf.edited_precinct_name.nunique()

8

In [12]:
gdf.original_precinct_name.nunique()

16

In [13]:
df.edited_precinct_name.nunique()

10

In [14]:
df.edited_precinct_name.nunique()

10

In [15]:
df = df[~df['edited_precinct_name'].isin(unmatched_precinct_lst_df)]

### Export

In [16]:
df['edited_precinct_name'] = df['edited_precinct_name'].str.title()
df['loc, prec'] = df.apply(lambda row: county_id + ', ' + row['edited_precinct_name'], axis=1)
df = df.groupby(by='loc, prec').sum()

In [17]:
gdf['edited_precinct_name'] = gdf['edited_precinct_name'].str.title()
gdf['loc, prec'] = gdf.apply(lambda row: county_id + ', ' + row['edited_precinct_name'], axis=1)
gdf = gdf.dissolve(by='loc, prec', as_index=False)
county_to_geoid = state_fip_to_county_to_geoid[1]
gdf['GEOID'] = gdf['county_id'].map(county_to_geoid)
gdf = gdf.set_index('loc, prec')

In [18]:
cols = [ 'loc, prec', 'GEOID', 'county_id', 'edited_precinct_name'] + list(df.columns) + ['geometry']
cols

['loc, prec',
 'GEOID',
 'county_id',
 'edited_precinct_name',
 'Gov_DEM',
 'Gov_REP',
 'LtGov_DEM',
 'LtGov_REP',
 'StHOR_DEM',
 'StHOR_IND',
 'StHOR_LIB',
 'StHOR_REP',
 'StSen_DEM',
 'StSen_IND',
 'StSen_REP',
 'SP_DEM',
 'SP_LIB',
 'SP_REP',
 'USHOR_DEM',
 'USHOR_REP',
 'geometry']

In [19]:
joined_df = df.join(gdf, lsuffix='_left', rsuffix='_right').reset_index()
print(joined_df.shape)
joined_df.head(2)

(8, 38)


Unnamed: 0,"loc, prec",Gov_DEM,Gov_REP,LtGov_DEM,LtGov_REP,StHOR_DEM,StHOR_IND,StHOR_LIB,StHOR_REP,StSen_DEM,...,NAME,VINTAGE,FUNCSTAT,JUSTIFY,MTFCC,county_id,original_precinct_name,edited_precinct_name,"original_precinct_name, edited_precinct_name",GEOID
0,"Bibb County, Alternative School",374.0,1034.0,333.0,1065.0,0.0,0.0,0.0,1087.0,329.0,...,Alternative School-10,90,N,,G5240,Bibb County,Alternative School-10,Alternative School,"(Alternative School-10, alternative school)",1007
1,"Bibb County, Brent City Hall",419.0,252.0,419.0,249.0,445.0,0.0,0.0,2.0,415.0,...,Brent City Hall-4,90,N,,G5240,Bibb County,Brent City Hall-4,Brent City Hall,"(Brent City Hall-4, brent city hall)",1007


In [20]:
output = gpd.GeoDataFrame(joined_df[cols].rename(columns={'edited_precinct_name':'precinct'}))
output.head()

Unnamed: 0,"loc, prec",GEOID,county_id,precinct,Gov_DEM,Gov_REP,LtGov_DEM,LtGov_REP,StHOR_DEM,StHOR_IND,...,StHOR_REP,StSen_DEM,StSen_IND,StSen_REP,SP_DEM,SP_LIB,SP_REP,USHOR_DEM,USHOR_REP,geometry
0,"Bibb County, Alternative School",1007,Bibb County,Alternative School,374.0,1034.0,333.0,1065.0,0.0,0.0,...,1087.0,329.0,0.0,1071.0,260.0,0.0,775.0,326.0,1070.0,"POLYGON Z ((-87.20296 33.13042 0.00000, -87.20..."
1,"Bibb County, Brent City Hall",1007,Bibb County,Brent City Hall,419.0,252.0,419.0,249.0,445.0,0.0,...,2.0,415.0,0.0,255.0,374.0,0.0,169.0,418.0,251.0,"POLYGON Z ((-87.26785 32.88378 0.00000, -87.26..."
2,"Bibb County, Centreville Rock Bldg",1007,Bibb County,Centreville Rock Bldg,203.0,647.0,172.0,675.0,127.0,0.0,...,471.0,161.0,0.0,689.0,133.0,0.0,488.0,167.0,682.0,"MULTIPOLYGON Z (((-87.13922 32.93553 0.00000, ..."
3,"Bibb County, Eoline Fire Dept",1007,Bibb County,Eoline Fire Dept,83.0,434.0,78.0,436.0,82.0,0.0,...,94.0,78.0,0.0,436.0,59.0,0.0,324.0,71.0,444.0,"POLYGON Z ((-87.42194 33.00338 0.00000, -87.41..."
4,"Bibb County, Greenpond Fire Dept",1007,Bibb County,Greenpond Fire Dept,188.0,1084.0,146.0,1123.0,0.0,0.0,...,1149.0,144.0,0.0,1123.0,82.0,0.0,821.0,136.0,1135.0,"POLYGON Z ((-87.19922 33.18954 0.00000, -87.19..."


In [21]:
parent_dir = '../../data/shapefiles/county_level_precinct_election_results'
output.to_file('/'.join([parent_dir, "_".join(county_id.split())]))