# Pennsylvania 2018 General Election Shapefile Assembly Script
Joins precinct geometries with election results for the 2018 Pennsylvania General Election.

Author: [Baxter Demers](https://github.com/baxterdemers)

In [1]:
from reference_data import (
    geoid_to_county_name,
    state_to_fips,
    state_abbreviation_to_state_name,
    state_fip_to_county_to_geoid,
)

import numpy as np
import pandas as pd
import geopandas as gpd
import re

### Import the datasets

In [2]:
gdf_name = 'Census Partnership VTDs (Shapefile)'
gdf = gpd.read_file('data/shapefiles/pa-precincts/census-partnership/compiled')
gdf.head(2)

Unnamed: 0,STATEFP,COUNTYFP,VTDST,NAMELSAD,VTDI,LSAD,CHNG_TYPE,ORIG_NAME,ORIG_CODE,RELATE,NAME,VINTAGE,FUNCSTAT,JUSTIFY,MTFCC,geometry
0,42,99,10,BLAIN Voting District,A,V2,,,,,BLAIN,90,N,,G5240,"POLYGON Z ((-77.51856 40.33821 0.00000, -77.51..."
1,42,99,20,BLOOMFIELD Voting District,A,V2,,,,,BLOOMFIELD,90,N,,G5240,"POLYGON Z ((-77.20030 40.41591 0.00000, -77.20..."


In [3]:
df_name = 'Open Elections (Results)'
df = pd.read_csv('data/election-results/open-elections/20181106__pa__general__precinct.csv', low_memory=False)
df.head(2)

Unnamed: 0,county,precinct,office,district,candidate,party,votes,absentee,election_day
0,York,Carroll Township,U.S. Senate,,"Bob CASEY, JR.",Dem,958,,
1,York,Carroll Township,U.S. Senate,,Lou BARLETTA,Rep,1858,,


### Precondition: County ID
Make a column in gdf and df called `county_id` with a common naming scheme. County name or County fips code will both work.

In [4]:
pa_fips_code = state_to_fips['Pennsylvania']
def get_county_name_pa(county_fips):
    geoid = str(pa_fips_code).zfill(2) + str(county_fips).zfill(3)
    assert(len(geoid) == 5)
    return geoid_to_county_name[geoid]

county_to_geoid_pa = state_fip_to_county_to_geoid[pa_fips_code]
def get_geoid_pa(county_name):
    return county_to_geoid_pa[county_name]

gdf['county_id'] = gdf['COUNTYFP'].map(get_county_name_pa)
df['county_id'] = df['county'].apply(lambda x: x.strip() + ' County') 
assert 'county_id' in gdf.columns and 'county_id' in df.columns
county_set_x = set(gdf['county_id'].unique())
county_set_y = set(df['county_id'].unique())
print(gdf_name, "unmatched counties: ", county_set_x.difference(county_set_y))
print(df_name, "unmatched counties: ", county_set_y.difference(county_set_x))
assert county_set_x.intersection(county_set_y) != set()

Census Partnership VTDs (Shapefile) unmatched counties:  set()
Open Elections (Results) unmatched counties:  {'Westmoreland County', 'Delaware County', 'Armstrong County'}


In [5]:
df = df.loc[df.office.isin(['U.S. Senate', 'Governor', 'U.S. House'])]
df.groupby('office').county_id.unique().map(lambda x: [county_id for county_id in df.county_id.unique() if county_id not in x])

office
Governor       [Butler County, Westmoreland County]
U.S. House                      [Schuylkill County]
U.S. Senate                     [Clearfield County]
Name: county_id, dtype: object

### Fill in missing election results

As you can see in the cell above, Open Elections is missing results for the following offices in the specified counties.

* Governor: [Butler, Westmoreland] 
* U.S. House: [Schuylkill] 
* U.S. Senate: [Clearfield]

Accordingly, I parsed some or all election results from each county's website. For more details, check out `county-files/{county_id}/parsing-election-results/`. The following cell adds these parsed resutls into the results dataframe to be used later for matching.

In [6]:
butler_results_df = pd.read_csv('county-files/Butler County/parsing-election-results/butler_county_parsed.csv')
westmoreland_results_df = pd.read_csv('county-files/Westmoreland County/parsing-election-results/westmoreland_county_parsed.csv')
clearfield_results_df = pd.read_csv('county-files/Clearfield County/parsing_election_results/clearfield_county_parsed.csv')

df = df[~df.county.isin({"Butler","Westmoreland","Clearfield"})]

df.loc[df.county_id == 'Schuylkill County', 'precinct'] = df.loc[df.county_id == 'Schuylkill County', 'precinct'].map(lambda x: x[x.index(' ')+1:] if len(x.split())> 1 else x)
schuylkill_us_house_results_df = pd.read_csv('county-files/Schuylkill County/parsing-election-results/schuylkill_county_parsed.csv')

df = df.append([butler_results_df,westmoreland_results_df,clearfield_results_df,schuylkill_us_house_results_df])
df = df.loc[df.office.isin(['U.S. Senate', 'Governor', 'U.S. House'])]
df['county_id'] = df['county'].apply(lambda x: x.strip() + ' County') 
county_set_x = set(gdf['county_id'].unique())
county_set_y = set(df['county_id'].unique())
print(gdf_name, "unmatched counties: ", county_set_x.difference(county_set_y))
print(df_name, "unmatched counties: ", county_set_y.difference(county_set_x))
assert county_set_x.intersection(county_set_y) != set()

Census Partnership VTDs (Shapefile) unmatched counties:  set()
Open Elections (Results) unmatched counties:  {'Westmoreland County', 'Delaware County', 'Armstrong County'}


### Precondition: Precinct Name
Identify the column that should contains the precinct names to be matched by naming it `original_precinct_name`

In [7]:
# TODO: Pass the precondition described above which takes the form of an assert statement in this cell.
gdf['original_precinct_name'] = gdf['NAME'].str.strip()
df['original_precinct_name'] = df['precinct'].str.strip()
assert 'original_precinct_name' in gdf.columns and 'original_precinct_name' in df.columns

### Shapefile modifications

The counties listed below in comments (e.g. `# Elk County`) were either missing from the original census VTD shapefile `('Westmoreland County', 'Armstrong County', 'Delaware County')` or needed to be replaced for accuracy reasons `('Elk County', 'Philadelphia County','Cumberland County','Dauphin County')`.

In [8]:
to_append = []

# Elk County
elk_gdf = gpd.read_file('county-files/Elk County/ElkCoPAdistricts')
elk_gdf['county_id'] = 'Elk County'
elk_gdf['original_precinct_name'] = elk_gdf['NAME'].str.strip()
to_append.append(elk_gdf)

# Philadelphia County
philly_gdf = gpd.read_file('county-files/Philadelphia County/Divisions_2018General')
philly_gdf['county_id'] = 'Philadelphia County'
philly_gdf['original_precinct_name'] = philly_gdf['DIVISION_N'].str.strip()
to_append.append(philly_gdf)


# Cumberland County
cumberland_gdf = gpd.read_file('county-files/Cumberland County/Voting_Precincts-shp')
cumberland_gdf['county_id'] = 'Cumberland County'
cumberland_gdf['original_precinct_name'] = cumberland_gdf['Precinct'].str.strip()
to_append.append(cumberland_gdf)

# Dauphin County
def dauphin_precs(row):
    nm = row['MCD_NAME']
    if row['WARD']:
        nm = nm + ' ' + row['WARD']
    if row['PRECINCT']:
        nm = nm + ' ' + row['PRECINCT']
    return nm
dauphin_gdf = gpd.read_file('county-files/Dauphin County/Voting_Districts-shp')
dauphin_gdf['county_id'] = 'Dauphin County'
dauphin_gdf['original_precinct_name'] = dauphin_gdf.apply(lambda row: dauphin_precs(row), axis=1)
to_append.append(dauphin_gdf)

migurski_pa_16 = gpd.read_file('data/shapefiles/pa-precincts/VTDs_Oct17')
migurski_pa_16['county_id'] = migurski_pa_16['COUNTYNAME'].str.strip() + ' County'
migurski_pa_16['original_precinct_name'] = migurski_pa_16['NAMELSAD']

# Westmoreland County
westmoreland_gdf = migurski_pa_16[migurski_pa_16.COUNTYNAME.isin(['Westmoreland'])].copy()
to_append.append(westmoreland_gdf)

# Armstrong County
armstrong_gdf = migurski_pa_16[migurski_pa_16.COUNTYNAME.isin(['Armstrong'])].copy()
to_append.append(armstrong_gdf)

# Delaware County
delaware_gdf = migurski_pa_16[migurski_pa_16.COUNTYNAME.isin(['Delaware'])].copy()
to_append.append(delaware_gdf)

county_id_to_be_replaced_lst = ['Elk County', 'Philadelphia County','Cumberland County','Dauphin County']
gdf = gdf[~(gdf.county_id.isin(county_id_to_be_replaced_lst))]

if not gdf.crs:
    gdf.crs = "epsg:4326"
gdf = gdf.append([cnty_gdf.to_crs(gdf.crs) for cnty_gdf in to_append], ignore_index=True)

### Data Cleaning

In [9]:
def remove_commas(df, col_lst):
    """
    remove commas from the string representation of numbers in the cols in col_lst
    :df: DataFrame object
    :col_lst: list of strings that are each the name of a column :df:
    """
    for col in col_lst:
        if df.dtypes[col] == "object":
            df = df.astype({col: "str"})
            df[col] = df[col].map(lambda s: s.replace(",", ""))
        df = df.astype({col: "float"})
        df = df.astype({col: "int"})
    return df

#### Remove duplicate rows and cast votes to int

In [10]:
gdf = gdf[~gdf.original_precinct_name.isin({'Voting Districts not defined'})]
df = df[~df.votes.isnull()]
df = df[~(df.original_precinct_name.isin(['Total', 'total', 'GOVERNMENT STUDY COMMISSIONER', 'SPRINGFIELD REFERENDUM']))] # Armstrong, Schuylkill, Carbon and Cumberland counties all have a 'total' precinct which has the countywide votes, but we don't want to include it
df = df.drop_duplicates() # Delaware had a ton of duplicates for some reason
df = remove_commas(df, ['votes'])

#### Standardize Party Names

In [11]:
skip = {'Blank Votes', 'No Vote', 'DEMOCRATIC', 'REPUBLICAN', 'GREEN', 'LIBERTARIAN','YES', 'NO', 'Cast Votes', 'Over Votes', 'Under Votes', "Write-In", "Write In", "Write - In", 'Write In ', 'Write In', 'Write-ins', 'Write-in Votes','WRITE-IN', 'Write In', 'Write In Votes', 'Write-In Votes', 'Write-Ins', 'Write-in'}
df[df.party.isna() & (~df.candidate.str.contains('(W)', na=True, regex=False)) & (~df.candidate.isin(skip)) & (df.votes > 1)].groupby('candidate').county_id.unique()
# candidates missing party affiliation

candidate
Bob Casey Jr.                           [Adams County]
Brent Ottaway                           [Adams County]
Dale R. Kerns Jr.                       [Adams County]
John Joyce                              [Adams County]
Ken V. Krawchuk / K. S. Smith           [Adams County]
Lou Barletta                            [Adams County]
Neale Gale                              [Adams County]
Paul Glover / J. Bowser Bostick         [Adams County]
Scott R. Wagner / Jeff Bartos           [Adams County]
TOM WOLF                           [Cumberland County]
Tom Wolf / John Fetterman               [Adams County]
Name: county_id, dtype: object

In [12]:
df.loc[(df.county_id == 'Cumberland County') & (df.candidate == 'TOM WOLF'), 'party'] = 'dem'
candidate_to_party_adams_county = {
    'Bob Casey Jr.':'dem', 
    'Lou Barletta':'rep', 
    'Neale Gale':'green', 
    'Dale R. Kerns Jr.':'lib',
    'Tom Wolf / John Fetterman':'dem', 
    'Scott R. Wagner / Jeff Bartos':'rep',
    'Paul Glover / J. Bowser Bostick':'green',
    'Ken V. Krawchuk / K. S. Smith':'lib',
    'Brent Ottaway':'dem', 
    'John Joyce':'rep',
}
df.loc[(df.county_id == 'Adams County'),'party'] = df.loc[(df.county_id == 'Adams County'),'candidate'].map(candidate_to_party_adams_county)

In [13]:
def clean_party_name(raw_party_name):
    if type(raw_party_name) != str:
        return 'Independent'
    elif raw_party_name.lower().startswith('dem/'):
        return 'Independent'
    elif raw_party_name.lower().startswith('dem'):
        return 'Democrat'
    elif raw_party_name.lower().startswith('rep'):
        return 'Republican'
    elif raw_party_name.lower().startswith('gr'):
        return 'Green'
    elif raw_party_name.lower().startswith('lib') or raw_party_name.lower().startswith('ln'):
        return 'Libertarian'
    else:
        return 'Independent'
df['party'] = df['party'].map(clean_party_name)
df['party'].unique()

array(['Democrat', 'Republican', 'Green', 'Libertarian', 'Independent'],
      dtype=object)

#### Fix ambiguous precinct names

In [14]:
addison_borough_dict = {
    'U.S. Senate':{
        'Democrat':26,
        'Republican':49,
        'Green':1,
        'Libertarian':1,
    },
    'Governor':{
        'Democrat':22,
        'Republican':52,
        'Green':0,
        'Libertarian':3,
    },
    'U.S. House':{
        'Democrat':18,
        'Republican':57,
        'Green':0,
        'Libertarian':0,
    },
}

for office in ['U.S. Senate', 'Governor', 'U.S. House']:
    party_to_votes_dict = addison_borough_dict[office]
    for party,votes in party_to_votes_dict.items():
        mask = ((df['party']==party)
                    & (df['office']==office)
                    & (df['votes']==votes)
                    & (df['county_id']=='Somerset County')
                    & (df['original_precinct_name']=='ADDISON'))
        df.loc[mask, 'original_precinct_name'] = 'Addison Borough'
        
addison_township_dict = {
    'U.S. Senate':{
        'Democrat':110,
        'Republican':233,
        'Green':2,
        'Libertarian':5,
    },
    'Governor':{
        'Democrat':117,
        'Republican':230,
        'Green':2,
        'Libertarian':4,
    },
    'U.S. House':{
        'Democrat':87,
        'Republican':265,
        'Green':0,
        'Libertarian':0,
    },
}

for office in ['U.S. Senate', 'Governor', 'U.S. House']:
    party_to_votes_dict = addison_township_dict[office]
    for party,votes in party_to_votes_dict.items():
        mask = ((df['party']==party)
                    & (df['office']==office)
                    & (df['votes']==votes)
                    & (df['county_id']=='Somerset County')
                    & (df['original_precinct_name']=='ADDISON'))
        df.loc[mask, 'original_precinct_name'] = 'Addison Township'

In [15]:
df[df.precinct == 'ADDISON'].original_precinct_name.unique()

array(['Addison Township', 'Addison Borough'], dtype=object)

### Pivoting the election results

Now that the data has been cleaned and standardized I will pivot the election results so they take their final format. Specifically, each precinct will get exactly one row. Columns constitute votes for a `(party, race)` tuple e.g. votes for the Democrat running for Congress. Then rename each column to conform to the 10 character maximum imposed by the shapefile format.

In [16]:
prec_row_df = pd.pivot_table(df, index = ['county_id', 'original_precinct_name'], columns = ['party','office'], values = ['votes'], aggfunc = np.sum)
prec_row_df.columns = prec_row_df.columns.to_series().str.join(' ')
print(list(prec_row_df.columns))

['votes Democrat Governor', 'votes Democrat U.S. House', 'votes Democrat U.S. Senate', 'votes Green Governor', 'votes Green U.S. House', 'votes Green U.S. Senate', 'votes Independent Governor', 'votes Independent U.S. House', 'votes Independent U.S. Senate', 'votes Libertarian Governor', 'votes Libertarian U.S. House', 'votes Libertarian U.S. Senate', 'votes Republican Governor', 'votes Republican U.S. House', 'votes Republican U.S. Senate']


In [17]:
def rename(nm):
    office_to_office_short = {
        'Governor':'Gov',
        'U.S. House':'HOR',
        'U.S. Senate':'Sen'
    }
    votes,party,office = nm.split(maxsplit=2)
    party_short = party[:3]
    office_short = office_to_office_short[office]
    ret = 'G18'+party_short+office_short
    assert len(ret) <= 10
    return ret
col_to_short_col = {nm:rename(nm) for nm in list(prec_row_df.columns)}
col_to_short_col

{'votes Democrat Governor': 'G18DemGov',
 'votes Democrat U.S. House': 'G18DemHOR',
 'votes Democrat U.S. Senate': 'G18DemSen',
 'votes Green Governor': 'G18GreGov',
 'votes Green U.S. House': 'G18GreHOR',
 'votes Green U.S. Senate': 'G18GreSen',
 'votes Independent Governor': 'G18IndGov',
 'votes Independent U.S. House': 'G18IndHOR',
 'votes Independent U.S. Senate': 'G18IndSen',
 'votes Libertarian Governor': 'G18LibGov',
 'votes Libertarian U.S. House': 'G18LibHOR',
 'votes Libertarian U.S. Senate': 'G18LibSen',
 'votes Republican Governor': 'G18RepGov',
 'votes Republican U.S. House': 'G18RepHOR',
 'votes Republican U.S. Senate': 'G18RepSen'}

In [18]:
#rename columns
prec_row_df = prec_row_df.rename(columns=col_to_short_col)
prec_row_df = prec_row_df.fillna(0)
prec_row_df.reset_index(inplace=True)  
prec_row_df.head()

Unnamed: 0,county_id,original_precinct_name,G18DemGov,G18DemHOR,G18DemSen,G18GreGov,G18GreHOR,G18GreSen,G18IndGov,G18IndHOR,G18IndSen,G18LibGov,G18LibHOR,G18LibSen,G18RepGov,G18RepHOR,G18RepSen
0,Adams County,Abbottstown,120.0,108.0,120.0,2.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,5.0,185.0,201.0,183.0
1,Adams County,Arendtsville,160.0,132.0,151.0,2.0,0.0,3.0,0.0,1.0,0.0,4.0,0.0,6.0,172.0,204.0,178.0
2,Adams County,Bendersville,76.0,66.0,74.0,2.0,0.0,2.0,0.0,0.0,0.0,3.0,0.0,1.0,98.0,111.0,103.0
3,Adams County,Berwick,318.0,252.0,289.0,5.0,0.0,5.0,0.0,0.0,0.0,9.0,0.0,14.0,554.0,631.0,575.0
4,Adams County,Biglerville,168.0,133.0,152.0,2.0,0.0,7.0,1.0,0.0,1.0,5.0,0.0,3.0,215.0,261.0,231.0


In [19]:
df = prec_row_df.reset_index()

### General Modifications
Its benificial to apply some modifications uniformly to all precincts. For example, its good practice to make everything lower case. This modification and other are made in `edit_precinct_name` - read its specification to learn more about how to use it to make modifications.

In [20]:
county_id_to_precinct_modification_dictionary_y = {
    'Adams County':
        {'gettysburg 1': 'gettysburg 1 1',
         'gettysburg 3': 'gettysburg 3 1',
         'huntington': 'huntington 1'},
    'Allegheny County':
        {'blawnox': 'blawnox 1',
         'chalfant 1': 'chalfant',
         'frazer': 'frazer 1',
         'heidelberg': 'heidelberg 1',
         'leetsdale': 'leetsdale 1',
         'mcdonald': 'mcdonald 5',
         'pennsbury vill': 'pennsbury village',
         'rosslyn farm': 'rosslyn farms',
         'thornburg': 'thornburg 1',
         'trafford': 'trafford 1'},
    'Berks County':
        {'bally': 'bally 1',
         '(4 cong)':'1 (4 cong)', 
         '(9 cong)':'1 (9 cong)',
         'exeter 5 1 (6 cong)':'exeter 5 (6 cong)',
         'exeter 5 2 (9 cong)':'exeter 5 (9 cong)',
         'bechtelsville': 'bechtelsville 1',
         'bernville': 'bernville 1',
         'centerport': 'centerport 1',
         'heidelberg': 'heidelberg 1',
         'jefferson': 'jefferson 1',
         'leesport': 'leesport 1',
         'lenhartsville': 'lenhartsville 1',
         'lyons': 'lyons 1',
         'marion': 'marion 1',
         'mohnton': 'mohnton 1',
         'mount penn': 'mount penn 1',
         'north heidelberg': 'north heidelberg 1',
         'ontelaunee': 'ontelaunee 1',
         'pike': 'pike 1',
         'reading 5 1': 'reading 5',
         'reading 7 1': 'reading 7',
         'reading 8 1': 'reading 8',
         'richmond': 'richmond 1',
         'robesonia': 'robesonia 1',
         'shoemakersville': 'shoemakersville 1',
         'upper bern': 'upper bern 1',
         'upper tulpehocken': 'upper tulpehocken 1',
         'wernersville': 'wernersville 1',
         'windsor': 'windsor 1',
         'womelsdorf': 'womelsdorf 1'},
    'Butler County':
        {'butler 6n': 'butler 6 north',
         'butler 6s': 'butler 6 south',
         'fields':'seven fields',
         'cherry vallery': 'cherry valley',
         'jefferson 1': 'jefferson i',
         'jefferson 2': 'jefferson ii'},
    'Cameron County':
        {'east emporium': 'emporium east',
         'east shippen': 'shippen east',
         'middle emporium': 'emporium middle',
         'west emporium': 'emporium west',
         'west shippen': 'shippen west'},
    'Carbon County':
        {'banks': 'banks 4',
         'ii franklin independent': 'franklin franklin ind',
         'lansford east': 'lansford 1',
         'mahoning packerton jamestown': 'mahoning packer jamestown',
         'palmerton east': 'palmerton eastern',
         'palmerton west': 'palmerton western',
         'penn forest northeast': 'penn forest north east',
         'penn forest northwest': 'penn forest north west',
         'penn forest southwest': 'penn forest south west',
         'fast penn south' : 'east penn south',
         'penn forest 2 northeast' : 'penn forest north east',
         'franklin twp�harrity' : 'franklin harrity',
         'i towamensing south 1' : 'towamensing south 1',},
    'Chester County':
        {'east downingtown east': 'downingtown east east',
         'east downingtown west':'downingtown east west',
         'north coventry 1': 'north coventry west 1',
         'north coventry 2': 'north coventry west 2',
         'london grove ch': 'london grove chatham',
         'west downingtown north': 'downingtown west north',
         'west downingtown south': 'downingtown west south',
         'west goshen m 1': 'west goshen middle 1',
         'west goshen m 2': 'west goshen middle 2',
         'east fallowfield': 'east fallowfield west',
         'east marlborough': 'east marlborough west',
         'west brandywine': 'west brandywine west'},
    'Clarion County': {'toby': 'toby east'},
    'Clearfield County':
        {'dubois city 1': 'dubois 1',
         'dubois city 2': 'dubois 2',
         'dubois city 3': 'dubois 3',
         'dubois city 4': 'dubois 4',
         'dubois city 5': 'dubois 5',
         'lawrence golden': 'lawrence golden rod hillsdale',
         'lawrence hyde': 'lawrence hyde city',
         'sandy falls creek 2': 'falls creek',
         'sandy sabula pct': 'sandy sabula'},
    'Clinton County':{'pine creek i': 'pine creek 1', 'pine creek ii': 'pine creek 2'},
    'Columbia County':
        {'briarcreek': 'briar creek',
         'briarcreek west': 'briar creek west',
         'briarcreek ne': 'briar creek northeast',
         'fishingcreek': 'fishing creek',
         'roaringcreek': 'roaring creek'},
    'Crawford County':
        {'vernon i': 'vernon 1',
         'vernon ii': 'vernon 2',
         'vernon iii': 'vernon 3',
         'west mead i': 'west mead 1',
         'west mead ii': 'west mead 2'},
    'Dauphin County':
        {'city 1 1': 'harrisburg 1 1',
         'city 1 2': 'harrisburg 1 2',
         'city 1 3': 'harrisburg 1 3',
         'city 10 1': 'harrisburg 10 1',
         'city 10 2': 'harrisburg 10 2',
         'city 10 3': 'harrisburg 10 3',
         'city 10 4': 'harrisburg 10 4',
         'city 11': 'harrisburg 11',
         'city 12': 'harrisburg 12',
         'city 13 1': 'harrisburg 13 1',
         'city 13 2': 'harrisburg 13 2',
         'city 13 3': 'harrisburg 13 3',
         'city 14': 'harrisburg 14',
         'city 15': 'harrisburg 15',
         'city 2 1': 'harrisburg 2 1',
         'city 2 2': 'harrisburg 2 2',
         'city 3': 'harrisburg 3',
         'city 4': 'harrisburg 4',
         'city 5': 'harrisburg 5',
         'city 6': 'harrisburg 6',
         'city 7 1': 'harrisburg 7 1',
         'city 7 2': 'harrisburg 7 2',
         'city 8': 'harrisburg 8',
         'city 9 1': 'harrisburg 9 1',
         'city 9 2': 'harrisburg 9 2',
         'city 9 3': 'harrisburg 9 3',
         'city 9 4': 'harrisburg 9 4',
         'city 9 5': 'harrisburg 9 5'},
    'Elk County':{
        'benezette':'benezett',
        'highland': 'highland highland',
        'north horton':'horton n',
        'south horton':'horton s',
         'james city': 'highland james city',
         'lamont': 'jones lamont',
         'north ridgway': 'ridgway north',
         'south ridgway': 'ridgway south',
         'wilcox': 'jones wilcox'},
    'Erie County':
        {'erie 2 1': 'erie 2 1 3',
         'erie 2 4': 'erie 2 4 9',
         'fairview 1 d': 'fairview 1',
         'fairview 2 d': 'fairview 2',
         'fairview 3 d': 'fairview 3',
         'fairview 4 d': 'fairview 4',
         'fairview 5 d': 'fairview 5',
         'harborcreek 1s': 'harborcreek 1',
         'harborcreek 2n': 'harborcreek 2',
         'harborcreek 3r': 'harborcreek 3',
         'harborcreek 4t': 'harborcreek 4',
         'harborcreek 5t': 'harborcreek 5',
         'harborcreek 6t': 'harborcreek 6',
         'harborcreek 7t': 'harborcreek 7',
         'union city 1 war': 'union city 1',
         'union city 2 war': 'union city 2',
         'union townsip': 'union',
         'wesleyville 1':'wesleyville east',
         'wesleyville 2':'wesleyville west',
         'waterford 1':'waterford east',
         'waterford 2':'waterford west',},
    'Forest County': {'kinglsey': 'kingsley'},
    'Franklin County':{'fannett sulphur spr': 'fannett sulphur springs', 'west end shippensburg': 'shippensburg'},
    'Greene County':
        {'cumberlandx1': 'cumberland 1',
         'cumberlandx2': 'cumberland 2',
         'cumberlandx4': 'cumberland 4',
         'cumberlandxnemacolin': 'cumberland nemacolin',
         'dunkardxbobtown': 'dunkard bobtown',
         'dunkardxdilliner': 'dunkard dilliner',
         'dunkardxlower': 'dunkard lower',
         'dunkardxupper': 'dunkard upper',
         'franklinxeast': 'franklin east',
         'franklinxnorth': 'franklin north',
         'franklinxsouth': 'franklin south',
         'franklinxwest': 'franklin west',
         'jeffersonx1': 'jefferson 1',
         'jeffersonx2': 'jefferson 2',
         'jeffersonx3': 'jefferson 3',
         'jeffersonx4': 'jefferson 4',
         'monongahelax1': 'monongahela 1',
         'monongahelax2': 'monongahela 2',
         'morganxchart tgrdn': 'morgan chart tgrdn',
         'morganxlippencott': 'morgan lippencott',
         'morganxmather': 'morgan mather',
         'waynesburgw1': 'waynesburg 1',
         'waynesburgw2': 'waynesburg 2',
         'waynesburgw3': 'waynesburg 3',
         'waynexeast': 'wayne east',
         'waynexwest': 'wayne west'},
    'Jefferson County':
        {'brockway': 'brockway 1',
         'oliver': 'oliver 1',
         'perry': 'perry 1',
         'reynoldsville': 'reynoldsville 1',
         'washington': 'washington east',
         'winslow': 'winslow 1',
         'young': 'young north 1'},
    'Juniata County':
        {'tuscarora tusc': 'tuscarora'},
    'Lackawanna County':
        {'carbondale ne': 'carbondale northeast',
         'carbondale nw': 'carbondale northwest',
         'carbondale so': 'carbondale south',
         'scranton 7 1': 'scranton 7',
         'springbrook': 'spring brook'},
    'Lawrence County':
        {'new castle 6 1': 'new castle 6',
         'new castle 7 1': 'new castle 7',
         'new castle 8 1': 'new castle 8'},
    'Lebanon County':
        {'bethel fredericksburg': 'bethel fred',
         'heidelberg kleinfeltersvl': 'heidelberg klein',
         'heidelberg schaefferstown': 'heidelberg sch',
         'south londonderry campbelltwn': 'south londonderry campbelltown',
         'union green point': 'union green pt'},
    'Lehigh County':
        {'allentown 1 1': 'allentown 1',
         'allentown 2 1': 'allentown 2',
         'allentown 3 1': 'allentown 3',
         'allentown 4 1': 'allentown 4',
         'allentown 5 1': 'allentown 5',
         'allentown 9 1': 'allentown 9',
         'bethlehem 10 1': 'bethlehem 10',
         'bethlehem 11 1': 'bethlehem 11'},
    'McKean County':
        {'bradford city 1': 'bradford 1',
         'bradford city 2': 'bradford 2',
         'bradford city 3 1': 'bradford 3 1',
         'bradford city 3 2': 'bradford 3 2',
         'bradford city 4': 'bradford 4',
         'bradford city 5': 'bradford 5',
         'bradford city 6 1': 'bradford 6 1',
         'bradford city 6 2': 'bradford 6 2',
         'bradford twnship 3': 'bradford 3',
         'foster 4 distr': 'foster 4'},
    'Mercer County':
        {'hermitage n1': 'hermitage nw 1',
         'hermitage n2': 'hermitage nw 2',
         'hermitage n3': 'hermitage nw 3',
         'hermitage n4': 'hermitage nw 4',
         'hermitage s1': 'hermitage sw 1',
         'hermitage s2': 'hermitage sw 2',
         'hermitage s3': 'hermitage sw 3'},
    'Monroe County':
        {'hamilton northern': 'hamilton north',
         'hamilton southern': 'hamilton south',
         'smithfield 4':'smithfield 1',
         'middle smithfield eastern': 'middle smithfield east',
         'middle smithfield western': 'middle smithfield west',
         'tobyhanna eastern': 'tobyhanna east',
         'tobyhanna western': 'tobyhanna west'},
    'Montgomery County':
        {'northwales 1': 'north wales 1',
         'northwales 2': 'north wales 2',
         'northwales 3': 'north wales 3',
         'plymouth 2 3': 'plymouth 2 3a',
         'plymouth 2 3 7': 'plymouth 2 3b',
         'towamencin 1 4': 'towamencin 2 1',
         'upper dublin 3 1': 'upper dublin 3 1a',
         'upper merion bel 1': 'upper merion belmont 1',
         'upper merion bel 2': 'upper merion belmont 2',
         'upper merion bel 3': 'upper merion belmont 3',
         'upper merion bel 4': 'upper merion belmont 4',
         'upper merion bel 5': 'upper merion belmont 5',
         'upper merion cand 1': 'upper merion candlebrook 1',
         'upper merion cand 2': 'upper merion candlebrook 2',
         'upper providence m c': 'upper providence mont clare',
         'west conshocken': 'west conshohocken'},
    'Montour County':
    {'mahoning i': 'mahoning 1',
     'mahoning ii': 'mahoning 2',
     'washingtonville bor': 'washingtonville'},
    'Northumberland County':
        {'chillisquaque': 'east chillisquaque',
         'shamokin city 1': 'shamokin 1',
         'shamokin city 2': 'shamokin 2 1',
         'shamokin city 3': 'shamokin 3',
         'shamokin city 4': 'shamokin 4 1',
         'shamokin city 5': 'shamokin 5 1',
         'shamokin city 6': 'shamokin 6',
         'shamokin city 7': 'shamokin 7',
         'shamokin city 8': 'shamokin 8',
         'sunbury city 1': 'sunbury 1',
         'sunbury city 2': 'sunbury 2',
         'sunbury city 3': 'sunbury 3',
         'sunbury city 4': 'sunbury 4',
         'sunbury city 5': 'sunbury 5',
         'sunbury city 6': 'sunbury 6',
         'upper mahanoy (12 district)': 'upper mahanoy'},
    'Potter County':
        {'eulalia': 'eulalia eulalia',
         'north sharon': 'sharon north',
         'south sharon': 'sharon south'},
    'Schuylkill County': {'norwegian marlin': 'norwegian mar lin', 'rush elixir': 'rush elixer'},
    'Snyder County':
        {'monroe no1': 'monroe 1',
         'monroe no2': 'monroe 2',
         'penn no 1': 'penn 1',
         'penn no 2': 'penn 2',
         'selinsgrove no1': 'selinsgrove 1',
         'selinsgrove no2': 'selinsgrove 2',
         'selinsgrove no3': 'selinsgrove 3'},
    'Somerset County':
        {'brothers valley': 'brothersvalley',
         'conemaughx1': 'conemaugh 1',
         'conemaughx2': 'conemaugh 2',
         'conemaughx3': 'conemaugh 3',
         'conemaughx4': 'conemaugh 4',
         'conemaughx5': 'conemaugh 5',
         'jennerx1': 'jenner 1',
         'jennerx2': 'jenner 2',
         'jennerx3': 'jenner 3',
         'paintx1': 'paint 1',
         'paintx2': 'paint 2',
         'paintx3': 'paint 3',
         'shadex1': 'shade 1',
         'shadex2': 'shade 2',
         'shadex3': 'shade 3',
         'somersetx1': 'somerset 1',
         'somersetx2': 'somerset 2',
         'somersetx3': 'somerset 3',
         'somersetx4': 'somerset 4',
         'somersetxeast': 'somerset east',
         'somersetxnorth west': 'somerset north west',
         'somersetxsouth west': 'somerset south west',
         'windberx1': 'windber 1',
         'windberx2': 'windber 2',
         'windberx3': 'windber 3',
         'windberx4': 'windber 4'},
    'Sullivan County':{'bernice': 'cherry bernice', 'lopez': 'colley lopez'},
    'Susquehanna County':
        {'forest city': 'forest city 1',
         'montrose 1w': 'montrose 1',
         'montrose 2w': 'montrose 2',
         'susquehanna': 'susquehanna depot',
         'union dale': 'uniondale'},
    'Venango County':
        {'oilcreek': 'oil creek', 'sugarcreek 5': 'sugarcreek 4'},
    'Warren County':
        {'warren city central': 'warren central',
         'warren city east': 'warren east',
         'warren city north': 'warren north',
         'warren city south': 'warren south',
         'warren city south east': 'warren south east',
         'warren city west': 'warren west'},
    'Washington County':
        {'bentleyville': 'bentleyville 1',
         'east bethlehem 1w': 'east bethlehem 1',
         'east bethlehem 2w': 'east bethlehem 2',
         'east bethlehem 3w': 'east bethlehem 3',
         'east bethlehem 4w': 'east bethlehem 4',
         'east finley': 'east finley 1',
         'east washington': 'east washington 1',
         'mc donald': 'mcdonald',
         'monongahela 1w': 'monongahela 1',
         'north bethlehem': 'north bethlehem 1',
         'north charleroi': 'north charleroi 1',
         'washington 4 west': 'washington 4',
         'petersxd 3':'peters 3'},
    'Wayne County':{'so canaan': 'south canaan'},
    'York County':
        {'york 1': 'york city 1',
         'york 11': 'york city 11',
         'york 12 1': 'york city 12 1',
         'york 12 2': 'york city 12 2',
         'york 12 3': 'york city 12 3',
         'york 12 4': 'york city 12 4',
         'york 13': 'york city 13',
         'york 14 1': 'york city 14 1',
         'york 14 2': 'york city 14 2',
         'york 14 3': 'york city 14 3',
         'york 15': 'york city 15',
         'york 5': 'york city 5',
         'york 6': 'york city 6',
         'york 7': 'york city 7',
         'york 8': 'york city 8',
         'york 9 1': 'york city 9 1',
         'york 9 2': 'york city 9 2'},
    'York County':
        {'york city 1': 'york 1',
         'york city 11': 'york 11',
         'york city 12 1': 'york 12 1',
         'york city 12 2': 'york 12 2',
         'york city 12 3': 'york 12 3',
         'york city 12 4': 'york 12 4',
         'york city 13': 'york 13',
         'york city 14 1': 'york 14 1',
         'york city 14 2': 'york 14 2',
         'york city 14 3': 'york 14 3',
         'york city 15': 'york 15',
         'york city 5': 'york 5',
         'york city 6': 'york 6',
         'york city 7': 'york 7',
         'york city 8': 'york 8',
         'york city 9 1': 'york 9 1',
         'york city 9 2': 'york 9 2'},
    'Lancaster County':
        {'cocalico east reamstown': 'east cocalico reamstown',
         'cocalico east smokestown': 'east cocalico smokestown',
         'cocalico east stevens': 'east cocalico stevens',
         'cocalico east swartzville': 'east cocalico swartzville',
         'cocalico west reinholds': 'west cocalico reinholds',
         'cocalico west schoeneck': 'west cocalico schoeneck',
         'donegal east maytown': 'east donegal maytown',
         'donegal east maytown west': 'east donegal maytown west',
         'donegal east springville': 'east donegal springville',
         'donegal west 1': 'west donegal 1',
         'donegal west 2': 'west donegal 2',
         'donegal west 3': 'west donegal 3',
         'donegal west 4': 'west donegal 4',
         'drumore east': 'east drumore',
         'earl east blue ball': 'east earl blue ball',
         'earl east terre hill': 'east earl terre hill',
         'east hempfield landisville': 'east hempfield landisville west',
         'lancaster 7 8 (cv)':'lancaster 7 8',
         'lancaster 7 8 (ls)':'lancaster 7 8',
         'manheim 7 41': 'manheim 7 a (hd 41)',
         'manheim 7 96': 'manheim 7 b (hd 96)',
         'west willow street': 'west willow st'},
    'Luzerne County':
        {'bear creek vlg': 'bear creek village', 'butler 2': 'butler 2 (upper lehigh)'},
    'Northampton County':
        {'bethlehem 1 2 (138)': 'bethlehem 1 2',
         'bethlehem 2 1 (135)': 'bethlehem 2 1',
         'bushkill bushkill center': 'bushkill centre',
         'bushkill cherryhill': 'bushkill cherry hill',
         'forks east 1': 'forks eastern 1',
         'forks east 2': 'forks eastern 2',
         'forks west 1': 'forks western 1',
         'forks west 2': 'forks western 2',
         'lehigh northwestern': 'lehigh northwest',
         'lower mount bethel lower ind': 'lower mount bethel independent',
         'lower saucon hellertown 1': 'lower saucon 1',
         'lower saucon leithsville 2': 'lower saucon 2',
         'lower saucon lower saucon 3': 'lower saucon 3',
         'lower saucon seidersville 4': 'lower saucon 4',
         'lower saucon shimersville 5': 'lower saucon 5',
         'lower saucon wassergass 6': 'lower saucon 6',
         'moore east': 'moore eastern',
         'palmer east': 'palmer eastern',
         'palmer upper east': 'palmer upper eastern',
         'palmer upper west': 'palmer upper western',
         'palmer upper west naz ind':'palmer upper western',
         'palmer west 1': 'palmer western 1',
         'palmer west 2': 'palmer western 2',
         'plainfield plf church': 'plainfield plainfield church',
         'williams east': 'williams eastern',
         'williams west': 'williams western'},
    'Fayette County':
        {'bellevernon': 'belle vernon',
         'bullskind1': 'bullskin 3',
         'bullskind2': 'bullskin 1',
         'bullskind3': 'bullskin 2',
         'cvilletwp': 'connellsville',
         'cvillew1': 'connellsville 1',
         'cvillew2': 'connellsville 2',
         'cvillew3': 'connellsville 3',
         'cvillew4': 'connellsville 4',
         'dawsonboro': 'dawson',
         'dunbarboro': 'dunbar',
         'dunbartwp1': 'dunbar 1',
         'dunbartwp2': 'dunbar 2',
         'fayctyboro': 'fayette city',
         'fchance': 'fairchance',
         'georgesd1': 'georges 1',
         'georgesd2': 'georges 2',
         'georgesd3': 'georges 3',
         'germand1': 'german 1',
         'germand2': 'german 2',
         'germand3': 'german 3',
         'germand4': 'german 4',
         'henryclay': 'henry clay',
         'jefftwp': 'jefferson',
         'ltyronetwp': 'lower tyrone',
         'luzerned1': 'luzerne 1',
         'luzerned2': 'luzerne 2',
         'luzerned3': 'luzerne 3',
         'luzerned4': 'luzerne 4',
         'markleysbrg': 'markleysburg',
         'menallen1': 'menallen 1',
         'menallen2': 'menallen 2',
         'menallen3': 'menallen 3',
         'newellboro': 'newell',
         'nunion1': 'north union 1',
         'nunion2': 'north union 2',
         'nunion3': 'north union 3',
         'nunion4': 'north union 4',
         'nunion5': 'north union 5',
         'perrytwp': 'perry 1',
         'ptmarion': 'point marion',
         'redstone1': 'redstone 1',
         'redstone2': 'redstone 2',
         'redstone3': 'redstone 3',
         'redstone4': 'redstone 4',
         'scvilleboro': 'south connellsville',
         'spfield1': 'springfield 1',
         'spfield2': 'springfield 2',
         'sphill1': 'springhill 1',
         'sphill2': 'springhill 2',
         'sunion1': 'south union 1',
         'sunion2': 'south union 2',
         'sunion3': 'south union 3',
         'utownw1': 'uniontown 1',
         'utownw2': 'uniontown 2',
         'utownw3': 'uniontown 3',
         'utownw4': 'uniontown 4',
         'utownw5': 'uniontown 5',
         'utownw6': 'uniontown 6',
         'utownw7': 'uniontown 7',
         'utyronetwp': 'upper tyrone',
         'vbiltboro': 'vanderbilt',
         'bvillew1':'brownsville',
         'bvillew2':'brownsville',
         'bvillew3':'brownsville',
         'franklind1':'franklin',
         'franklind2':'franklin',
         },
    'Westmoreland County':
        {'donegal four mile r': 'donegal four mile run',
         'donegal indian cree': 'donegal indian creek',
         'east huntingdon 1': 'east huntingdon bessemer ed 1',
         'east huntingdon 2': 'east huntingdon bessemer ed 2',
         'east huntingdon ruffs': 'east huntingdon ruffsdale',
         'east huntingdon stoner': 'east huntingdon stoners',
         'east huntingdon strohm': 'east huntingdon strohms',
         'fairfield north fairfie': 'fairfield north fairfield',
         'fairfield south fairfie': 'fairfield south fairfield',
         'hempfield east adamsbu': 'hempfield east adamsburg',
         'hempfield 1': 'hempfield foxhill',
         'hempfield 2': 'hempfield alwine',
         'hempfield 3': 'hempfield maplewood',
         'hempfield 4': 'hempfield carbon',
         'hempfield 5': 'hempfield wendel herm',
         'hempfield 6': 'hempfield luxor',
         'hempfield 7': 'hempfield hannastown',
         'hempfield 8': 'hempfield bovard',
         'hempfield fort alle': 'hempfield fort allen',
         'hempfield grapevill': 'hempfield grapeville',
         'hempfield lincoln east': 'hempfield lincoln heights',
         'hempfield lincoln west': 'hempfield lincoln heights west',
         'hempfield middletow': 'hempfield middletown',
         'hempfield universit': 'hempfield university',
         'hempfield weavers o': 'hempfield weavers old stand',
         'hempfield haydenvil':'hempfield haydenville', 
         'hempfield new stant':'hempfield new stanton', 
         'hempfield west hempfie':'west hempfield hempfield',
         'ligonier laughlinst': 'ligonier laughlinstown',
         'mt pleasant bridgep': 'mt pleasant bridgeport',
         'mt pleasant hecla': 'mt pleasant heccla',
         'mt pleasant laurel': 'mt pleasant laurel run',
         'mt pleasant pleasan': 'mt pleasant pleasant valley',
         'mt pleasant ridgevi': 'mt pleasant ridgeview',
         'mt pleasant spring': 'mt pleasant spring garden',
         'mt pleasant westmor': 'mt pleasant westmoreland',
         'murrysville east murrysvil': 'murrysville east murrysville',
         'murrysville south murrysvil': 'murrysville south murrysville',
         'murrysville west murrysvil': 'murrysville west murrysville',
         'rostraver collinsbu': 'rostraver collinsburg',
         'rostraver cross roa': 'rostraver cross roads',
         'sewickley east hermini': 'sewickley east herminie',
         'sewickley west hermini': 'sewickley west herminie',
         'south huntingdon jacobs': 'south huntingdon jacobs creek',
         'south huntingdon minera': 'south huntingdon mineral',
         'south huntingdon port r': 'south huntingdon port royal',
         'south huntingdon south hunt': 'south huntingdon south huntingdon',
         'sw greensburg 1': 'southwest greensburg 1',
         'sw greensburg 2': 'southwest greensburg 2',
         'unity pleasant unit': 'unity pleasant unity',
         'washington north washin': 'washington north washington',
         'washington oakland': 'washington oakland x roads',},
    'Armstrong County':
        {'cowanshannock east': 'cowanshannock eastern',
         'cowanshannock sag': 'cowanshannock sagamore',
         'cowanshannock west': 'cowanshannock western',
         'east franklin east': 'east franklin eastern',
         'east franklin north': 'east franklin northern',
         'east franklin west': 'east franklin western',
         'ford city 1 north': 'ford city northern 1',
         'ford city 1 south': 'ford city southern 1',
         'ford city 2 north': 'ford city northern 2',
         'ford city 2 south': 'ford city southern 2',
         'gilpin 2': 'gilpin 2 ed schenley',
         'kiskiminetas ohn': 'kiskiminetas orchhills north',
         'kiskiminetas ohs': 'kiskiminetas orchhills south',
         'manor north': 'manor northern',
         'manor south 1': 'manor southern 1',
         'manor south 2': 'manor southern 2',
         'north buffalo east': 'north buffalo eastern',
         'north buffalo west': 'north buffalo western',
         'parks n vandergrift': 'parks north vandergrift',
         'parks west': 'parks western',
         'south buffalo east': 'south buffalo eastern',
         'south buffalo west': 'south buffalo western'},
    'Delaware County':
        {'chadds ford ne': 'chadds ford northeast',
         'chadds ford sw': 'chadds ford southeast',
         'east lansdowne e': 'east lansdowne eastern',
         'east lansdowne w': 'east lansdowne western',
         'lwr chichester 1p': 'lower chichester 1',
         'lwr chichester 2p': 'lower chichester 2',
         'radnor 6w 1p': 'radnor 6 1 hd 165',
         'springfield 2w 3p emerg': 'springfield 2 3',
         'upper darby 6d 11p emerg': 'upper darby 6 11',
         'upper darby 6d 8p emerg': 'upper darby 6 8',
         'CHESTER TWP PCT 03': 'CHESTER CITY 3W',
         'CHESTER TWP PCT 04': 'CHESTER CITY 4W',
         'CHESTER WD 03': 'CHESTER TWP 3P',
         'CHESTER WD 04': 'CHESTER TWP 4P',
         'DARBY TWP WD 03 PCT 01': 'DARBY BORO 3W 1P',
         'DARBY TWP WD 03 PCT 02': 'DARBY BORO 3W 2P',
         'DARBY WD 03 PCT 01': 'DARBY TWP 3W 1P',
         'DARBY WD 03 PCT 02': 'DARBY TWP 3W 2P',
        },
    'Mifflin County':
        {'brown church hill': 'brown', 'brown reedsville big va': 'brown'},
}

county_id_to_vtd_to_oe_prec_name = {
    'Allegheny County': {
        '000130' : 'BALDWIN BR DIST 1',
        '000140' : 'BALDWIN BR DIST 2',
        '000151' :'BALDWIN TP DIST 1',
        '000161' : 'BALDWIN TP DIST 2',},
    'Beaver County' : {
        '000630': '1401 DARLINGTON TWP', 
        '000640': '1301 DARLINGTON BORO',
        '001430': '4801 ROCHESTER TWP 1', 
        '001445': '4802 ROCHESTER TWP 2', 
        '001460': '4701 ROCHESTER BORO 1', 
        '001470': '4702 ROCHESTER BORO 2',
        # Unmatched precincts
        '001320': '4004 NORTH SEWICKLEY TWP 4',
        '001280': '3903 NEW SEWICKLEY TWP 3',
        '001260' : '3901 NEW SEWICKLEY TWP 1',
        '001270' : '3902 NEW SEWICKLEY TWP 2',
    },
    'Berks County' : {
        '000239' : 'Cumru Twp. 01', 
        '000241' : 'Cumru Twp. 01',
        '000529': 'Laureldale Boro 01',
        '000531' : 'Laureldale Boro 01',
    },
    'Blair County' : {
        '000870': 'TYRONE TWP 1', 
        '000880': 'TYRONE TWP 2',
        '000890': 'TYRONE BORO 1',
        '000900': 'TYRONE BORO 2'},
    'Bradford County' : {
        '000120': 'BURLINGTON BOROUGH',
        '000310': 'ROME BOROUGH',
        '000230': 'MONROE BOROUGH',
        '000080': 'ATHENS BOROUGH 2ND',
        '000110': 'BURLINGTON TOWNSHIP', 
        '000070': 'ATHENS BOROUGH 1ST',
        '000600': 'WYALUSING BOROUGH',
        '000220': 'MONROE TOWNSHIP',
        '000300': 'ROME TOWNSHIP',
        '000590': 'WYALUSING TOWNSHIP',
        '000130': 'CANTON TOWNSHIP',
        '000510': 'TROY BOROUGH',
        '000140': 'CANTON BOROUGH',
        '000500': 'TROY TOWNSHIP',
        '000050': 'ATHENS TOWNSHIP 1ST',
        '000060': 'ATHENS TOWNSHIP 2ND'},
    'Bucks County' : {
        '001920': 'Newtown Boro 2nd',
        '001850': 'Newtown Twp 1',
        '001860': 'Newtown Twp 2',
        '001910': 'Newtown Boro 1st'},
    'Butler County' : {
        '000670':'0052 SLIPPERY ROCK TOWNSHIP',
        '000680':'0077 SLIPPERY ROCK BOROUGH',
        '000420':'0036 FAIRVIEW TOWNSHIP',
        '000430':'0067 FAIRVIEW BOROUGH', 
        '000330':'0025 CONNOQUENESSING TOWNSHIP',
        '000340':'0063 CONNOQUENESSING BOROUGH',},
    'Cambria County' : {
        '000636': 'Gallitzin Boro',
        '000615': 'Gallitzin Twp',},
    'Clarion County' : {
        '000070': '5.1 Clarion Borough First',
        '000080': '5.2 Clarion Borough Second',
        '000050': '6.1 Clarion Township First',
        '000060': '6.2 Clarion Township Second',
        '000210': '13.1 Knox Borough',
        '000200': '14.1 Knox Township'},
    'Clearfield County' : {
        '000120': '0002 Burnside Borough', 
        '000110': '0036 Burnside Township'},
    'Clinton County' : {
        '000050': '0004 BEECH CREEK BOROUGH',
        '000040': '0005 BEECH CREEK TOWNSHIP'},
    'Columbia County' : {
        '000020': 'Benton Township',
        '000030': 'Benton Borough',
        '000260': 'Catawissa Township',
        '000275': 'Catawissa Borough'},
    'Crawford County' : {
        '000670': 'Woodcock Boro',
        '000580': 'Venango Boro',
        '000570': 'Venango Twp',
        '000660': 'Woodcock Twp'},
    'Erie County' : {
        '001320': 'NORTH EAST TOWNSHIP 1ST',
        '001330': 'NORTH EAST TOWNSHIP 2ND',
        '001350': 'NORTH EAST BORO 2ND WAR',
        '001340': 'NORTH EAST BORO 1ST WAR'},
    'Forest County' : {
        '000100': '0008 TIONESTA TWP',
        '000110': '0009 TIONESTA BORO'},
    'Lancaster County' : {
        '000580': 'Lancaster City - 1st Ward',
        '000980': 'Lancaster Twp - 1st Dist',
        '001140': 'Manheim Twp - 1st Dist',
        '001155': 'Manheim Twp - 2nd Dist',
        '001340': 'Manheim Boro - 1st Ward',
        '001350': 'Manheim Boro - 2nd Ward',
        '000610': 'Lancaster City - 3rd Ward',
    },
    'Luzerne County' : {
        '001285': 'KINGSTON TWP D 01',
        '001295': 'KINGSTON TWP D 02',
        '001305': 'KINGSTON TWP D 03',
        '001310': 'KINGSTON BORO W 01',
        '001320': 'KINGSTON BORO W 02',
        '001330': 'KINGSTON BORO W 03',
        '001740': 'NESCOPECK TWP',
        '001750': 'NESCOPECK BORO',
        '001852': 'PITTSTON CITY W 01',
        '001862': 'PITTSTON CITY W 02',
        '001995': 'PITTSTON TWP D 01',
        '002005': 'PITTSTON TWP D 02',
        '002122': 'PLYMOUTH BORO W 01',
        '002142': 'PLYMOUTH TWP',
        '000230':'DALLAS BORO W 01',
        '000240':'DALLAS BORO W 02',
        '000165':'CONYNGHAM TWP',
        '002635': 'WILKES BARRE CITY W 01',
        '002655': 'WILKES BARRE CITY W 02',
        '003075': 'WILKES BARRE TWP W 01',
        '003095': 'WILKES BARRE TWP W 02'},
    'McKean County' : {
        '000020': '0002 BRADFORD CITY-1ST WARD',
        '000030': '0003 BRADFORD CITY-2ND WARD',
        '000110': '0010 BRADFORD TWNSHIP-1ST DIS',
        '000120': '0011 BRADFORD TWNSHIP-2ND DIST',
        '000160': '0015 ELDRED BOROUGH',
        '000170': '0016 ELDRED TOWNSHIP'},
    'Mercer County' : {'000710': '3301 SANDY LAKE BORO', '000700': '3401 SANDY LAKE TWP'},
    'Mifflin County' : {
        '000010':'0001 ARMAGH - EAST',
        '000020':'0002 ARMAGH - WEST',
        '000160':'0016 LEWISTOWN - WEST',
        '000180':'0017 LEWISTOWN - CENTRAL',
        '000210':'0018 LEWISTOWN - NORTH',
        '000230':'0019 LEWISTOWN - SOUTH',},
    'Perry County' : {'000140': '0012 LIVERPOOL BOROUGH', '000130': '0013 LIVERPOOL TOWNSHIP'},
    'Pike County' : {'000090': 'Milford Boro', '000100': 'Milford Twp'},
    'Potter County' : {'000190': 'Oswayo Boro', '000180': 'Oswayo Twp', '000310': 'Ulysses Twp', '000320': 'Ulysses Boro'},
    'Schuylkill County' : {'001580': 'Tremont Twp.', '001595': 'Tremont', '000900': 'Pine Grove Twp. Pct. 2', '000890': 'Pine Grove Twp. Pct. 1'},
    'Sullivan County' : {'000140': 'Laporte Borough', '000130': 'Laporte Township'},
    'Susquehanna County' : {'000420': 'Thompson Boro', '000410': 'Thompson Twp', '000340': 'Oakland Twp', '000350': 'Oakland Boro', '000160': 'Great Bend Boro', '000330': 'New Milford Boro', '000320': 'New Milford Twp', '000150': 'Great Bend Twp'},
    'Tioga County' : {
        '000240': 'Liberty Boro',
        '000230': 'Liberty Twp',
        '000460': 'Westfield Boro',
        '000445': 'Westfield Twp',
        '000370': 'Tioga Twp',
        '000380': 'Tioga Boro',
        '000410': 'Ward'},
    'Warren County' : {
        '000255': 'SUGAR GROVE Twp',
        '000290': 'SUGAR GROVE Boro'},
    'Wyoming County' : {
        '000230': 'Tunkhannock Township #1',
        '000240': 'Tunkhannock Township #2',
        '000130': 'Meshoppen Township',
        '000140': 'Meshoppen Borough',
        '000170': 'Nicholson Borough at Large',
        '000160': 'Nicholson Township',
        '000250': 'Tunkhannock Borough Ward 1',
        '000260': 'Tunkhannock Borough Ward 2'},
    'Bedford County' : {
        '000150': 'Hopewell Township',
        '000160': 'Hopewell Borough',
        '000410': 'Woodbury Township',
        '000420': 'Woodbury Borough',
    },
    'Centre County' : {
        '000290' : 'Howard Township',
        '000300' : 'Howard Borough'
    },
    'Somerset County' : {
        '000010':'Addison Township',
        '000020':'Addison Borough',
    }
}

In [21]:
default_remove_lst = ['#', '.',"'", '`', ',', 'twp-', 'w-']
default_target_to_replacement_substring = {'~':' ', '-':' ', '/':' ', 'upper leacock':'leacock upper', 'hempfield east':'east hempfield','hempfield west':'west hempfield', 'west earl':'earl west'}
default_stopping_words = ['pct', 'di', 'dis', 'division', 'borough', 'township', 'district', 'dist', 'br', 'boro', 'ward', 'wrd', 'park', 'prk', 'tp', 'twp', 'wd', 'pk', 'precinct', 'p', 'd', 'dst']
default_target_to_replacement_word = {'one':'1','two':'2', 'three':'3','four':'4', 'five':'5', 'six':'6', 'seven':'7','sq':'square', 'mid':'middle', 'johnstown':'johns', 'first':'1', 'second':'2', 'third':'3', 'fourth':'4', 'fifth':'5', 'sixth':'6','seventh':'7','eighth':'8','seventeenth':'17', 'eighteenth':'18', 'nineteenth':'19', 'twenteith': '20', 'spg':'spring','l':'lower', 'mt':'mount', 'casl':'castle', 'n':'north', 's':'south', 'e':'east', 'w':'west', 'hts':'heights', 'ht':'heights', 'hl':'hills', 'springdal':'springdale','up':'upper'}

# Helper Functions
def remove_pre_num_char(prec_name, target):
    window_size = len(target)
    for idx in range(window_size, len(prec_name)):
        window = prec_name[idx-window_size:idx]
        if window == target and prec_name[idx].isnumeric():
            prec_name = prec_name[:idx-(window_size-1)] + prec_name[idx:]
    return prec_name

def edit_precinct_name(prec_name, 
    remove_lst=default_remove_lst, 
    target_to_replacement_substring = default_target_to_replacement_substring,
    stopping_words=default_stopping_words,
    target_to_replacement_word = default_target_to_replacement_word,
    prec_dict={}):
    '''
    Returns the a lower case precinct string with certian modifications depending other arguments. 
    
    Modifications are performed in order of the parameters they depend on. By convention, case is 
    ignored by making prec_name lower case. Accordingly, one should pass arguements with lower case
    elements. That is, keys of the dictionaries and elements of lists should be lower case strings.

	Parameters:
		prec_name (str): precinct name
		remove_lst ((str) list): if a string in this list is a substring in prec_name it will be removed. 
            All elements should be lower case.
        target_to_replacement_substring ({str:str} dictionary): keys (targets) will be replaced with their 
            corresponding value (replacements) in prec_name if the key is a substring. All keys should be lower case.
        stopping_words ({str} list): If any substring of prec_name contains a element of stopping_words
             that is adjacent to a space character it will be removed. All elements should be lower case.
        target_to_replacement_words ({str:str} dictionary): keys (targets) will be replaced with their 
            corresponding value (replacements) in prec_name if the key is a word. All keys should be lower case.
        prec_dict ({str:str} dictionary): After all the modifications above, if the edited prec_name
            string is in the set of keys for prec_dict, then it will be replaced with that key's value.  
            All keys should be lower case.

	Returns:
		prec_name (str): prec_name arguement returned with the 
    '''
    prec_name = str(prec_name).lower()
    prec_name = re.sub(r"(?<=\d)(st|nd|rd|th)\b", '', prec_name) # removes ordinals (e.g. 3rd -> 3)
    for word in remove_lst:
        prec_name = prec_name.replace(word, '')
    for target, replacement in target_to_replacement_substring.items():
        prec_name = prec_name.replace(target, replacement)
    for target in [' w', ' d', ' p', ' twp']:
        prec_name = remove_pre_num_char(prec_name, target) # e.g. remove d and w in front of number (e.g. d1 -> 1)
    words = prec_name.split()
    words = [target_to_replacement_word.get(word, word) for word in words]
    words = [word.lstrip('0') for word in words if word not in stopping_words]
    has_words = any(not word.isnumeric() for word in words)
    while has_words and words[0].isnumeric():
        words = words[1:]
    prec_name = " ".join([word.lstrip('0') for word in words if len(word) > 0])
    return prec_dict[prec_name] if prec_name in prec_dict.keys() else prec_name


def row_to_edited_precinct_name(row, county_id_to_precinct_modifications_dictionary, county_id_to_target_to_replacement_word):
    county_id = row['county_id']
    original_prec_name = row['original_precinct_name']
    county_target_to_replacement_word = county_id_to_target_to_replacement_word.get(county_id, default_target_to_replacement_word)
    county_precinct_modification_dictionary = county_id_to_precinct_modifications_dictionary.get(county_id, {})
    edited_prec_name = edit_precinct_name(original_prec_name, prec_dict=county_precinct_modification_dictionary, target_to_replacement_word=county_target_to_replacement_word)
    return edited_prec_name.strip()

def enumerate_unmatched(precinct_list, unmatched_precincts, dataset_name):
    for original_precinct_name, edited_precinct_name in precinct_list:
        if edited_precinct_name in unmatched_precincts:
            print("{} <-- {} ({})".format(edited_precinct_name, original_precinct_name, dataset_name))  

county_id_to_target_to_replacement_word_y = {
    'Lancaster County':{'city':'', 'lampeter':''},
    'Luzerne County':{'w':'', 'city': ''},
    'Northampton County':dict(default_target_to_replacement_word, **{'city':'', 'eastern':'east','western':'west'}),
    'Westmoreland County':{'no':'','n':'north', 's':'south', 'e':'east','w':'west'},
    'Armstrong County':{'nort':'north', 'sout':'south'},
    'Delaware County':{'city':'', 'prov':'providence','upp':'upper'},
}

county_id_to_replacement_words_x = {
    'Butler County':{'ward':'city'},
    'Lancaster County':{'city':'', 'lampeter':''},
    'Philadelphia County':{'~':' ','philadelphia':''},
    'Elk County':{'vtd':'',},
    'Westmoreland County':{'vtd':'', 'voting':''},
    'Armstrong County':{'vtd':'', 'voting':''},
    'Delaware County':{'vtd':'', 'voting':''},
}

county_id_to_precinct_modification_dictionary_x = {
    'Allegheny County': {'whitehall 1 a (cong 14)':'whitehall 1', 'whitehall 1 b (cong 18)':'whitehall 1'},
    'Cambria County':  {
        'johns 21' : 'johns twenty 1',
        'johns 8 2' : 'johns 8 3',
        'carrolltown' : 'carroltown', 
        'clearfield' : 'clearfied', 
        'johns old conemaugh' : 'johns old conemaugh woodv',
        'johns 11' : 'johns old conemaugh woodv',
        'reade north' : 'reade',
        'reade south' : 'reade', 
        'northern cambria 3 a (cong 9)' : 'northern cambria 3',
        'northern cambria 3 b (cong 12)' : 'northern cambria 3',
        'barr south a (cong 9)' : 'barr',
        'barr south b (cong 12)' : 'barr',
        'johns center town 1' : 'johns center town',
        'johns center town 2' : 'johns center town',
    },
    'Chester County': {
        'phoenixville middle 1 (hd 157)' : 'phoenixville middle 1',
        'phoenixville middle 1 (hd 155)' : 'phoenixville middle 1',
        'kennett 2 a (cong 7)' : 'kennett 2',
        'kennett 2 b (cong 16)' : 'kennett 2',
    },
    'Clarion County': {
        'farmington west' : 'farmington', 
        'farmington north' : 'farmington',
        'farmington south' : 'farmington',
        'piney a (cong 3)' : 'piney',
        'piney b (cong 5)' : 'piney',
    },
    'Cumberland County': {
        'lower allen annex':'lower allen 1 annex',
        'north middleton 1 congressional 10':'north middleton 1',
        'north middleton 1 congressional 13':'north middleton 1',
        'north middleton 3 congressional 10':'north middleton 3',
        'north middleton 3 congressional 13':'north middleton 3',
    },
    'Erie County': {'corry 4':'corry 3'},
    'Fayette County': {'masontown 1':'masontwn', 'masontown 2':'masontwn',},
    'Huntingdon County':{'penn b (9 cong)':'penn', 'penn a (5 cong)':'penn'},
    'Lebanon County':{'north lebanon east b (cong 15)':'north lebanon east', 'north lebanon east a (cong 6)':'north lebanon east'},
    'Montgomery County':{
        'hatfield 5 2 a (cong 13)':'hatfield 5 2',
        'hatfield 5 2 b (cong 8)':'hatfield 5 2',
        'horsham 2 2 b (cong 13)':'horsham 2 2',
        'horsham 2 2 a (cong 7)':'horsham 2 2 7',
        'lower merion 2 2 a (cong 2)':'lower merion 2 2',
        'lower merion 2 2 b (cong 13)':'lower merion 2 2',
        'perkiomen 1 a (cong 6)':'perkiomen 1',
        'perkiomen 1 b (cong 7)':'perkiomen 1 7',},
    'Northampton County':{'bethlehem 17 b (cong 17)':'bethlehem 17', 'bethlehem 17 a (cong 15)':'bethlehem 17'},
    'Northumberland County':{'riverside b (cong 11)':'riverside', 'riverside a (cong 10)':'riverside',},
    'Perry County':{'south west madison':'madison', 'north east madison':'sandy hill',},
    'Schuylkill County': {
        'cass 1':'cass north',
        'cass 2':'cass south',
        'pine grove 1':'pine grove north',
        'pine grove 2':'pine grove south',},
    'Tioga County': {'shippen a (cong 5)':'shippen', 'shippen b (cong 10)':'shippen'},
    'Washington County': {'canton 3':'canton 1', 'fallowfield 2 a (cong 9)':'fallowfield 2', 'fallowfield 2 b (cong 18)':'fallowfield 2'},
    'Delaware County':{'radnor 6 1 hd 165': 'radnor 6 1 hd 16', 'springfield 3 2': 'springfield 3 2 16'},
    'Bedford County':
        {'bedford 1': 'Bedford Township No 1',
         'bedford 2': 'Bedford Township No 2',
         'bedford east': 'Bedford Borough East Ward',
         'bedford west': 'Bedford Borough West Ward',
         'bloomfield': 'Bloomfield Township',
         'broad top': 'Broad Top Township',
         'coaldale': 'Coaldale Borough',
         'colerain': 'Colerain Township',
         'cumberland valley': 'Cumberland Valley',
         'east providence': 'East Providence Township',
         'east st clair': 'East St. Clair Township',
         'everett': 'Everett Borough',
         'harrison': 'Harrison Township',
         'hyndman': 'Hyndman Borough',
         'juniata': 'Juniata Township',
         'kimmel': 'Kimmel Township',
         'king': 'King Township',
         'liberty': 'Liberty Township',
         'lincoln': 'Lincoln Township',
         'londonderry': 'Londonderry Township',
         'mann': 'Mann Township',
         'manns choice': 'Manns Choice Borough',
         'monroe': 'Monroe Township Township',
         'napier': 'Napier Township',
         'new paris': 'New Paris Borough',
         'pavia': 'Pavia Township',
         'pleasantville': 'Pleasantville Borough',
         'rainsburg': 'Rainsburg Borough',
         'saxton': 'Saxton Borough',
         'schellsburg': 'Schellsburg Borough',
         'snake spring': 'Snake Spring',
         'south woodbury': 'South Woodbury Township',
         'southampton': 'Southampton Township',
         'st clairsville': 'St. Clairsville Borough',
         'west providence': 'West Providence Township',
         'west st clair': 'West St. Clair Township',
    },
    'Centre County':
        {'ferguson north central 2': 'ferguson north central',
         'ferguson northeast 1 a': 'ferguson northeast 1',
         'ferguson northeast 1 b': 'ferguson northeast 1',
         'ferguson west central 1': 'ferguson west central',
         'halfmoon proper': 'halfmoon',
         'state college east 1': 'sc east 1',
         'state college east 2': 'sc east 2',
         'state college east 3': 'sc east 3',
         'state college east 4': 'sc east 4',
         'state college east central 1': 'sc east central 1',
         'state college west central 1': 'sc west central 1'}
}

county_lst = sorted(list(gdf['county_id'].unique()))
n_counties = len(county_lst)

# Centre County
centre = pd.read_csv('county-files/Centre County/num_to_name.csv')
number_to_name_centre = pd.Series(centre.Name.values,index=centre.Precinct).to_dict()
number_to_name_centre['24'] = '0024 24 SC EAST 1'
number_to_name_centre['25'] = '0025 25 SC EAST 2'
number_to_name_centre['26'] = '0026 26 SC EAST 3'
number_to_name_centre['27'] = '0027 27 SC EAST 4'
number_to_name_centre['28'] = '0028 28 SC EAST CENTRAL 1'
number_to_name_centre['33'] = '0033 33 SC WEST CENTRAL 1'
number_to_name_centre['44'] = 'College West'
df.loc[df.county_id.isin(['Centre County']), 'original_precinct_name'] = df[df.county_id.isin(['Centre County'])].original_precinct_name.map(lambda x: x.zfill(2)).map(number_to_name_centre)

gdf['edited_precinct_name'] = gdf.apply(lambda row: row_to_edited_precinct_name(row, county_id_to_precinct_modification_dictionary_x, county_id_to_replacement_words_x), axis=1)
df['edited_precinct_name'] = df.apply(lambda row: row_to_edited_precinct_name(row, county_id_to_precinct_modification_dictionary_y, county_id_to_target_to_replacement_word_y), axis=1)

# VTDST modifications
def get_name_x(row):
    vtd_to_oe_prec_name = county_id_to_vtd_to_oe_prec_name.get(row['county_id'], {})
    return vtd_to_oe_prec_name[row['VTDST']] if row['VTDST'] in vtd_to_oe_prec_name.keys() else row['edited_precinct_name']

def get_name_y(row):
    vtd_to_oe_prec_name = county_id_to_vtd_to_oe_prec_name.get(row['county_id'], {})
    return row['original_precinct_name'] if row['original_precinct_name'] in vtd_to_oe_prec_name.values() else row['edited_precinct_name']

gdf['edited_precinct_name'] = gdf.apply(get_name_x, axis=1)
df['edited_precinct_name'] = df.apply(get_name_y, axis=1)

######## Manual Corrections ###########

# cross county precincts
gdf.loc[(gdf.edited_precinct_name == 'springs') & (gdf.county_id == 'Fayette County'), 'county_id'] = 'Somerset County'
gdf.loc[(gdf.edited_precinct_name == 'adamstown') & (gdf.county_id == 'Berks County'), 'county_id'] = 'Lancaster County'
gdf.loc[(gdf.edited_precinct_name == 'tunnelhill') & (gdf.county_id == 'Blair County'), 'county_id'] = 'Cambria County'
gdf.loc[(gdf.edited_precinct_name == 'emlenton') & (gdf.county_id == 'Clarion County'), 'county_id'] = 'Venango County'
gdf.loc[(gdf.edited_precinct_name == 'ashland') & (gdf.county_id == 'Columbia County'), 'county_id'] = 'Schuylkill County'
gdf.loc[(gdf.edited_precinct_name == 'ashland') & (gdf.county_id == 'Schuylkill County'), 'edited_precinct_name'] = 'ashland 2'

# Fulton County 
valley_hi = gdf[gdf.edited_precinct_name == 'brush creek and valley hi']
valley_hi = valley_hi.reset_index()
valley_hi.loc[0,'edited_precinct_name'] = 'valley hi'
MGGG_gdf = gpd.read_file('data/shapefiles/pa-precincts/PA_VTDs')
fulton_gdf = MGGG_gdf[(MGGG_gdf.COUNTYFP10 == '057')]
valley_hi.loc[0:,'geometry'] = fulton_gdf[fulton_gdf.NAME10 == 'VALLEY-HI'].reset_index().at[0,'geometry']
gdf = gdf.append(valley_hi)
gdf.loc[(gdf.edited_precinct_name == 'wells') & (gdf.county_id == 'Fulton County'),'geometry'] = fulton_gdf[fulton_gdf.NAME10 == 'WELLS TWP'].reset_index().at[0,'geometry']
gdf.loc[(gdf.edited_precinct_name == 'brush creek and valley hi') & (gdf.county_id == 'Fulton County'),'edited_precinct_name'] = 'brush creek'

# Precinct Splitting
cds = gpd.read_file("data/shapefiles/tl_2018_us_cd116")
cds = cds[cds.STATEFP == '42']

# Berks County
cds_4_6_9 = cds[cds.CD116FP.isin({'04','06','09'})][['CD116FP','geometry']]
precs_to_split = ['1', 'exeter 5']
berks_county_gdf = gdf[(gdf.county_id == 'Berks County') & gdf.edited_precinct_name.isin(precs_to_split)]
overlayed = gpd.overlay(cds_4_6_9, berks_county_gdf, how='intersection')
overlayed['edited_precinct_name'] = overlayed.apply(lambda row: row['edited_precinct_name'] + ' ({} cong)'.format(str(int(row['CD116FP']))), axis=1)
gdf = gdf[~gdf.edited_precinct_name.isin(precs_to_split)]
gdf = gdf.append(overlayed[gdf.columns])

# Cambria County
cds_13_15 = cds[cds.CD116FP.isin({'13','15'})][['CD116FP','geometry']]
precs_to_split = ['east taylor 1', 'east taylor 2', 'east taylor 3', 'east taylor 4']
cambria_county_gdf = gdf[(gdf.county_id == 'Cambria County') & gdf.edited_precinct_name.isin(precs_to_split)]
overlayed = gpd.overlay(cds_13_15, cambria_county_gdf, how='intersection')
overlayed['edited_precinct_name'] = overlayed.apply(lambda row: 'east taylor 1' if row['CD116FP'] == '15' else 'east taylor 2', axis=1)
gdf = gdf[~gdf.edited_precinct_name.isin(precs_to_split)]
gdf = gdf.append(overlayed[gdf.columns])

# Montgomery County precinct splitting
cds_1_4_5 = cds[cds.CD116FP.isin({'01','04','05'})][['CD116FP','geometry']]
precs_to_split = ['franconia 2', 'horsham 2 1', 'horsham 4 2', 'lower merion 12 2', 'lower merion 12 3']
montgomery_county_gdf = gdf[(gdf.county_id == 'Montgomery County') & gdf.edited_precinct_name.isin(precs_to_split)]
overlayed = gpd.overlay(cds_1_4_5, montgomery_county_gdf, how='intersection')
overlayed['edited_precinct_name'] = overlayed.apply(lambda row: row['edited_precinct_name'] + ' ' + str(int(row['CD116FP'])), axis=1)
gdf = gdf[~gdf.edited_precinct_name.isin(precs_to_split)]
gdf = gdf.append(overlayed[gdf.columns])
gdf = gdf.reset_index()

# delaware
def del_precs(nm):
    lst = []
    for elt in nm.split():
        if elt[0].isdigit() and len(elt) > 1:
            lst.append(elt[:-1])
        else:
            lst.append(elt)
    return ' '.join(lst)
df.loc[df.county_id == 'Delaware County', 'edited_precinct_name'] = df[df.county_id == 'Delaware County']['edited_precinct_name'].map(del_precs)
df.loc[((df.county_id == 'Delaware County') & df.edited_precinct_name.isin(['chester 3', 'chester 4', 'darby 3 1', 'darby 3 2'])), 'edited_precinct_name'] = df.original_precinct_name
delaware_corrections = {
    'CHESTER TWP PCT 03': 'CHESTER CITY 3W',
    'CHESTER TWP PCT 04': 'CHESTER CITY 4W',
    'CHESTER WD 03': 'CHESTER TWP 3P',
    'CHESTER WD 04': 'CHESTER TWP 4P',
    'DARBY TWP WD 03 PCT 01': 'DARBY BORO 3W 1P',
    'DARBY TWP WD 03 PCT 02': 'DARBY BORO 3W 2P',
    'DARBY WD 03 PCT 01': 'DARBY TWP 3W 1P',
    'DARBY WD 03 PCT 02': 'DARBY TWP 3W 2P'}
gdf.loc[((gdf.county_id == 'Delaware County') & gdf.edited_precinct_name.isin(['chester 3', 'chester 4', 'darby 3 1', 'darby 3 2'])), 'edited_precinct_name'] = gdf.original_precinct_name.map(delaware_corrections)


# Philly
df.loc[df.county_id == 'Philadelphia County', 'edited_precinct_name'] = df['original_precinct_name']
def philly_precs(nm):
    nm = nm.zfill(4)
    return nm[0:2] + '~' + nm[2:]
gdf.loc[gdf.county_id == 'Philadelphia County', 'edited_precinct_name'] = gdf[gdf.county_id == 'Philadelphia County']['edited_precinct_name'].map(philly_precs)

# Fayette County
gdf.loc[(gdf.VTDST == '000020') & (gdf.county_id == 'Fayette County'), 'edited_precinct_name'] = 'bvilletwp'

# Tioga County
df.loc[(df.county_id =='Tioga County') & (df.original_precinct_name =='Ward Twp'),'edited_precinct_name'] = 'Ward'

# Bedford County
bedford = pd.read_csv('county-files/Bedford County/split_polling_locations.csv')
bedford['num'] = bedford.num.apply(lambda x: 'Precinct ' + str(x).zfill(4))
number_to_name_bedford = pd.Series(bedford.name.values,index=bedford.num).to_dict()
df.loc[df.county_id.isin(['Bedford County']), 'edited_precinct_name']= df[df.county_id.isin(['Bedford County'])].original_precinct_name.map(number_to_name_bedford)
df.loc[df.county_id.isin(['Bedford County']), 'original_precinct_name']= df[df.county_id.isin(['Bedford County'])].original_precinct_name.map(number_to_name_bedford)

gdf['original_precinct_name, edited_precinct_name'] = gdf[['original_precinct_name','edited_precinct_name']].apply(tuple, axis=1)
df['original_precinct_name, edited_precinct_name'] = df[['original_precinct_name','edited_precinct_name']].apply(tuple, axis=1)

In [22]:
#### Disolve Shapefile on `loc, prec`

In [23]:
gdf['loc, prec'] = gdf.apply(lambda row: row['county_id'] + ', ' + row['edited_precinct_name'], axis=1)
gdf = gdf.dissolve(by='loc, prec', as_index=False)
county_to_geoid = state_fip_to_county_to_geoid[42]
gdf['COUNTYFP'] = gdf['county_id'].map(county_to_geoid)
gdf = gdf.set_index('loc, prec')

#### Final validation for the election results before joining with the shapefile

In [24]:
# create 'loc, prec' to be used for the join 
df['loc, prec'] = df.apply(lambda row: row['county_id'] + ', ' + row['edited_precinct_name'], axis=1)

# screen for duplicate names
acceptable_duplicates_to_reason = {
    'Clarion County, farmington':"Multiple polling locations - same precinct",
    'Delaware County, springfield 2 3':"combine 'emerg' results with normal results",
    'Delaware County, springfield 3 2 16':"originally seperate because state leg distirct splits the precinct and thus they get different ballots. merging since we aren't including state leg resutls.",
    'Delaware County, upper darby 6 1':"combine 'emerg' results with normal results",
    'Delaware County, upper darby 6 8':"combine 'emerg' results with normal results",
    'Fayette County, brownsville':"Fayette Board of elections said they don't have a shapefile with brownsville seperated into 'bvillew1', 'bvillew2', and 'bvillew3'.",
    'Fayette County, franklin':"Fayette Board of elections said they don't have a shapefile with franklin seperated into 'franklind1' and 'franklind2'.",
    'Lancaster County, lancaster 7 8':"Spoke with Diane who told me that 'lancaster 7 8' has two different school districts, but their ballots are the same other than school district related items. Therefore I combined their results under the single shapefiles.",
    'Mifflin County, brown':"'brown reedsville big va', 'brown church hill' are contigous and County is unwilling to share shapefile so I am aggregating the results for 'brown reedsville big va' and 'brown church hill' under the 'brown' geometry.",
    'Monroe County, smithfield 1':"Monroe County Shapefile from the county doesn't have a 'smithfield 4' precinct. The polling location for smithfield 1 and smithfield 4 is the same address, so I decided to just add the smithfield 4 results to smithfield 1.",
    'Northampton County, palmer upper western':"Called 610-829-6260 and learned that 'NAZ IND' is only seperate for school district related elections. For Federal Elections, it is the same precinct as PALMER TOWNSHIP UPPER WEST DISTRICT EASTON. ",
    'Washington County, peters 3':"'petersxd 3' looks like a mis-scan of 'peter d-3' renamed it accordingly. ",
}
counts = df['loc, prec'].value_counts()
duplicates = counts[(counts>1) & (~counts.index.isin(acceptable_duplicates_to_reason.keys()))]
assert len(duplicates) == 0

# remove aggregate 'loc, prec' into one row (summing the result)
df = df.groupby(by='loc, prec').sum()

In [25]:
assert set(gdf.index.unique()) == set(df.index.unique())

### Join the election results and shapefile on `loc, prec`

In [26]:
joined_df = df.join(gdf, lsuffix='_left', rsuffix='_right').reset_index()
print(joined_df.shape)
joined_df.head(2)

(9150, 123)


Unnamed: 0,"loc, prec",index_left,G18DemGov,G18DemHOR,G18DemSen,G18GreGov,G18GreHOR,G18GreSen,G18IndGov,G18IndHOR,...,T16TREASR,T16CONGD,T16CONGR,T16STSD,T16STSR,T16STHD,T16STHR,edited_precinct_name,index_right,"original_precinct_name, edited_precinct_name"
0,"Adams County, abbottstown",0,120.0,108.0,120.0,2.0,0.0,2.0,0.0,0.0,...,,,,,,,,abbottstown,,"(ABBOTTSTOWN, abbottstown)"
1,"Adams County, arendtsville",1,160.0,132.0,151.0,2.0,0.0,3.0,0.0,1.0,...,,,,,,,,arendtsville,,"(ARENDTSVILLE, arendtsville)"


### Choose columns, check that there are no empty values, and save the file

In [27]:
cols = ['loc, prec', 'COUNTYFP', 'county_id', 'edited_precinct_name',
 'G18DemSen', 'G18RepSen', 'G18LibSen', 'G18GreSen', 'G18IndSen',
 'G18DemGov', 'G18RepGov', 'G18LibGov', 'G18GreGov', 'G18IndGov', 
 'G18DemHOR', 'G18RepHOR', 'G18LibHOR','G18GreHOR', 'G18IndHOR', 
 'geometry']
output = gpd.GeoDataFrame(joined_df[cols].rename(columns={'COUNTYFP':'GEOID', 'edited_precinct_name':'precinct'}))
output.head()

Unnamed: 0,"loc, prec",GEOID,county_id,precinct,G18DemSen,G18RepSen,G18LibSen,G18GreSen,G18IndSen,G18DemGov,G18RepGov,G18LibGov,G18GreGov,G18IndGov,G18DemHOR,G18RepHOR,G18LibHOR,G18GreHOR,G18IndHOR,geometry
0,"Adams County, abbottstown",42001,Adams County,abbottstown,120.0,183.0,5.0,2.0,0.0,120.0,185.0,2.0,2.0,0.0,108.0,201.0,0.0,0.0,0.0,"POLYGON Z ((-76.99801 39.88359 0.00000, -76.99..."
1,"Adams County, arendtsville",42001,Adams County,arendtsville,151.0,178.0,6.0,3.0,0.0,160.0,172.0,4.0,2.0,0.0,132.0,204.0,0.0,0.0,1.0,"POLYGON Z ((-77.31141 39.92625 0.00000, -77.30..."
2,"Adams County, bendersville",42001,Adams County,bendersville,74.0,103.0,1.0,2.0,0.0,76.0,98.0,3.0,2.0,0.0,66.0,111.0,0.0,0.0,0.0,"POLYGON Z ((-77.25596 39.98075 0.00000, -77.25..."
3,"Adams County, berwick",42001,Adams County,berwick,289.0,575.0,14.0,5.0,0.0,318.0,554.0,9.0,5.0,0.0,252.0,631.0,0.0,0.0,0.0,"MULTIPOLYGON Z (((-77.02734 39.87105 0.00000, ..."
4,"Adams County, biglerville",42001,Adams County,biglerville,152.0,231.0,3.0,7.0,1.0,168.0,215.0,5.0,2.0,1.0,133.0,261.0,0.0,0.0,0.0,"POLYGON Z ((-77.25594 39.93043 0.00000, -77.25..."


In [28]:
output.to_file('Pennsylvania 2018 General Election Shapefile')