# Match 2016 NCSBE House Votes

Retrieve precinct-sorted 2016 general election results from [`precinct_sort_20161108.zip`](https://dl.ncsbe.gov/index.html?prefix=ENRS/2016_11_08/precinct%20sort/), match them to [parties from Ballotpedia 2016 races](https://docs.google.com/spreadsheets/d/1LmNoDfZH9lMtGh5kLwQG7Te7o3XugiarnK2zron72pE/edit#gid=1587907766), and output [new 2016 North Carolina model data](https://docs.google.com/spreadsheets/d/1aMPAXJN7Km3fxglhHXPqsCl4TKGCE_JXooufT6KVom0/edit#gid=684088115).

In [1]:
import os; os.environ['DYLD_LIBRARY_PATH'] = './.venv-NC/lib'
import pandas, editdistance, numpy, geopandas, re

## Import NCSBE Votes

In [2]:
df_2016 = pandas.read_csv('precinct_sort_20161108.txt.gz', sep='\t', dtype=str)
df_2016sldl = df_2016[df_2016.contest_name.str.startswith('NC HOUSE OF REPRESENTATIVES DISTRICT ')]
df_2016uspres = df_2016[df_2016.contest_name == 'US PRESIDENT']

df_2016sldl

Unnamed: 0,county_id,county_desc,precinct_code,precinct_desc,contest_name,vote_for,candidate_name,votes
80,1,ALAMANCE,01,PATTERSON,NC HOUSE OF REPRESENTATIVES DISTRICT 064,1,Dennis Riddell,2029
81,1,ALAMANCE,01,PATTERSON,NC HOUSE OF REPRESENTATIVES DISTRICT 064,1,OVER VOTES,0
82,1,ALAMANCE,01,PATTERSON,NC HOUSE OF REPRESENTATIVES DISTRICT 064,1,UNDER VOTES,338
210,1,ALAMANCE,02,COBLE,NC HOUSE OF REPRESENTATIVES DISTRICT 064,1,Dennis Riddell,2149
211,1,ALAMANCE,02,COBLE,NC HOUSE OF REPRESENTATIVES DISTRICT 064,1,OVER VOTES,0
...,...,...,...,...,...,...,...,...
371483,100,YANCEY,10 PEN,PENSACOLA,NC HOUSE OF REPRESENTATIVES DISTRICT 118,1,UNDER VOTES,23
371605,100,YANCEY,11 PRI,PRICES CREEK,NC HOUSE OF REPRESENTATIVES DISTRICT 118,1,Michele D. Presnell,567
371606,100,YANCEY,11 PRI,PRICES CREEK,NC HOUSE OF REPRESENTATIVES DISTRICT 118,1,OVER VOTES,0
371607,100,YANCEY,11 PRI,PRICES CREEK,NC HOUSE OF REPRESENTATIVES DISTRICT 118,1,Rhonda Cole Schandevel,288


## Import Ballotpedia Candidates

Match candidate names between Ballotpedia and NCSBE in each house district to later determine party votes.

In [3]:
df_candidates = pandas.read_csv('North Carolina State House Candidates - 2016 Candidates.csv')

def closest_name(name, names):
    if name in (numpy.nan, 'No candidate'):
        return None
    distances = sorted([(editdistance.distance(n, str(name)), n) for n in names])
    return distances[0][1]

DEMs, REPs, contests = list(), list(), list()

for (index, row) in df_candidates.iterrows():
    contest_name = 'NC HOUSE OF REPRESENTATIVES DISTRICT {:03d}'.format(row['State House District'])
    names = set(df_2016sldl[df_2016sldl.contest_name == contest_name].candidate_name)
    DEMs.append(closest_name(row.Democrat, names))
    REPs.append(closest_name(row.Republican, names))
    contests.append(contest_name)

df_candidates['Democrat'], df_candidates['Republican'], df_candidates['Contest'] = DEMs, REPs, contests

df_candidates

Unnamed: 0,State House District,Winning Party,Incumbent Party,Democrat,Republican,Other,Contest
0,1,R,R,Sam Davis,Bob Steinburg,,NC HOUSE OF REPRESENTATIVES DISTRICT 001
1,2,R,R,Joe Parrish,Larry Yarborough,,NC HOUSE OF REPRESENTATIVES DISTRICT 002
2,3,R,R,Marva Fisher Baldwin,Michael Speciale,,NC HOUSE OF REPRESENTATIVES DISTRICT 003
3,4,R,R,,Jimmy Dixon,,NC HOUSE OF REPRESENTATIVES DISTRICT 004
4,5,D,D,Howard J. Hunter III,,,NC HOUSE OF REPRESENTATIVES DISTRICT 005
...,...,...,...,...,...,...,...
115,116,D,D,Brian Turner,,,NC HOUSE OF REPRESENTATIVES DISTRICT 116
116,117,R,R,,Chuck McGrady,,NC HOUSE OF REPRESENTATIVES DISTRICT 117
117,118,R,R,Rhonda Cole Schandevel,Michele D. Presnell,,NC HOUSE OF REPRESENTATIVES DISTRICT 118
118,119,R,D,Joe Sam Queen,Mike Clampitt,,NC HOUSE OF REPRESENTATIVES DISTRICT 119


## Count Precinct Party Votes

Create a new `df_2016new` DataFrame with vote counts for State representatives and U.S. President by party.

In [4]:
arrays = dict(county_desc=list(), precinct_code=list(), district=list(),
              incumbent=list(), winner=list(), sldl_votes_D=list(), sldl_votes_R=list(),
              uspres_votes_D=list(), uspres_votes_R=list())

groups1 = df_2016sldl.groupby(['county_desc', 'precinct_code', 'contest_name']).groups

for (county_desc, precinct_code, contest_name) in groups1.keys():
    _df1 = df_2016sldl[df_2016sldl.county_desc == county_desc]
    _df2 = _df1[_df1.precinct_code == precinct_code]
    subdf_2016sldl = _df2[_df2.contest_name == contest_name]
    
    arrays['county_desc'].append(county_desc)
    arrays['precinct_code'].append(precinct_code)
    
    row_candidates = df_candidates[df_candidates.Contest == contest_name].iloc[0]
    arrays['district'].append(row_candidates['State House District'].astype(str))
    arrays['incumbent'].append(row_candidates['Incumbent Party'])
    arrays['winner'].append(row_candidates['Winning Party'])
    
    try:
        DEM_name = row_candidates.Democrat
        DEM_row = subdf_2016sldl[subdf_2016sldl.candidate_name == DEM_name].iloc[0]
    except IndexError:
        arrays['sldl_votes_D'].append(0)
    else:
        arrays['sldl_votes_D'].append(int(DEM_row.votes))
            
    try:
        REP_name = row_candidates.Republican
        REP_row = subdf_2016sldl[subdf_2016sldl.candidate_name == REP_name].iloc[0]
    except IndexError:
        arrays['sldl_votes_R'].append(0)
    else:
        arrays['sldl_votes_R'].append(int(REP_row.votes))
    
    arrays['uspres_votes_D'].append(0)
    arrays['uspres_votes_R'].append(0)
    

groups2 = df_2016sldl.groupby(['county_desc', 'precinct_code']).groups

for (county_desc, precinct_code) in groups2.keys():
    _df1 = df_2016uspres[df_2016uspres.county_desc == county_desc]
    subdf_2016uspres = _df1[_df1.precinct_code == precinct_code]
    
    arrays['county_desc'].append(county_desc)
    arrays['precinct_code'].append(precinct_code)
    
    arrays['district'].append(None)
    arrays['incumbent'].append(None)
    arrays['winner'].append(None)
    
    arrays['sldl_votes_D'].append(0)
    arrays['sldl_votes_R'].append(0)
    
    try:
        DEM_row = subdf_2016uspres[subdf_2016uspres.candidate_name == 'Hillary Clinton'].iloc[0]
    except IndexError:
        arrays['uspres_votes_D'].append(0)
    else:
        arrays['uspres_votes_D'].append(int(DEM_row.votes))
            
    try:
        REP_row = subdf_2016uspres[subdf_2016uspres.candidate_name == 'Donald J. Trump'].iloc[0]
    except IndexError:
        arrays['uspres_votes_R'].append(0)
    else:
        arrays['uspres_votes_R'].append(int(REP_row.votes))
    

df_2016new = pandas.DataFrame(arrays)
print(df_2016new.shape)
print('SLDL Votes:', df_2016new.sldl_votes_D.sum(), df_2016new.sldl_votes_R.sum())
print('US Pres Votes:', df_2016new.uspres_votes_D.sum(), df_2016new.uspres_votes_R.sum())
df_2016new

(5848, 9)
SLDL Votes: 1949386 2150214
US Pres Votes: 2187676 2362130


Unnamed: 0,county_desc,precinct_code,district,incumbent,winner,sldl_votes_D,sldl_votes_R,uspres_votes_D,uspres_votes_R
0,ALAMANCE,01,64,R,R,0,2029,0,0
1,ALAMANCE,02,64,R,R,0,2149,0,0
2,ALAMANCE,035,64,R,R,0,1926,0,0
3,ALAMANCE,03C,63,R,R,0,1342,0,0
4,ALAMANCE,03C,64,R,R,0,107,0,0
...,...,...,...,...,...,...,...,...,...
5843,YANCEY,07 BRU,,,,0,0,91,163
5844,YANCEY,08 CRA,,,,0,0,522,1260
5845,YANCEY,09 SOU,,,,0,0,596,766
5846,YANCEY,10 PEN,,,,0,0,106,270


## Assign Precinct Geography PSIDs

Read precinct PSID values from `NC-Geographies.gpkg`, merge by county name and precinct ID.

In [5]:
geogs_2016 = geopandas.read_file('NC-Geographies.gpkg', layer='precincts')

geogs_2016.psid = geogs_2016.psid.astype(str).str.replace(re.compile(r'^'), 'PSID:')

df_2016out = df_2016new.merge(geogs_2016, how='left',
                              left_on=('county_desc', 'precinct_code'),
                              right_on=('county_name', 'precinct_id'))

df_2016out

Unnamed: 0,county_desc,precinct_code,district,incumbent,winner,sldl_votes_D,sldl_votes_R,uspres_votes_D,uspres_votes_R,psid,gid,year,county_fips,county_name,precinct_id,geometry
0,ALAMANCE,01,64,R,R,0,2029,0,0,PSID:1158854937,2344.0,2016,1,ALAMANCE,01,(POLYGON ((-79.43197308657012 35.8892919150817...
1,ALAMANCE,02,64,R,R,0,2149,0,0,PSID:1158848961,2342.0,2016,1,ALAMANCE,02,(POLYGON ((-79.53483899133798 36.0438210001847...
2,ALAMANCE,035,64,R,R,0,1926,0,0,PSID:1158850573,2337.0,2016,1,ALAMANCE,035,(POLYGON ((-79.50051299120325 36.1542650001393...
3,ALAMANCE,03C,63,R,R,0,1342,0,0,PSID:1158854377,2350.0,2016,1,ALAMANCE,03C,(POLYGON ((-79.5203239911727 36.08219700003401...
4,ALAMANCE,03C,64,R,R,0,107,0,0,PSID:1158854377,2350.0,2016,1,ALAMANCE,03C,(POLYGON ((-79.5203239911727 36.08219700003401...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5843,YANCEY,07 BRU,,,,0,0,91,163,PSID:1158854169,1089.0,2016,100,YANCEY,07 BRU,(POLYGON ((-82.19135299166359 35.9750180004796...
5844,YANCEY,08 CRA,,,,0,0,522,1260,PSID:1158852683,1092.0,2016,100,YANCEY,08 CRA,(POLYGON ((-82.18617999165113 35.9435280000667...
5845,YANCEY,09 SOU,,,,0,0,596,766,PSID:1158854577,1096.0,2016,100,YANCEY,09 SOU,(POLYGON ((-82.22187599103363 35.8651619998303...
5846,YANCEY,10 PEN,,,,0,0,106,270,PSID:1158853255,1095.0,2016,100,YANCEY,10 PEN,(POLYGON ((-82.26803599172992 35.7637380002675...


## Group Votes by Precinct

Add up all votes, grouping by precinct and concatenating multiple districts and incumbents.

In [6]:
grouped = df_2016out.groupby(['county_desc', 'precinct_code'])

def semicolon(series):
    return ';'.join({str(v) for v in series.values if v and v is not numpy.nan})

def doit(df):
    #print(df)
    return pandas.DataFrame({
        'psid': [semicolon(df.psid)],
        'district': [semicolon(df.district)],
        'winner': [semicolon(df.winner)],
        'incumbent': [semicolon(df.incumbent)],
        'sldl_votes_D': [df.sldl_votes_D.sum()],
        'sldl_votes_R': [df.sldl_votes_R.sum()],
        'uspres_votes_D': [df.uspres_votes_D.sum()],
        'uspres_votes_R': [df.uspres_votes_R.sum()],
    })

df_2016final = grouped.apply(doit)

df_2016final

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,psid,district,winner,incumbent,sldl_votes_D,sldl_votes_R,uspres_votes_D,uspres_votes_R
county_desc,precinct_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ALAMANCE,01,0,PSID:1158854937,64,R,R,0,2029,411,1865
ALAMANCE,02,0,PSID:1158848961,64,R,R,0,2149,403,2004
ALAMANCE,035,0,PSID:1158850573,64,R,R,0,1926,995,1485
ALAMANCE,03C,0,PSID:1158854377,64;63,R,R,0,1449,679,1059
ALAMANCE,03N,0,PSID:1158852503,64,R,R,0,1627,1036,1160
...,...,...,...,...,...,...,...,...,...,...
YANCEY,07 BRU,0,PSID:1158854169,118,R,R,109,149,91,163
YANCEY,08 CRA,0,PSID:1158852683,118,R,R,728,1095,522,1260
YANCEY,09 SOU,0,PSID:1158854577,118,R,R,691,681,596,766
YANCEY,10 PEN,0,PSID:1158853255,118,R,R,151,218,106,270


In [7]:
df_2016final.to_csv('df_2016out.csv')