# Match 2016 NCSBE Senate Votes

Retrieve precinct-sorted 2016 general election results from [`precinct_sort_20161108.zip`](https://dl.ncsbe.gov/index.html?prefix=ENRS/2016_11_08/precinct%20sort/), match them to [parties from Ballotpedia 2016 races](https://docs.google.com/spreadsheets/d/1LmNoDfZH9lMtGh5kLwQG7Te7o3XugiarnK2zron72pE/edit#gid=1587907766), and output [new 2016 North Carolina model data](https://docs.google.com/spreadsheets/d/1aMPAXJN7Km3fxglhHXPqsCl4TKGCE_JXooufT6KVom0/edit#gid=684088115).

In [1]:
import os; os.environ['DYLD_LIBRARY_PATH'] = './.venv-NC/lib'
import pandas, editdistance, numpy, geopandas, re

## Import NCSBE Votes

In [2]:
df_2016 = pandas.read_csv('precinct_sort_20161108.txt.gz', sep='\t', dtype=str)
df_2016sldu = df_2016[df_2016.contest_name.str.startswith('NC STATE SENATE DISTRICT ')]
df_2016uspres = df_2016[df_2016.contest_name == 'US PRESIDENT']

df_2016sldu

Unnamed: 0,county_id,county_desc,precinct_code,precinct_desc,contest_name,vote_for,candidate_name,votes
92,1,ALAMANCE,01,PATTERSON,NC STATE SENATE DISTRICT 24,1,John Thorpe,474
93,1,ALAMANCE,01,PATTERSON,NC STATE SENATE DISTRICT 24,1,OVER VOTES,0
94,1,ALAMANCE,01,PATTERSON,NC STATE SENATE DISTRICT 24,1,Rick Gunn,1826
95,1,ALAMANCE,01,PATTERSON,NC STATE SENATE DISTRICT 24,1,UNDER VOTES,67
222,1,ALAMANCE,02,COBLE,NC STATE SENATE DISTRICT 24,1,John Thorpe,436
...,...,...,...,...,...,...,...,...
371496,100,YANCEY,10 PEN,PENSACOLA,NC STATE SENATE DISTRICT 47,1,UNDER VOTES,22
371618,100,YANCEY,11 PRI,PRICES CREEK,NC STATE SENATE DISTRICT 47,1,Mary Jane Boyd,293
371619,100,YANCEY,11 PRI,PRICES CREEK,NC STATE SENATE DISTRICT 47,1,OVER VOTES,0
371620,100,YANCEY,11 PRI,PRICES CREEK,NC STATE SENATE DISTRICT 47,1,Ralph Hise,566


## Import Ballotpedia Candidates

Match candidate names between Ballotpedia and NCSBE in each house district to later determine party votes.

In [3]:
df_candidates = pandas.read_csv('North Carolina State Lege Candidates - 2016 Senate.csv')

def closest_name(name, names):
    if name in (numpy.nan, 'No candidate'):
        return None
    distances = sorted([(editdistance.distance(n, str(name)), n) for n in names])
    return distances[0][1]

DEMs, REPs, contests = list(), list(), list()

for (index, row) in df_candidates.iterrows():
    contest_name = 'NC STATE SENATE DISTRICT {:02d}'.format(row['State Senate District'])
    names = set(df_2016sldu[df_2016sldu.contest_name == contest_name].candidate_name)
    DEMs.append(closest_name(row.Democrat, names))
    REPs.append(closest_name(row.Republican, names))
    contests.append(contest_name)

df_candidates['Democrat'], df_candidates['Republican'], df_candidates['Contest'] = DEMs, REPs, contests

df_candidates

Unnamed: 0,State Senate District,Winning Party,Incumbent Party,Democrat,Republican,Other,Contest
0,1,,R,Brownie Futrell,Bill Cook,,NC STATE SENATE DISTRICT 01
1,2,,R,Dorothea E. White,Norman Sanderson,,NC STATE SENATE DISTRICT 02
2,3,,D,Erica Smith-Ingram,,,NC STATE SENATE DISTRICT 03
3,4,,D,Angela R. Bryant,Richard Scott,,NC STATE SENATE DISTRICT 04
4,5,,D,Don Davis,,,NC STATE SENATE DISTRICT 05
5,6,,R,,Harry Brown,,NC STATE SENATE DISTRICT 06
6,7,,R,,"Louis M. Pate, Jr.",,NC STATE SENATE DISTRICT 07
7,8,,R,,Bill Rabon,,NC STATE SENATE DISTRICT 08
8,9,,R,Andrew Barnhill,Michael Lee,,NC STATE SENATE DISTRICT 09
9,10,,R,,Brent Jackson,,NC STATE SENATE DISTRICT 10


## Count Precinct Party Votes

Create a new `df_2016new` DataFrame with vote counts for State representatives and U.S. President by party.

In [5]:
arrays = dict(county_desc=list(), precinct_code=list(), district=list(),
              incumbent=list(), winner=list(), sldu_votes_D=list(), sldu_votes_R=list(),
              uspres_votes_D=list(), uspres_votes_R=list())

groups1 = df_2016sldu.groupby(['county_desc', 'precinct_code', 'contest_name']).groups

for (county_desc, precinct_code, contest_name) in groups1.keys():
    _df1 = df_2016sldu[df_2016sldu.county_desc == county_desc]
    _df2 = _df1[_df1.precinct_code == precinct_code]
    subdf_2016sldu = _df2[_df2.contest_name == contest_name]
    
    arrays['county_desc'].append(county_desc)
    arrays['precinct_code'].append(precinct_code)
    
    row_candidates = df_candidates[df_candidates.Contest == contest_name].iloc[0]
    arrays['district'].append(row_candidates['State Senate District'].astype(str))
    arrays['incumbent'].append(row_candidates['Incumbent Party'])
    arrays['winner'].append(row_candidates['Winning Party'])
    
    try:
        DEM_name = row_candidates.Democrat
        DEM_row = subdf_2016sldu[subdf_2016sldu.candidate_name == DEM_name].iloc[0]
    except IndexError:
        arrays['sldu_votes_D'].append(0)
    else:
        arrays['sldu_votes_D'].append(int(DEM_row.votes))
            
    try:
        REP_name = row_candidates.Republican
        REP_row = subdf_2016sldu[subdf_2016sldu.candidate_name == REP_name].iloc[0]
    except IndexError:
        arrays['sldu_votes_R'].append(0)
    else:
        arrays['sldu_votes_R'].append(int(REP_row.votes))
    
    arrays['uspres_votes_D'].append(0)
    arrays['uspres_votes_R'].append(0)
    

groups2 = df_2016sldu.groupby(['county_desc', 'precinct_code']).groups

for (county_desc, precinct_code) in groups2.keys():
    _df1 = df_2016uspres[df_2016uspres.county_desc == county_desc]
    subdf_2016uspres = _df1[_df1.precinct_code == precinct_code]
    
    arrays['county_desc'].append(county_desc)
    arrays['precinct_code'].append(precinct_code)
    
    arrays['district'].append(None)
    arrays['incumbent'].append(None)
    arrays['winner'].append(None)
    
    arrays['sldu_votes_D'].append(0)
    arrays['sldu_votes_R'].append(0)
    
    try:
        DEM_row = subdf_2016uspres[subdf_2016uspres.candidate_name == 'Hillary Clinton'].iloc[0]
    except IndexError:
        arrays['uspres_votes_D'].append(0)
    else:
        arrays['uspres_votes_D'].append(int(DEM_row.votes))
            
    try:
        REP_row = subdf_2016uspres[subdf_2016uspres.candidate_name == 'Donald J. Trump'].iloc[0]
    except IndexError:
        arrays['uspres_votes_R'].append(0)
    else:
        arrays['uspres_votes_R'].append(int(REP_row.votes))
    

df_2016new = pandas.DataFrame(arrays)
print(df_2016new.shape)
print('SLDU Votes:', df_2016new.sldu_votes_D.sum(), df_2016new.sldu_votes_R.sum())
print('US Pres Votes:', df_2016new.uspres_votes_D.sum(), df_2016new.uspres_votes_R.sum())
df_2016new

(5679, 9)
SLDU Votes: 1823676 2242349
US Pres Votes: 2187676 2362130


Unnamed: 0,county_desc,precinct_code,district,incumbent,winner,sldu_votes_D,sldu_votes_R,uspres_votes_D,uspres_votes_R
0,ALAMANCE,01,24,R,,474,1826,0,0
1,ALAMANCE,02,24,R,,436,1964,0,0
2,ALAMANCE,035,24,R,,950,1530,0,0
3,ALAMANCE,03C,24,R,,641,1172,0,0
4,ALAMANCE,03N,24,R,,960,1243,0,0
...,...,...,...,...,...,...,...,...,...
5674,YANCEY,07 BRU,,,,0,0,91,163
5675,YANCEY,08 CRA,,,,0,0,522,1260
5676,YANCEY,09 SOU,,,,0,0,596,766
5677,YANCEY,10 PEN,,,,0,0,106,270


## Assign Precinct Geography PSIDs

Read precinct PSID values from `NC-Geographies.gpkg`, merge by county name and precinct ID.

In [6]:
geogs_2016 = geopandas.read_file('NC-Geographies.gpkg', layer='precincts')

geogs_2016.psid = geogs_2016.psid.astype(str).str.replace(re.compile(r'^'), 'PSID:')

df_2016out = df_2016new.merge(geogs_2016, how='left',
                              left_on=('county_desc', 'precinct_code'),
                              right_on=('county_name', 'precinct_id'))

df_2016out

Unnamed: 0,county_desc,precinct_code,district,incumbent,winner,sldu_votes_D,sldu_votes_R,uspres_votes_D,uspres_votes_R,psid,gid,year,county_fips,county_name,precinct_id,geometry
0,ALAMANCE,01,24,R,,474,1826,0,0,PSID:1158854937,2344.0,2016,1,ALAMANCE,01,(POLYGON ((-79.43197308657012 35.8892919150817...
1,ALAMANCE,02,24,R,,436,1964,0,0,PSID:1158848961,2342.0,2016,1,ALAMANCE,02,(POLYGON ((-79.53483899133798 36.0438210001847...
2,ALAMANCE,035,24,R,,950,1530,0,0,PSID:1158850573,2337.0,2016,1,ALAMANCE,035,(POLYGON ((-79.50051299120325 36.1542650001393...
3,ALAMANCE,03C,24,R,,641,1172,0,0,PSID:1158854377,2350.0,2016,1,ALAMANCE,03C,(POLYGON ((-79.5203239911727 36.08219700003401...
4,ALAMANCE,03N,24,R,,960,1243,0,0,PSID:1158852503,2351.0,2016,1,ALAMANCE,03N,(POLYGON ((-79.49082799096274 36.1056899996169...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5674,YANCEY,07 BRU,,,,0,0,91,163,PSID:1158854169,1089.0,2016,100,YANCEY,07 BRU,(POLYGON ((-82.19135299166359 35.9750180004796...
5675,YANCEY,08 CRA,,,,0,0,522,1260,PSID:1158852683,1092.0,2016,100,YANCEY,08 CRA,(POLYGON ((-82.18617999165113 35.9435280000667...
5676,YANCEY,09 SOU,,,,0,0,596,766,PSID:1158854577,1096.0,2016,100,YANCEY,09 SOU,(POLYGON ((-82.22187599103363 35.8651619998303...
5677,YANCEY,10 PEN,,,,0,0,106,270,PSID:1158853255,1095.0,2016,100,YANCEY,10 PEN,(POLYGON ((-82.26803599172992 35.7637380002675...


## Group Votes by Precinct

Add up all votes, grouping by precinct and concatenating multiple districts and incumbents.

In [7]:
grouped = df_2016out.groupby(['county_desc', 'precinct_code'])

def semicolon(series):
    return ';'.join({str(v) for v in series.values if v and v is not numpy.nan})

def doit(df):
    #print(df)
    return pandas.DataFrame({
        'psid': [semicolon(df.psid)],
        'district': [semicolon(df.district)],
        'winner': [semicolon(df.winner)],
        'incumbent': [semicolon(df.incumbent)],
        'sldu_votes_D': [df.sldu_votes_D.sum()],
        'sldu_votes_R': [df.sldu_votes_R.sum()],
        'uspres_votes_D': [df.uspres_votes_D.sum()],
        'uspres_votes_R': [df.uspres_votes_R.sum()],
    })

df_2016final = grouped.apply(doit)

df_2016final

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,psid,district,winner,incumbent,sldu_votes_D,sldu_votes_R,uspres_votes_D,uspres_votes_R
county_desc,precinct_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ALAMANCE,01,0,PSID:1158854937,24,,R,474,1826,411,1865
ALAMANCE,02,0,PSID:1158848961,24,,R,436,1964,403,2004
ALAMANCE,035,0,PSID:1158850573,24,,R,950,1530,995,1485
ALAMANCE,03C,0,PSID:1158854377,24,,R,641,1172,679,1059
ALAMANCE,03N,0,PSID:1158852503,24,,R,960,1243,1036,1160
...,...,...,...,...,...,...,...,...,...,...
YANCEY,07 BRU,0,PSID:1158854169,47,,R,117,144,91,163
YANCEY,08 CRA,0,PSID:1158852683,47,,R,717,1098,522,1260
YANCEY,09 SOU,0,PSID:1158854577,47,,R,669,694,596,766
YANCEY,10 PEN,0,PSID:1158853255,47,,R,154,216,106,270


In [8]:
df_2016final.to_csv('df_2016-senate.csv')