In [1]:
import numpy as np
import pandas as pd
from src.modules import *
import os

In [2]:
# All raw election data is stored in the same folder, ordered by the State abbreviation.

elec_folder_path = 'data/raw_elec_totals'

OH_files = [file for file in os.listdir(elec_folder_path) if os.path.isfile(os.path.join(elec_folder_path, file)) and file.startswith('OH')]

#create empty list to hold OH formatted dataframes
formatted_OH = [] 
OH_names = []

In [3]:
# Get a list of sheetnames in the OH data files
for i in OH_files:
    file = pd.ExcelFile(f'{elec_folder_path}/{i}')
    print(file.sheet_names)

['Contents', 'Master', 'President', 'U.S. Congress', 'General Assembly', 'Judicial']
['U.S. Congress']
['Contents', 'Master', 'President', 'U.S. Congress', 'General Assembly', 'State Board of Education', 'Judicial']
['Contents', 'Master', 'Statewide Offices', 'U.S. Congress', 'Gen Assembly', 'State Board of Education', 'Judicial']
['Contents', 'Master', 'President and Vice President', 'U.S. Congress', 'Ohio General Assembly', 'State Board of Education', 'Judicial']


In [4]:
OH_data = []

for i in OH_files:
    file = pd.read_excel(f'{elec_folder_path}/{i}', sheet_name='U.S. Congress', header=1)
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = f'{i}' 
    name = name[:-4]+'_f' 
    
    
    
    # Append both to the empty list
    OH_data.append((name, file))


In [5]:
for i in OH_data:
    print(f"{i[0]} has a shape of {i[1].shape}")
    print(i[1].columns)

OH_2012precinct._f has a shape of (9232, 56)
Index(['County Name', 'Precinct Name', 'Precinct Code', 'Region Name',
       'Media Market', 'Registered Voters', 'Total Voters',
       'Turnout Percentage', 'Sherrod Brown (D)', 'Josh Mandel (R)',
       'Scott Rupert', 'Jim Berns (L)', 'Steve Chabot (R)', 'Jeff Sinnard (D)',
       'Rich Stevenson (G)', 'William Smith (D)', 'Brad Wenstrup (R)',
       'Joyce Beatty (D)', 'Jeff Brown (WI)*', 'Richard Ehrbar (L)',
       'Bob Fitrakis (G)', 'Chris Long (R)', 'Jim Jordan (R)',
       'Chris Kalla (L)', 'Jim Slone (D)', 'Eric Eberly (L)', 'Bob Latta (R)',
       'Angela Zimmann (D)', 'Bill Johnson (R)', 'Charlie Wilson (D)',
       'Bob Gibbs (R)', 'Joyce Healy-Abrams (D)', 'John Boehner (R)',
       'James Condit (WI)*', 'Marcy Kaptur (D)', 'Sean Stipe (L)',
       'Samuel Wurzelbacher (R)', 'David Harlow (L)', 'Sharen Neuhardt (D)',
       'Mike Turner (R)', 'Marcia Fudge (D)', 'Jim Reese (D)',
       'Pat Tiberi (R)', 'Marisha Agana (R)',

In [6]:
# The Ohio data appears to be formatted with rows for each county and precinct, and columns for each Candidate (amongst other columns of data)
# While the Candidates here are identified with the party initial, in the interest of consistency we should format them to only have the candidate names
# since we will be getting the party IDs from the FEC data

## The final format will also have to match the IRS data, meaning that we will need to retain the orientation of the data (rows for each county)

# Our goals for formatting this data will be - 
## Build a dataframe with columns for each Candidate containing:

### Vote totals broken down by county

### Party ID for that Candidate

### A binary marker for Incumbant/Challenger
#### These last two will provide the keys to merging candidate vote totals, since our depended variables in the initially-proposed analysis will be 'Vote Total for Incumbents' and 'Vote Total for Challengers'
#### Tagging candidates by Party ID will allow future analysis to examine effects of party popularity on the IRS and vote-total data

## The list of columns should be reduced to:

### County names
#### This will be the Primary Key for merging the IRS data
#### These counties should be formatted in all lowercase to match the IRS data and to avoid any inconsistencies with naming conventions year-to-year

### Canddidate names
#### Each candidate should have their name formatted to match the FEC data while avoiding differences in formatting between States ('Lastname, Firstname' vs 'Firstname M. Lastname' vs. 'Firstname Middlename Lastname' vs. '(Title) Firsname Lastname(, Jr.)', etc etc) and year-to-year
#### Anticipated diffrerences include name suffixes ("Jr., Sr.") and titles ("Dr.") which are inculded inconsistently across the datasets
#### The formatting initially agreed on is 'Firstname Lastname' to balance legibility, consistency, and ease
##### EXPECTED ISSUES: It is conceivable that candidates will have the same first and last names within a given year
##### Because we are transforming the data for each State in each year before joining, replacing Candidate names with Party ID and Incumbancy means that data is unlikely to become polluted by merging vote totals for candidates with the same simplified name.
##### None the less, future analysis may benefit from developing more rigorous methods of ensuring this pollution cannot occur

In [7]:
# Before proceeding with general formatting, we will check the health of the data, esp. with regards to the consistency of county names
for year, i in OH_data:
    county_col = [col for col in i.columns if col.startswith('County')]
    print(f"{year} has {i[county_col[0]].nunique()} unique counties")

OH_2012precinct._f has 90 unique counties
OH_2014precinct._f has 89 unique counties
OH_2016precinct._f has 90 unique counties
OH_2018precinct._f has 90 unique counties
OH_2020precinct._f has 90 unique counties


In [8]:
# We know (internet) that Ohio has 88 counties, so we expect any additional entries in the 'Coutny' data field to be aggregate totals or something similar

In [9]:
# # Similar to our examination of the IRS data, we will build a list of County Names common to all years and quickly examine it for the expected aggregate fields
common_counties = set(OH_data[0][1]['County Name'].tolist())

for year, i in OH_data[1:]:
    county_col = [col for col in i.columns if col.startswith('County')]  # Get the name of the column containing county names
    common_counties = common_counties.intersection(set(i[county_col[0]].tolist()))
common_counties = list(common_counties)
common_counties.sort()
print(common_counties)
print(len(common_counties))

['Adams', 'Allen', 'Ashland', 'Ashtabula', 'Athens', 'Auglaize', 'Belmont', 'Brown', 'Butler', 'Carroll', 'Champaign', 'Clark', 'Clermont', 'Clinton', 'Columbiana', 'Coshocton', 'Crawford', 'Cuyahoga', 'Darke', 'Defiance', 'Delaware', 'Erie', 'Fairfield', 'Fayette', 'Franklin', 'Fulton', 'Gallia', 'Geauga', 'Greene', 'Guernsey', 'Hamilton', 'Hancock', 'Hardin', 'Harrison', 'Henry', 'Highland', 'Hocking', 'Holmes', 'Huron', 'Jackson', 'Jefferson', 'Knox', 'Lake', 'Lawrence', 'Licking', 'Logan', 'Lorain', 'Lucas', 'Madison', 'Mahoning', 'Marion', 'Medina', 'Meigs', 'Mercer', 'Miami', 'Monroe', 'Montgomery', 'Morgan', 'Morrow', 'Muskingum', 'Noble', 'Ottawa', 'Paulding', 'Perry', 'Pickaway', 'Pike', 'Portage', 'Preble', 'Putnam', 'Richland', 'Ross', 'Sandusky', 'Scioto', 'Seneca', 'Shelby', 'Stark', 'Summit', 'Total', 'Trumbull', 'Tuscarawas', 'Union', 'Van Wert', 'Vinton', 'Warren', 'Washington', 'Wayne', 'Williams', 'Wood', 'Wyandot']
89


In [10]:
# The total length of '89' for this confirms (at least superficially) that we have all of the counties, plus an extra common to all datasets
# As expected, we easily spot a 'Total' in there
# To find the other +1 contained only in the 2012 and 2016-2020 data, we'll drop 'Total' and print all strings not found in the 2014 data


In [11]:
# Now is also a good time to mention that our data here is split by precinct, with each county having multiple precincts:

In [12]:
for year, i in OH_data:
    print(i['Precinct Name'].head(10))

0                  NaN
1                  NaN
2     0101 BRATTON TWP
3     0201 CEDAR MILLS
4            0301 LYNX
5    0401 LOCUST GROVE
6       0601 GREEN TWP
7    0501 ROME VILLAGE
8     0701 CHURN CREEK
9         0801 WAMSLEY
Name: Precinct Name, dtype: object
0                      NaN
1         BRATTON TOWNSHIP
2          BRUSH CREEK TWP
3           GREEN TOWNSHIP
4            JEFFERSON TWP
5            LIBERTY NORTH
6            LIBERTY SOUTH
7             LOCUST GROVE
8    MANCHESTER UNITED TWP
9           MEIGS TOWNSHIP
Name: Precinct Name, dtype: object
0                           NaN
1                           NaN
2              BRATTON TOWNSHIP
3          BRUSH CREEK TOWNSHIP
4                  LOCUST GROVE
5                GREEN TOWNSHIP
6            JEFFERSON TOWNSHIP
7                 LIBERTY SOUTH
8    MANCHESTER UNITED TOWNSHIP
9                MEIGS TOWNSHIP
Name: Precinct Name, dtype: object
0                           NaN
1                           NaN
2          

In [13]:
# So for the final dataframe we will just need to group the vote-totals by County

In [14]:
base_data = OH_data[1][1]['County'].tolist()

sample_datasets = OH_data[:1] + OH_data[2:]
for year, i in sample_datasets:
    current_counties = i['County Name'].tolist()
    for county in current_counties:
        if county not in base_data:
            print(county)

Percentage
Percentage
Percentage
Percentage


In [15]:
# We'll need to drop rows with 'Total' and 'Percentage' as the County name to be left with only data for the 88 counties in Ohio
# But this should be verified with the IRS data to ensure that there are no differences in formatting or spelling errors

In [16]:
# To format the candidate names in the Ohio data, we will build a general purpose function to trim the party identifier, which in the OH data is bracketed by '()' or '()*' 
# We can also use this fact to pull all candidate names from the columns by creating a list of column names filtering those ending in ')' or '*'

In [31]:
def trim_party(data,delimiter='('):
    """
    this function will remove party designations for candidate names with the format used by the OH SoS, unless a different delimiter is called
    can also be used to split and return just first names from first name columns with middle names 
    """
    data_split = [cand.split(delimiter) for cand in data]
    cand_name = [cand[0] for cand in data_split]
    cand_name = [cand.strip() for cand in cand_name]
    return cand_name

In [None]:
# A cursory look at the candidate names also reveals that some have their middle initial.
# We will use a function to remove this, creating a candidate name containing only the first and last words after striping any other titles or markers

In [None]:
def remove_middle_name(data):
    """this will return the start and end of a split item,
    built to remove middle names and titles from full name columns """
    no_middle = []
    for i in data:
        if len(i.split()) > 1:
            if i.split()[-1][-1] == '.': #will call the line below, skipping any titles appended to last names ending in '.'
                no_middle.append(i.split()[0] + ' ' + i.split()[-2])
            elif i.split()[-1].lower() == 'jr': #evauluating on .lower() ensures any changes in case between datasets will be ignored
                no_middle.append(i.split()[0] + ' ' + i.split()[-2])
            elif i.split()[-1].lower() == 'sr':
                no_middle.append(i.split()[0] + ' ' + i.split()[-2])
            else:
                no_middle.append(i.split()[0] + ' ' + i.split()[-1]) #default format is assumned to be 'Firstname Lastname'
        else:
            no_middle.append(i) #accounts for items without spaces, expected to be 'County' when applied to the assumed datasets.
    return no_middle


In [28]:
def format_OH(data):
    """
    This function is build to prepare election data from the Ohio Sec. of State 
    """
    
    col_dic = {'County Name':'County'} #column names to reformat, ensuring that the column containing County name data is always the same
    bad_counties = ['Total','Percentage'] #rows with aggregate rather than county data
    
    #remove general information about the election, we only want the details about each canddidate, OH marks candidates by party or * if write-in
    candidates = [i for i in data.columns if i.endswith(')') or i.endswith('*')] #gets list of candidates
    data = data.rename(columns=col_dic) 
    
    #Add the renamed 'County' column (always 1st column) to our list of candidates
    candidates.insert(0, data.columns[0])
    
    #use defined "bad terms" to remove unneeded information from the table
    data = data[~data['County'].isin(bad_counties)].copy()
    
    #future cases may want to split the flow here, to evauluate other races than House races
    data_copy = data[candidates].copy()
    
     #apply trim_party to get just candidate names w/o party designation
    data_copy.columns = trim_party(data_copy.columns)
    
    #ensure all column names will be compatable with the FEC data for future mergers
    ###
    ### NOTE - this step will be complicated for states that have candidates with the same first and last name running - edge cases, but must be accounted for in the future
    ###
    data_copy.columns = remove_middle_name(list(data_copy.columns))
    #OH reports by precinct, we only need data by 'County'
    data_copy = data_copy.groupby('County').sum().reset_index()
    return data_copy