In [4]:
import numpy as np
import pandas as pd
from src.modules import *
import os

In [6]:
elec_folder_path = 'data/raw_elec_totals'

OH_files = [file for file in os.listdir(elec_folder_path) if os.path.isfile(os.path.join(elec_folder_path, file)) and file.startswith('OH')]

#create empty list to hold OH formatted dataframes
formatted_OH = [] 
OH_names = []

In [None]:
#Create an empty list to hold filenames and dataframes

OH_files = [file for file in os.listdir(elec_folder_path) if os.path.isfile(os.path.join(elec_folder_path, file)) and file.startswith('OH')]


In [12]:
for i in OH_files:
    file = pd.ExcelFile(f'{elec_folder_path}/{i}')
    print(file.sheet_names)

['Contents', 'Master', 'President', 'U.S. Congress', 'General Assembly', 'Judicial']
['U.S. Congress']
['Contents', 'Master', 'President', 'U.S. Congress', 'General Assembly', 'State Board of Education', 'Judicial']
['Contents', 'Master', 'Statewide Offices', 'U.S. Congress', 'Gen Assembly', 'State Board of Education', 'Judicial']
['Contents', 'Master', 'President and Vice President', 'U.S. Congress', 'Ohio General Assembly', 'State Board of Education', 'Judicial']


In [8]:
OH_data = []

for i in OH_files:
    file = pd.read_excel(f'{elec_folder_path}/{i}', sheet_name='U.S. Congress', header=1)
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = f'{i}' 
    name = name[:-4]+'_f' 
    
    
    
    # Append both to the empty list
    OH_data.append((name, file))


In [17]:
for i in OH_data:
    print(f"{i[0]} has a shape of {i[1].shape}")
    print(i[1].columns)

OH_2012precinct._f has a shape of (9232, 56)
Index(['County Name', 'Precinct Name', 'Precinct Code', 'Region Name',
       'Media Market', 'Registered Voters', 'Total Voters',
       'Turnout Percentage', 'Sherrod Brown (D)', 'Josh Mandel (R)',
       'Scott Rupert', 'Jim Berns (L)', 'Steve Chabot (R)', 'Jeff Sinnard (D)',
       'Rich Stevenson (G)', 'William Smith (D)', 'Brad Wenstrup (R)',
       'Joyce Beatty (D)', 'Jeff Brown (WI)*', 'Richard Ehrbar (L)',
       'Bob Fitrakis (G)', 'Chris Long (R)', 'Jim Jordan (R)',
       'Chris Kalla (L)', 'Jim Slone (D)', 'Eric Eberly (L)', 'Bob Latta (R)',
       'Angela Zimmann (D)', 'Bill Johnson (R)', 'Charlie Wilson (D)',
       'Bob Gibbs (R)', 'Joyce Healy-Abrams (D)', 'John Boehner (R)',
       'James Condit (WI)*', 'Marcy Kaptur (D)', 'Sean Stipe (L)',
       'Samuel Wurzelbacher (R)', 'David Harlow (L)', 'Sharen Neuhardt (D)',
       'Mike Turner (R)', 'Marcia Fudge (D)', 'Jim Reese (D)',
       'Pat Tiberi (R)', 'Marisha Agana (R)',

In [1]:
# The Ohio data appears to be formatted with rows for each county and precinct, and columns for each Candidate (amongst other columns of data)
# While the Candidates here are identified with the party initial, in the interest of consistency we should format them to only have the candidate names
# since we will be getting the party IDs from the FEC data

## The final format will also have to match the IRS data, meaning that we will need to retain the orientation of the data (rows for each county)

# Our goals for formatting this data will be - 
## Build a dataframe with columns for each Candidate containing:

### Vote totals broken down by county

### Party ID for that Candidate

### A binary marker for Incumbant/Challenger
#### These last two will provide the keys to merging candidate vote totals, since our depended variables in the initially-proposed analysis will be 'Vote Total for Incumbents' and 'Vote Total for Challengers'
#### Tagging candidates by Party ID will allow future analysis to examine effects of party popularity on the IRS and vote-total data

## The list of columns should be reduced to:

### County names
#### This will be the Primary Key for merging the IRS data
#### These counties should be formatted in all lowercase to match the IRS data and to avoid any inconsistencies with naming conventions year-to-year

### Canddidate names
#### Each candidate should have their name formatted to match the FEC data, the simplest of which is 

In [None]:
def format_OH(data):
    """
    This function is build to prepare election data from the Ohio Sec. of State 
    """
    
    col_dic = {'County Name':'County'} #column names to reformat
    bad_counties = ['Total','Percentage'] #rows with totals rather than county data
    
    #remove general information about the election, we only want the details about each canddidate, OH marks candidates by party or * if write-in
    candidates = [i for i in data.columns if i.endswith(')') or i.endswith('*')] #gets list of candidates
    data = data.rename(columns=col_dic) 
    
    #Add the renamed 'County' column (always 1st column) to our list of candidates
    candidates.insert(0, data.columns[0])
    
    #use defined "bad terms" to remove unneeded information from the table
    data = data[~data['County'].isin(bad_counties)].copy()
    
    #future cases may want to split the flow here, to evauluate other races than House races
    data_copy = data[candidates].copy()
    
     #apply trim_party to get just candidate names w/o party designation
    data_copy.columns = trim_party(data_copy.columns)
    
    #ensure all column names will be compatable with the FEC data for future mergers
    ###
    ### NOTE - this step will be complicated for states that have candidates with the same first and last name running - edge cases, but must be accounted for in the future
    ###
    data_copy.columns = remove_middle_name(list(data_copy.columns))
    #OH reports by precinct, we only need data by 'County'
    data_copy = data_copy.groupby('County').sum().reset_index()
    return data_copy