In [1]:
import numpy as np
import pandas as pd
from modules import *

import os #Used when reading/writing csv files programatically

In [2]:
fec_folder_path = '../data/FEC/'

fec_files = [file for file in os.listdir(fec_folder_path) if os.path.isfile(os.path.join(fec_folder_path, file))]

In [3]:
# Empty list to hold FEC files
FEC_files = []

for i in fec_files: # Call item in the file list
    
    # Read each file from the FEC file list
    file = pd.read_csv(fr'{fec_folder_path}{i}', index_col=0)
    
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = f'{i}' 
    name = name[:-4] 
    
    # Assign the dataframe to the variable name
    globals()[name] = file # from the documentation: 'the globals() function is a built-in function that returns a dictionary representing the current global symbol table' only half understand this, but it works (#programming)
    
    # Append both to the empty list
    FEC_files.append(file)

In [4]:
FEC_files[0]

Unnamed: 0_level_0,D,CANDIDATE NAME (First),CANDIDATE NAME (Last),CANDIDATE NAME(f),CANDIDATE NAME,PARTY,(I),GENERAL VOTES
STATE ABBREVIATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AL,01,Jo,Bonner,jo bonner,"Bonner, Jo",R,1,196374
AL,02,Martha,Roby,martha roby,"Roby, Martha",R,1,180591
AL,02,Therese,Ford,therese ford,"Ford, Therese",D,0,103092
AL,03,Mike,Rogers,mike rogers,"Rogers, Mike",R,1,175306
AL,03,John Andrew,Harris,john harris,"Harris, John Andrew",D,0,98141
...,...,...,...,...,...,...,...,...
WY,00,Daniel Clyde,Cummings,daniel cummings,"Cummings, Daniel Clyde",OTHER,0,4963
WY,00,Don,Wills,don wills,"Wills, Don",OTHER,0,3775
WY,S,John,Barrasso,john barrasso,"Barrasso, John",R,1,185250
WY,S,Tim,Chesnut,tim chesnut,"Chesnut, Tim",D,0,53019


In [5]:
elec_folder_path = '../data/raw_elec_totals'

PA_files = [file for file in os.listdir(elec_folder_path) if os.path.isfile(os.path.join(elec_folder_path, file)) and file.startswith('PA')]

In [6]:
def candidate_firstname(data,delimiter=' '):
    """
    This function will get the first name where candidate names are stored 'LastName, First Name middlename/title'
    """
    data_split = [cand.split(delimiter) for cand in data]
    cand_name = [name[1] for name in data_split]
    cand_name = [name.strip() for name in cand_name]
    return cand_name

In [7]:
def format_PA(data):
    """
    This function is built to format election data 
    as recorded by the Pennsylvania Department of State
    the PA DoS elections website allows you to generate data for individual elections years 
    """
    
    # Format names using 'trim party' function to remove middle names and titles, then concatinate so name matches FEC data
    data['CanLastName'] = trim_party(data['Candidate Name'],',') #removes middle names
    data['CanFirstName'] = candidate_firstname(data['Candidate Name'], ' ') #removes titles appended to last names
    data['Candidate Name(f)'] = data['CanFirstName']+' '+data['CanLastName']
    #format names as lower-case
    data['Candidate Name(f)'] = data['Candidate Name(f)'].astype(str).str.lower()

    # Create list of candidates
    cand_list = data['Candidate Name(f)'].unique()

    #reduce candidate list to just names, if any name-final punctuation remains after removing titles
    cand_list = [''.join(char for char in i if char.isalpha() or char.isspace()) for i in cand_list] 

    # Since all races appear in a single sheet
    # Create tables for each candidate
    cand_tables = {}
    for i in cand_list:
        candidate_df = data[data['Candidate Name(f)'] == i][['County Name', 'Votes']]
        candidate_df = candidate_df.groupby('County Name').sum().reset_index()
        cand_tables[i] = candidate_df

    # Merge tables
    merged_df = cand_tables[cand_list[0]]
    for i in cand_list[1:]:
        merged_df = pd.merge(merged_df, cand_tables[i], on='County Name', how='outer', suffixes=('_' + i, ''))

    # Rename columns
    merged_df.columns = ['County Name'] + list(cand_list)

    # If candidate received zero votes, fill NaN
    merged_df = merged_df.fillna(0)
    return merged_df

In [8]:
# Create empty lists to hold dataframes and their names
formatted_PA = []
PA_names = []

In [9]:
for i in PA_files: # Call item in the file list
    file = pd.read_excel(f'{elec_folder_path}/{i}', sheet_name='Official')
    
    # Get only data for congressional races
    file = file[file['Office Name'].str.contains('congress', case=False, na=False)]
    
    
    # Apply PA formatting function
    formatted = format_PA(file)

    
    
    # Generate a name for each dataframe based on the filename without the file extension
    name = f'{i}' 
    name = name[:-5]+'_f' 
    
    # Assign the dataframe to the variable name
    # globals()[name] = formatted # from the documentation: 'the globals() function is a built-in function that returns a dictionary representing the current global symbol table' only half understand this, but it works (#programming)
    
    # Append both to the empty list creating a list of names and corresponding dataframes
    formatted_PA.append(formatted)
    PA_names.append(name)


In [10]:

# Create zipped list of formatted State Election data, FEC list of candidates and parties, and the filenames found in raw_elec data
zipped_PA_FEC = zip(formatted_PA, FEC_files, PA_names)

In [11]:
# Further process and transform election data, grouping vote totals by party and incumbancy
# Allows analysis on these two metrics
for i, j, k in zipped_PA_FEC:
    
    # Joins FEC and State data for each year, produces list of counties as well
    # If an error is generated here, there is likely a mismatch between the counties in these files
    formatted_PA_FEC, counties = state_join_FEC(i,j)
    transformed_data = state_trans(formatted_PA_FEC, counties)
    
    # Writes the transformed data to a .csv file whose name references the original filename
    transformed_data.to_csv(fr"../data/formatted_house_totals/{k[:7]}.csv", index=False)

In [None]:
PA_names

In [None]:
FEC_files[0]

In [None]:
formatted_PA[0].info()

In [None]:
PA_2012_FEC_joined, counties = state_join_FEC(formatted_PA[0],FEC_files[0])

In [None]:
PA_2012_FEC_joined

In [None]:
transformed_data = state_trans(PA_2012_FEC_joined, counties)

In [None]:
formatted_PA[0]

In [None]:
data = formatted_PA[0]

In [None]:
county_col = data.columns[0]

#county names will be reinserted later for the merger with IRS data
counties = data[county_col].tolist()
counties = [i.lower() for i in counties]
data_t = data.drop(county_col, axis=1).copy()


In [None]:
data_t

In [None]:

# Transpose the dataframe so that our columns are the county vote totals and candidates are rows
# This is done to aid the transformation and grouping of candidates by party
data_t=data_t.transpose()
cand_list = list(data_t.index)


In [None]:
data_t

In [None]:

# Render candidate names in lowercase to match FEC data
# cand_list = [i.lower() for i in cand_list]
data_t.index = cand_list

# Merge FEC data, associating each candidate with their party and incumbancy
data_t = pd.merge(data_t, fec_data, left_index=True, right_on='CANDIDATE NAME(f)').reset_index(drop=True)

# return dataframe of candidates and list of counties
return data_t, counties

In [None]:
formatted_PA_FEC, counties = state_join_FEC(i,j)

In [None]:

transformed_data = state_trans(formatted_PA_FEC, counties)

In [None]:
formatted_PA_FEC

In [None]:
formatted_PA[0]['County Name'].nunique()

In [None]:
formatted_PA[1]['County Name'].nunique()

In [None]:
formatted_PA[2]['County Name'].nunique()

In [None]:
formatted_PA[3]['County Name'].nunique()

In [None]:
formatted_PA[4]['County Name'].nunique()

In [None]:
data_split = formatted_PA[2]['Candidate Name'].split(',')

In [None]:
# formal_il ALMOST works, need to split candidate names at comma and then again at space

In [None]:
df = format_PA(formatted_PA[4])

In [None]:
df