In [1]:
import pandas as pd
import numpy as np
import pickle

In [4]:
# %load functions/houseFunctions.py
'''
Parse 1976-2016 house data from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/IG0UN2

@relFilePath : file path of house results data, relative to python notebook

@return:  dataframe indexed by (year, state, district)
'''

def parse_data(relFilePath, minYear=2010):

    data_df = pd.read_csv(relFilePath)

    winners_df = pd.DataFrame()
    for key, shard in data_df.groupby(['year', 'state_po', 'district']):
        if int(key[0]) >= minYear:
            winners_df = winners_df.append(shard.loc[shard['candidatevotes'].idxmax()])
    return winners_df

def parse_index(full_interest, save=False,load=False):

    if not load:
        # Make a dummy dataframe so everyone else can make complete dataframes
        master_index = full_interest[['district','state_po','year']].reset_index().drop('index', axis=1)
        master_index = master_index.rename(columns={'state_po' : 'state'}) # rename state code
        master_index = master_index.astype({'year' : int, 'district' : int}) # 
        master_index.loc[master_index['district']==0, 'district'] = 1 # make sure all districts start with 1

        # glue together the columns to get a more descriptive index
        master_index.index = ['{0}_{1:02d}_{2}'.format(row['state'],row['district'],row['year']) for _,row in master_index.iterrows()]

        if save:
            pickle.dump(master_index, open('Datasets/master_index.p', 'wb'))
        return master_index

    else:
        # Load the file
        master_index = pickle.load(open('Datasets/master_index.p', 'rb'))
        return master_index
    
def fetch_training_set(full_interest):
    
    sub_interest = full_interest[['district', 'state_po', 'year', 'party', 'candidatevotes', 'totalvotes']]    
    
    #if democratic-farmer-labor it's made to be democratic party (one entry in 2012)
    sub_interest.loc[sub_interest['party'] == 'democratic-farmer-labor', 'party'] = 'democrat'
    #if tax revolt it's made to be republican party (one entry in 2012)
    sub_interest.loc[sub_interest['party'] == 'tax revolt', 'party'] = 'republican'
    
    return sub_interest

In [7]:
full_interest = parse_data("Datasets/fec/1976-2016-house.csv", minYear=2012)
parse_index(full_interest, save=True, load=False)

Unnamed: 0,district,state,year
AK_01_2012,1,AK,2012
AL_01_2012,1,AL,2012
AL_02_2012,2,AL,2012
AL_03_2012,3,AL,2012
AL_04_2012,4,AL,2012
AL_05_2012,5,AL,2012
AL_06_2012,6,AL,2012
AL_07_2012,7,AL,2012
AR_01_2012,1,AR,2012
AR_02_2012,2,AR,2012


In [None]:
training_set = fetch_training_set(full_interest)
#set(fetch_training_set(training_set)['party'])
training_set

In [None]:
# full_interest

In [None]:
parse_index(full_interest, save=True, load=False)

In [None]:
%reset