In [7]:
import pandas as pd
import numpy as np
import pickle

In [24]:
# %load functions/houseFunctions.py
'''
Parse 1976-2016 house data from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/IG0UN2

@relFilePath : file path of house results data, relative to python notebook

@return:  dataframe indexed by (year, state, district)
'''

def parse_data(relFilePath, minYear=2010):

    data_df = pd.read_csv(relFilePath)

    winners_df = pd.DataFrame()
    for key, shard in data_df.groupby(['year', 'state_po', 'district']):
        if int(key[0]) > minYear:
            winners_df = winners_df.append(shard.loc[shard['candidatevotes'].idxmax()])
    return winners_df

def parse_index(full_interest, save=False,load=False):

    if not load:
        # Make a dummy dataframe so everyone else can make complete dataframes
        master_index = full_interest[['district','state_po','year']].reset_index().drop('index', axis=1)
        master_index = master_index.rename(columns={'state_po' : 'state'}) # rename state code
        master_index = master_index.astype({'year' : int, 'district' : int}) # 
        master_index.loc[master_index['district']==0, 'district'] = 1 # make sure all districts start with 1

        # glue together the columns to get a more descriptive index
        master_index.index = ['{0}_{1:02d}_{2}'.format(row['state'],row['district'],row['year']) for _,row in master_index.iterrows()]

        if save:
            pickle.dump(master_index, open('Datasets/master_index.p', 'wb'))
        return master_index

    else:
        # Load the file
        master_index = pickle.load(open('Datasets/master_index.p', 'rb'))
        return master_index
    
def fetch_training_set(full_interest):
    
    sub_interest = full_interest[['district', 'state_po', 'year', 'party', 'candidatevotes', 'totalvotes']]    
    
    #if democratic-farmer-labor it's made to be democratic party (one entry in 2012)
    sub_interest.loc[sub_interest['party'] == 'democratic-farmer-labor', 'party'] = 'democrat'
    #if tax revolt it's made to be republican party (one entry in 2012)
    sub_interest.loc[sub_interest['party'] == 'tax revolt', 'party'] = 'republican'
    
    return sub_interest

In [22]:
full_interest = parse_data("Datasets/fec/1976-2016-house.csv", minYear=2010)

In [27]:
training_set = fetch_training_set(full_interest)
#set(fetch_training_set(training_set)['party'])
training_set

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,district,state_po,year,party,candidatevotes,totalvotes
18,0.0,AK,2012.0,republican,185296.0,289804.0
466,1.0,AL,2012.0,republican,196374.0,200676.0
495,2.0,AL,2012.0,republican,180591.0,283953.0
116,3.0,AL,2012.0,republican,175306.0,273930.0
353,4.0,AL,2012.0,republican,199071.0,269118.0
316,5.0,AL,2012.0,republican,189185.0,291293.0
513,6.0,AL,2012.0,republican,219262.0,308102.0
336,7.0,AL,2012.0,democrat,232520.0,306558.0
691,1.0,AR,2012.0,republican,138800.0,246843.0
533,2.0,AR,2012.0,republican,158175.0,286598.0


In [None]:
# full_interest

In [11]:
parse_index(full_interest, save=True, load=False)

Unnamed: 0,district,state,year
AK_01_1976,1,AK,1976
AL_01_1976,1,AL,1976
AL_02_1976,2,AL,1976
AL_03_1976,3,AL,1976
AL_04_1976,4,AL,1976
AL_05_1976,5,AL,1976
AL_06_1976,6,AL,1976
AL_07_1976,7,AL,1976
AR_01_1976,1,AR,1976
AR_02_1976,2,AR,1976


In [None]:
%reset