In [1]:
import pandas as pd
import datetime
import numpy as np
import pickle

In [2]:
def drop_rows(df, column, value):
    ''' Drop a row where 'value' is in 'column'. Only grabs first item.
    arguments:
        df -- dataframe to modify
        column --  the column you want to search
        value -- if you find this value in the column, drop the row
    returns:
        dataframe without specified rows
    '''
    try:
        new_df = df.drop(df[column][df[column]==value].index[0])
    except IndexError:
        new_df = df
    return new_df

In [3]:
def get_election_day(year):
    ''' Get the date of election day in a given year
    arguments:
        year -- year as int
    retuns:
        datetime object of election day. '''
    if year%2 == 1: 
        raise ValueError('No election in even years.')
    # possible days = Nov. 2 - Nov. 8
    possible_days = [datetime.datetime(year, 11, d) for d in range(2,9)]
    for day in possible_days:
        if day.weekday()==1: # return if it's a Tuesday
            return day 

In [4]:
def get_mean_spread(year,n_days_before_election):
    # read in data file 
    ballot_df = pd.read_csv('Datasets/'+str(year)+'_generic_congressional_vote.csv')
    
    # Data Cleaning
    ballot_df = drop_rows(ballot_df, 'Poll', 'Final Results')
    ballot_df = drop_rows(ballot_df, 'Poll', 'RCP Average')

    election_day = get_election_day(year)

    # make spread standardized around 0
    if year >= 2014: # they changed their column names after 2014
        ballot_df['Spread'] = ballot_df['Democrats (D)'] - ballot_df['Republicans (R)']
    else:
        ballot_df['Spread'] = ballot_df['Democrats'] - ballot_df['Republicans']

    spread = []
    for index, row in ballot_df.iterrows():
        # clean up the date format
        dates = row['Date'].split('-')
        start = datetime.datetime.strptime(str(year)+'/'+dates[0].strip(),'%Y/%m/%d')
        #end = datetime.datetime.strptime(str(year)+'/'+dates[1].strip(),'%Y/%m/%d')

        # take all the polls which started less than 4 weeks ago
        if (start - election_day).days <= n_days_before_election:
            spread.append(row['Spread'])

    # find the mean of the spread over the last 4 weeks
    return np.mean(spread)

In [5]:

n_days_before_election = 28
spread_dict = {}
for year in [2002,2004,2006,2008,2010,2012,2014,2016,2018]:
    spread_dict[year] = get_mean_spread(year,n_days_before_election)

In [6]:
# todo: for some reason only getting the last number
# plug it into Manish's dummy dataset
formatted_poll_df = pickle.load(open('Datasets/master_index.p','rb'))
formatted_poll_df['national_poll'] = np.nan # add new column to the df 
for year in [2002,2004,2006,2008,2010,2012,2014,2016,2018]:
    formatted_poll_df.loc[formatted_poll_df['year']==year, 'national_poll'] = spread_dict[year]

# put in same format as Jahira's 
# index: district ID, state, year 
# make a function to load to create the file

In [7]:
pickle.dump(formatted_poll_df, open('Datasets/national_poll.p','wb'))

Unnamed: 0,district,state,year,national_poll
AK_01_2012,1,AK,2012,1.988304
AL_01_2012,1,AL,2012,1.988304
AL_02_2012,2,AL,2012,1.988304
AL_03_2012,3,AL,2012,1.988304
AL_04_2012,4,AL,2012,1.988304
AL_05_2012,5,AL,2012,1.988304
AL_06_2012,6,AL,2012,1.988304
AL_07_2012,7,AL,2012,1.988304
AR_01_2012,1,AR,2012,1.988304
AR_02_2012,2,AR,2012,1.988304


In [8]:
# fec_data_loc = "ac209a_project/Datasets/1976-2016-house.csv"

# data_df = pd.read_csv(fec_data_loc)

# district_info = {}

# for key, shard in data_df.groupby(['state', 'year']):
#     state, year = key
#     if year in district_info:
#         district_info[year][state] = np.unique(shard['district'].values)
#     else:
#         district_info[year] = {}
#         district_info[year][state] = np.unique(shard['district'].values)
    
# # total_districts_count_2000 = 0
# # for state, districts in district_info[2000].items():
# #     print(state)
# #     print(districts)
# #     total_districts_count_2000 += len(districts)
# # total_districts_count_2000

# df = pd.DataFrame(district_info)
# df