In [1]:
#imports
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import numpy as np
import pandas as pd
import pickle
%matplotlib inline

In [2]:
# dictionary to translate numeric codes to state abbrev. names
state_codes = {  
1: "AL", 2: "AK",4: "AZ",5: "AR",6: "CA",8: "CO",9: "CT",10: "DE",
11: "DC",12: "FL",13: "GA",15: "HI",16: "ID",17: "IL",18: "IN",19: "IA",
20: "KS",21: "KY",22: "LA",23: "ME",24: "MD",25: "MA",26: "MI",27: "MN",
28: "MS",29: "MO",30: "MT",31: "NE",32: "NV",33: "NH",34: "NJ",35: "NM",
36: "NY",37: "NC",38: "ND",39: "OH",40: "OK",41: "OR",42: "PA",44: "RI",
45: "SC",46: "SD",47: "TN",48: "TX",49: "UT",50: "VT",51: "VA",53: "WA",
54: "WV",55: "WI",56: "WY"}

In [3]:
# raw ACS data file paths
path10 = 'Datasets/demographics/ACS_10_1YR_S0201_with_ann.csv'
path12 = 'Datasets/demographics/ACS_12_1YR_S0201_with_ann.csv'
path14 = 'Datasets/demographics/ACS_14_1YR_S0201_with_ann.csv'
path16 = 'Datasets/demographics/ACS_16_1YR_S0201_with_ann.csv'
path17 = 'Datasets/demographics/ACS_17_1YR_S0201_with_ann.csv'

# read in the raw files
df10 = pd.read_csv(path10, header = 1)
df12 = pd.read_csv(path12, header = 1)
df14 = pd.read_csv(path14, header = 1)
df16 = pd.read_csv(path16, header = 1)
df17 = pd.read_csv(path17, header = 1)

In [4]:
def clean(df, year):

    # insert year column
    df.insert(0, 'year', year)

    # insert state column (as 2-char code, like: MA)
    df.insert(1, 'state', df.Id2.apply(lambda x: int(x / 100)))
    df.state = df.state.apply(lambda x: state_codes[x])
    
    # insert district column, as an integer
    # districts should start with district 1, not 0
    df.insert(2, 'district', df.Id2.apply(lambda x: int(str(x)[-2:])))
    df.district = df.district.replace(0, 1)

    # filter out Margin of Error columns
    columns_to_keep = [col for col in df.columns if not 'Margin of Error;' in col]
    df = df[columns_to_keep]
    
    # create index as: state_district_year
    df.index = ['{0}_{1:02d}_{2}'.format(row['state'], row['district'], row['year']) for _,row in df.iterrows()]
    
    return df

In [5]:
acs10 = clean(df10, 2010)
acs12 = clean(df12, 2012)
acs14 = clean(df14, 2014)
acs16 = clean(df16, 2016)
acs17 = clean(df17, 2017)

In [6]:
# check order of unemployement columns
[col for col in acs10.columns if 'EMPLOYMENT STATUS - In labor force - Civilian labor force - Unemployed' in col]

['Estimate; EMPLOYMENT STATUS - In labor force - Civilian labor force - Unemployed',
 'Estimate; EMPLOYMENT STATUS - In labor force - Civilian labor force - Unemployed - Percent of civilian labor force',
 'Estimate; EMPLOYMENT STATUS - In labor force - Civilian labor force - Unemployed.1',
 'Estimate; EMPLOYMENT STATUS - In labor force - Civilian labor force - Unemployed - Percent of civilian labor force.1']

In [7]:
len(acs17.columns)

316

In [8]:
# find the common set of columns between two dataframes
len(set(acs16.columns).intersection(set(acs17.columns)))

287

In [9]:
set(acs10.columns).intersection(set(acs12.columns))

{'Estimate; CLASS OF WORKER - Civilian employed population 16 years and over',
 'Estimate; CLASS OF WORKER - Government workers',
 'Estimate; CLASS OF WORKER - Private wage and salary workers',
 'Estimate; CLASS OF WORKER - Self-employed workers in own not incorporated business',
 'Estimate; CLASS OF WORKER - Unpaid family workers',
 'Estimate; COMMUTING TO WORK - Car, truck, or van - carpooled',
 'Estimate; COMMUTING TO WORK - Car, truck, or van - drove alone',
 'Estimate; COMMUTING TO WORK - Mean travel time to work (minutes)',
 'Estimate; COMMUTING TO WORK - Other means',
 'Estimate; COMMUTING TO WORK - Public transportation (excluding taxicab)',
 'Estimate; COMMUTING TO WORK - Walked',
 'Estimate; COMMUTING TO WORK - Worked at home',
 'Estimate; COMMUTING TO WORK - Workers 16 years and over',
 'Estimate; DISABILITY STATUS - Civilian noninstitutionalized population 18 to 64 years',
 'Estimate; DISABILITY STATUS - Civilian noninstitutionalized population 65 years and older',
 'Estima

In [10]:
#[col for col in df.columns if not 'Margin of Error;' in col]


In [11]:
# 2010 and 2012
raw_predictor_name_subset_10_12 = [
"Estimate; SEX AND AGE - Female",
"Estimate; SEX AND AGE - 18 to 24 years",
"Estimate; SEX AND AGE - 25 to 34 years",
"Estimate; SEX AND AGE - Median age (years)",
"Estimate; RELATIONSHIP - Nonrelatives - Unmarried partner",
"Estimate; HOUSEHOLDS BY TYPE - Nonfamily households - Male householder - Living alone",
"Estimate; EDUCATIONAL ATTAINMENT - Bachelor's degree or higher",
"Estimate; FERTILITY - Women 15 to 50 years who had a birth in the past 12 months - Unmarried women 15 to 50 years who had a birth in the past 12 months - As a percent of all women with a birth in the past 12 months",
"Estimate; VETERAN STATUS - Civilian veteran",
"Estimate; RESIDENCE 1 YEAR AGO - Same house",
"Estimate; PLACE OF BIRTH, CITIZENSHIP STATUS AND YEAR OF ENTRY - Native",
"Estimate; PLACE OF BIRTH, CITIZENSHIP STATUS AND YEAR OF ENTRY - Foreign born",
"Estimate; WORLD REGION OF BIRTH OF FOREIGN BORN - Latin America",
"Estimate; LANGUAGE SPOKEN AT HOME AND ABILITY TO SPEAK ENGLISH - Language other than English",
"Estimate; EMPLOYMENT STATUS - In labor force - Civilian labor force - Unemployed",
"Estimate; COMMUTING TO WORK - Public transportation (excluding taxicab)",
"Estimate; HEALTH INSURANCE COVERAGE - No health insurance coverage",
"Estimate; POVERTY RATES FOR FAMILIES AND PEOPLE FOR WHOM POVERTY STATUS IS DETERMINED - All people",
"Estimate; OWNER CHARACTERISTICS - Median value (dollars)"] 

raw_predictor_name_supplement_10 = [
"Estimate; INCOME IN THE PAST 12 MONTHS (IN 2010 INFLATION-ADJUSTED DOLLARS) - Median household income (dollars)",
"Estimate; INCOME IN THE PAST 12 MONTHS (IN 2010 INFLATION-ADJUSTED DOLLARS) - With Food Stamp/SNAP benefits"]

raw_predictor_name_supplement_12 = [
"Estimate; INCOME IN THE PAST 12 MONTHS (IN 2012 INFLATION-ADJUSTED DOLLARS) - Median household income (dollars)",
"Estimate; INCOME IN THE PAST 12 MONTHS (IN 2012 INFLATION-ADJUSTED DOLLARS) - With Food Stamp/SNAP benefits"]

raw_predictor_names_10 = raw_predictor_name_subset_10_12 + raw_predictor_name_supplement_10
raw_predictor_names_12 = raw_predictor_name_subset_10_12 + raw_predictor_name_supplement_12


# 2014 and beyond
raw_predictor_name_subset_14_beyond = [
"Estimate; SEX AND AGE - Total population - Female",
"Estimate; SEX AND AGE - 18 to 24 years",
"Estimate; SEX AND AGE - 25 to 34 years",
"Estimate; SEX AND AGE - Median age (years)",
"Estimate; RELATIONSHIP - Population in households - Nonrelatives - Unmarried partner",
"Estimate; HOUSEHOLDS BY TYPE - Households - Nonfamily households - Male householder - Living alone",
"Estimate; EDUCATIONAL ATTAINMENT - Bachelor's degree or higher",
"Estimate; FERTILITY - Women 15 to 50 years - Women 15 to 50 years who had a birth in the past 12 months - Unmarried women 15 to 50 years who had a birth in the past 12 months - As a percent of all women with a birth in the past 12 months",
"Estimate; VETERAN STATUS - Civilian population 18 years and over - Civilian veteran",
"Estimate; RESIDENCE 1 YEAR AGO - Population 1 year and over - Same house",
"Estimate; PLACE OF BIRTH, CITIZENSHIP STATUS AND YEAR OF ENTRY - Native",
"Estimate; PLACE OF BIRTH, CITIZENSHIP STATUS AND YEAR OF ENTRY - Foreign born",
"Estimate; WORLD REGION OF BIRTH OF FOREIGN BORN - Foreign-born population excluding population born at sea - Latin America",
"Estimate; LANGUAGE SPOKEN AT HOME AND ABILITY TO SPEAK ENGLISH - Population 5 years and over - Language other than English",
"Estimate; EMPLOYMENT STATUS - Population 16 years and over - In labor force - Civilian labor force - Unemployed",
"Estimate; COMMUTING TO WORK - Workers 16 years and over - Public transportation (excluding taxicab)",
"Estimate; HEALTH INSURANCE COVERAGE - Civilian noninstitutionalized population - No health insurance coverage",
"Estimate; POVERTY RATES FOR FAMILIES AND PEOPLE FOR WHOM POVERTY STATUS IS DETERMINED - All people",
"Estimate; OWNER CHARACTERISTICS - Owner-occupied housing units - Median value (dollars)"]

raw_predictor_name_supplement_14 = [
"Estimate; INCOME IN THE PAST 12 MONTHS (IN 2014 INFLATION-ADJUSTED DOLLARS) - Households - Median household income (dollars)",
"Estimate; INCOME IN THE PAST 12 MONTHS (IN 2014 INFLATION-ADJUSTED DOLLARS) - With Food Stamp/SNAP benefits"]

raw_predictor_name_supplement_16 = [
"Estimate; INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS) - Households - Median household income (dollars)",
"Estimate; INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS) - With Food Stamp/SNAP benefits"]

raw_predictor_name_supplement_17 = [
"Estimate; INCOME IN THE PAST 12 MONTHS (IN 2017 INFLATION-ADJUSTED DOLLARS) - Households - Median household income (dollars)",
"Estimate; INCOME IN THE PAST 12 MONTHS (IN 2017 INFLATION-ADJUSTED DOLLARS) - With Food Stamp/SNAP benefits"]

raw_predictor_names_14 = raw_predictor_name_subset_14_beyond + raw_predictor_name_supplement_14
raw_predictor_names_16 = raw_predictor_name_subset_14_beyond + raw_predictor_name_supplement_16
raw_predictor_names_17 = raw_predictor_name_subset_14_beyond + raw_predictor_name_supplement_17

In [12]:
renamed_predictors = [
"female_pct",
"age18_24_pct",
"age25_34_pct",
"median_age",
"unmarried_partner_pct",
"male_living_alone_pct",
"bachelors_deg_or_higher_pct",
"past_year_births_to_unmarried_women_pct",
"civilian_veteran_pct",
"live_same_house_past_year_pct",
"native_born_population",
"foreign_born_population",
"foreign_born_proportion_from_LatinAmerica",
"speak_other_language_at_home_pct",
"labor_force_unemployed_pct",
"public_transit_commuter_pct",
"no_health_insurance_pct",
"poverty_rate_pct",
"median_housing_value",
"median_household_income",
"food_stamp_benefits_pct"]

In [13]:
# check number of predictors
len(acs17[raw_predictor_names_17].columns)

21

In [14]:
identifier_cols = ['state', 'district', 'year']

acs10 = acs10[['state', 'district', 'year'] + raw_predictor_names_10]
acs12 = acs12[['state', 'district', 'year'] + raw_predictor_names_12]
acs14 = acs14[['state', 'district', 'year'] + raw_predictor_names_14]
acs16 = acs16[['state', 'district', 'year'] + raw_predictor_names_16]
acs17 = acs17[['state', 'district', 'year'] + raw_predictor_names_17]

In [15]:
acs10.columns = identifier_cols + renamed_predictors
acs10.head()

Unnamed: 0,state,district,year,female_pct,age18_24_pct,age25_34_pct,median_age,unmarried_partner_pct,male_living_alone_pct,bachelors_deg_or_higher_pct,...,foreign_born_population,foreign_born_proportion_from_LatinAmerica,speak_other_language_at_home_pct,labor_force_unemployed_pct,public_transit_commuter_pct,no_health_insurance_pct,poverty_rate_pct,median_housing_value,median_household_income,food_stamp_benefits_pct
AL_01_2010,AL,1,2010,51.0,9.0,12.3,38.4,1.6,12.3,21.5,...,22296,45.1,4.9,7.5,0.4,17.3,19.1,136000,41172,15.7
AL_02_2010,AL,2,2010,52.1,10.1,12.6,37.7,1.7,12.3,20.1,...,16268,49.5,3.7,5.5,0.2,12.9,18.6,111200,40567,15.3
AL_03_2010,AL,3,2010,51.7,12.5,12.1,36.7,1.6,12.8,19.9,...,22689,45.2,5.0,7.9,0.5,14.9,24.0,110100,35452,17.2
AL_04_2010,AL,4,2010,50.6,8.4,11.3,40.2,1.2,10.6,12.1,...,26109,87.5,5.8,7.4,0.1,16.3,19.8,93400,36715,14.6
AL_05_2010,AL,5,2010,51.0,9.8,12.4,38.3,1.2,12.3,26.9,...,31009,53.8,6.0,7.0,0.3,13.1,15.3,135600,45856,10.6


In [16]:
# convert foreign / native born population to ratio
for df in [acs10, acs12, acs14, acs16, acs17]:
    df.columns = identifier_cols + renamed_predictors
    df['foreign_to_native_born_ratio'] = df['foreign_born_population'] / df['native_born_population']
    df.drop(['native_born_population', 'foreign_born_population'], axis = 1, inplace = True)

In [17]:
# use 2017 demographics as 2018 data
acs18 = acs17
acs18.year = 2018
acs18.index = ['{0}_{1:02d}_{2}'.format(row['state'], row['district'], row['year']) for _,row in df.iterrows()]

In [18]:
acs18

Unnamed: 0,state,district,year,female_pct,age18_24_pct,age25_34_pct,median_age,unmarried_partner_pct,male_living_alone_pct,bachelors_deg_or_higher_pct,...,foreign_born_proportion_from_LatinAmerica,speak_other_language_at_home_pct,labor_force_unemployed_pct,public_transit_commuter_pct,no_health_insurance_pct,poverty_rate_pct,median_housing_value,median_household_income,food_stamp_benefits_pct,foreign_to_native_born_ratio
AL_01_2018,AL,1,2018,51.8,8.2,12.9,40.0,1.4,14.0,25.0,...,38.3,4.2,3.2,0.3,10.5,17.0,147100,47984,13.5,0.035991
AL_02_2018,AL,2,2018,51.2,9.7,13.0,38.5,1.7,13.0,23.1,...,41.6,3.9,3.4,0.1,9.3,18.0,131400,46579,15.8,0.026709
AL_03_2018,AL,3,2018,51.2,10.7,13.0,38.1,2.1,13.8,21.7,...,34.7,4.2,2.9,0.2,8.9,17.8,127600,46484,14.0,0.030215
AL_04_2018,AL,4,2018,51.4,8.8,11.7,40.7,1.3,13.0,17.9,...,76.4,6.7,3.2,0.2,10.6,17.6,117700,43218,14.0,0.040059
AL_05_2018,AL,5,2018,50.8,9.3,13.2,39.5,1.6,12.6,31.9,...,43.6,6.1,2.8,0.3,9.0,13.8,160400,54707,12.4,0.045012
AL_06_2018,AL,6,2018,52.3,8.6,12.3,39.1,1.3,9.9,37.6,...,44.7,6.1,2.7,0.2,7.0,9.9,193400,65170,6.9,0.045651
AL_07_2018,AL,7,2018,52.3,12.5,14.2,36.3,1.6,16.0,20.7,...,49.8,4.6,4.8,1.3,10.4,24.6,96500,35988,21.9,0.027468
AK_01_2018,AK,1,2018,47.9,9.9,16.0,34.5,2.7,13.8,28.8,...,14.9,15.9,4.9,1.7,13.7,11.1,273100,73181,10.8,0.085367
AZ_01_2018,AZ,1,2018,49.5,10.7,12.2,37.6,2.3,13.0,25.3,...,54.4,26.8,4.2,0.7,10.2,19.2,180500,50303,13.7,0.065604
AZ_02_2018,AZ,2,2018,50.5,9.5,12.8,41.4,3.0,15.5,35.4,...,53.3,22.5,3.8,2.2,6.6,13.9,185900,53516,11.9,0.114537


In [19]:
# stack 5 dataframes from each year on top of each other
demographics_data = acs18.append([acs16, acs14, acs12, acs10])

In [20]:
demographics_data

Unnamed: 0,state,district,year,female_pct,age18_24_pct,age25_34_pct,median_age,unmarried_partner_pct,male_living_alone_pct,bachelors_deg_or_higher_pct,...,foreign_born_proportion_from_LatinAmerica,speak_other_language_at_home_pct,labor_force_unemployed_pct,public_transit_commuter_pct,no_health_insurance_pct,poverty_rate_pct,median_housing_value,median_household_income,food_stamp_benefits_pct,foreign_to_native_born_ratio
AL_01_2018,AL,1,2018,51.8,8.2,12.9,40.0,1.4,14.0,25.0,...,38.3,4.2,3.2,0.3,10.5,17.0,147100,47984,13.5,0.035991
AL_02_2018,AL,2,2018,51.2,9.7,13.0,38.5,1.7,13.0,23.1,...,41.6,3.9,3.4,0.1,9.3,18.0,131400,46579,15.8,0.026709
AL_03_2018,AL,3,2018,51.2,10.7,13.0,38.1,2.1,13.8,21.7,...,34.7,4.2,2.9,0.2,8.9,17.8,127600,46484,14.0,0.030215
AL_04_2018,AL,4,2018,51.4,8.8,11.7,40.7,1.3,13.0,17.9,...,76.4,6.7,3.2,0.2,10.6,17.6,117700,43218,14.0,0.040059
AL_05_2018,AL,5,2018,50.8,9.3,13.2,39.5,1.6,12.6,31.9,...,43.6,6.1,2.8,0.3,9.0,13.8,160400,54707,12.4,0.045012
AL_06_2018,AL,6,2018,52.3,8.6,12.3,39.1,1.3,9.9,37.6,...,44.7,6.1,2.7,0.2,7.0,9.9,193400,65170,6.9,0.045651
AL_07_2018,AL,7,2018,52.3,12.5,14.2,36.3,1.6,16.0,20.7,...,49.8,4.6,4.8,1.3,10.4,24.6,96500,35988,21.9,0.027468
AK_01_2018,AK,1,2018,47.9,9.9,16.0,34.5,2.7,13.8,28.8,...,14.9,15.9,4.9,1.7,13.7,11.1,273100,73181,10.8,0.085367
AZ_01_2018,AZ,1,2018,49.5,10.7,12.2,37.6,2.3,13.0,25.3,...,54.4,26.8,4.2,0.7,10.2,19.2,180500,50303,13.7,0.065604
AZ_02_2018,AZ,2,2018,50.5,9.5,12.8,41.4,3.0,15.5,35.4,...,53.3,22.5,3.8,2.2,6.6,13.9,185900,53516,11.9,0.114537


In [21]:
demographics_data.to_csv('Datasets/demographics_data_2010_to_2018.csv')

In [22]:
demographics_data.columns

Index(['state', 'district', 'year', 'female_pct', 'age18_24_pct',
       'age25_34_pct', 'median_age', 'unmarried_partner_pct',
       'male_living_alone_pct', 'bachelors_deg_or_higher_pct',
       'past_year_births_to_unmarried_women_pct', 'civilian_veteran_pct',
       'live_same_house_past_year_pct',
       'foreign_born_proportion_from_LatinAmerica',
       'speak_other_language_at_home_pct', 'labor_force_unemployed_pct',
       'public_transit_commuter_pct', 'no_health_insurance_pct',
       'poverty_rate_pct', 'median_housing_value', 'median_household_income',
       'food_stamp_benefits_pct', 'foreign_to_native_born_ratio'],
      dtype='object')

In [23]:
fec = pickle.load(open('Datasets/data_FEC_NATIONALPOLL_2004_2018.p', 'rb'))

In [24]:
fec.index

Index(['AK_01_2004', 'AL_01_2004', 'AL_02_2004', 'AL_03_2004', 'AL_04_2004',
       'AL_05_2004', 'AL_06_2004', 'AL_07_2004', 'AR_01_2004', 'AR_02_2004',
       ...
       'WV_03_2018', 'WI_01_2018', 'WI_02_2018', 'WI_03_2018', 'WI_04_2018',
       'WI_05_2018', 'WI_06_2018', 'WI_07_2018', 'WI_08_2018', 'WY_01_2018'],
      dtype='object', length=3431)

In [25]:
demographics_data = demographics_data.drop(['district', 'state', 'year'], axis=1)

In [26]:
# try joining FEC election data with ACS demographic data


fec_demographics_data = fec.join(demographics_data, how='inner')

In [27]:
set(fec_demographics_data['year'])

{2010, 2012, 2014, 2016, 2018}

In [28]:
# pickle.dump(fec_demographics_data, open('Datasets/data_FEC_NATIONALPOLL_DEMOGRAPHICS_2010_2018.p', 'wb'))
# fec_demographics_data.to_csv('Datasets/data_FEC_NATIONALPOLL_DEMOGRAPHICS_2010_2018.csv')
# test = pickle.load(open('Datasets/data_FEC_NATIONALPOLL_DEMOGRAPHICS_2010_2018.p', 'rb'))
# fec_demographics_data.shape, test.shape

In [29]:
fec.shape, demographics_data.shape

((3431, 22), (2180, 20))

In [30]:
fec_demographics_imputed_data = fec.join(demographics_data, how='outer')

for index, row in fec_demographics_imputed_data.iterrows():
    if index not in fec.index:
        fec_demographics_imputed_data = fec_demographics_imputed_data[fec_demographics_imputed_data.index != index]
        
fec_demographics_imputed_data.shape

(3431, 42)

In [31]:
assert(sorted(fec_demographics_imputed_data.index) == sorted(fec.index))

In [32]:
demographics_data.columns

Index(['female_pct', 'age18_24_pct', 'age25_34_pct', 'median_age',
       'unmarried_partner_pct', 'male_living_alone_pct',
       'bachelors_deg_or_higher_pct',
       'past_year_births_to_unmarried_women_pct', 'civilian_veteran_pct',
       'live_same_house_past_year_pct',
       'foreign_born_proportion_from_LatinAmerica',
       'speak_other_language_at_home_pct', 'labor_force_unemployed_pct',
       'public_transit_commuter_pct', 'no_health_insurance_pct',
       'poverty_rate_pct', 'median_housing_value', 'median_household_income',
       'food_stamp_benefits_pct', 'foreign_to_native_born_ratio'],
      dtype='object')

In [33]:
for index, row in fec_demographics_imputed_data.iterrows():
    if row['year'] < 2010:
        impute_index = index[:-5] + '_2010'
        fec_demographics_imputed_data.loc[fec_demographics_imputed_data.index == index, demographics_data.columns] = fec_demographics_imputed_data[fec_demographics_imputed_data.index == impute_index][demographics_data.columns].values        

In [34]:
fec_demographics_imputed_data[fec_demographics_imputed_data.index == 'AK_01_2010']

Unnamed: 0,district,state,year,party,candidatevotes,totalvotes,candidate,national_poll,national_poll_prev,national_poll_delta_subtract,...,foreign_born_proportion_from_LatinAmerica,speak_other_language_at_home_pct,labor_force_unemployed_pct,public_transit_commuter_pct,no_health_insurance_pct,poverty_rate_pct,median_housing_value,median_household_income,food_stamp_benefits_pct,foreign_to_native_born_ratio
AK_01_2010,1.0,AK,2010.0,republican,175384,254335,Don Young,-2.622642,9.824324,-12.446966,...,19.2,16.5,6.6,1.2,19.9,9.9,241400,64576.0,10.6,0.074201


In [35]:
fec_demographics_imputed_data[fec_demographics_imputed_data.index == 'AK_01_2004']

Unnamed: 0,district,state,year,party,candidatevotes,totalvotes,candidate,national_poll,national_poll_prev,national_poll_delta_subtract,...,foreign_born_proportion_from_LatinAmerica,speak_other_language_at_home_pct,labor_force_unemployed_pct,public_transit_commuter_pct,no_health_insurance_pct,poverty_rate_pct,median_housing_value,median_household_income,food_stamp_benefits_pct,foreign_to_native_born_ratio
AK_01_2004,1.0,AK,2004.0,republican,213216,299996,Don Young,3.680556,-0.989011,4.669567,...,19.2,16.5,6.6,1.2,19.9,9.9,241400,64576.0,10.6,0.074201


In [36]:
for col in demographics_data.columns:
    if fec_demographics_imputed_data[col].isnull().values.any():
        raise ValueError

In [37]:
set(fec_demographics_imputed_data['year'].values)

{2004.0, 2006.0, 2008.0, 2010.0, 2012.0, 2014.0, 2016.0, 2018.0}

In [38]:
pickle.dump(fec_demographics_imputed_data, open('Datasets/data_FEC_NATIONALPOLL_DEMOGRAPHICSIMPUTED_2004_2018.p', 'wb'))
fec_demographics_imputed_data.to_csv('Datasets/data_FEC_NATIONALPOLL_DEMOGRAPHICSIMPUTED_2004_2018.csv')
test = pickle.load(open('Datasets/data_FEC_NATIONALPOLL_DEMOGRAPHICSIMPUTED_2004_2018.p', 'rb'))
fec_demographics_imputed_data.shape, test.shape

((3431, 42), (3431, 42))

In [39]:
set(test['year'].values)

{2004.0, 2006.0, 2008.0, 2010.0, 2012.0, 2014.0, 2016.0, 2018.0}

In [40]:
for col in demographics_data.columns:
    if test[col].isnull().values.any():
        raise ValueError

In [41]:
fec[fec.index == 'WI_06_2018']

Unnamed: 0,district,state,year,party,candidatevotes,totalvotes,candidate,national_poll,national_poll_prev,national_poll_delta_subtract,...,dem_win_margin_prev,rep_win_margin_prev,margin_signed_minus_prev,margin_signed_divide_prev,margin_unsigned_minus_prev,margin_unsigned_divide_prev,dem_win_prev,rep_win_prev,dem_win,rep_win
WI_06_2018,6,WI,2018,republican,,,,7.168919,3.175439,3.99348,...,0.37256,0.571547,-0.198988,0.651844,0.198988,1.534109,0.0,1.0,0.0,1.0


In [42]:
demographics_data[demographics_data.index == 'WI_06_2018']

Unnamed: 0,female_pct,age18_24_pct,age25_34_pct,median_age,unmarried_partner_pct,male_living_alone_pct,bachelors_deg_or_higher_pct,past_year_births_to_unmarried_women_pct,civilian_veteran_pct,live_same_house_past_year_pct,foreign_born_proportion_from_LatinAmerica,speak_other_language_at_home_pct,labor_force_unemployed_pct,public_transit_commuter_pct,no_health_insurance_pct,poverty_rate_pct,median_housing_value,median_household_income,food_stamp_benefits_pct,foreign_to_native_born_ratio
WI_06_2018,49.6,8.9,11.5,42.1,3.1,13.4,26.8,23.9,7.8,86.0,33.5,6.9,1.9,0.5,4.6,8.8,163500,59685,9.5,0.037647


In [43]:
fec_demographics_data[fec_demographics_data.index == 'WI_06_2018']

Unnamed: 0,district,state,year,party,candidatevotes,totalvotes,candidate,national_poll,national_poll_prev,national_poll_delta_subtract,...,foreign_born_proportion_from_LatinAmerica,speak_other_language_at_home_pct,labor_force_unemployed_pct,public_transit_commuter_pct,no_health_insurance_pct,poverty_rate_pct,median_housing_value,median_household_income,food_stamp_benefits_pct,foreign_to_native_born_ratio
WI_06_2018,6,WI,2018,republican,,,,7.168919,3.175439,3.99348,...,33.5,6.9,1.9,0.5,4.6,8.8,163500,59685,9.5,0.037647


In [44]:
test.columns

Index(['district', 'state', 'year', 'party', 'candidatevotes', 'totalvotes',
       'candidate', 'national_poll', 'national_poll_prev',
       'national_poll_delta_subtract', 'national_poll_delta_divide',
       'previous_party', 'dem_win_margin_prev', 'rep_win_margin_prev',
       'margin_signed_minus_prev', 'margin_signed_divide_prev',
       'margin_unsigned_minus_prev', 'margin_unsigned_divide_prev',
       'dem_win_prev', 'rep_win_prev', 'dem_win', 'rep_win', 'female_pct',
       'age18_24_pct', 'age25_34_pct', 'median_age', 'unmarried_partner_pct',
       'male_living_alone_pct', 'bachelors_deg_or_higher_pct',
       'past_year_births_to_unmarried_women_pct', 'civilian_veteran_pct',
       'live_same_house_past_year_pct',
       'foreign_born_proportion_from_LatinAmerica',
       'speak_other_language_at_home_pct', 'labor_force_unemployed_pct',
       'public_transit_commuter_pct', 'no_health_insurance_pct',
       'poverty_rate_pct', 'median_housing_value', 'median_household_inc

In [45]:
fec.shape

(3431, 22)

In [None]:
demographics_data.shape

In [None]:
demographics_data.columns