In [2]:
import numpy as np
import pandas as pd
from code.load_data import Census_Data_Loader
from code.featurize import Featurize
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_rows = 999
import statsmodels.stats.api as smf
from code.build_model import *

import plotly.plotly as py
import plotly.graph_objs as go

In [113]:
class one_party_strat(object):
    '''An object run my simulation'''
    def __init__(self, df, X, y, num_offices, mod_type='dem'):
        self.df = df
        self.num_offices = num_offices
        
        model = sm.GLSAR(y, X, rho=1).iterative_fit(1)
        [self.one_coef, self.two_coef, self.int_coef] = model.params[-3:]
        [self.one_std, self.two_std, self.int_std] = model.bse[-3:]
        
        counter_model = sm.GLSAR(y, X.drop(['1_office', '2_office', 'cook * office_bool'], axis=1), rho=1).iterative_fit(1)
        self.df['votes_predicted'] = counter_model.predict(X.drop(['1_office', '2_office', 'cook * office_bool'], axis=1)) * self.df['CVAP_EST']
        self.df['votes_predicted'] = self.df['votes_predicted'].apply(lambda x: max(x, 0))

        self.data_dict = df.set_index('NAME').to_dict('index')
        
        if mod_type == 'rep':
            self.int_std = -self.int_std
    
    def set_params(self, one_coef, two_coef, int_coef, one_std, two_std, int_std):
        self.one_coef, self.two_coef, self.int_coef = one_coef, two_coef, int_coef
        self.one_std, self.two_std, self.int_std = one_std, two_std, int_std
        
    def _weighted_sample(self, weights, sample_size):
        totals = np.cumsum(weights)
        sample = []
        for i in xrange(sample_size):
            rnd = random() * totals[-1]
            idx = np.searchsorted(totals, rnd, 'right')
            sample.append(idx)
            totals[idx:] -= weights[idx]
        return sample
    
    def place_offices(self, weights):
        num_counties = len(self.data_dict)
        options = np.zeros(num_counties)
        idxes = self._weighted_sample(weights, num_counties)
        
        counter, iterator = self.num_offices, 0
        while counter > 0:
            choice = np.random.choice([1, 2]) # room for improvement
            if counter == 1:
                choice = 1
            
            options[idxes[iterator]] = choice
            counter += -1 * choice
            iterator += 1
        self.df['num_offices_sim'] = options
        self.data_dict = self.df.set_index('NAME').to_dict('index')
        
        votes_added = []
        for county in self.data_dict.iterkeys():
            num_offices = self.data_dict[county]['num_offices_sim']
            votes_added.append(self._get_vote_increase(num_offices, county))
            
        self.df['votes_added'] = votes_added
        self.df['total_votes'] = self.df['votes_added'] + self.df['votes_predicted']
        
    def _get_vote_increase(self, num_offices, county):
        if num_offices == 1:
            office_effect =  np.random.normal(loc=self.one_coef, scale=self.one_std)
        elif num_offices == 2:
            office_effect =  np.random.normal(loc=self.two_coef, scale=self.two_std)
        else:
            return 0
            
        cook_effect = np.random.normal(loc=self.int_coef, scale=self.int_std)
        
        county_cook = self.data_dict[county]['cook_score']
        county_pop = self.data_dict[county]['CVAP_EST']
        
        return county_pop * (office_effect + county_cook * cook_effect)
    
    def get_av_increase(self):
        self.df['vote_effect'] = self.df['cook_score'] * self.int_coef + self.df['1_office'] * self.one_coef + self.df['2_office'] * self.two_coef
        self.df['max_vote_effect'] = self.df['cook_score'] * (self.int_coef + self.int_std) + self.df['1_office'] * (self.one_coef + self.one_std) + self.df['2_office'] * (self.two_coef + self.two_std)
        self.df['av_vote_increase'] = self.df['CVAP_EST'] * self.df['vote_effect'].apply(lambda x: max([0, x]))
        self.df['max_vote_increase'] = self.df['CVAP_EST'] * self.df['max_vote_effect'].apply(lambda x: max([0, x]))
        self.df['av_increase % of Predicted'] = self.df['av_vote_increase'] / self.df['votes_predicted']
        return self.df[['NAME', 'state_abbr', 'votes_predicted', 'max_vote_increase', 'av_vote_increase', 'av_increase % of Predicted', 'vote_effect', 'max_vote_effect', 'cook_score', 'CVAP_EST']]

In [4]:
featurizer = Featurize()
print 'Loading Data...'
# 2012 Election Data
election_data = pd.read_csv('data/election_2012_cleaned.csv')
election_data.drop('Unnamed: 0', axis=1, inplace=True)
# 2013 ACS summary data
census_data = featurizer.load_summary_cols()
census_cols = featurizer.load_census_cols()
# 2013 Population Data
population = pd.read_csv('data/profile_census/census_pop.csv')
# 2013 Citizens of voting age by county
CVAP = featurizer.load_CVAP()
# Location of Field offices 2012
obama_offices = featurizer.load_offices('data/Obama_Office_Locations_Parsed_Cleaned.csv', suffix='dem')
romney_offices = featurizer.load_offices('data/Romney_Office_Locations_Parsed_Cleaned.csv', suffix='rep')
# Turnout by state
total_turnout = featurizer.load_turnout('data/turnout/total_turnout.csv', prefix='tot')
dem_turnout = featurizer.load_turnout('data/turnout/democratic_turnout.csv', prefix='dem')
rep_turnout = featurizer.load_turnout('data/turnout/republican_turnout.csv', prefix='rep')
# More county factors
education = featurizer.load_education()
# education = featurizer.normalize_by_col(education, cols=['Percent of adults with a high school diploma only, 2009-2013',
#                                                          'Percent of adults completing some college or associate\'s degree, 2009-2013',
#                                                          'Percent of adults with less than a high school diploma, 2009-2013',
#                                                          'Percent of adults with a bachelor\'s degree or higher, 2009-2013'], col='st_num')
unemployment = featurizer.load_unemployment()
# unemployment = featurizer.normalize_by_col(unemployment, cols=['Unemployment_rate_2013'], col='st_num')
poverty = featurizer.load_poverty()
# poverty = featurizer.normalize_by_col(poverty, cols=['PCTPOVALL_2013', 'PCTPOV017_2013'], col='st_num')
electoral = featurizer.get_electoral_df()
print 'Making df and fitting NMF...'
obama_df = make_joined_df(census_data, CVAP, dem_turnout, election_data, obama_offices, education, unemployment, poverty, featurizer, mod_type='dem', k=2)
romney_df = make_joined_df(census_data, CVAP, rep_turnout, election_data, romney_offices, education, unemployment, poverty, featurizer, mod_type='rep', k=2)

X_obama, y_obama, feat_names_obama = make_X_y(obama_df, mod_type='dem')
X_romney, y_romney, feat_names_romney = make_X_y(romney_df, mod_type='rep')

Loading Data...
Making df and fitting NMF...



Columns (6,17) have mixed types. Specify dtype option on import or set low_memory=False.



In [5]:
electoral = pd.read_csv('data/electoral_college.csv', '\t', header=None)
electoral.columns = ['state_abbr', 'electoral_votes']
electoral_dict = electoral.set_index('state_abbr').to_dict()['electoral_votes']

In [114]:
drop_cols_dem = ['CVAP_EST',
                 'dem_2008_delta',
                 'dem_2004_delta',
                 'dem_2000_delta',
                 'dem_1996_delta',
                 'dem_1992_delta',
                 'Percent of adults with a high school diploma only, 2009-2013',
                 'PCTPOVALL_2013',
                 'PCTPOV017_2013', 
                 'rep_prez_amount', 
                 'dem_prez_amount', 
                 'Rep Cont % - Dem Cont %']
    
drop_cols_rep =['CVAP_EST',
            'rep_2008_delta',
            'rep_2004_delta',
            'rep_2000_delta',
            'rep_1996_delta',
            'rep_1992_delta',
            'Percent of adults with a high school diploma only, 2009-2013',
            'PCTPOVALL_2013',
            'PCTPOV017_2013', 'rep_prez_amount', 'dem_prez_amount', 'Rep Cont % - Dem Cont %']

X_obama_dropped = X_obama.drop(drop_cols_dem, axis=1)
X_romney_dropped = X_romney.drop(drop_cols_rep, axis=1)

dem = one_party_strat(obama_df, X_obama_dropped, y_obama, 800)
rep = one_party_strat(romney_df, X_romney_dropped, y_romney, 800, mod_type='rep')
# model = sm.GLSAR(y_obama, X_obama_dropped, rho=1).iterative_fit(1)
# [one_coef, two_coef, int_coef] = model.params[-3:]
# [one_std, two_std, int_std] = model.bse[-3:]
# rep.set_params(one_coef, two_coef, -int_coef, one_std, two_std, int_std)

In [62]:
def get_plotly_vars(obama_df, romney_df, electoral_dict, thresh=.1):
    red = 'rgb(255, 65, 54)'
    blue = 'rgb(93, 164, 214)'
    
    state_obama_df = obama_df.groupby('state_abbr').sum()
    state_romney_df = romney_df.groupby('state_abbr').sum()
    
    winner = state_obama_df['votes'] - state_romney_df['votes']
    voting_pop = state_obama_df['CVAP_EST']
    
    close_calls = []
    for x in xrange(winner.shape[0]):
        if np.absolute(winner[x] / float(voting_pop[x])) < thresh:
            close_calls.append(1)
        else:
            close_calls.append(.3)
            
            
    color = []
    for x in winner:
        if x > 0:
            color.append(blue)
        else:
            color.append(red)
            
    size = []
    for x in state_obama_df.index:
        size.append(electoral_dict[x])

    text = []
    for idx in xrange(len(winner)):
        row = state_obama_df.iloc[idx]
        r_row = state_romney_df.iloc[idx]
        name = row.name
        pop = row['CVAP_EST']
        e_votes = size[idx]
        ob_v = np.round(row['votes'] / float(pop), 3) * 100
        ro_v = np.round(r_row['votes'] / float(pop), 3) * 100
        temp = 'State: %s<br>Electoral Votes: %s<br>Voting Age Pop: %s<br>Obama Vote: %s<br>Romney Vote: %s' %(name, e_votes, pop, str(ob_v)+'%', str(ro_v)+'%')
        text.append(temp)
    return state_obama_df, state_romney_df, color, close_calls, voting_pop, size, text

In [63]:
state_obama_df, state_romney_df, color, close_calls, voting_pop, size, text = get_plotly_vars(obama_df, romney_df, electoral_dict, thresh=.05)

In [14]:
trace0 = go.Scatter(
    x=state_obama_df['votes'] / voting_pop,
    y=state_romney_df['votes'] / voting_pop,
    mode='markers',
    text = text,
    marker=dict(
        color=color,
        opacity=close_calls,
        size=np.log(size) * 5
    )
)
data = [trace0]
layout = go.Layout(
    title='State Results and Electoral Votes - Naive View',
    xaxis=dict(
        range=[0, .6],
        title='% Obama Vote',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        range=[0, .6],
        title='% Romney Vote',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    height=600,
    width=600,
)
fig = go.Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='bubblechart-color')

In [115]:
close_calls_smart = np.zeros(state_obama_df.shape[0])
state_inc_dem = dem.get_av_increase().groupby('state_abbr').sum()
state_inc_rep = rep.get_av_increase().groupby('state_abbr').sum()
winner = state_obama_df['votes'] - state_romney_df['votes']
winner = winner.apply(lambda x: x > 0)

for i in xrange(state_obama_df.shape[0]):
    row_dem = state_inc_dem.iloc[i]
    row_rep = state_inc_rep.iloc[i]
    if ((row_dem['max_vote_increase'] + row_dem['votes_predicted']) > row_rep['votes_predicted']) & np.logical_not(winner[i]):
        close_calls_smart[i] = 1
    elif ((row_rep['max_vote_increase'] + row_rep['votes_predicted']) > row_dem['votes_predicted']) & winner[i]:
        close_calls_smart[i] = 1
    else:
        close_calls_smart[i] = .3
        


In [10]:
state_inc_dem

Unnamed: 0_level_0,votes_predicted,max_vote_increase,av_vote_increase,av_increase % of Predicted
state_abbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AL,760055.976313,7618.533949,5763.228015,0.213998
AR,386257.542558,4367.620181,3306.449486,0.100005
AZ,885629.090786,27683.787888,22965.203479,0.103148
CA,8829897.727582,398181.105919,316609.068594,0.538684
CO,1288529.638014,96944.128241,83767.263331,1.33065
CT,1014518.802045,24156.52115,17848.85776,0.087571
DC,251068.47192,15046.513195,10696.236679,0.042603
DE,246279.563772,7850.191957,5785.210676,0.035664
FL,3921530.669771,326947.741178,283696.735881,2.046797
GA,1848035.015695,28613.26942,20519.306964,0.339898


In [65]:
trace0 = go.Scatter(
    x=state_obama_df['votes'] / voting_pop,
    y=state_romney_df['votes'] / voting_pop,
    mode='markers',
    text = text,
    marker=dict(
        color=color,
        opacity=close_calls_smart,
        size=np.log(size) * 5
    )
)
data = [trace0]
layout = go.Layout(
    title='State Results and Electoral Votes - Smart View',
    xaxis=dict(
        range=[0, .6],
        title='% Obama Vote',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        range=[0, .6],
        title='% Romney Vote',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    height=600,
    width=600,
)
fig = go.Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='bubblechart-color')

In [66]:
state_inc_dem = dem.get_av_increase().groupby('state_abbr').sum()
state_inc_dem['av_increase % of Predicted'] = state_inc_dem['av_vote_increase'] / state_inc_dem['votes_predicted']
state_inc_rep = rep.get_av_increase().groupby('state_abbr').sum()
state_inc_rep['av_increase % of Predicted'] = state_inc_rep['av_vote_increase'] / state_inc_rep['votes_predicted']
state_inc_dem.head()

Unnamed: 0_level_0,votes_predicted,max_vote_increase,av_vote_increase,av_increase % of Predicted,vote_effect,max_vote_effect,cook_score
state_abbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AL,760055.976313,7618.533949,5763.228015,0.007583,-0.314152,-0.472241,-9.39363
AR,386257.542558,4367.620181,3306.449486,0.00856,-0.395182,-0.593515,-11.748698
AZ,885629.090786,27683.787888,22965.203479,0.025931,-0.023408,-0.039159,-1.206378
CA,8829897.727582,398181.105919,316609.068594,0.035856,0.127721,0.160525,0.632029
CO,1288529.638014,96944.128241,83767.263331,0.06501,0.28653,0.28632,-4.892152


In [15]:
tracea = go.Bar(
    y=state_inc_rep.index[::-1],
    x=-state_inc_rep['av_increase % of Predicted'][::-1],
    name='Republican Effect',
    orientation = 'h',
    marker = dict(
        color = 'rbga(255, 65, 54, .6)',
        line = dict(
            color = 'rgba(255, 65, 54, 1.0)',
            width = 1,
        )
    )
)
traceb = go.Bar(
    y=state_inc_rep.index[::-1],
    x=state_inc_dem['av_increase % of Predicted'][::-1],
    name='Democratic Effect',
    orientation = 'h',
    marker = dict(
        color = 'rgba(93, 164, 214, .6)',
        line = dict(
            color = 'rgba(93, 164, 214, 1.0)',
            width = 1,
        )
    )
)
data = [tracea, traceb]
layout = go.Layout(
    barmode='overlay',
    title='Average Vote Increase by State',
)
fig = go.Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='marker-h-bar')

In [117]:
# Field offices by state
rep_df = rep.get_av_increase()
dem_df = dem.get_av_increase()
df = dem_df + rep_df
df['NAME'] = rep_df['NAME']
df['state_abbr'] = rep_df['state_abbr']
states = ['NH', 'OH', 'FL', 'NC', 'VA', 'CA', 'MO', 'IN']
mask = rep_df['state_abbr'].isin(states)

In [132]:
red = 'rgb(255, 65, 54)'
blue = 'rgb(93, 164, 214)'

state_list = rep_df[mask].groupby('state_abbr').count().index
count_per_state = rep_df[mask].groupby('state_abbr').count()['NAME'].values
size_effect_rep = rep_df[mask].groupby('state_abbr').max()['av_vote_increase'] / rep_df[mask].groupby('state_abbr').sum()['votes_predicted']
size_effect_dem = dem_df[mask].groupby('state_abbr').max()['av_vote_increase'] / dem_df[mask].groupby('state_abbr').sum()['votes_predicted']

winner = ['dem', 'rep', 'rep', 'rep', 'rep', 'dem', 'dem', 'dem']
size_effect = []
for i in xrange(len(winner)):
    if winner[i] =='dem':
        size_effect.append(size_effect_rep[i])
    else:
        size_effect.append(size_effect_dem[i])

rep_counties = rep_df[mask].groupby('state_abbr').max()['NAME']
dem_counties = dem_df[mask].groupby('state_abbr').max()['NAME']

county_list = []
for i in xrange(len(winner)):
    if winner[i] =='dem':
        county_list.append(dem_counties[i])
    else:
        county_list.append(rep_counties[i])

cook_scores = []
for i in xrange(len(winner)):
    if winner[i] =='dem':
        cook_scores.append(dem_df[dem_df['NAME'] == dem_counties[i]]['cook_score'].values[0])
    else:
        cook_scores.append(rep_df[rep_df['NAME'] == rep_counties[i]]['cook_score'].values[0])
                

colors = []
for x in winner:
    if x == 'dem':
        colors.append(blue)
    else:
        colors.append(red)

county_pop = []
for i in xrange(len(winner)):
    if winner[i] =='dem':
        county_pop.append(dem_df[dem_df['NAME'] == dem_counties[i]]['CVAP_EST'].values[0])
    else:
        county_pop.append(rep_df[rep_df['NAME'] == rep_counties[i]]['CVAP_EST'].values[0])
        
state_pop = rep_df[mask].groupby('state_abbr').sum()['CVAP_EST'].values

size = county_pop / state_pop.astype(float)


text = []
for i in xrange(len(winner)):
    state = state_list[i]
    county = county_list[i]
    cook = cook_scores[i]
    effect = size_effect[i]
    temp = 'State: %s<br>County: %s<br>Cook Score: %s<br>Vote Effect: %s' %(state, county, cook, effect)
    text.append(temp)

    

# print state_list
# print size_effect

In [137]:
trace0 = go.Scatter(
    x=count_per_state,
    y=size_effect,
    mode='markers',
    text = text,
    marker=dict(
        color=colors,
        size=size * 3000
    )
)
data = [trace0]
layout = go.Layout(
    title='States by Most Influential county',
    showlegend=True,
    xaxis=dict(
        title='Counties per State',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Max Size of Vote Effect',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='bubblechart-color')

NameError: name 'blue' is not defined