In [68]:
import random
import numpy as np
import pandas as pd
from pandas import *
import matplotlib.pyplot as plt
import math
from datetime import datetime, date, timedelta
from scipy.stats.stats import spearmanr 
from scipy import stats
import re
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix
import pickle
import us
plt.style.use(['seaborn-darkgrid'])
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

### Reading and reformatting data

In [85]:
hp = read_pickle('HousePollwithWinner_Final.pkl')
hp['year'] = hp.date.dt.to_period('Y')
# Create unique index for race name (district, state, year)
hp['race_name'] = hp['race_name'] + ', ' + hp.set_index('year').index.strftime('%Y')

sp = read_pickle('SenatePollwithWinner_Final.pkl')
sp['year'] = sp.date.dt.to_period('Y')

gp = read_pickle('GovernorPollwithWinner_Final.pkl')
gp['year'] = gp.date.dt.to_period('Y')

### Building the model

In [91]:
# Create new columns to subset by race name
# This isn't especially pythonic but I'm past caring
for i in hp['race_name'].unique():
    hp[i] = hp['race_name'].isin([i]).astype('int')

# Create a dictionary with key values being races and value being a df of all the info we need moving forward
lis = []
for i in hp['race_name'].unique():
    df = hp[hp[i] == 1]
    # There are entries still marked as <BLANK>...
    df = df.replace('<BLANK>',np.nan)
    df['election_date'] = pd.to_datetime(df['election_date'])
    lis.append(df[['race_name','state','poll_results','poll_leader','poll_spread','date','winner','election_date']])
race_dict = dict(zip(hp['race_name'].unique(), lis))

In [92]:
# Example from race_dict
race_dict['pennsylvania 12th district special election, 2010']

Unnamed: 0,race_name,state,poll_results,poll_leader,poll_spread,date,winner,election_date
0,"pennsylvania 12th district special election, 2010",PA,"{'burns': 44.0, 'critz': 41.0}",burns,3.0,2010-04-20,critz,2010-11-02
6,"pennsylvania 12th district special election, 2010",PA,"{'burns': 46.0, 'critz': 40.0}",burns,6.0,2010-05-01,critz,2010-11-02
7,"pennsylvania 12th district special election, 2010",PA,"{'critz': 40.0, 'burns': 46.0}",burns,6.0,2010-05-01,critz,2010-11-02
9,"pennsylvania 12th district special election, 2010",PA,"{'critz': 44.0, 'burns': 38.0}",critz,6.0,2010-05-12,critz,2010-11-02
10,"pennsylvania 12th district special election, 2010",PA,"{'burns': 38.0, 'critz': 44.0}",critz,6.0,2010-05-12,critz,2010-11-02
11,"pennsylvania 12th district special election, 2010",PA,"{'critz': 47.0, 'burns': 48.0}",burns,1.0,2010-05-17,critz,2010-11-02


In [96]:
# From the dictionaries within poll results (for specific elections) create a time series of poll data (formatted as df)
# The last value in this df is the date of the election
# Put these time series in a dictionary with the key being race name
rd = []
for i in hp['race_name'].unique():
    date = pd.DataFrame(race_dict[i]['date']).reset_index(drop=True)
    res = pd.concat([pd.DataFrame.from_dict(i,orient='index').T for i in race_dict[i]['poll_results']]).reset_index(drop=True)
    w_date = pd.concat([date,res], axis=1)
    w_date['date'] = to_datetime(w_date['date'])
    w_date = w_date.set_index('date')  
    w_date = w_date.groupby('date').mean()
    w_date.loc[race_dict[i]['election_date'].reset_index(drop=True)[0]] = np.nan
    rd.append(w_date)
race_dict_updated = dict(zip(hp['race_name'].unique(), rd))    

In [101]:
# Example from race_dict_updated
race_dict_updated['pennsylvania 12th district special election, 2010']

Unnamed: 0_level_0,burns,critz
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-04-20,44.0,41.0
2010-05-01,46.0,40.0
2010-05-12,38.0,44.0
2010-05-17,48.0,47.0
2010-11-02,,


In [100]:
# Example from race_dict_updated -- some races only have one poll taken
race_dict_final['alaska at-large district, 2018']

Unnamed: 0_level_0,galvin,young
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-11-02,49.0,48.0
NaT,,


In [103]:
### MODEL METHODOLOGY BRIEFLY EXPLAINED HERE ###

# Final prediction on the day of the election
# Exponential moving average on past three rate of changes in polls, multiplied by days from the last poll until the election.
# This product is added to the most recent poll to output a prediction

# Predictions are in a dictionary with the key being race name
rd_final = []
for i in hp['race_name'].unique():
    polling_diff = race_dict_final[i].reset_index().diff()
    polling_diff['date'] = polling_diff['date'].dt.days
    polling_diff = polling_diff.set_index('date')
    final_polling = polling_diff.div(polling_diff.index, axis=0).ewm(span=3,min_periods=1).mean().reset_index(drop=True)[-1:].multiply(polling_diff.index[-1:],axis=0).reset_index(drop=True)
    rd_final.append(final_polling)
race_dict_final = dict(zip(hp['race_name'].unique(), rd_final)) 

In [105]:
# Unbelievably innacurate
race_dict_final['pennsylvania 12th district special election, 2010']

Unnamed: 0,burns,critz
0,162.415584,73.306494


In [104]:
# If we have less than three polls we output nan
race_dict_final2['alaska at-large district, 2018']

Unnamed: 0,galvin,young
0,,


### Results
* Plenty of nans (case where we don't have an election date and/or we have <3 total polls taken)
* Most values that aren't nan are wrong, mostly because there's often a huge gap between the last poll and the date of the election
* For example, in PA District 12 the six polls taken are on average 10 days apart, but the actual election is 6 months from the last poll!
* Very small sample sizes and predictions far outside our date range is a reciple for disaster (even with this ultra-simple model) 
* **This makes accurate prediction/extrapolation next to impossible**

In [110]:
# Conversion of the pretty dict into an ugly df
# Demonstrative of my points above

pd.DataFrame.from_dict(race_dict_final2, orient='index')

Unnamed: 0,0
"pennsylvania 12th district special election, 2010",burns critz 0 162.415584 73.306494
"new hampshire 1st district, 2010",guinta shea-porter 0 -2.357714 -4.275143
"new hampshire 2nd district, 2010",bass kuster swett 0 -38.393226 8....
"maryland 1st district, 2010",harris kratovil 0 37.721519 37.721519
"hawaii 1st district special election, 2010",djou case hanabusa 0 NaN NaN NaN
"south carolina 4th district, 2010",gowdy inglis lee thomas jeffrey 0 Na...
"north carolina 2nd district, 2010",ellmers etheridge rose 0 NaN ...
"minnesota 6th district, 2010",anderson bachmann clark 0 NaN 0...
"nevada 3rd district, 2010",heck titus 0 0.560192 -0.104884
"virginia 5th district, 2010",hurt perriello 0 -8.722852 -0.858313
