In [None]:
%matplotlib inline 

from fnmatch import fnmatch
from IPython.display import display, HTML
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from pattern import web


from matplotlib import rcParams

dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'



In [None]:
def get_poll_xml(poll_id):
    url = "http://charts.realclearpolitics.com/charts/%i.xml" % int(poll_id)
    return requests.get(url).text

In [None]:
import re

def _strip(s):
    return re.sub(r'[\W_]+', '', s)

def plot_colors(xml):
    dom = web.Element(xml)
    result = {}
    for graph in dom.by_tag('graph'):
        title = _strip(graph.attributes['title'])
        result[title] = graph.attributes['color']
    return result

In [None]:
def rcp_poll_data(xml): 
    dom = web.Element(xml)
    result = {}
    
    dates = dom.by_tag('series')[0]    
    dates = {n.attributes['xid']: str(n.content) for n in dates.by_tag('value')}
    
    keys = dates.keys()
    
    result['date'] = pd.to_datetime([dates[k] for k in keys])
    
    for graph in dom.by_tag('graph'):
        name = graph.attributes['title']
        data = {n.attributes['xid']: float(n.content) 
                if n.content else np.nan for n in graph.by_tag('value')}
        result[name] = [data[k] for k in keys]
        
    result = pd.DataFrame(result)    
    result = result.sort_values(by=['date'])
        
    return result

In [None]:
def poll_plot(poll_id):
    xml = get_poll_xml(poll_id)
    data = rcp_poll_data(xml)
    colors = plot_colors(xml)

    data = data.rename(columns = {c: _strip(c) for c in data.columns})

    norm = data[colors.keys()].sum(axis=1) / 100    
    for c in colors.keys():
        data[c] /= norm
    
    for label, color in colors.items():
        plt.plot(data.date, data[label], color=color, label=label)        
        
    plt.xticks(rotation=70)
    plt.legend(loc='best')
    plt.xlabel("Date")
    plt.ylabel("Normalized Poll Percentage")

In [None]:
poll_plot(1044)
plt.title("Obama Job Approval")

In [None]:
def is_gov_race(l):
    pattern = 'http://www.realclearpolitics.com/epolls/????/governor/??/*-*.html'
    return fnmatch(l, pattern)
        
def find_governor_races(html):
    dom = web.Element(html)
    links = [a.attributes.get('href', '') for a in dom.by_tag('a')] 
    links = [l for l in links if is_gov_race(l)]
    #eliminate duplicates!
    links = list(set(links))
    return links

In [None]:
def race_result(url):
    
    dom = web.Element(requests.get(url).text)
    
    table = dom.by_tag('div#polling-data-rcp')[0]
    result_data = table.by_tag('tr.final')[0]
    td = result_data.by_tag('td')

    results = [float(t.content) for t in td[3:-1]]
    tot = sum(results) / 100
    
    #get table headers
    headers = table.by_tag('th')
    labels = [str(t.content).split('(')[0].strip() for t in headers[3:-1]]
    
    return {l:r / tot for l, r in zip(labels, results)}

In [None]:
def id_from_url(url):
    return url.split('-')[-1].split('.html')[0]

def plot_race(url):
    id = id_from_url(url)
    xml = get_poll_xml(id)    
    colors = plot_colors(xml)

    if len(colors) == 0:
        return
    
    result = race_result(url)
    
    poll_plot(id)
    plt.xlabel("Date")
    plt.ylabel("Polling Percentage")
    for r in result:
        plt.axhline(result[r], color=colors[_strip(r)], alpha=0.6, ls='--')

In [None]:
page = requests.get('http://www.realclearpolitics.com/epolls/2010/governor/2010_elections_governor_map.html').text.encode('ascii', 'ignore')

for race in find_governor_races(page):
    plot_race(race)
    plt.show()

In [None]:
def party_from_color(color):
    if color in ['#0000CC', '#3B5998']:
        return 'democrat'
    if color in ['#FF0000', '#D30015']:
        return 'republican'
    return 'other'


def error_data(url):
    id = id_from_url(url)
    xml = get_poll_xml(id)
    
    colors = plot_colors(xml)
    if len(colors) == 0:
        return pd.DataFrame()
    
    df = rcp_poll_data(xml)
    result = race_result(url)
    
    df = df.rename(columns={c: _strip(c) for c in df.columns})
    for k, v in result.items():
        result[_strip(k)] = v 
    
    #display(df)
    col_name =df.columns[2]
    df=df.rename(columns = {col_name:'date'})
    display(df)
    candidates = [c for c in df.columns if c is not 'date']
        
    df.index = df.date
    
    df = df.resample('D').mean()
    df = df.dropna()
    
    forecast_length = (df.date.max() - df.date).values
    forecast_length = forecast_length / np.timedelta64(1, 'D')  # convert to number of days
    
    errors = {}
    normalized = {}
    poll_lead = {}
    
    for c in candidates:
        corr = df[c].values / df[candidates].sum(axis=1).values * 100.
        err = corr - result[_strip(c)]
        
        normalized[c] = corr
        errors[c] = err
        
    n = forecast_length.size
    
    result = {}
    result['percentage'] = np.hstack(normalized[c] for c in candidates)
    result['error'] = np.hstack(errors[c] for c in candidates)
    result['candidate'] = np.hstack(np.repeat(c, n) for c in candidates)
    result['party'] = np.hstack(np.repeat(party_from_color(colors[_strip(c)]), n) for c in candidates)
    result['forecast_length'] = np.hstack(forecast_length for _ in candidates)
    
    result = pd.DataFrame(result)
    return result

In [None]:
def all_error_data():
    data = [error_data(race_page) for race_page in find_governor_races(page)]
    return pd.concat(data, ignore_index=True)

In [None]:
errors = all_error_data()

In [None]:
errors.error.hist(bins=50)
plt.xlabel("Polling Error")
plt.ylabel('N')

In [None]:
errors.error.std()

In [None]:
print "< 7 days: %0.2f" % errors[errors.forecast_length < 7].error.std()
print ">30 days: %0.2f" % errors[errors.forecast_length > 30].error.std()

In [None]:
def bootstrap_result(c1, c2, errors, nsample=1000):
    tot = (c1 + c2)
    c1 = 100. * c1 / tot
    c2 = 100. * c2 / tot
    
    indices = np.random.randint(0, errors.shape[0], nsample)
    errors = errors.error.irow(indices).values
    
    c1_actual = c1 - errors
    c2_actual = c2 + errors
    
    p1 = (c1_actual > c2_actual).mean()
    p2 = 1 - p1
    return p1, p2


nsample = 10000
mcauliffe, cuccinelli = 43.0, 39.0

pm, pc = bootstrap_result(mcauliffe, cuccinelli, errors, nsample=nsample)
print "Virginia Race"
print "-------------------------"
print "P(McAuliffe wins)  = %0.2f" % pm
print "P(Cuccinelli wins) = %0.2f" % pc

#new jersey
print "\n\n"
print "New Jersey Race"
print "-----------------------"
christie, buono = 55.4, 31.8
pc, pb = bootstrap_result(christie, buono, errors, nsample=nsample)
print "P(Christie wins) = %0.2f" % pc
print "P(Buono wins)    = %0.2f" % pb