In [42]:
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource, LinearColorMapper
import re

In [2]:
filepath = "sample_data.log"
output_file("journals_verification.html")

In [33]:
class JournalCollection(): 
    """ Conglomerates journals together so that we don't have a million data points 
    
    For example, americanjournalo02amer, americanjournalo03amer, americanjournalo05amer, and americanjournalo06amer are different versions 
    of the americanjournal. Instead of having one data point for each, we put all 4 together into the same collection. 
    
    For now, we are only combining the Verified to Unverified Ratio data
    """
    def __init__(self, title): 
        self.title = title 
        self.edition_to_ratio = dict()
    def get_average_ratio(self): 
        """ Returns the arithmetic mean of the Verified to Unverified Ratio data """
        if len(self.edition_to_ratio) == 0: 
            return 0 
        return sum(self.edition_to_ratio.values()) / len(self.edition_to_ratio)
    
def extract_journal(name): 
    """ Extracts the journal name and the edition from the raw journal id """
    match = re.search("\d+", name)
    if match != None: 
        return name[:match.start()], int(name[match.start(): match.end()])
    else: 
        return "", 0

In [38]:
# Setting up our data by mapping from the journal name to a JournalCollection object
journals = dict() 
with open(filepath, "r") as f: 
    for i in range(37500): # Looking at 100 journal points 
        name = f.readline()
        ratio = f.readline()
        name, edition = extract_journal(name)
        ratio = float(ratio.strip().split()[-1])
        
        journal = journals.get(name, JournalCollection(name))
        journal.edition_to_ratio[edition] = ratio
        journals[name] = journal

In [39]:
data = {k : v.get_average_ratio() for (k, v) in journals.items()}
keys = [k for k in journals.keys()]
TOOLTIPS = [("index", "$index")] # "(x, y), (keys[$x], $y)"
plot = figure(plot_height = 1000, 
              plot_width = 1000, 
              title = "Journal vs Verification Ratio", 
              tools = "pan, crosshair, wheel_zoom, reset, hover", 
              toolbar_location = "below", 
              toolbar_sticky = False,
             tooltips = TOOLTIPS)
plot.circle(x = range(len(data)), y  = list(data.values()), tags = keys, size = 10, color = "blue", alpha = 0.5)
show(plot)

In [43]:
from math import pi
import pandas as pd

from bokeh.io import show
from bokeh.models import LinearColorMapper, BasicTicker, PrintfTickFormatter, ColorBar
from bokeh.plotting import figure
from bokeh.sampledata.unemployment1948 import data

data['Year'] = data['Year'].astype(str)
data = data.set_index('Year')
data.drop('Annual', axis=1, inplace=True)
data.columns.name = 'Month'

years = list(data.index)
months = list(data.columns)

# reshape to 1D array or rates with a month and year for each row.
df = pd.DataFrame(data.stack(), columns=['rate']).reset_index()

# this is the colormap from the original NYTimes plot
colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
mapper = LinearColorMapper(palette=colors, low=df.rate.min(), high=df.rate.max())

TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title="US Unemployment ({0} - {1})".format(years[0], years[-1]),
           x_range=years, y_range=list(reversed(months)),
           x_axis_location="above", plot_width=900, plot_height=400,
           tools=TOOLS, toolbar_location='below',
           tooltips=[('date', '@Month @Year'), ('rate', '@rate%')])

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "5pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = pi / 3

p.rect(x="Year", y="Month", width=1, height=1,
       source=df,
       fill_color={'field': 'rate', 'transform': mapper},
       line_color=None)

color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="5pt",
                     ticker=BasicTicker(desired_num_ticks=len(colors)),
                     formatter=PrintfTickFormatter(format="%d%%"),
                     label_standoff=6, border_line_color=None, location=(0, 0))
p.add_layout(color_bar, 'right')

show(p)      # show the plot