In [1]:
from pathlib import Path
import pandas as pd
from bokeh.plotting import figure, ColumnDataSource, show
from bokeh.models import HoverTool
from bokeh.models.mappers import LinearColorMapper
from bokeh.palettes import brewer
from bokeh.io import output_notebook
from math import sqrt, log, exp

output_notebook()

In [42]:
def comet_chart(df, columns=None, **kwargs):
    """
    Generates comet-charts as originally described by Zan Armstrong with bokeh.figure.patches().
    See https://www.zanarmstrong.com/#/infovisresearch/ for details.
    
    A comet chart compares two scenarios: (weight_start, value_start) vs (weight_end, value_end).
    Weight_start and weight_end denotes the size of the population; and value_start and value_end denotes the metric.
    
    df:     Pandas dataframe which contains at least 4 columns as defined in value_columns as input for comet chart.
            Preceding columns df.iloc[:,:-4] may contain hierarchy of subpopulations or segments
            which will be shown in hover tooltip.
    columns:List of length 4 containing ['weight_start', 'weight_end', 'value_start', 'value_end'] data, in that order.
            When none, defaults to the last four columns, i.e. df.iloc[:,-4:].
            ,weight_start, weight_end, value_start, value_end datapoints for each record.
                
    """
    
    def values_to_points(_id, weight_start, weight_end, value_start, value_end):
        """Returns dict with xs, ys, delta_weight for single comet"""
        a = weight_end - weight_start
        b = value_end - value_start
        dist = sqrt(a**2 + b**2)
        halfwidth = dist/16
        comet = {
            '_ids': _id,
            '_delta_weight': a,
            '_xs': [weight_start, (halfwidth / dist) * b + weight_end, (-halfwidth / dist) * b + weight_end],
            '_ys': [value_start, (-halfwidth / dist) * a + value_end, (halfwidth / dist) * a + value_end]}
        return comet
    
    # check correct input: reset index for joining
    _df = df.reset_index()
    
    # parse data into dataframe of comets
    comets = []
    
    # TO DO: rewrite i) use columns if not None, ii) else last four columns in order
    for row in _df.itertuples():
        comets.append(values_to_points(row[0], row[-2], row[-1], row[-4], row[-3]))
    cdf = pd.DataFrame(comets)
    source = ColumnDataSource(cdf)
    
    # configure plot
    hover = HoverTool(tooltips=[
        ("index", "$index"),
        ("(x,y)", "($x, $y)"),
        ("delta weight", "@_delta_weight"),
        ("test", "@test")
        
    ])
    
    plot = figure(tools=[hover, 'box_zoom', 'reset'])
    
    # TO D): add option to choose palette
    color_mapper = LinearColorMapper(palette=brewer['RdBu'][11],
                                     high=cdf._delta_weight.max(),
                                     low=cdf._delta_weight.min(),
                                     )
    plot.patches('_xs', '_ys', source=source, 
                 fill_color={'field': '_delta_weight', 'transform': color_mapper},
                 fill_alpha=0.7,
                 line_color={'field': '_delta_weight', 'transform': color_mapper},
                )
    
    # customize plot
    
    return (cdf, plot)
        

In [43]:
# test using CDC wonder dataset
data = Path.cwd() / 'data.csv'
df = pd.DataFrame.from_csv(data)

# calculate log values
for col in df.columns[-4:]:
    df['log_' + col] = df[col].map(lambda x: log(x))
cdf, plot = comet_chart(df)
show(plot)

In [8]:
#TO DO: add hoover tool with population/segment characteristics, _weight_value
#TO DO: colors and axis are off
#TO DO: add brushed sortable bar chart

In [30]:
jdf = pd.concat([df.reset_index(),cdf], join='inner', axis=1)

In [31]:
jdf.iloc[51,:]

state                                                          Texas
birthweight                                        499 grams or less
startvalue                                                    703.34
endvalue                                                      708.93
startweight                                                     1409
endweight                                                       2642
log_startvalue                                               6.55584
log_endvalue                                                 6.56376
log_startweight                                              7.25064
log_endweight                                                7.87929
_delta_weight                                               -3.43505
_ids                                                      California
_xs                [11.314340370906502, 7.8797862585, 7.87879671167]
_ys                    [6.55584041643, 6.77844734642, 6.34906623569]
test                              

In [16]:
df.iloc[51,:]

birthweight        499 grams or less
startvalue                    703.34
endvalue                      708.93
startweight                     1409
endweight                       2642
log_startvalue               6.55584
log_endvalue                 6.56376
log_startweight              7.25064
log_endweight                7.87929
Name: Texas, dtype: object

In [17]:
log(1409)

7.25063551189868

In [27]:
for row in df.iterrows():
    _index, _row = row
    print(_index, _row)

Ohio birthweight        2500 - 2999 grams
startvalue                      5.53
endvalue                        4.88
startweight                   101227
endweight                     109151
log_startvalue               1.71019
log_endvalue                 1.58515
log_startweight              11.5251
log_endweight                11.6005
Name: Ohio, dtype: object
Ohio birthweight        1500 - 1999 grams
startvalue                     29.19
endvalue                       26.96
startweight                     9078
endweight                       9904
log_startvalue               3.37383
log_endvalue                 3.29435
log_startweight              9.11361
log_endweight                9.20069
Name: Ohio, dtype: object
Ohio birthweight        1000 - 1499 grams
startvalue                     67.39
endvalue                       59.15
startweight                     4526
endweight                       5038
log_startvalue                4.2105
log_endvalue                 4.08008
log_star