# Boxplot

In [2]:
import pandas as pd
import io
import textwrap
import numpy as np
import pandas as pd
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral6
from utilities.colors import get_colors, plot_colors
from bokeh.models import Legend, LegendItem
from bokeh.models import DatetimeTickFormatter
from bokeh.models.ranges import Range1d, DataRange1d
from bokeh.models import ColumnDataSource, HoverTool, GlyphRenderer, Line
import itertools
from bokeh.models import Span

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.core.common import flatten
from bokeh.plotting import figure, output_notebook, show
output_notebook()

## Generate fake data

In [3]:
def _read_csv_string(string, index_cols=None):
    string = textwrap.dedent(string)
    return pd.read_csv(io.StringIO(string), index_col=index_cols)


synth_data = _read_csv_string("""
    gender,working_status,essential_worker,cc_hours,covid
    male,full_time,both,20,pre
    female,full_time,both,-10,pre
    male,full_time,both,30,post
    female,full_time,both,-10,post
    male,part_time,both,15,pre
    female,part_time,both,-10,pre
    male,part_time,both,40,post
    female,part_time,both,20,post
    male,full_time,father,10,pre
    female,full_time,father,20,pre
    male,full_time,father,15,post
    female,full_time,father,10,post
    male,part_time,father,20,pre
    female,part_time,father,5,pre
    male,part_time,father,30,post
    female,part_time,father,40,post
"""
)
synth_data

Unnamed: 0,gender,working_status,essential_worker,cc_hours,covid
0,male,full_time,both,20,pre
1,female,full_time,both,-10,pre
2,male,full_time,both,30,post
3,female,full_time,both,-10,post
4,male,part_time,both,15,pre
5,female,part_time,both,-10,pre
6,male,part_time,both,40,post
7,female,part_time,both,20,post
8,male,full_time,father,10,pre
9,female,full_time,father,20,pre


**Outcome variable** (fixed): `cc_hours`

**Main background variable**: `working_status` 

**Secondary background variable** (fixed): `covid` 

## Process data

In [4]:
def process_data(df, bg_vars_1, bg_var_2, outcome, sample_var):
        """Compute data for boxplot, for arbitrary number of main background variables.

        Args:
            df (pd.DataFrame): Dataset.
            bg_vars_1 (list): List of main background variables.
            bg_var_2 (str): Secondary background variable.
            outcome (str): Outcome variable.
            sample_var (str): Variable that divides the dataset into samples

        Returns:
            dict

        """

        tot_res = {}
        
        all_res = {}
        
        for var_1, var_2 in itertools.product(bg_vars_1, bg_var_2):

            res = compute_quantities(df, var_1, var_2, outcome)
            all_res.update(res)

        tot_res["all"] = all_res


        for s in df[sample_var]:

            s_res = {}
            s_df = df[df[sample_var] == s]
            
            for var_1, var_2 in itertools.product(bg_vars_1, bg_var_2):

                res = compute_quantities(s_df, var_1, var_2, outcome)
                s_res.update(res)
                
            tot_res[s] = s_res

        return tot_res

In [5]:
def compute_quantities(df, bg_var_1, bg_var_2, outcome):
    """Compute data for boxplot, for one main background variable.
    
    Args:
        df (pd.DataFrame): Dataset.
        bg_var_1 (str): Main background variable.
        bg_var_2 (str): Secondary background variable.
        outcome (str): Outcome variable.
    
    Returns:
        dict
        
    """
    # create temporary dict of quantiles
    temp_dict = {"q25": 0.25, "q50": 0.5, "q75": 0.75}
    
    # empty list where to store data
    data = []
    
    for key, val in temp_dict.items():
        
        # compute quantiles for (grouped) data
        groups = df.groupby([bg_var_1, bg_var_2])
        out = groups.quantile(q=val).rename(columns={outcome: key})
        
        # store data
        data.append(out)
    
    # compute "upper" and "lower" extreme for boxplot stems
    upper = data[2]["q75"] + 1.5*(data[2]["q75"] - data[0]["q25"])
    lower = data[0]["q25"] - 1.5*(data[2]["q75"] - data[0]["q25"])
    
    # add "upper" and "lower" to data. The result is a list of pd.DataFrames
    data.append(upper)
    data.append(lower)
    
    # concatenate pd.DataFrames
    df_final = pd.concat(data, axis=1).rename(columns={0:"upper", 1:"lower"})
    
    # convert result to dictionary of results
    key = (bg_var_1, bg_var_2)
    index = df_final.index.tolist()
    res = {key: {"cats": index, "data": df_final.to_dict("list"), "order": [i[1] for i in index]}}
    
    return res

In [6]:
# I am adding "gender" to the main background variables to carify the dict structure
data_dict = process_data(synth_data, ["working_status", "gender"], ["covid"], "cc_hours", "essential_worker")

In [7]:
data_dict

{'all': {('working_status',
   'covid'): {'cats': [('full_time', 'post'),
    ('full_time', 'pre'),
    ('part_time', 'post'),
    ('part_time', 'pre')], 'data': {'q25': [5.0, 5.0, 27.5, 1.25],
    'q50': [12.5, 15.0, 35.0, 10.0],
    'q75': [18.75, 20.0, 40.0, 16.25],
    'upper': [39.375, 42.5, 58.75, 38.75],
    'lower': [-15.625, -17.5, 8.75, -21.25]}, 'order': ['post',
    'pre',
    'post',
    'pre']},
  ('gender',
   'covid'): {'cats': [('female', 'post'),
    ('female', 'pre'),
    ('male', 'post'),
    ('male', 'pre')], 'data': {'q25': [5.0, -10.0, 26.25, 13.75],
    'q50': [15.0, -2.5, 30.0, 17.5],
    'q75': [25.0, 8.75, 32.5, 20.0],
    'upper': [55.0, 36.875, 41.875, 29.375],
    'lower': [-25.0, -38.125, 16.875, 4.375]}, 'order': ['post',
    'pre',
    'post',
    'pre']}},
 'both': {('working_status',
   'covid'): {'cats': [('full_time', 'post'),
    ('full_time', 'pre'),
    ('part_time', 'post'),
    ('part_time', 'pre')], 'data': {'q25': [0.0, -2.5, 25.0, -3.75],
  

In [8]:
# Dictionary mapping variables to nice names
nice_names = {'full_time':'Full-time work', 'part_time':'Part-time work', 'post':'During CoVid-19', 'pre':'Before CoVid-19'}

## Create boxplot

In [9]:
def boxplot(data_dict, bg_var_1, bg_var_2, nice_names, sample):
    """Create boxplot.
    
    Args:
        data_dict (dict): Dictionary of data.
        bg_var_1 (str): Main background variable.
        bg_var_2 (str): Secondary background variable.
        nice_names(dict): Dictionary mapping variables to nice names.
        sample (str): Either one of the categories of the variable dividing dataset into samples or "all" for the whole dataset.
        
    Returns:
        Bokeh.figure
    
    """
    
    # get data from dictionary
    cats = data_dict[sample][(bg_var_1, bg_var_2)]["cats"]
    data = data_dict[sample][(bg_var_1, bg_var_2)]["data"]
    order = data_dict[sample][(bg_var_1, bg_var_2)]["order"]
    
    #change names according to nice_names
    cats = [(nice_names[s], nice_names[f]) for s,f in cats]
    order = [nice_names[s] for s in order]
    
    # create figure
    p = figure(
        tools="", 
        y_range=FactorRange(*cats, factor_padding=-0.42), 
        plot_height=500, 
        plot_width=800, 
        toolbar_location=None    )
    

    
    # create ColumnDataSource (see https://tinyurl.com/y46stcab)
    source = ColumnDataSource(dict(
        x=cats, 
        q25=data["q25"], 
        q50=data["q50"], 
        q75=data["q75"], 
        upper=data["upper"], 
        lower=data["lower"], 
        order=order,
    ))
    
    
    
    # get palette
    palette=get_colors("categorical", number=2)
    palette.reverse()
    
    
    # this iterate the first color of the (reversed) palette every two rows
    # (we want the barplots to be grouped by CoVid-19 status)
    mapper = factor_cmap(field_name='x', palette=palette, factors=order, start=1, end=2)

    
    # stems
    r_75 = p.segment("upper", "x", "q75", "x", line_color="black", source=source)
    r_25 = p.segment("lower", "x", "q25", "x", line_color="black", source=source)

    # vertical line at 0
    vline = Span(location=0, dimension='height', line_color='black', line_width=2, line_dash="dashed")
    p.renderers.extend([vline])
    
    
    # boxes
    r_box_1 = p.hbar("x", left="q25", right="q50", height=0.575, line_color="black", source=source, color=mapper, legend_field="order")
    r_box_2 = p.hbar("x", left="q50", right="q75", height=0.575, line_color="black", source=source, color=mapper)

    # whiskers (almost-0 height rectangles, simpler than segments)
    r_lower = p.rect("lower", "x", 0.05, 0.3, line_color="black", source=source)
    r_upper = p.rect("upper", "x", 0.05, 0.3, line_color="black", source=source)

    _apply_styling(p)
    
    # NOTE: legend items are reversed because of a bug, see https://github.com/holoviz/holoviews/issues/4799 
    p.legend.orientation = "vertical"
    p.legend.location = "center_right"
    
    TOOLTIPS = [
    ("Lower whisker", "@lower"),
    ("25th quantile", "@q25"),
    ("Median", "@q50"),
    ("75th quantile", "@q75"),
    ("Upper whisker", "@upper")
    ]
    
    p.add_tools(
        HoverTool(
            renderers=[r_25, r_75, r_box_1, r_box_2, r_upper, r_lower],
            tooltips=TOOLTIPS,
        ))
    
    
    show(p)

In [10]:
def _apply_styling(p):
    
    # grid 
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_alpha = 0.5
    p.outline_line_color = None
    
    # title 
    p.title.text = "Childcare: mother's hours - fathers's hours"
    p.title.align = "center"
    p.title_location = "below"
    p.title.text_font_size = "15pt"
    
    # y-axis
    p.yaxis.axis_line_color = None
    p.axis.major_tick_line_color = None  
    p.axis.minor_tick_line_color = None
    p.yaxis.separator_line_alpha = 0
    p.y_range.range_padding = 0.2
    
    # y-axis labels
    p.yaxis.axis_label_text_font_style = "normal"
    p.yaxis.axis_label_standoff = 30
    p.yaxis.group_label_orientation = "horizontal"
    p.yaxis.group_text_font_size = "11pt"
    p.yaxis.group_text_color = "black"
    p.yaxis.group_text_font_style = "normal"
    
    # x-axis labels
    p.xaxis.axis_label_text_font_size = "12pt"
    p.xaxis.major_label_text_font_size = "11pt"
    p.yaxis.major_label_text_font_size = "0pt"
    p.yaxis.major_label_text_font = "normal"
    
    # borders
    p.min_border_left = 50
    p.min_border_right = 50
    p.min_border_top = 20
    p.min_border_bottom = 50
    
    return p

In [11]:
fig = boxplot(data_dict, "working_status", "covid", nice_names, "both")



## TO-DO

- [x] Add nice labels (i.e. "Before CoVid-19" rather than "pre")
- [x] Implement different samples conditional on variable(s)
- [x] Implement HoverTool (see https://tinyurl.com/y5vkjewb)