In [3]:
%run preprocessing.ipynb

In [4]:

import ipysheet
import ipywidgets as widgets
from ipywidgets import VBox, HBox, Layout, Label, Text, GridBox, GridspecLayout, Dropdown, Tab, Button, HTML, Box
import plotly.express as px
from pycelonis import get_celonis
from DataModel import DataModelInfo
import plotly.graph_objects as go
import math
import pandas as pd
from scipy import stats
import numpy as np
from errors import MaximumValueReachedError, MinimumValueReachedError
# from preprocessing import Attribute, AttributeDataType, Preprocessor
from pycelonis.celonis_api.pql.pql import PQL, PQLColumn, PQLFilter
import functools
import logging
import sys
from DecisionRuleMiner import DecisionRuleMiner

In [None]:
logging.disable(sys.maxsize)

In [None]:
login = {
        "celonis_url": "academic-michael-schulten-rwth-aachen-de.eu-2.celonis.cloud",
        "api_token": "ODJkZDhjNmQtYTQ2Ny00NWRlLWJkMGYtZWJjY2FjOGVhYmQyOllBYlBkRXNHV2psZ1o1MDJacmlsRVU3KytxaDdLVHY5N1lBOHJQdTJnOXR0",
        # The following 2 lines are only necessary when connecting to CPM4.5, not for IBC:
        # "api_id": "paste_here_your_api_id",
        # "username": "paste_here_your_username",
    }

In [24]:
def get_dm(datamodel, celonis_login = None):
    if celonis_login is None:
        celonis = get_celonis()
    else:
        celonis = get_celonis(**login)
    dm = celonis.datamodels.find(datamodel)
    return dm

In [4]:
"""
login = {
    "celonis_url": "academic-michael-schulten-rwth-aachen-de.eu-2.celonis.cloud",
    "api_token": "ODJkZDhjNmQtYTQ2Ny00NWRlLWJkMGYtZWJjY2FjOGVhYmQyOllBYlBkRXNHV2psZ1o1MDJacmlsRVU3KytxaDdLVHY5N1lBOHJQdTJnOXR0",
    # The following 2 lines are only necessary when connecting to CPM4.5, not for IBC:
    # "api_id": "paste_here_your_api_id",
    # "username": "paste_here_your_username",
}
celonis = get_celonis(**login)

dm = celonis.datamodels.find("P2P_Wils_Course")
"""

### create scroll field with buttons of the attributes and the correlations

create a box for each attribute including two labels (or HTML widgets) and one button

In [26]:
class StatisticalAnalysisBox:
    def __init__(self, p:Preprocessor, df):
        self.p = p
        self.df = df


    def get_statistical_box(self):
        parent_vbox = VBox()
        title_scrollbox_layout = Layout(margin="5px 0px 0px 0px")
        title_scrollbox_html = "<span style=\"font-weight:bold;  font-size:16px\"> Attributes with potential effect on case duration:</span>"
        title_scrollbox = HTML(title_scrollbox_html, layout = title_scrollbox_layout)
        scroll_box_layout = Layout(
            overflow_y='scroll',
            max_height='400px',
            border='3px solid grey',
            padding = "3px 3px 3px 3px"
        )
        scroll_box = VBox(layout = scroll_box_layout)
        attr_boxes = []

        # sort attributes by correlation coefficient
        attrs_sorted = sorted(self.p.attributes, key=lambda x: abs(x.correlation), reverse=True)

        for attr in attrs_sorted:
            if attr.correlation >= 0.3:
                attr_field = AttributeField(attr, "Case duration", self.p, parent_vbox, self.df)
                attr_boxes.append(attr_field.box)
        scroll_box.children = attr_boxes
        parent_vbox.children = [title_scrollbox, scroll_box, VBox()]

        return parent_vbox


class AttributeField:
    def __init__(self, attribute: Attribute, label: str, p: Preprocessor, parent_box, df):
        self.label = label
        self.attribute = attribute
        self.p = p
        self.df = df
        self.attribute_name_label = self.create_attribute_label()
        self.metrics_label = self.create_metrics_label()
        self.button = self.create_button(p, parent_box)
        self.box = self.create_box()

    def create_box(self):
        layout_vbox = Layout(
            border = '2px solid gray',
            min_height = '100px',
            width = 'auto',
            padding = "0px 0px 0px 0px",
            margin = "0px 3px 3px 3px"
        )
        vbox = VBox(children=[self.attribute_name_label, self.metrics_label, self.button], layout=layout_vbox)
        return vbox

    def create_attribute_label(self):
        html_attribute = "<span style=\"font-weight:bold\"> Attribute: " + f"<span style=\"color: Blue\">{self.attribute.display_name}</span></span>"
        attribute_label = HTML(html_attribute, layout = Layout(padding="0px 0px 0px 0px"))
        return attribute_label

    def create_metrics_label(self):
        layout_padding = Layout(padding = "0px 0px 0px 12px")
        correlation_html = "<span style=\"font-weight:bold\"> Correlation: </span>"  + str(round(self.attribute.correlation, 2))
        correlation_label = HTML(correlation_html, layout=layout_padding)
        if self.attribute.attribute_data_type == AttributeDataType.NUMERICAL:
            return correlation_label
        else:
            sign = "+" if self.attribute.label_influence > 0 else ""
            case_duration_effect_html = "<span style=\"font-weight:bold\">Effect on " + self.label + ": </span>" + sign +  str(round(self.attribute.label_influence)) + "\xa0" + self.p.label.unit

            case_duration_effect_label = HTML(case_duration_effect_html)
            cases_with_attribute_html = "<span style=\"font-weight:bold\">Cases with attribute: </span>" + str(self.attribute.cases_with_attribute)

            cases_with_attribute_label = HTML(cases_with_attribute_html, layout=layout_padding)
            metrics_box = HBox([case_duration_effect_label, cases_with_attribute_label, correlation_label])
            return metrics_box

    def create_button(self, p, parent_box):
        button_layout = Layout(
            min_height = "30px"
        )
        button = Button(description="Details", layout = button_layout)
        partial_button_clicked = functools.partial(self.on_button_clicked, p=p, parent_box=parent_box)
        button.on_click(partial_button_clicked)
        return button

    def on_button_clicked(self, b, p, parent_box):
        attribute_box = self.gen_attribute_box(self.attribute, p)
        parent_box.children = parent_box.children[:-1] + (attribute_box,)

    def gen_attribute_box(self, attribute: Attribute, p: Preprocessor):

        layout_box = Layout(border = '3px solid grey', padding="5px 5px 5px 5px")

        title_layout = Layout(margin="15px 0px 0px 0px")
        title_html = "<span style=\"font-weight:bold;  font-size:16px\"> Attribute details:</span>"
        title_label = HTML(title_html, layout = title_layout)

        fig_layout = Layout(margin="15px 0px 0px 0px", width='100%')

        label_attribute = self.attribute_name_label



        query_attribute = PQLColumn(name = attribute.display_name, query=attribute.query)
        query_starttime = PQLColumn(name = "starttime", query="PU_FIRST(\"" + p.case_table_name + "\", \"" + p.activity_table_name + "\".\"" + p.eventtime_col + "\")")
        query_caseid = p.get_query_case_ids()
        query_label = PQLColumn(name="Case duration", query=p.label.query)
        query = PQL()
        query.add(query_caseid)
        query.add(query_starttime)
        query.add(query_attribute)
        query.add(query_label)
        df = p.dm.get_data_frame(query)
        if attribute.attribute_data_type == AttributeDataType.CATEGORICAL:

            hbox_metrics = self.gen_avg_metrics()
            df_cpy = df.copy()
            df_cpy['starttime'] = df_cpy['starttime'].dt.to_period('M').astype(str)
            num_cases_all_df = df_cpy.groupby('starttime', as_index=False)['caseid'].count().fillna(0)
            num_cases_all_df = num_cases_all_df.rename({'caseid': 'All cases'}, axis=1)
            num_cases_attr_true = df_cpy[df_cpy[attribute.display_name] ==1].groupby('starttime', as_index=False)['caseid'].count().fillna(0)
            num_cases_attr_true = num_cases_attr_true.rename({'caseid': 'Cases with attribute'}, axis=1)
            complete_df = p._join_dfs([num_cases_all_df, num_cases_attr_true], keys=['starttime']*2).fillna(0)
            fig = go.Figure(layout_title_text="Attribute development")
            fig.add_trace(go.Scatter(x=complete_df["starttime"], y=complete_df["All cases"], fill='tonexty', name= "All cases")) # fill down to xaxis
            fig.add_trace(go.Scatter(x=complete_df["starttime"], y=complete_df["Cases with attribute"], fill='tozeroy', name= "Cases with attribute")) # fill to trace0 y
            fig.update_layout(
                    xaxis_title = None,
                    yaxis_title = None,
                    height=300,
                    margin = {
                        'l': 10,
                        'r': 10,
                        't': 40,
                        'b': 10
                    }
                )
            fig_widget = go.FigureWidget(fig)
            fig_box = VBox([fig_widget], layout = fig_layout)
            vbox_details = VBox(children = [label_attribute, hbox_metrics, fig_box], layout = layout_box)


        else:
            df_cpy = df.copy()
            df_cpy['starttime'] = df_cpy['starttime'].dt.to_period('M').astype(str)
            avg_case_duration_over_Attribute = df_cpy.groupby(attribute.display_name, as_index=False)['Case duration'].mean().fillna(0)
            # Attribute effect on label
            fig_effect = go.Figure(layout_title_text="Case duration over attribute value")
            fig_effect.add_trace(go.Scatter(x=avg_case_duration_over_Attribute[attribute.display_name], y=avg_case_duration_over_Attribute['Case duration'], fill='tonexty'))
            fig_effect.update_layout(
                title="Effect of attribute on case duration",
                    xaxis_title = None,
                    yaxis_title = None,
                    height=300,
                    margin = {
                        'l': 5,
                        'r': 10,
                        't': 40,
                        'b': 10
                    }
                )
            fig_effect_widget = go.FigureWidget(fig_effect)
            fig_effect_box = VBox([fig_effect_widget], layout = fig_layout)

            avg_attribute_over_time_df = df_cpy.groupby('starttime', as_index=False)[attribute.display_name].mean().fillna(0)
            fig_dev = go.Figure(layout_title_text="Attribute value development")
            fig_dev.add_trace(go.Scatter(x=avg_attribute_over_time_df["starttime"], y=avg_attribute_over_time_df[attribute.display_name], fill='tonexty'))
            fig_dev.update_layout(
                    title="Average attribute value development",
                    xaxis_title = None,
                    yaxis_title = None,
                    height=300,
                    margin = {
                        'l': 10,
                        'r': 10,
                        't': 40,
                        'b': 10
                    }
                )
            fig_dev_widget = go.FigureWidget(fig_dev)
            fig_dev_box = VBox([fig_dev_widget], layout = fig_layout)

            vbox_details = VBox(children=[label_attribute, fig_effect_box, fig_dev_box], layout = layout_box)

        vbox_whole = VBox([title_label, vbox_details])
        return vbox_whole


    def gen_avg_metrics(self):
        avg_with_attr = round(self.df[self.df[self.attribute.df_attribute_name] == 1][self.p.label.df_attribute_name].mean())
        avg_without_attr = round(self.df[self.df[self.attribute.df_attribute_name] != 1][self.p.label.df_attribute_name].mean())


        html_avg_with_attr = Box([HTML("<center><span style=\"font-weight:bold\"> Average " + self.label + " with attribute" + "</span><br><span style=\"color: Red; font-size:16px; text-align: center\">"+ str(avg_with_attr) + "\xa0" + self.p.label.unit + "</span></center>")], layout=Layout(border='3px double CornflowerBlue', margin='0px 10px 0px 0px'))
        html_avg_without_attr = Box([HTML("<center><span style=\"font-weight:bold; text-align: center\"> Average " + self.label + " without attribute" + "</span><br><span style=\"color: Green; font-size:16px; text-align: center\">"+ str(avg_without_attr) + "\xa0" + self.p.label.unit + "</span></center>")], layout=Layout(border='3px double CornflowerBlue', color='CornflowerBlue', margin='0px 0px 0px 10px'))
        hbox_metrics_layout = Layout(margin = "5px 0px 0px 0px")
        hbox_metrics = HBox([html_avg_with_attr, html_avg_without_attr], layout = hbox_metrics_layout)
        return hbox_metrics
        # Label with average case duration with attribute


In [16]:
"""
def get_parent_box():
    print("pbox start0")
    parent_vbox = VBox()
    title_scrollbox_layout = Layout(margin="5px 0px 0px 0px")
    title_scrollbox_html = "<span style=\"font-weight:bold;  font-size:16px\"> Attributes with potential effect on case duration:</span>"
    title_scrollbox = HTML(title_scrollbox_html, layout = title_scrollbox_layout)
    scroll_box_layout = Layout(
        overflow_y='scroll',
        max_height='300px',
        border='3px solid grey',
        padding = "3px 3px 3px 3px"
    )
    print("pbox start1")
    scroll_box = VBox(layout = scroll_box_layout)
    print("pbox start2")
    attr_boxes = []

    # sort attributes by correlation coefficient
    attrs_sorted = sorted(p.attributes, key=lambda x: abs(x.correlation), reverse=True)

    for attr in attrs_sorted:
        print(f"{attr.display_name}, {attr.correlation}")
        if attr.correlation >= 0.3:
            attr_field = AttributeField(attr, "Case duration", p, parent_vbox, df_total_time)
            attr_boxes.append(attr_field.box)
    print("pbox start3")
    scroll_box.children = attr_boxes
    print("pbox start4")
    parent_vbox.children = [title_scrollbox, scroll_box, VBox()]
    print("pbox start5")
    return parent_vbox
"""

In [21]:
class OverviewBox:
    def __init__(self, dm_info, p):
        self.dm_info = dm_info
        self.p = p

    def get_overview_box(self):
        vBox_overview_layout = Layout(border='2px solid gray', grid_gap='30px')
        vBox_overview = VBox(layout=vBox_overview_layout)
        avg_case_duration = get_case_duration_pql(self.dm_info)
        # Case duration

        avg_case_duration_box = HBox([Box([HTML("<center><span style=\"font-weight:bold\"> Average Case Duration</span><br><span "
                                  "style=\"color: Red; font-size:16px\">" + str(round(avg_case_duration)) + "\xa0" + self.p.label.unit + "</span></center>")],
                            layout=Layout(border='3px double CornflowerBlue', margin='20px 50px 0px 10px'))], layout=Layout(margin="0px 30px 0px 0px"))

        #description_case_duration = Label(value="Average Case Duration")
        #label_avg_case_duration = Label(value = str(round(avg_case_duration)))
        #vBox_caseDuration.children = [description_case_duration, label_avg_case_duration]


        # development of case duration
        df_case_duration_dev = get_case_duration_development_pql(self.dm_info)
        fig_case_duration_development = px.area(df_case_duration_dev, x="datetime", y="case duration", title="Case duration development", height=250)
        fig_case_duration_development.update_layout(
            xaxis_title = None,
            yaxis_title = None,
            margin = {
                'l': 10,
                'r': 10,
                't': 40,
                'b': 10
            }
        )
        f_widget_case_duration_dev = go.FigureWidget(fig_case_duration_development)
        # case duration distribution
        df_distribution = get_bins_trace_times(self.dm_info, 10, time_aggregation = "DAYS")
        fig_distribution = px.bar(df_distribution, x="range", y="cases", title="Case duration distribution", height=300)
        fig_distribution.update_layout(
            xaxis_title = None,
            yaxis_title = None,
            margin = {
                'l': 10,
                'r': 10,
                't': 40,
                'b': 10
            }
        )
        f_widget_distribution = go.FigureWidget(fig_distribution)
        vBox_overview.children = [avg_case_duration_box, f_widget_case_duration_dev, f_widget_distribution]
        return vBox_overview

In [1]:
class AnalysisCaseDuration:
    def __init__(self, datamodel, celonis_login=None):
        self.datamodel = datamodel
        self.celonis_login = celonis_login
        self.p = None
        self.dm_info = None
        self.df_total_time=None
        self.overview_box = None
        self.stat_analysis_box = None
        self.dec_rule_box = None
        self.tabs=None

    def run(self):
        out = widgets.Output(layout={'border': '1px solid black'})
        display(out)
        # Connect and get dm
        with out:
            print("Connecting to Celonis...")
        dm = get_dm(self.datamodel, celonis_login = self.celonis_login)
        with out:
            print("Done")
        # Create dm_info and preprocessor
        with out:
            print("Fetching data and preprocessing...")
        self.dm_info, self.p, self.df_total_time = self.preprocess(dm)
        with out:
            print("Done")

        with out:
            print("Creatng GUI...")
        # Create overview box
        overview_box_obj = OverviewBox(self.dm_info, self.p)
        self.overview_box = overview_box_obj.get_overview_box()

        # Ceate statistical analysis tab
        stat_analysis_obj = StatisticalAnalysisBox(self.p, self.df_total_time)
        self.stat_analysis_box = stat_analysis_obj.get_statistical_box()

        # Create decision rule miner box
        dec_rule_box_obj = DecisionRulesBox(self.df_total_time, self.p.label.df_attribute_name, self.p.label.unit, self.p.attributes_dict, pos_class=None)
        self.dec_rule_box = dec_rule_box_obj.create_view()

        # Create tabs
        self.tabs = self.create_tabs()
        out.close()
        del out
        display(self.tabs)


    def preprocess(self, dm):
        dm_info = DataModelInfo(dm)
        p = Preprocessor(dm)
        df_total_time = p.run_total_time_PQL(20, time_aggregation="DAYS")
        return dm_info, p, df_total_time


    def create_tabs(self):
        tab_names = ["Overview", "Statistical Analysis", "Decision Rules"]
        tab = Tab([self.overview_box, self.stat_analysis_box, self.dec_rule_box])
        for i, el in enumerate(tab_names):
            tab.set_title(i, el)

        return tab



In [17]:
def get_overview_box(dm_info):
    vBox_overview_layout = Layout(border='2px solid gray', grid_gap='40px')
    vBox_overview = VBox(layout=vBox_overview_layout)
    avg_case_duration = get_case_duration_pql(dm_info)
    # Case duration
    layout_case_duration = Layout(align_items='center')
    vBox_caseDuration = VBox(layout = layout_case_duration)
    description_case_duration = Label(value="Average Case Duration")
    label_avg_case_duration = Label(value = str(round(avg_case_duration)))
    vBox_caseDuration.children = [description_case_duration, label_avg_case_duration]


    # development of case duration
    df_case_duration_dev = get_case_duration_development_pql(dm_info)
    fig_case_duration_development = px.area(df_case_duration_dev, x="datetime", y="case duration", title="Case duration development", height=300)
    fig_case_duration_development.update_layout(
        xaxis_title = None,
        yaxis_title = None,
        margin = {
            'l': 10,
            'r': 10,
            't': 40,
            'b': 10
        }
    )
    f_widget_case_duration_dev = go.FigureWidget(fig_case_duration_development)
    # case duration distribution
    df_distribution = get_bins_trace_times(dm_info, 10, time_aggregation = "DAYS")
    fig_distribution = px.bar(df_distribution, x="range", y="cases", title="Case duration distribution", height=300)
    fig_distribution.update_layout(
        xaxis_title = None,
        yaxis_title = None,
        margin = {
            'l': 10,
            'r': 10,
            't': 40,
            'b': 10
        }
    )
    f_widget_distribution = go.FigureWidget(fig_distribution)
    vBox_overview.children = [vBox_caseDuration, f_widget_case_duration_dev, f_widget_distribution]
    return vBox_overview

In [None]:
class DecisionRulesBox:
    def __init__(self, df, label, label_unit, attributes_dict, pos_class=None):
        self.df = df
        self.label = label
        self.label_unit = label_unit
        self.attributes_dict = attributes_dict
        self.pos_class = pos_class  # the value of the positive class. Leave at None if class is numerical
        self.default_long_perc = 20  # default top percentage of long case duration
        self.min_display_perc = 1  # minimum percentage of data to display in plot
        self.max_display_perc = 99  # maximum percentage of data to display in plot
        self.default_val = None
        self.min_val = None
        self.max_val = None
        self.min_display_val = None
        self.max_display_val = None
        self.get_statistics_from_df()
        self.high_duration_box = None
        self.dr_miner = None
        self.decision_rules = None
        self.current_case_duration = None
        self.rule_box = HBox()
        self.view = None
        self.button_run = None
        self.button_elaborate_rules = None
        self.button_simplify_rules = None

    def get_statistics_from_df(self):
        self.default_val = self.df[self.label].quantile((100 - self.default_long_perc) / 100)
        self.min_val = self.df[self.label].min()
        self.max_val = self.df[self.label].max()
        self.min_display_val = self.df[self.label].quantile(self.min_display_perc / 100)
        self.max_display_val = self.df[self.label].quantile(self.max_display_perc / 100)

    def create_view(self):
        selection_box = self.create_duration_selection_box()
        vbox_view = VBox(children=[selection_box, self.rule_box])
        self.view = vbox_view
        return vbox_view

    def create_duration_selection_box(self):
        label_title = Label("Define high case duration:")

        label_description = Label("Case duration >=\xa0")

        label_unit = Label("\xa0" + self.label_unit, layout=Layout(width='auto'))
        label_percentage = Label(
            "\xa0 (Top\xa0" + str(self.get_percentile_score(self.df[self.label], self.default_val)) + "%)",
            layout=Layout(width='auto'))
        selection_box_layout = Layout(max_width="80px")
        selection_box_number = widgets.BoundedIntText(value=self.default_val, min=self.min_val, max=self.max_val,
                                                      step=1, layout=selection_box_layout)
        self.high_duration_box = selection_box_number

        def handle_label_percentage_description(change):
            new_description = "\xa0 (Top\xa0" + str(self.get_percentile_score(self.df[self.label], change.new)) + "%)"
            label_percentage.value = new_description

        selection_box_number.observe(handle_label_percentage_description, names="value")

        # Default value Button
        def on_button_default_clicked(b):
            selection_box_number.value = self.default_val

        button_default = Button(description="Default: Top\xa0" + str(self.default_long_perc) + "%")
        button_default.on_click(on_button_default_clicked)

        hbox_selection_right = HBox(children=[selection_box_number, label_unit, label_percentage])
        vbox_selection_right = VBox(children=[hbox_selection_right, button_default])
        hbox_selection = HBox(children=[label_description, vbox_selection_right])

        # Run button
        def on_button_run_clicked(b):
            # secure the current content of the box

            case_duration_th = self.high_duration_box.value
            # Do not run the miner twice for the same threshold value
            if case_duration_th == self.current_case_duration:
                return
            button_run.disabled = True
            if self.dr_miner is None or self.dr_miner.threshold != self.high_duration_box.value:
                self.dr_miner = DecisionRuleMiner(self.df, self.label, self.attributes_dict.keys(), pos_class=None,
                                                  threshold=self.high_duration_box.value)
            self.run_decision_miner()
            self.current_case_duration = case_duration_th
            self.rule_box = self.create_rule_box()
            self.view.children = [self.view.children[0]] + [self.rule_box]
            button_run.disabled = False

        button_run = Button(description="Mine rules!")
        self.button_run = button_run
        button_run.on_click(on_button_run_clicked)

        vbox_run_button_layout = Layout(flex='1', justify_content='flex-end', align_items='center')
        vbox_run_button = VBox([button_run], layout=vbox_run_button_layout)

        vbox_duration_selection_layout = Layout(min_width="350px")
        vbox_duration_selection = VBox(children=[label_title, hbox_selection, vbox_run_button],
                                       layout=vbox_duration_selection_layout)
        prob_figure_widget = self.create_probability_figure_widget()
        hbox_all = HBox(children=[vbox_duration_selection, prob_figure_widget])
        return hbox_all

    def get_percentile_score(self, series, val):
        return round(100 - stats.percentileofscore(series, val))

    def create_probability_figure_widget(self):
        df_float = pd.DataFrame(self.df[self.label].astype(float))
        fig = px.ecdf(df_float, x=self.label)
        fig.update_layout({'xaxis_title': "Case duration (days)", 'yaxis_title': "cumulative probability"})
        fig.update_xaxes(range=[self.min_display_val, self.max_display_val])
        fig.update_layout(height=300, margin={'l': 10, 'r': 10, 't': 40, 'b': 10})
        fig_widget = go.FigureWidget(fig)
        return fig_widget

    def create_rule_box(self):
        rule_box_rules = self.create_rule_box_rules()
        rule_box_metrics = self.gen_rule_box_metrics()
        layout_rule_box = Layout(margin='20px 0px 0px 0px', border='3px groove lightblue')
        layout_rule_box_all = Layout(margin='20px 20px 20px 20px', border='3px groove lightblue')
        rule_box_all = HBox(children=[rule_box_rules, rule_box_metrics], layout=layout_rule_box_all)
        rule_box_parent = Box(children=[rule_box_all], layout=layout_rule_box)
        return rule_box_parent

    def create_rule_box_rules(self):
        html_rule_caption = HTML(
            "<span style=\"font-weight:bold; font-size: 16px\">" + "Rule for case duration >=\xa0" + str(
                self.current_case_duration) + ":</span>", layout=Layout(margin='0px 0px 10px 0px'))
        html_rules = self.get_pretty_html_rules()
        rules_html_widget = Box([HTML(value=html_rules)])

        def on_click_simplify_rules(b):
            try:
                button_simplify_rules.disabled = True
                button_elaborate_rules.disabled = True
                self.button_run.disabled = True
                self.dr_miner.simplify_rule_config()
                self.run_decision_miner()
                self.rule_box = self.create_rule_box()
                self.view.children = [self.view.children[0]] + [self.rule_box]
            except MinimumValueReachedError:
                button_simplify_rules.disabled = True
            finally:
                self.button_run.disabled = False

        def on_click_elaborate_rules(b):
            try:
                button_simplify_rules.disabled = True
                button_elaborate_rules.disabled = True
                self.dr_miner.elaborate_rule_config()
                self.run_decision_miner()
                self.rule_box = self.create_rule_box()
                self.view.children = [self.view.children[0]] + [self.rule_box]
            except MaximumValueReachedError:
                button_elaborate_rules.disabled = True
            finally:
                self.button_run.disabled = False

        button_simplify_rules = Button(description="Simplify rules")
        self.button_simplify_rules = button_simplify_rules
        button_simplify_rules.on_click(on_click_simplify_rules)
        if self.dr_miner.config_index == 0:
            button_simplify_rules.disabled = True
        button_elaborate_rules = Button(description="Elaborate rules")
        self.button_elaborate_rules = button_elaborate_rules
        button_elaborate_rules.on_click(on_click_elaborate_rules)
        if self.dr_miner.config_index >= len(self.dr_miner.configs) - 1:
            button_elaborate_rules.disabled = True

        hbox_change_rules = HBox(children=[button_simplify_rules, button_elaborate_rules])
        vbox_rule = VBox(children=[html_rule_caption, rules_html_widget, hbox_change_rules])

        return vbox_rule

    def run_decision_miner(self):
        self.dr_miner.run_pipeline()
        self.decision_rules = self.dr_miner.structured_rules

    def enable_buttons(self):
        self.button_elaborate_rules.disabled = False
        self.button_simplify_rules.disabled = False
        self.button_run.disabled = False

    def disable_buttons(self):
        self.button_elaborate_rules.disabled = True
        self.button_simplify_rules.disabled = True
        self.button_run.disabled = True

    def get_pretty_html_rules(self):
        pretty_rules = []
        for rule in self.decision_rules:
            pretty_conds = []
            for cond in rule:
                attr = cond['attribute']
                val = cond['value']
                unequality = cond['unequal_sign']
                if self.attributes_dict[attr].attribute_data_type == AttributeDataType.NUMERICAL:
                    if unequality != "between":
                        pretty_str = attr + " " + unequality + "= " + val
                    else:
                        pretty_str = attr + " is in range " + val
                else:
                    if val == "1":
                        pretty_str = attr
                    else:
                        pretty_str = "<span style=\"color: Red\">NOT</span> " + attr
                pretty_conds.append(pretty_str)
            pretty_rule = ""
            for pretty_cond in pretty_conds:
                if pretty_rule != "":
                    pretty_rule = pretty_rule + "<span style=\"color: Green; font-weight: bold;\"><br>AND<br></span>"
                pretty_rule = pretty_rule + pretty_cond
            pretty_rule = "<div style=\"line-height:140%; margin-top: 0px; margin-bottom: 0px;\">" + pretty_rule + \
                          "</div>"
            pretty_rules.append(pretty_rule)

        all_rules_html_text = ""
        for pretty_rule in pretty_rules:
            if all_rules_html_text != "":
                all_rules_html_text = all_rules_html_text + "<div style=\"color: DodgerBlue; font-weight: bold; " \
                                                            "margin-top: 5px; margin-bottom: 5px;\">&emsp;OR</div>"
            all_rules_html_text = all_rules_html_text + pretty_rule
        return all_rules_html_text

    def gen_rule_box_metrics(self):
        conf_matrix = self.gen_conf_matrix()
        avg_metrics = self.gen_avg_rule_metrics()
        metrics_box = VBox([conf_matrix, avg_metrics], layout=Layout(margin='35px 0px 0px 30px'))
        return metrics_box

    def gen_conf_matrix(self):
        header_color = "AliceBlue"
        cell_color = "Snow"
        font_size = '12px'
        html_rule_performance = HTML("<span style=\"font-weight: bold; font-size:16px\">Rule Performance</span>")
        conf_matrix = ipysheet.sheet(rows=4, columns=4, column_headers=False, row_headers=False)
        ipysheet.cell(0, 0, '', read_only=True, background_color=header_color)
        ipysheet.cell(0, 1, 'Rule = True', read_only=True,
                      style={'font-weight': 'bold', 'color': 'Green', 'font-size': font_size},
                      background_color=header_color)
        ipysheet.cell(0, 2, 'Rule = False', read_only=True,
                      style={'font-weight': 'bold', 'color': 'Red', 'font-size': font_size},
                      background_color=header_color)
        ipysheet.cell(0, 3, 'Covered by rule', read_only=True, style={'font-weight': 'bold', 'font-size': font_size},
                      background_color=header_color)
        ipysheet.cell(1, 0, 'High case duration', read_only=True, style={'font-weight': 'bold', 'font-size': font_size},
                      background_color=header_color)
        ipysheet.cell(1, 1, str(self.dr_miner.metrics['true_p']), read_only=True, style={'font-size': font_size},
                      background_color=cell_color)
        ipysheet.cell(1, 2, str(self.dr_miner.metrics['false_n']), read_only=True, style={'font-size': font_size},
                      background_color=cell_color)
        ipysheet.cell(1, 3, str(round(self.dr_miner.metrics['recall_p'] * 100)) + "%", read_only=True,
                      style={'font-size': font_size}, background_color=cell_color)
        ipysheet.cell(2, 0, 'Low case duration', read_only=True, style={'font-weight': 'bold', 'font-size': font_size},
                      background_color=header_color)
        ipysheet.cell(2, 1, str(self.dr_miner.metrics['false_p']), read_only=True, style={'font-size': font_size},
                      background_color=cell_color)
        ipysheet.cell(2, 2, str(self.dr_miner.metrics['true_n']), read_only=True, style={'font-size': font_size},
                      background_color=cell_color)
        ipysheet.cell(2, 3, str(round(self.dr_miner.metrics['recall_n'] * 100)) + "%", read_only=True,
                      style={'font-size': font_size}, background_color=cell_color)
        ipysheet.cell(3, 0, 'Rule correct', read_only=True, style={'font-weight': 'bold', 'font-size': font_size},
                      background_color=header_color)
        ipysheet.cell(3, 1, str(round(self.dr_miner.metrics['precision_p'] * 100)) + "%", read_only=True,
                      style={'font-size': font_size}, background_color=cell_color)
        ipysheet.cell(3, 2, str(round(self.dr_miner.metrics['precision_n'] * 100)) + "%", read_only=True,
                      style={'font-size': font_size}, background_color=cell_color)
        ipysheet.cell(3, 3, '', read_only=True, background_color=cell_color)
        vbox_all = VBox(children=[html_rule_performance, conf_matrix])
        return vbox_all

    def gen_avg_rule_metrics(self):
        html_avg_true = Box([HTML("<center><span style=\"font-weight:bold\"> Rule = True</span><br><span "
                                  "style=\"color: Red; font-size:16px\">" + str(
            round(self.dr_miner.metrics['avg_True'])) + "\xa0" + self.label_unit + "</span></center>")],
                            layout=Layout(border='3px double CornflowerBlue', margin='0px 10px 0px 0px'))
        html_avg_false = Box([HTML("<center><span style=\"font-weight:bold\"> Rule = False</span><br><span "
                                   "style=\"color: Green; font-size:16px\">" + str(
            round(self.dr_miner.metrics['avg_False'])) + "\xa0" + self.label_unit + "</span></center>")],
                             layout=Layout(border='3px double CornflowerBlue', color='CornflowerBlue',
                                           margin='0px 0px 0px 10px'))
        hbox_metrics = HBox([html_avg_true, html_avg_false])
        html_avg_case_duration = HTML("<span style=\"font-weight: bold; font-size:16px\">Average case duration</span>")
        vbox_metrics = VBox([html_avg_case_duration, hbox_metrics], layout=Layout(margin='10px 0px 0px 0px'))
        return vbox_metrics

In [2]:
def get_case_duration_pql(dm_info, aggregation="AVG", time_aggregation="DAYS"):
    q = (
        aggregation
        + "(CALC_THROUGHPUT(ALL_OCCURRENCE['Process Start'] TO ALL_OCCURRENCE['Process End'], REMAP_TIMESTAMPS(\""
        + dm_info.activity_table_name
        + '"."'
        + dm_info.eventtime_col
        + '", '
        + time_aggregation
        + ")))"
    )
    query = PQL()
    query.add(PQLColumn(q, "average case duration"))
    df_avg_case_duration = dm_info.dm.get_data_frame(query)
    return df_avg_case_duration["average case duration"].values[0]


def get_case_duration_development_pql(
    dm_info,
    duration_aggregation="AVG",
    date_aggregation="ROUND_MONTH",
    time_aggregation="DAYS",
):
    q_date = (
        date_aggregation
        + '("'
        + dm_info.activity_table_name
        + '"."'
        + dm_info.eventtime_col
        + '")'
    )
    q_duration = (
        duration_aggregation
        + "(CALC_THROUGHPUT(ALL_OCCURRENCE['Process Start'] TO ALL_OCCURRENCE['Process End'], REMAP_TIMESTAMPS(\""
        + dm_info.activity_table_name
        + '"."'
        + dm_info.eventtime_col
        + '", '
        + time_aggregation
        + ")))"
    )
    query = PQL()
    query.add(PQLColumn(q_date, "datetime"))
    query.add(PQLColumn(q_duration, "case duration"))
    df_avg_case_duration = dm_info.dm.get_data_frame(query)
    return df_avg_case_duration


def get_quantiles_tracetime_pql(dm_info, quantiles, time_aggregation="DAYS"):
    q_quantiles = []
    for quantile in quantiles:
        q = (
            "QUANTILE(CALC_THROUGHPUT(ALL_OCCURRENCE['Process Start'] TO ALL_OCCURRENCE['Process End'], "
            'REMAP_TIMESTAMPS(" '
            + dm_info.activity_table_name
            + '"."'
            + dm_info.eventtime_col
            + '", '
            + time_aggregation
            + ")), "
            + str(quantile)
            + ")"
        )
        q_quantiles.append(q)

    query = PQL()
    for q in q_quantiles:
        query.add(PQLColumn(q[0], q[1]))

    df_quantiles = dm_info.dm.get_data_frame(query)
    return df_quantiles


def get_num_cases_with_durations(dm_info, durations, time_aggregation="DAYS"):
    """Get the number of cases with the durations.

    :param dm_info:
    :param durations: List of Tuples from ... to...
    :param time_aggregation:
    :return:
    """

    query = PQL()
    for d in durations:
        if d == (None, None):
            continue
        elif d[0] is None:
            q = (
                'SUM(CASE WHEN (CALC_THROUGHPUT(CASE_START TO CASE_END, REMAP_TIMESTAMPS("'
                + dm_info.activity_table_name
                + '"."'
                + dm_info.eventtime_col
                + '", '
                + time_aggregation
                + ")) <= "
                + str(d[1])
                + ") THEN 1 ELSE 0 END)"
            )
        elif d[1] is None:
            q = (
                'SUM(CASE WHEN (CALC_THROUGHPUT(CASE_START TO CASE_END, REMAP_TIMESTAMPS("'
                + dm_info.activity_table_name
                + '"."'
                + dm_info.eventtime_col
                + '", '
                + time_aggregation
                + ")) >= "
                + str(d[0])
                + ") THEN 1 ELSE 0 END)"
            )
        else:
            q = (
                'SUM(CASE WHEN (CALC_THROUGHPUT(CASE_START TO CASE_END, REMAP_TIMESTAMPS("'
                + dm_info.activity_table_name
                + '"."'
                + dm_info.eventtime_col
                + '", '
                + time_aggregation
                + ")) >= "
                + str(d[0])
                + ') AND (CALC_THROUGHPUT(CASE_START TO CASE_END, REMAP_TIMESTAMPS("'
                + dm_info.activity_table_name
                + '"."'
                + dm_info.eventtime_col
                + '", '
                + time_aggregation
                + ")) <= "
                + str(d[1])
                + ") THEN 1 ELSE 0 END)"
            )
        query.add(PQLColumn(q, str(d)))
    df_durations = dm_info.dm.get_data_frame(query)
    return df_durations


def get_quantiles_tracetime_pql(dm_info, quantiles, time_aggregation="DAYS"):
    q_quantiles = []
    for quantile in quantiles:
        q = (
            "QUANTILE(CALC_THROUGHPUT(ALL_OCCURRENCE['Process Start'] TO ALL_OCCURRENCE['Process End'], "
            'REMAP_TIMESTAMPS("'
            + dm_info.activity_table_name
            + '"."'
            + dm_info.eventtime_col
            + '", '
            + time_aggregation
            + ")), "
            + str(quantile)
            + ")"
        )
        q_quantiles.append((q, quantile))

    query = PQL()
    for q in q_quantiles:
        query.add(PQLColumn(q[0], q[1]))
    df_quantiles = dm_info.dm.get_data_frame(query)
    return df_quantiles


def get_num_cases_with_durations(dm_info, durations, time_aggregation="DAYS"):
    """Get the number of cases with the durations.

    :param dm_info:
    :param durations: List of Tuples from ... to...
    :param time_aggregation:
    :return:
    """

    query = PQL()
    for d in durations:
        if d == (None, None):
            continue
        elif d[0] is None:
            q = (
                'SUM(CASE WHEN (CALC_THROUGHPUT(CASE_START TO CASE_END, REMAP_TIMESTAMPS("'
                + dm_info.activity_table_name
                + '"."'
                + dm_info.eventtime_col
                + '", '
                + time_aggregation
                + ")) <= "
                + str(d[1])
                + ") THEN 1 ELSE 0 END)"
            )
        elif d[1] is None:
            q = (
                'SUM(CASE WHEN (CALC_THROUGHPUT(CASE_START TO CASE_END, REMAP_TIMESTAMPS("'
                + dm_info.activity_table_name
                + '"."'
                + dm_info.eventtime_col
                + '", '
                + time_aggregation
                + ")) >= "
                + str(d[0])
                + ") THEN 1 ELSE 0 END)"
            )
        else:
            q = (
                'SUM(CASE WHEN (CALC_THROUGHPUT(CASE_START TO CASE_END, REMAP_TIMESTAMPS("'
                + dm_info.activity_table_name
                + '"."'
                + dm_info.eventtime_col
                + '", '
                + time_aggregation
                + ")) >= "
                + str(d[0])
                + ') AND (CALC_THROUGHPUT(CASE_START TO CASE_END, REMAP_TIMESTAMPS("'
                + dm_info.activity_table_name
                + '"."'
                + dm_info.eventtime_col
                + '", '
                + time_aggregation
                + ")) <= "
                + str(d[1])
                + ") THEN 1 ELSE 0 END)"
            )
        query.add(PQLColumn(q, str(d)))
    df_durations = dm_info.dm.get_data_frame(query)
    return df_durations


def get_potential_extra_bins(
    lower_end, upper_end, bin_width, num_bins, min_val, max_val
):
    """Get bins beyond the borders.

    :param lower_end:
    :param upper_end:
    :param bin_width:
    :param num_bins:
    :return:
    """
    potential_lowers = []
    potential_uppers = []
    for i in range(1, num_bins + 1):
        if lower_end - i * bin_width > min_val:
            potential_lowers.append(
                (lower_end - i * bin_width, lower_end - (i - 1) * bin_width - 1)
            )
        if upper_end + i * bin_width < max_val:
            potential_uppers.append(
                (upper_end + 1 + (i - 1) * bin_width, upper_end + i * bin_width)
            )

    return potential_lowers, potential_uppers


def choose_extra_bins(potential_lowers, potential_uppers, num_bins):
    potential_all = potential_lowers + potential_uppers

    if len(potential_all) == 0:
        return [], []
    extra_bins_lower = []
    extra_bins_upper = []
    take_from_upper = True
    for i in range(num_bins):
        if (len(potential_lowers) == 0) and (len(potential_uppers) == 0):
            break
        if (
            (len(potential_lowers) > 0 and len(potential_uppers) == 0)
            or (len(potential_lowers) > 0)
            and (not take_from_upper)
        ):
            extra_bins_lower = [potential_lowers[-1]] + extra_bins_lower
            potential_lowers = potential_lowers[:-1]
            take_from_upper = True
        else:
            extra_bins_upper.append(potential_uppers[0])
            potential_uppers = potential_uppers[1:]
            take_from_upper = False
    return extra_bins_lower, extra_bins_upper


def get_bins_trace_times(dm_info, num_bins, time_aggregation="DAYS"):
    min_percentile = 0.0
    max_percentile = 1.0
    lower_percentile = 1 / (2 * num_bins)
    upper_percentile = 1 - lower_percentile
    df_qs = get_quantiles_tracetime_pql(
        dm_info,
        [lower_percentile, upper_percentile, min_percentile, max_percentile],
        time_aggregation,
    )
    min_val = df_qs[str(min_percentile)].values[0]
    max_val = df_qs[str(max_percentile)].values[0]

    lower_end = df_qs[str(lower_percentile)].values[0]
    upper_end = df_qs[str(upper_percentile)].values[0]

    bin_width = int(np.ceil((upper_end - lower_end) / (num_bins - 2)))
    if (max_val - min_val + 1) / bin_width < num_bins and bin_width > 1:
        bin_width -= 1
    bins_within = (upper_end - lower_end + 1) // bin_width
    bins = [
        (lower_end + i * bin_width, lower_end + (i + 1) * bin_width - 1)
        for i in range(bins_within)
    ]
    diff_bins = num_bins - 2 - bins_within
    upper_end_within = lower_end + bin_width * bins_within - 1
    potential_lowers, potential_uppers = get_potential_extra_bins(
        lower_end, upper_end_within, bin_width, diff_bins, min_val, max_val
    )

    extra_bins_lower, extra_bins_upper = choose_extra_bins(
        potential_lowers, potential_uppers, diff_bins
    )
    if len(extra_bins_lower) > 0:
        min_inner_bin = extra_bins_lower[0][0]
    else:
        min_inner_bin = lower_end
    if len(extra_bins_upper) > 0:
        max_inner_bin = extra_bins_upper[-1][1]
    else:
        max_inner_bin = upper_end_within

    min_bin = (min_val, min_inner_bin - 1)
    max_bin = (max_inner_bin + 1, max_val)
    bins = [min_bin] + extra_bins_lower + bins + extra_bins_upper + [max_bin]
    df_histogram = get_num_cases_with_durations(
        dm_info, bins, time_aggregation=time_aggregation
    )
    df_histogram = df_histogram.transpose().reset_index()
    df_histogram.rename(
        columns={df_histogram.columns[0]: "range", df_histogram.columns[1]: "cases"},
        inplace=True,
    )
    return df_histogram