# Creating a dataset for customer workflow configuration usage behaviour

This script analyzes all customer configurations, extracting knowledge about the workflow usage, like most common `Event` and `Action` types, as well as the most common `Action` and `Event` sequences. This information can be used to improve the product, by suggesting the most common actions to the user, or to improve the documentation, by providing examples of common workflows.

## Parsing logic

Parses the following information:

- `Event` and `Action` types _(frequency of their occurrence)_
- `Action` and `Event` sequences _(frequency of their occurrence)_

In [1]:
from collections import Counter
from typing import Tuple
from xml.etree import ElementTree as ET

def parse_xml(xml_content: str) -> Tuple[Counter, Counter, Counter, Counter]:
    # layer 1: get all event, action, and condition tags
    event_dict = Counter()
    action_dict = Counter()
    condition_dict = Counter()
    # layer 2: get all pairs of event and action tags
    event_action_pair_dict = Counter()
    # Function to recursively parse elements and update frequencies
    def parse_element(element: ET.Element, current_event=None):
        for child in element:
            if "EventId" in child.attrib:
                event_dict[child.tag] += 1
                current_event = child.tag
            elif "ActionId" in child.attrib:
                action_dict[child.tag] += 1
                if current_event:
                    event_action_pair_dict[(current_event, child.tag)] += 1
            elif child.tag.endswith("Condition"):
                condition_dict[child.tag] += 1
            elif child.tag == "Conditions":
                if "Type" in child.attrib:
                    if child.attrib["Type"] == "OR":
                        condition_dict["CompositeConditionOr"] += 1
                    elif child.attrib["Type"] == "AND":
                        condition_dict["CompositeConditionAnd"] += 1
                parse_element(child, current_event)
            parse_element(child, current_event)
    # Parse the XML data
    root = ET.fromstring(xml_content)
    for business_object in root.findall("BusinessObject"):
        parse_element(business_object)
    return event_dict, action_dict, event_action_pair_dict, condition_dict

In [2]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import Counter
from internal_shared.parsing.workflows import get_workflows_content

events = Counter()
actions = Counter()
pairs = Counter()
conditions = Counter()

xml_contents = get_workflows_content(use_default_filter=True)

with ThreadPoolExecutor() as executor:
    futures = {executor.submit(parse_xml, content): content for content in xml_contents}
    for future in as_completed(futures):
        event_dict, action_dict, event_action_pair_dict, condition_dict = future.result()
        events.update(event_dict)
        actions.update(action_dict)
        pairs.update(event_action_pair_dict)
        conditions.update(condition_dict)

This cell converts data to dataframes and save them to CSV files.

In [3]:
import pandas as pd

df = pd.DataFrame(
    {
        "type": ["Event"] * len(events)
        + ["Action"] * len(actions)
        + ["Condition"] * len(conditions),
        "name": list(events.keys()) + list(actions.keys()) + list(conditions.keys()),
        "frequency": list(events.values())
        + list(actions.values())
        + list(conditions.values()),
    }
)

df.to_csv("/workspace/data/workflows/individual_freq.csv", index=False)

pair_df = pd.DataFrame(
    {
        "event": [pair[0] for pair in pairs.keys()],
        "action": [pair[1] for pair in pairs.keys()],
        "frequency": list(pairs.values()),
    }
)

pair_df.to_csv("/workspace/data/workflows/pair_freq.csv", index=False)

## In-depth workflow analysis

Starting here, this analyzes all the configurations, by going through the top 3 events and actions. Each possible combination is checkend and all business objects, that meet these criteria, are stored in a dictionary.
Optionally, since conditions are not required, the top conditions are added to the matching criteria aswell.

In [4]:
# from df, get top 3 most frequent events, actions, and conditions
top_events = df[df["type"] == "Event"].nlargest(3, "frequency")["name"].tolist()
top_actions = df[df["type"] == "Action"].nlargest(3, "frequency")["name"].tolist()
top_conditions = df[df["type"] == "Condition"].nlargest(3, "frequency")["name"].tolist()

top_events, top_actions, top_conditions

(['OnPropertyChanged', 'OnCustomEvent', 'OnBeforeCommit'],
 ['SetValueByConst', 'SetValueByProperty', 'SetValueByExpression'],
 ['ExpressionCondition', 'CompositeConditionAnd', 'CompositeConditionOr'])

In [25]:
import xml.etree.ElementTree as ET
from typing import List, Tuple


def find_top_combinations(
    xml_content: str,
    top_events: List[str],
    top_actions: List[str],
    top_conditions: List[str],
) -> Tuple[List[dict], int]:
    """Find and extract BusinessObjects containing top combinations of events, actions, and conditions along with their metadata."""
    extracted_objects = []

    root = ET.fromstring(xml_content)
    business_objects = root.findall("BusinessObject")
    for business_object in business_objects:
        match_data = has_top_combination(
            business_object, top_events, top_actions, top_conditions
        )
        if match_data:
            extracted_objects.append({
                "element": business_object,
                "matches": match_data
            })

    return extracted_objects, len(business_objects)

def has_top_combination(
    business_object: ET.Element,
    top_events: List[str],
    top_actions: List[str],
    top_conditions: List[str],
) -> dict:
    """Check if the BusinessObject contains any of the top event, action, and condition combinations. Returns matching combinations."""
    events = {child.tag for child in business_object.findall(".//*[@EventId]")}
    actions = {child.tag for child in business_object.findall(".//*[@ActionId]")}
    conditions = {
        child.tag
        for child in business_object.iter()
        if child.tag.endswith('Condition')
    }
    matches = {"events": [], "actions": [], "conditions": []}
    for event in top_events:
        if event in events:
            matches["events"].append(event)
    for action in top_actions:
        if action in actions:
            matches["actions"].append(action)
    for condition in top_conditions:
        if condition in conditions:
            matches["conditions"].append(condition)

    if matches["events"] and matches["actions"]:
        return matches
    return {}


def ends_with_name(element: ET.Element, suffix: str) -> bool:
    """Helper function to check if the element tag ends with a given suffix."""
    return element.tag.endswith(suffix)

In [27]:
from internal_shared.parsing.workflows import get_workflows_content

xml_contents = get_workflows_content(use_default_filter=True)

extracted_objects = []
all_business_objects = 0

for content in xml_contents:
    combination, total_for_content = find_top_combinations(
        content, top_events, top_actions, top_conditions
    )
    extracted_objects.extend(combination)
    all_business_objects += total_for_content

In [33]:
len(extracted_objects), all_business_objects, round(len(extracted_objects) / all_business_objects, 2)

(11416, 22109, 0.52)

In [35]:
results_without_conditions = [ob for ob in extracted_objects if not ob.get("matches").get("conditions")]
len(results_without_conditions)

5010

As shown in the examples above, over 10.000 configurations meet the criteria of the top 3 events and actions. This is actually makes a lot of sense, as the top 3 actions and events already make up almost 90% of all actions and events!

Thus, the results, that these combinations make up to 52% of all configurations, is not suprising. It is worth mentioning, that one workflow is able to have multiple actions and conditions, which is why we only get 52% of all configurations.

Around 5000 of these configurations have no conditions.


The overall idea of this analysis was to get a small subset of "common" workflows, that can be used as examples to let the LLM create these workflows. These results should have been obvious in the first place. Now, automatically extracting these does not make sense, as 10.000+ configurations are still too many to be used as examples.

The next idea is to simply gather these manually and try first steps with the LLM. The `workflows.md` file contains more information that has been gathered, to explain all parts of the workflows. It also describes, which parts (e.g. events, actions and conditions) are used for the first few tests, as these should only demonstrate, if it is even possible to create workflows with a LLM.

In [21]:
extracted_objects[555]

{'element': <Element 'BusinessObject' at 0x7f9117617920>,
 'matches': {'events': ['OnCustomEvent'],
  'actions': ['SetValueByConst'],
  'conditions': ['ExpressionCondition']}}

In [46]:
import xml.etree.ElementTree as ET

xml_str = ET.tostring(extracted_objects[5]["element"], encoding='unicode')
print(xml_str.strip())

<BusinessObject TypePropertyId="145000">
    <OnCustomEvent EventId="78fd925bb16c4907929245935b235ace" EventActive="False" EventDescription="Wert (RG) mit %-Satz in das Feld @Wert ILV schreiben [Sendung / Freie Aktion]" Instruction="???" IsContextMenu="True" ShowParamDialog="DoNotShow">
      <SetValueByExpression ActionId="791b24c3-00ad-4d14-ba4f-0af285fadde9" Description="zur manuellen Verarbeitung genutzt" TargetProperty="700037" Expression="TODECIMAL([700046] * 0.8)" />
      <ReActGroup Id="21311890" />
    </OnCustomEvent>
  </BusinessObject>


In [23]:
results_without_conditions[0]

{'element': <Element 'BusinessObject' at 0x7f911e0cb290>,
 'matches': {'events': ['OnPropertyChanged'],
  'actions': ['SetValueByProperty', 'SetValueByExpression'],
  'conditions': []}}

In [34]:
import xml.etree.ElementTree as ET

# Assume 'root' is your ElementTree object
tree = ET.ElementTree(results_without_conditions[0]["element"])

# Specify the file name and path
file_name = "output.xml"

# Write the tree to the file
tree.write(file_name, encoding="utf-8", xml_declaration=True)