# Creating a dataset for customer workflow configuration usage behaviour

This script analyzes all customer configurations, extracting knowledge about the workflow usage, like most common `Event` and `Action` types, as well as the most common `Action` and `Event` sequences. This information can be used to improve the product, by suggesting the most common actions to the user, or to improve the documentation, by providing examples of common workflows.

This first cell contains all the general functionality in order to parse the correct files.

In [1]:
from collections import Counter
from typing import List, Tuple
import xml.etree.ElementTree as ET
from pathlib import Path


def load_xml_file(file_path: str | Path) -> str:
    with open(file_path, "r") as file:
        return file.read()


def should_exclude_path(file_path: Path) -> bool:
    # Exclude folders containing "SOLOPLAN" (case insensitive)
    if any("SOLOPLAN" in part.upper() for part in file_path.parts):
        return True
    # Exclude "TestSystem" subfolder
    if "TestSystem" in file_path.parts:
        return True
    return False


def should_exclude_file(file_path: Path) -> bool:
    # Exclude files with "Options" or "Property" in their name
    if "Options" in file_path.name or "Property" in file_path.name:
        return True
    return False


# Get all XML files
all_files: List[Path] = []
for file in Path("/workspace/data/customer_data/workflows").rglob("*.xml"):
    all_files.append(file)

## Parsing logic

Parses the following information:

- `Event` and `Action` types _(frequency of their occurrence)_
- `Action` and `Event` sequences _(frequency of their occurrence)_

In [2]:
def parse_xml(xml_content: str) -> Tuple[Counter, Counter, Counter]:
    event_dict = Counter()
    action_dict = Counter()
    pair_dict = Counter()

    # Function to recursively parse elements and update frequencies
    def parse_element(element: ET.Element, current_event=None):
        for child in element:
            if "EventId" in child.attrib:
                event_dict[child.tag] += 1
                current_event = child.tag
            if "ActionId" in child.attrib:
                action_dict[child.tag] += 1
                if current_event:
                    pair_dict[(current_event, child.tag)] += 1
            parse_element(child, current_event)

    # Parse the XML data
    root = ET.fromstring(xml_content)
    for business_object in root.findall("BusinessObject"):
        parse_element(business_object)

    return event_dict, action_dict, pair_dict


file_contents = {file: load_xml_file(file) for file in all_files}

events = Counter()
actions = Counter()
pairs = Counter()

# Parse each file and update the overall counters
for file, content in file_contents.items():
    if should_exclude_path(file) or should_exclude_file(file):
        continue

    e, a, p = parse_xml(content)
    events.update(e)
    actions.update(a)
    pairs.update(p)

This cell converts data to dataframes and save them to CSV files.

In [3]:
import pandas as pd

df = pd.DataFrame(
    {
        "type": ["Event"] * len(events) + ["Action"] * len(actions),
        "name": list(events.keys()) + list(actions.keys()),
        "frequency": list(events.values()) + list(actions.values()),
    }
)

df.to_csv("/workspace/data/workflows/individual_freq.csv", index=False)

pair_df = pd.DataFrame(
    {
        "event": [pair[0] for pair in pairs.keys()],
        "action": [pair[1] for pair in pairs.keys()],
        "frequency": list(pairs.values()),
    }
)

pair_df.to_csv("/workspace/data/workflows/pair_freq.csv", index=False)