# Creating a dataset for customer expression usage behaviour

This script analyzes all customer configurations, extracting knowledge about the expression usage. The script works in combination with the `scrape_devexpress_functions` notebook, as well as the code analysis tool to get all custom functions. Compared to the initial document chunking, this now includes more metadata about the expression itself, which can be beneficial for further analysis (and document retrieval).

Currently, there are manual tasks for the custom functions: extracting examples and mapping to the correct category. 

This first cell contains all helper function for the parsing and extracting task.

In [1]:
from typing import List
import xml.etree.ElementTree as ET
import re
from pathlib import Path


def load_xml_file(file_path: str | Path) -> str:
    with open(file_path, "r") as file:
        return file.read()


def extract_function_names(expression):
    # Regular expression to match function names
    function_pattern = re.compile(r"\b([a-zA-Z_]+)\s*\(")
    return function_pattern.findall(expression)


def parse_and_extract_functions(xml_content):
    root = ET.fromstring(xml_content)

    function_names = []

    # Traverse the XML tree and find all elements with an "Expression" attribute
    for elem in root.iter():
        expression = elem.attrib.get("Expression", "")
        if expression:
            functions = extract_function_names(expression)
            function_names.extend(func.upper() for func in functions)

    return function_names

def should_exclude_path(file_path: Path):  
    # Exclude folders containing "SOLOPLAN" (case insensitive)  
    if any('SOLOPLAN' in part.upper() for part in file_path.parts):  
        return True  
    # Exclude "TestSystem" subfolder  
    if 'TestSystem' in file_path.parts:  
        return True  
    return False  

all_files: List[Path] = []
for file in Path("/workspace/data/customer_data/workflows").rglob("*.xml"):
    all_files.append(file)

The second cell loads the metadata files in order to get the category mappings for each function.

In [2]:
from pathlib import Path
import json
from collections import defaultdict
from typing import Dict, Set

general_metadata_file = Path("/workspace/data/functions/data.metadata.json")
custom_metadata_file = Path("/workspace/data/functions/data.custom.metadata.json")

with open(general_metadata_file, "r") as file:
    general_metadata: dict = json.load(file)

with open(custom_metadata_file, "r") as file:
    custom_metadata: dict = json.load(file)

# Initialize a defaultdict with an empty set as the default value
metadata: Dict[str, Dict[str, Set[str]]] = defaultdict(lambda: defaultdict(set))

# Add the data from general_metadata to metadata
for key, value in general_metadata.items():
    metadata[key]["DevExpress"].update(value)

# Add the data from custom_metadata to metadata
for key, value in custom_metadata.items():
    metadata[key]["Soloplan"].update(value)

# Convert metadata back to a regular dict
metadata = {key: dict(value) for key, value in metadata.items()}

The last cell creates the dataframe and saves it to a `csv` file in order to speed up any analysis parts.

In [3]:
import pandas as pd
from collections import Counter

# Dictionary to store the content of all files
file_contents = {file: load_xml_file(file) for file in all_files}

# Extract function names for overall and filtered sets
overall_function_names = []
filtered_function_names = []

for file, content in file_contents.items():
    function_names = parse_and_extract_functions(content)
    overall_function_names.extend(function_names)
    if not should_exclude_path(file):
        filtered_function_names.extend(function_names)

# Count the frequency of each function name
overall_counter = Counter(overall_function_names)
filtered_counter = Counter(filtered_function_names)

# Create a dictionary to map function names to their categories
function_to_category = {}
function_to_source = {}
for category, sources in metadata.items():
    for source, functions in sources.items():
        for function in functions:
            function_to_category[function.upper()] = category
            function_to_source[function.upper()] = source

# Combine the results into a DataFrame
df = pd.DataFrame.from_dict(overall_counter, orient="index", columns=["Frequency"])
df["FilteredFrequency"] = df.index.map(filtered_counter).fillna(0).astype(int)
df.reset_index(inplace=True)
df.rename(columns={"index": "FunctionName"}, inplace=True)

# Map each function to its category
df["Category"] = df["FunctionName"].map(
    lambda fn: function_to_category.get(fn, "unknown_functions")
)
df["Source"] = df["FunctionName"].map(
    lambda fn: function_to_source.get(fn, "unknown")
)

df.to_csv("/workspace/data/functions/functions.csv", index=False)