# DevExpress Criteria Language Syntax function parsing

Parsing all available DevExpress functions with their metadata

In [1]:
import requests
from bs4 import BeautifulSoup
from typing import Set, List, Tuple, Dict
import html
import re
from internal_shared.models.documents import DevExpressFunction

Define all helper functions for the parsting part

In [None]:
def get_html_document(url: str) -> BeautifulSoup:
    try:
        response = requests.get(url)
        response.raise_for_status()
        return BeautifulSoup(response.text, "html.parser")
    except requests.RequestException as e:
        raise RuntimeError(f"Failed to fetch the URL: {url}") from e


def get_category_from_previous_h3(table) -> str:
    previous_sibling = table.find_previous_sibling("h3")
    return (
        previous_sibling.get_text().strip().lower().replace(" ", "_").replace("-", "")
        if previous_sibling
        else "unknown_category"
    )


def get_table_rows(table) -> List:
    if table.find("thead") or table.find("tbody"):
        body = table.find("tbody") or table.find("thead")
        return body.find_all("tr")
    else:
        return table.find_all("tr")


def parse_table_rows(rows, category: str) -> Set[DevExpressFunction]:
    return {
        parse_row(row, category)
        for row in rows
        if not row.find("th") and len(row.find_all("td")) >= 3
    }


def parse_row(row, category: str) -> DevExpressFunction:
    cells = row.find_all("td")
    name_with_params = html_decode(cells[0].get_text(strip=True))
    name = extract_function_name(name_with_params)
    return DevExpressFunction(
        name=name,
        description=f"{name}: {html_decode(cells[1].get_text(strip=True))}",
        example=html_decode(cells[2].get_text(strip=True)),
        category=category,
    )


def html_decode(text: str) -> str:
    txt = html.unescape(text)
    #unicode_replacements = {"\u2019": "'", "\u2013": "-", "\u2026": ""}
    #return txt.translate(str.maketrans(unicode_replacements))
    return txt


def extract_function_name(name_with_params: str) -> str:
    match = re.match(r"(\w+)\s*\(", name_with_params)
    return match.group(1) if match else name_with_params

Actual parsing logic

In [None]:
def parse_functions(url: str) -> Tuple[Set[DevExpressFunction], Dict[str, List[str]]]:
    doc = get_html_document(url)
    functions_node = doc.find("h2", {"id": "functions"})
    tables = functions_node.find_all_next("table")

    functions = set()
    metadata: Dict[str, List[str]] = {}

    for table in tables:
        category = get_category_from_previous_h3(table)
        rows = get_table_rows(table)
        parsed_functions = parse_table_rows(rows, category)
        functions.update(parsed_functions)

        for func in parsed_functions:
            if func.category not in metadata:
                metadata[func.category] = []
            metadata[category].append(func.name)

    return functions, metadata

In [None]:
import json

url = "https://docs.devexpress.com/CoreLibraries/4928/devexpress-data-library/criteria-language-syntax"
functions, metadata = parse_functions(url)

functions_list = [func.to_dict() for func in functions]

# Write functions to data.json
with open("/workspace/data/functions/data.json", "w") as f:
    func_json = json.dumps(functions_list, indent=4, ensure_ascii=False).encode("utf-8")
    f.write(func_json.decode())

# Write metadata to data.metadata.json
with open("/workspace/data/functions/data.metadata.json", "w") as f:
    meta_json = json.dumps(metadata, indent=4, ensure_ascii=False).encode("utf-8")
    f.write(meta_json.decode())