In [24]:
from typing import Generator, Optional, Set
import xml.etree.ElementTree as XMLElementTree
import re
from io import StringIO


INCLUDE_ATTRIBUTES_KEYS_VALUES = {"schemeName", "unitCode", "listName"}
EXCLUDE_ATTRIBUTES_KEYS_VALUES = {"nuts", "country", "cpv"}


def get_unique_xpath_generator(xml_content: str,
                               remove_namespaces: bool = True,
                               include_values_by_attribute_names: Optional[Set[str]] = None,
                               exclude_attribute_values: Optional[Set[str]] = None
                               ) -> Generator[str, None, None]:
    xml_file = StringIO(xml_content)
    path = []
    it = XMLElementTree.iterparse(xml_file, events=('start', 'end'))
    for evt, el in it:
        if evt == 'start':
            if remove_namespaces:
                ns_tag = re.split('[{}]', el.tag, 2)[1:]
                path.append(ns_tag[1] if len(ns_tag) > 1 else el.tag)
            else:
                path.append(el.tag)
            xpath = "/" + '/'.join(path)
            for attribute_key, attribute_value in el.attrib.items():
                if (attribute_key in include_values_by_attribute_names) and (
                        attribute_value not in exclude_attribute_values):
                    yield f"{xpath}@{attribute_key}={attribute_value}"
                else:
                    yield f"{xpath}@{attribute_key}"
            yield xpath
        else:
            path.pop()

In [25]:
from tests.test_data import EFORMS_SAMPLE_FILE_PATH

xml_content = EFORMS_SAMPLE_FILE_PATH.read_text(encoding="utf-8")
xpaths = set(get_unique_xpath_generator(xml_content))
xpaths

{'/PriorInformationNotice',
 '/PriorInformationNotice/ContractFolderID',
 '/PriorInformationNotice/ContractingParty',
 '/PriorInformationNotice/ContractingParty/BuyerProfileURI',
 '/PriorInformationNotice/ContractingParty/ContractingActivity',
 '/PriorInformationNotice/ContractingParty/ContractingActivity/ActivityTypeCode',
 '/PriorInformationNotice/ContractingParty/ContractingActivity/ActivityTypeCode@listName=authority-activity',
 '/PriorInformationNotice/ContractingParty/ContractingPartyType',
 '/PriorInformationNotice/ContractingParty/ContractingPartyType/PartyTypeCode',
 '/PriorInformationNotice/ContractingParty/ContractingPartyType/PartyTypeCode@listName=buyer-legal-type',
 '/PriorInformationNotice/ContractingParty/Party',
 '/PriorInformationNotice/ContractingParty/Party/PartyIdentification',
 '/PriorInformationNotice/ContractingParty/Party/PartyIdentification/ID',
 '/PriorInformationNotice/ContractingParty/Party/PartyIdentification/ID@schemeName=organization',
 '/PriorInformatio