In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from utils.regexp_validation import post_process_xml

In [3]:
test_xml = """
<rule id="BRIEFCATCH_51343401212486148133691753914879276798" name="BRIEFCATCH_CONCISENESS_4143">
    <antipattern>
        <token>prior</token>
        <token>to</token>
        <token postag="VB">
            <exception>can</exception>
            <exception>case</exception>
            <exception>contract</exception>
            <exception>counsel</exception>
            <exception>court</exception>
            <exception>dissent</exception>
            <exception>district</exception>
            <exception>equal</exception>
            <exception>even</exception>
            <exception>evidence</exception>
            <exception>found</exception>
            <exception>jail</exception>
            <exception>judge</exception>
            <exception>motion</exception>
            <exception>people</exception>
            <exception>respect</exception>
            <exception>source</exception>
            <exception>still</exception>
            <exception>title</exception>
            <exception>trial</exception>
            <exception>up</exception>
            <exception>view</exception>
            <exception>while</exception>
            <exception>will</exception>
        </token>
    </antipattern>
    <antipattern>
        <token>
            <exception>immediately</exception>
        </token>
        <token case_sensitive="yes">prior</token>
        <token>to</token>
        <token postag="N.*|PRP|PRP$|DT" postag_regexp="yes">
            <exception regexp="yes">all|are|being|believes|but|by|can|circuit|clear|concerning|concerns|dissent|does|due|even|fails|find|finds|get|given|having|his|hold|holds|if|left|like|long|make|may|might|must|no|note|one|or|other|prior|regarding|see|then|try|will</exception>
        </token>
        <token postag="VBG"/>
    </antipattern>
    <antipattern>
        <token>prior</token>
        <token>to</token>
        <token>the</token>
        <token regexp="yes">effective|entry|filing|hearing|interrogation</token>
    </antipattern>
    <pattern>
        <token>
            <exception>immediately</exception>
        </token>
        <token case_sensitive="yes">prior</token>
        <token>to</token>
        <token postag="N.*|PRP|PRP/$|DT" postag_regexp="yes">
            <exception regexp="yes">all|are|being|believes|but|by|can|circuit|clear|concerning|concerns|dissent|does|due|even|fails|find|finds|get|given|having|his|hold|holds|if|left|like|long|make|may|might|must|no|note|one|or|other|prior|regarding|see|then|try|will</exception>
        </token>
    </pattern>
    <message>Would using fewer words help sharpen the point?|**Example** from Chief Justice Roberts: "Forcing the officer to wait and predict whether such exigencies will occur **before entry** is in practice no different from forcing the officer to wait for these exigencies to occur."|**Example** from Justice Kagan: "It instead gives the court a brief chance to fix mistakes **before its** (single) judgment on a (single) habeas application becomes final and thereby triggers the time for appeal."|**Example** from Judge Easterbrook: "The seller does not need to conduct tests **before** asserting that this product reduces the risk of infection from cuts."|**Example** from Paul Clement: "Instead, like so many **before** them, the latest courts to take up that misguided charge cannot even agree on a single test."|**Example** from Office of Legal Counsel: "The tax committees . . . are the only congressional committees that need not obtain a resolution from the full House or Senate **before** they request tax information."</message>
    <suggestion><match no="1"/> before <match no="4"/></suggestion>
    <short>{"ruleGroup":null,"ruleGroupIdx":0,"isConsistency":false,"isStyle":true,"correctionCount":1,"priority":"4.575","WORD":true,"OUTLOOK":true}</short>
    <example correction="that before July">Sandy testified <marker>that prior to July</marker> 29, he had never heard</example>
    <example>And prior to walk,</example>
    <example>And prior to plaintiff pleading for relief.</example>
    <example>And prior to the interrogation she smiled.</example>
</rule>
</rule>
"""

In [4]:
import re


def alphabetize_operands_in_string(s):
    """
    assumes input string contains logical OR. e.g.
        s = "can|case|contract|counsel|court|dissent|district|equal|even|evidence|found|jail|judge|motion|people|respect|source|still|that|title|trial|up|view|while|will|you|us|congress|as|myself"
    """
    if "|" not in s:
        return s
    return "|".join(sorted(s.split("|")))


def get_value_of_tag(tag_name, xml):
    return re.findall(rf"<{tag_name}.*?>(.*?)</{tag_name}>", xml)


def alphabetize_operands_in_tag(tag_name, xml):
    for tag_value in get_value_of_tag(tag_name, xml):
        if "|" in tag_value:
            print(tag_value)
            new_tag_value = alphabetize_operands(tag_value)
            xml = xml.replace(tag_value, new_tag_value)
    return xml


def alphabetize_operands(xml):
    tag_names_with_operands = ["token", "exception"]
    for tag_name in tag_names_with_operands:
        xml = alphabetize_operands_in_tag(tag_name, xml)
    return xml

effective|entry|filing|hearing|interrogation
all|are|being|believes|but|by|can|circuit|clear|concerning|concerns|dissent|does|due|even|fails|find|finds|get|given|having|his|hold|holds|if|left|like|long|make|may|might|must|no|note|one|or|other|prior|regarding|see|then|try|will
all|are|being|believes|but|by|can|circuit|clear|concerning|concerns|dissent|does|due|even|fails|find|finds|get|given|having|his|hold|holds|if|left|like|long|make|may|might|must|no|note|one|or|other|prior|regarding|see|then|try|will
