# summary

- `postag_regexp` can occur as part of the following tags:
  - <match [...]>
  - <token [...]>


In [1]:
# incorrect usage of postag_regexp="yes"
rule_xml_wrong = """<rule id="BRIEFCATCH_23860985463569237141502738364142673900" name="BRIEFCATCH_PUNCHINESS_892">
    <pattern>
        <token inflected="yes">have</token>
        <token min="0" regexp="yes">some|real|great|considerable|much</token>
        <token>difficulty</token>
        <token min="0">in</token>
        <token postag_regexp="yes" postag="VBG">
            <exception regexp="yes">accepting|concerning|following|maintaining|obtaining|pending|regarding|seeing|standing</exception>
        </token>
    </pattern>
    <message>Would shorter words add punch?|**Example** from Justice Thomas: “Only later did courts **wrestle with** the language in § 230(f )(3) suggesting providers are liable for content they help develop ‘in part.’”|**Example** from Justice Barrett: “We need not **wrestle with** these questions in this case, though, because the same standard applies regardless whether *Smith* stays or goes.”</message>
    <suggestion>\1 trouble \5</suggestion>
    <suggestion><match no="1" postag="(V.*)" postag_regexp="yes" postag_replace="$1">struggle</match> to <match no="5" postag="V.*" postag_regexp="yes" postag_replace="VB"/></suggestion>
    <suggestion><match no="1" postag="(V.*)" postag_regexp="yes" postag_replace="$1">wrestle</match> with \5</suggestion>
    <short>{"ruleGroup":"BRIEFCATCH_PUNCHINESS_892","ruleGroupIdx":0,"isConsistency":false,"isStyle":true,"correctionCount":3,"priority":"5.543"}</short>
    <example correction="have trouble reaching|struggle to reach|wrestle with reaching">The experts can <marker>have difficulty reaching</marker> consensus.</example>
</rule>"""
# correct usage of postag_regexp="yes"
rule_xml_right = """<rule id="BRIEFCATCH_117638887568771684836986824386702321799" name="BRIEFCATCH_PUNCHINESS_1433">
    <pattern>
        <token>is</token>
        <token min="0" regexp="yes">almost|also|barely|consistently|even|generally|impermissibly|improperly|inconsistently|increasingly|justifiably|largely|mainly|mostly|nearly|never|occasionally|often|partially|partly|perhaps|permissibly|possibly|practically|primarily|probably|rarely|seldom|sometimes|somewhat|sporadically|still|therefore|thus|typically|understandably|unreliably|usually</token>
        <token min="0" regexp="yes">absolutely|actually|certainly|clearly|completely|considerably|decidedly|definitely|drastically|dramatically|entirely|extremely|flatly|fully|fundamentally|greatly|highly|obviously|perfectly|plainly|quite|really|strongly|surely|totally|truly|utterly|very|wholly|widely|engaged</token>
        <token>in</token>
        <token>the</token>
        <token>business</token>
        <token>of</token>
        <token postag="VBG">
            <exception>banking</exception>
            <exception>concerning</exception>
            <exception>following</exception>
            <exception>manufacturing</exception>
            <exception>pending</exception>
            <exception>regarding</exception>
            <exception>standing</exception>
            <exception>transporting</exception>
        </token>
    </pattern>
    <message>Would using fewer words or a vivid verb help engage the reader?</message>
    <suggestion>\2 <match no="8" postag="V.*" postag_regexp="yes" postag_replace="VBZ"/></suggestion>
    <short>{"ruleGroup":"BRIEFCATCH_PUNCHINESS_1433","ruleGroupIdx":1,"isConsistency":false,"isStyle":true,"correctionCount":1,"priority":"8.1231"}</short>
    <example correction="supplies">He <marker>is in the business of supplying</marker> landscaping materials.</example>
</rule>"""
# fake null example has no postag_regexp
rule_xml_null = """<rule id="BRIEFCATCH_117638887568771684836986824386702321799" name="BRIEFCATCH_PUNCHINESS_1433">
    <pattern>
        <token>is</token>
        <token>business</token>
        <token>of</token>
        <token postag="VBG">
            <exception>banking</exception>

            <exception>transporting</exception>
        </token>
    </pattern>
    <message>Would using fewer words or a vivid verb help engage the reader?</message>
    <short>{"ruleGroup":"BRIEFCATCH_PUNCHINESS_1433","ruleGroupIdx":1,"isConsistency":false,"isStyle":true,"correctionCount":1,"priority":"8.1231"}</short>
    <example correction="supplies">He <marker>is in the business of supplying</marker> landscaping materials.</example>
</rule>"""


# fpath = "../../briefcatch-example-rules/grammar.xml"
# with open(fpath) as f:
#     grammar = f.read()

In [3]:
import re


def get_tags_with_postag_regexp(xml):
    matchtag_pattern = r"<match.*?postag_regexp.*?>"
    tokentag_pattern = r"<token.*?postag_regexp.*?>"
    return re.findall(Rf"""({matchtag_pattern}|{tokentag_pattern})""", xml)


def get_postag_value(xml):
    return re.findall(r'postag="(.*?)"', xml)


def is_complex_regexp(s: str, model="gpt-4"):
    complex_tokens = [".*", ":", "|", "$"]
    for token in complex_tokens:
        if token in s:
            return True
    return False


def cleanup_postag_regexp(rule_xml):
    # 1. get relevant tags from xml
    tags = get_tags_with_postag_regexp(rule_xml)

    if len(tags) == 0:
        # no tags postag_regexp detected, nothing to do
        return rule_xml

    invalid_tags = []
    for ix in range(len(tags)):
        tag = tags[ix]
        # 2. get value of postag
        postag_value = get_postag_value(tag)
        assert (
            len(postag_value) <= 1
        ), f"expected only one postag value in enclosing tag, but got {postag_value}"

        # 3. if postag_value is not complex, remove postag_regexp
        if not is_complex_regexp(postag_value[0]):
            print("correcting tag")
            invalid_tags.append(tag)
            corrected_tag = re.sub(r'postag_regexp="yes" ', "", tag)
            rule_xml = re.sub(tag, corrected_tag, rule_xml)

    return rule_xml

In [5]:
rule_xml_test_set = [rule_xml_null, rule_xml_right, rule_xml_wrong]

for rule_xml in rule_xml_test_set:
    rule_xml = cleanup_postag_regexp(rule_xml)

correcting tag
