In [112]:
import regex, math

In [113]:
cat_pattern = open("categories.txt").read()
cat_pattern

'(?:\\p{Grapheme_Cluster_Break=CR}\\p{Grapheme_Cluster_Break=LF}|\\p{Grapheme_Cluster_Break=CR}|\\p{Grapheme_Cluster_Break=LF})|\\p{Grapheme_Cluster_Break=Control}|\\p{Grapheme_Cluster_Break=Prepend}*(?:(?:\\p{Grapheme_Cluster_Break=L}*(?:\\p{Grapheme_Cluster_Break=V}+|\\p{Grapheme_Cluster_Break=LV}\\p{Grapheme_Cluster_Break=V}*|\\p{Grapheme_Cluster_Break=LVT})\\p{Grapheme_Cluster_Break=T}*|\\p{Grapheme_Cluster_Break=L}+|\\p{Grapheme_Cluster_Break=T}+)|(?:\\p{Grapheme_Cluster_Break=RI}\\p{Grapheme_Cluster_Break=RI})|(?:\\p{Extended_Pictographic}(?:\\p{Grapheme_Cluster_Break=Extend}*\\p{Grapheme_Cluster_Break=ZWJ}\\p{Extended_Pictographic})*)|(?:\\p{InCB=Consonant}(?:(?:\\p{InCB=Extend}|\\p{InCB=Linker})*\\p{InCB=Linker}(?:\\p{InCB=Extend}|\\p{InCB=Linker})*\\p{InCB=Consonant})+)|[^\\p{Grapheme_Cluster_Break=Control}\\p{Grapheme_Cluster_Break=CR}\\p{Grapheme_Cluster_Break=LF}])(?:\\p{Grapheme_Cluster_Break=Extend}|\\p{Grapheme_Cluster_Break=ZWJ}|\\p{Grapheme_Cluster_Break=SpacingMark})*

In [122]:
categories = [
    # this is special cased because negated sets don't play well with variable length character sets
    "[^\\p{Grapheme_Cluster_Break=Control}\\p{Grapheme_Cluster_Break=CR}\\p{Grapheme_Cluster_Break=LF}]",
    # normal classes below
    "\\p{Grapheme_Cluster_Break=CR}",
    "\\p{Grapheme_Cluster_Break=LF}",
    "\\p{Grapheme_Cluster_Break=Control}",
    "\\p{Grapheme_Cluster_Break=Prepend}",
    "\\p{Grapheme_Cluster_Break=L}",
    "\\p{Grapheme_Cluster_Break=V}",
    "\\p{Grapheme_Cluster_Break=LV}",
    "\\p{Grapheme_Cluster_Break=LVT}",
    "\\p{Grapheme_Cluster_Break=T}",
    "\\p{Grapheme_Cluster_Break=RI}",
    "\\p{Extended_Pictographic}",
    "\\p{Grapheme_Cluster_Break=Extend}",
    "\\p{Grapheme_Cluster_Break=ZWJ}",
    "\\p{InCB=Consonant}",
    "\\p{InCB=Extend}",
    "\\p{InCB=Linker}",
    "\\p{Grapheme_Cluster_Break=SpacingMark}",
]

In [115]:
cat_chars: dict[str, list[str]] = {}
for category in categories:
    chars = []
    pat = regex.compile(category)
    
    lim = 0x110000 * 1
    for i in range(lim):
        try:
            chr(i).encode("utf-8")
            if regex.match(pat, chr(i)):
                chars.append(chr(i))
        except:
            pass
    cat_chars[category] = chars

In [116]:
{key: len(value) for key, value in cat_chars.items()}

{'[^\\p{Grapheme_Cluster_Break=Control}\\p{Grapheme_Cluster_Break=CR}\\p{Grapheme_Cluster_Break=LF}]': 1108169,
 '\\p{Grapheme_Cluster_Break=CR}': 1,
 '\\p{Grapheme_Cluster_Break=LF}': 1,
 '\\p{Grapheme_Cluster_Break=Control}': 3893,
 '\\p{Grapheme_Cluster_Break=Prepend}': 27,
 '\\p{Grapheme_Cluster_Break=L}': 125,
 '\\p{Grapheme_Cluster_Break=V}': 95,
 '\\p{Grapheme_Cluster_Break=LV}': 399,
 '\\p{Grapheme_Cluster_Break=LVT}': 10773,
 '\\p{Grapheme_Cluster_Break=T}': 137,
 '\\p{Grapheme_Cluster_Break=RI}': 26,
 '\\p{Extended_Pictographic}': 3537,
 '\\p{Grapheme_Cluster_Break=Extend}': 2130,
 '\\p{Grapheme_Cluster_Break=ZWJ}': 1,
 '\\p{InCB=Consonant}': 240,
 '\\p{InCB=Extend}': 884,
 '\\p{InCB=Linker}': 6,
 '\\p{Grapheme_Cluster_Break=SpacingMark}': 395}

In [159]:
def regexify(c: str):
    h = ":" + c.encode("utf-8").hex(":")
    return h.replace(":", "\\x").encode("utf-8")

[(chr(n), regexify(chr(n))) for n in [53, 652, 9382]]

[('5', b'\\x35'), ('ʌ', b'\\xca\\x8c'), ('⒦', b'\\xe2\\x92\\xa6')]

In [217]:
import re

def form_prefix_tree(*bins: bytes):
    prefix_tree = {}
    for bin in bins:
        if bin[0] in prefix_tree and bin[1:]:
            prefix_tree[bin[0]].append(bin[1:])
        elif bin[1:]:
            prefix_tree[bin[0]] = [bin[1:]]
        else:
            prefix_tree[bin[0]] = []
    # recurse
    return {i: form_prefix_tree(*cs) for i, cs in prefix_tree.items()}

def saturate(tree: dict):
    return {re.escape(bytes([key])): saturate(value) for key, value in tree.items()}

def prefix_to_regex(tree: dict):
    if tree:
        return b'(?:' + b'|'.join(key + prefix_to_regex(val) for key, val in tree.items()) + b')'
    else:
        return b''

def chars_union(*cs: str):
    prefix_tree = form_prefix_tree(*(c.encode("utf-8") for c in cs))
    # print(prefix_tree)
    # print(saturate(prefix_tree))
    result = prefix_to_regex(saturate(prefix_tree))

    # print(result)
    # result = b"(?:" + b"|".join(regexify(c) for c in cs) + b")"
    # print(result)
    # print("initial", len(result), "final", len(prefix_to_regex(prefix_tree)), "delta", len(prefix_to_regex(prefix_tree)) - len(result))
    return result

chars_union(*"こ+え")

b'(?:\xe3(?:\x81(?:\x93|\x88))|\\+)'

In [218]:
cat_ranges = {cat.encode("utf-8"): chars_union(*chars) for cat, chars in cat_chars.items()}
{cat: len(union) for cat, union in cat_ranges.items()}

{b'[^\\p{Grapheme_Cluster_Break=Control}\\p{Grapheme_Cluster_Break=CR}\\p{Grapheme_Cluster_Break=LF}]': 2304325,
 b'\\p{Grapheme_Cluster_Break=CR}': 6,
 b'\\p{Grapheme_Cluster_Break=LF}': 6,
 b'\\p{Grapheme_Cluster_Break=Control}': 8187,
 b'\\p{Grapheme_Cluster_Break=Prepend}': 147,
 b'\\p{Grapheme_Cluster_Break=L}': 278,
 b'\\p{Grapheme_Cluster_Break=V}': 223,
 b'\\p{Grapheme_Cluster_Break=LV}': 1696,
 b'\\p{Grapheme_Cluster_Break=LVT}': 22444,
 b'\\p{Grapheme_Cluster_Break=T}': 302,
 b'\\p{Grapheme_Cluster_Break=RI}': 70,
 b'\\p{Extended_Pictographic}': 7502,
 b'\\p{Grapheme_Cluster_Break=Extend}': 5193,
 b'\\p{Grapheme_Cluster_Break=ZWJ}': 15,
 b'\\p{InCB=Consonant}': 543,
 b'\\p{InCB=Extend}': 2426,
 b'\\p{InCB=Linker}': 50,
 b'\\p{Grapheme_Cluster_Break=SpacingMark}': 1248}

In [219]:
bin_pattern = cat_pattern.encode("utf-8")
for cat, union in cat_ranges.items():
    bin_pattern = bin_pattern.replace(cat, union)

len(bin_pattern)

2371402

In [220]:
import re, grapheme

final_pattern = re.compile(bin_pattern)
s = " 😀😀😀👨‍👩‍👧‍👦👨‍👩‍👧‍👦😀😀"
len(final_pattern.findall(s.encode("utf-8"))), grapheme.length(s)

(8, 8)

In [227]:
line_width = 79
offset = len("pattern = (")
indent = len("    ")
delimiters = len("b''")

bin_remaining = ascii(bin_pattern)[2:-1]

undecorated = []
remaining_budget = line_width - indent - delimiters
initial_budget = line_width - offset - delimiters
initial_budget = remaining_budget

i = 0
while i < len(bin_remaining):
    budget = initial_budget if i == 0 else remaining_budget

    nudge = 0
    attempt = bin_remaining[i : i + budget - nudge]
    while True:
        try:
            eval("b'" + attempt + "'")
        except SyntaxError:
            nudge += 1
            attempt = bin_remaining[i : i + budget - nudge]
        else:
            break
    
    undecorated.append((attempt, nudge))
    i += len(attempt)


undecorated

[('(?:(?:\\\\\\r)(?:\\\\\\n)|(?:\\\\\\r)|(?:\\\\\\n))|(?:\\x00|\\x01|\\x02|\\x03|\\x04|\\x05|',
  0),
 ('\\x06|\\x07|\\x08|\\\\\\t|\\\\\\x0b|\\\\\\x0c|\\x0e|\\x0f|\\x10|\\x11|\\x12|\\x13|\\x14|',
  3),
 ('\\x15|\\x16|\\x17|\\x18|\\x19|\\x1a|\\x1b|\\x1c|\\x1d|\\x1e|\\x1f|\\x7f|\\xc2(?:\\x80|',
  0),
 ('\\x81|\\x82|\\x83|\\x84|\\x85|\\x86|\\x87|\\x88|\\x89|\\x8a|\\x8b|\\x8c|\\x8d|\\x8e|',
  2),
 ('\\x8f|\\x90|\\x91|\\x92|\\x93|\\x94|\\x95|\\x96|\\x97|\\x98|\\x99|\\x9a|\\x9b|\\x9c|',
  2),
 ('\\x9d|\\x9e|\\x9f|\\xad)|\\xd8(?:\\x9c)|\\xe1(?:\\xa0(?:\\x8e))|\\xe2(?:\\x80(?:',
  3),
 ('\\x8b|\\x8e|\\x8f|\\xa8|\\xa9|\\xaa|\\xab|\\xac|\\xad|\\xae)|\\x81(?:\\xa0|\\xa1|\\xa2',
  0),
 ('|\\xa3|\\xa4|\\xa5|\\xa6|\\xa7|\\xa8|\\xa9|\\xaa|\\xab|\\xac|\\xad|\\xae|\\xaf))|\\xef',
  0),
 ('(?:\\xbb(?:\\xbf)|\\xbf(?:\\xb0|\\xb1|\\xb2|\\xb3|\\xb4|\\xb5|\\xb6|\\xb7|\\xb8|\\xb9',
  0),
 ('|\\xba|\\xbb))|\\xf0(?:\\x93(?:\\x90(?:\\xb0|\\xb1|\\xb2|\\xb3|\\xb4|\\xb5|\\xb6|',
  3),
 ('\\xb7|\\xb8|\\xb9|\\xba|\\x

In [229]:
prefix = (
"""def length(string: bytes) -> int:
    '''Finds the length of a string'''
    import re
    return len(re.findall(pattern, string))

"""
)

with open("part.py", "wt") as f:
    f.write(prefix)
    f.write("pattern = (\n")
    for i, (line, nudge) in enumerate(undecorated):
        f.write("    b'")
        f.write(line)
        f.write("'")
        if nudge:
            f.write(nudge * "#")
        f.write("\n")
    f.write(")")

In [232]:
import part

part.length(" 😀😀😀👨‍👩‍👧‍👦👨‍👩‍👧‍👦😀😀".encode("utf-8"))

8