In [1]:
import regex, grapheme
# external libs used for generating the bytes regex

In [2]:
import json 
data = json.load(open("categories.json"))
cat_pattern = data["regex"]
categories = data["classes"]
cat_pattern, categories

('(?:\\p{Grapheme_Cluster_Break=CR}\\p{Grapheme_Cluster_Break=LF}|\\p{Grapheme_Cluster_Break=CR}|\\p{Grapheme_Cluster_Break=LF})|\\p{Grapheme_Cluster_Break=Control}|\\p{Grapheme_Cluster_Break=Prepend}*(?:(?:\\p{Grapheme_Cluster_Break=L}*(?:\\p{Grapheme_Cluster_Break=V}+|\\p{Grapheme_Cluster_Break=LV}\\p{Grapheme_Cluster_Break=V}*|\\p{Grapheme_Cluster_Break=LVT})\\p{Grapheme_Cluster_Break=T}*|\\p{Grapheme_Cluster_Break=L}+|\\p{Grapheme_Cluster_Break=T}+)|(?:\\p{Grapheme_Cluster_Break=RI}\\p{Grapheme_Cluster_Break=RI})|(?:\\p{Extended_Pictographic}(?:\\p{Grapheme_Cluster_Break=Extend}*\\p{Grapheme_Cluster_Break=ZWJ}\\p{Extended_Pictographic})*)|(?:\\p{InCB=Consonant}(?:(?:\\p{InCB=Extend}|\\p{InCB=Linker})*\\p{InCB=Linker}(?:\\p{InCB=Extend}|\\p{InCB=Linker})*\\p{InCB=Consonant})+)|[^\\p{Grapheme_Cluster_Break=Control}\\p{Grapheme_Cluster_Break=CR}\\p{Grapheme_Cluster_Break=LF}])(?:\\p{Grapheme_Cluster_Break=Extend}|\\p{Grapheme_Cluster_Break=ZWJ}|\\p{Grapheme_Cluster_Break=SpacingMark})

In [3]:
cat_chars: dict[str, list[str]] = {}
for category in categories:
    chars = []
    pat = regex.compile(category)
    
    lim = 0x110000 * 1
    for i in range(lim):
        try:
            chr(i).encode("utf-8")
            if regex.match(pat, chr(i)):
                chars.append(chr(i))
        except:
            pass
    cat_chars[category] = chars

In [4]:
{key: len(value) for key, value in cat_chars.items()}

{'[^\\p{Grapheme_Cluster_Break=Control}\\p{Grapheme_Cluster_Break=CR}\\p{Grapheme_Cluster_Break=LF}]': 1108169,
 '\\p{Grapheme_Cluster_Break=CR}': 1,
 '\\p{Grapheme_Cluster_Break=LF}': 1,
 '\\p{Grapheme_Cluster_Break=Control}': 3893,
 '\\p{Grapheme_Cluster_Break=Prepend}': 27,
 '\\p{Grapheme_Cluster_Break=L}': 125,
 '\\p{Grapheme_Cluster_Break=V}': 95,
 '\\p{Grapheme_Cluster_Break=LV}': 399,
 '\\p{Grapheme_Cluster_Break=LVT}': 10773,
 '\\p{Grapheme_Cluster_Break=T}': 137,
 '\\p{Grapheme_Cluster_Break=RI}': 26,
 '\\p{Extended_Pictographic}': 3537,
 '\\p{Grapheme_Cluster_Break=Extend}': 2130,
 '\\p{Grapheme_Cluster_Break=ZWJ}': 1,
 '\\p{InCB=Consonant}': 240,
 '\\p{InCB=Extend}': 884,
 '\\p{InCB=Linker}': 6,
 '\\p{Grapheme_Cluster_Break=SpacingMark}': 395}

In [5]:

def find_ranges(xss: list[bytes]):
    if not xss:
        return []
    xs = sorted([x[0] for x in set(xss)])
    ranges = []
    start = prev = xs[0]
    for x in xs[1:]:
        if prev + 1 != x:
            ranges.append(range(start, prev + 1)) # exclusive
            start = x
        prev = x
    else:
        ranges.append(range(start, prev + 1))
    return ranges

def format_ranges(ranges: list[range]):
    return (b'[' if len(ranges) > 1 or (ranges and len(ranges[0]) > 1) else b'') + b''.join(
        bytes([range.start]) + b'-' + bytes([range.stop - 1])
        if len(range) > 2
        else bytes([range.start]) + bytes([range.stop - 1])
        if len(range) == 2
        else bytes([range.start])
        for range in ranges
    ) + (b']' if len(ranges) > 1 or (ranges and len(ranges[0]) > 1) else b'')

x = find_ranges([b"a", b"c", b"b", b"e"])
x, format_ranges(x)

([range(97, 100), range(101, 102)], b'[a-ce]')

In [6]:
import re

def commonate_nonsingles(nonsingles: list[tuple[bytes, bytes]]):
    common_branches = {}
    for key, trail in nonsingles:
        common_branches.setdefault(trail, []).append(key)
    joins = [(
        branch,
        [key for key in keys if re.escape(key) == key],
        [key for key in keys if re.escape(key) != key]
    ) for branch, keys in common_branches.items()]
    merged = [(branch, 
               b'(?:' + format_ranges(find_ranges(safe)) + b'|' + b'|'.join(unsafe) + b')'
               if safe and unsafe else
               format_ranges(find_ranges(safe))
               if safe else
               b'(?:' + b'|'.join(unsafe) + b')'
               if len(unsafe) > 1 else unsafe[0]
    ) for branch, safe, unsafe in joins]
    return b'|'.join(keys + branch for branch, keys in merged)


In [7]:
def form_prefix_tree(*bins: bytes):
    prefix_tree = {}
    for bin in bins:
        if bin[0] in prefix_tree and bin[1:]:
            prefix_tree[bin[0]].append(bin[1:])
        elif bin[1:]:
            prefix_tree[bin[0]] = [bin[1:]]
        else:
            prefix_tree[bin[0]] = []
    # recurse
    return {i: form_prefix_tree(*cs) for i, cs in prefix_tree.items()}

def saturate(tree: dict):
    return {re.escape(bytes([key])): saturate(value) for key, value in tree.items()}


def prefix_to_regex(tree: dict):
    branches = [(key, prefix_to_regex(val)) for key, val in tree.items()]
    singles = [key for key, trail in branches if not trail and re.escape(key) == key]
    nonsingles = [(key, trail) for key, trail in branches if trail or re.escape(key) != key]
    single_ranges = format_ranges(find_ranges(singles))
    single_block = b"".join(singles)
    nonsingle_bundle = commonate_nonsingles(nonsingles)
    if singles and nonsingles:
        return b"(?:" + single_ranges + b'|' + nonsingle_bundle + b")"
    elif nonsingles:
        if len(nonsingles) == 1:
            return nonsingle_bundle
        else:
            return b"(?:" + nonsingle_bundle + b")"
    elif singles:
        if len(singles) == 1:
            return single_block
        else:
            return single_ranges
    else:
        return b''

def prefix_to_regex_old(tree: dict):
    return b'(?:' + b'|'.join(key + prefix_to_regex_old(val) for key, val in tree.items()) + b')'

def chars_union(*cs: str):
    prefix_tree = form_prefix_tree(*(c.encode("utf-8") for c in cs))
    # print(prefix_tree)
    # print(saturate(prefix_tree))
    result = prefix_to_regex(saturate(prefix_tree))
    result_old = prefix_to_regex_old(saturate(prefix_tree))

    # print(result)
    # result = b"(?:" + b"|".join(regexify(c) for c in cs) + b")"
    # print(result)
    print("initial", len(result_old), "final", len(result), "delta", len(result) - len(result_old))
    return result

chars_union(*"abあいうもやゆ+")

initial 73 final 26 delta -47


b'(?:[ab]|\xe3(?:[\x81\x82][\x82\x84\x86])|\\+)'

In [8]:
cat_ranges = {cat.encode("utf-8"): chars_union(*chars) for cat, chars in cat_chars.items()}
{cat: len(union) for cat, union in cat_ranges.items()}

initial 6737001 final 502 delta -6736499
initial 10 final 2 delta -8
initial 10 final 2 delta -8
initial 23759 final 140 delta -23619
initial 255 final 73 delta -182
initial 778 final 30 delta -748
initial 603 final 41 delta -562
initial 3292 final 404 delta -2888
initial 65536 final 611 delta -64925
initial 850 final 30 delta -820
initial 174 final 8 delta -166
initial 21650 final 371 delta -21279
initial 13713 final 1477 delta -12236
initial 19 final 3 delta -16
initial 1503 final 97 delta -1406
initial 5962 final 774 delta -5188
initial 74 final 14 delta -60
initial 2828 final 630 delta -2198


{b'[^\\p{Grapheme_Cluster_Break=Control}\\p{Grapheme_Cluster_Break=CR}\\p{Grapheme_Cluster_Break=LF}]': 502,
 b'\\p{Grapheme_Cluster_Break=CR}': 2,
 b'\\p{Grapheme_Cluster_Break=LF}': 2,
 b'\\p{Grapheme_Cluster_Break=Control}': 140,
 b'\\p{Grapheme_Cluster_Break=Prepend}': 73,
 b'\\p{Grapheme_Cluster_Break=L}': 30,
 b'\\p{Grapheme_Cluster_Break=V}': 41,
 b'\\p{Grapheme_Cluster_Break=LV}': 404,
 b'\\p{Grapheme_Cluster_Break=LVT}': 611,
 b'\\p{Grapheme_Cluster_Break=T}': 30,
 b'\\p{Grapheme_Cluster_Break=RI}': 8,
 b'\\p{Extended_Pictographic}': 371,
 b'\\p{Grapheme_Cluster_Break=Extend}': 1477,
 b'\\p{Grapheme_Cluster_Break=ZWJ}': 3,
 b'\\p{InCB=Consonant}': 97,
 b'\\p{InCB=Extend}': 774,
 b'\\p{InCB=Linker}': 14,
 b'\\p{Grapheme_Cluster_Break=SpacingMark}': 630}

In [9]:
bin_pattern = cat_pattern.encode("utf-8")
for cat, union in cat_ranges.items():
    bin_pattern = bin_pattern.replace(cat, union)

len(bin_pattern)

8149

In [10]:
import re, grapheme

final_pattern = re.compile(bin_pattern)
s = " 😀😀😀👨‍👩‍👧‍👦👨‍👩‍👧‍👦😀😀"
len(final_pattern.findall(s.encode("utf-8"))), grapheme.length(s)

(8, 8)

In [11]:
line_width = 79
varname = "pattern"
offset = len(f"{varname} = (")
indent = len("    ")
delimiters = len("b''")

bin_remaining = ascii(bin_pattern)[2:-1]

undecorated = []
remaining_budget = line_width - indent - delimiters
initial_budget = line_width - offset - delimiters
initial_budget = remaining_budget

i = 0
while i < len(bin_remaining):
    budget = initial_budget if i == 0 else remaining_budget

    nudge = 0
    attempt = bin_remaining[i : i + budget - nudge]
    while True:
        try:
            eval("b'" + attempt + "'")
        except SyntaxError:
            nudge += 1
            attempt = bin_remaining[i : i + budget - nudge]
        else:
            break
    
    undecorated.append((attempt, nudge))
    i += len(attempt)


undecorated

[('(?:\\\\\\r\\\\\\n|\\\\\\r|\\\\\\n)|(?:[\\x00-\\x08\\x0e-\\x1f\\x7f]|(?:\\\\\\t|\\\\\\x0b|\\\\\\x0c',
  0),
 (')|\\xc2[\\x80-\\x9f\\xad]|\\xd8\\x9c|\\xe1\\xa0\\x8e|\\xe2(?:\\x80[\\x8b\\x8e\\x8f\\xa8',
  0),
 ('-\\xae]|\\x81[\\xa0-\\xaf])|\\xef(?:\\xbb\\xbf|\\xbf[\\xb0-\\xbb])|\\xf0(?:\\x93\\x90',
  0),
 ('[\\xb0-\\xbf]|\\x9b\\xb2[\\xa0-\\xa3]|\\x9d\\x85[\\xb3-\\xba])|\\xf3\\xa0(?:\\x80[',
  3),
 ('\\x80-\\x9f]|[\\x82\\x83\\x88-\\xbf][\\x80-\\xbf]|\\x87[\\xb0-\\xbf]))|(?:\\xd8[\\x80',
  0),
 ('-\\x85]|\\xdb\\x9d|\\xdc\\x8f|\\xe0(?:\\xa2[\\x90\\x91]|\\xa3\\xa2|\\xb5\\x8e)|\\xf0',
  2),
 ('\\x91(?:\\x82\\xbd|\\x83\\x8d|\\x87[\\x82\\x83]|\\xa4\\xbf|\\xa5\\x81|\\xa8\\xba|\\xaa[',
  0),
 ('\\x84-\\x89]|\\xb5\\x86|\\xbc\\x82))*(?:(?:(?:\\xe1(?:\\x84[\\x80-\\xbf]|\\x85[\\x80',
  0),
 ('-\\x9f])|\\xea\\xa5[\\xa0-\\xbc])*(?:(?:\\xe1(?:\\x85[\\xa0-\\xbf]|\\x86[\\x80-\\xa7',
  0),
 ('])|\\xed(?:\\x9e[\\xb0-\\xbf]|\\x9f[\\x80-\\x86]))+|(?:\\xea(?:[\\xb0\\xb7\\xbe][',
  2),
 ('\\x80\\x9c\\xb8]|[\\

In [15]:
nudge_enabled = False
with open("part.py", "wt") as f:
    f.write(f"{varname} = (\n")
    for i, (line, nudge) in enumerate(undecorated):
        f.write("    b'")
        f.write(line)
        f.write("'")
        if nudge and nudge_enabled:
            f.write(nudge * "#")
        f.write("\n")
    f.write(")")

with open("raw.py", "wt") as f:
    f.write(f"{varname} = b'")
    f.write(bin_remaining)
    f.write("'\n")
    

In [18]:
import part

def length(string: bytes) -> int:
    '''Finds the length of a string'''
    import re
    return len(re.findall(part.pattern, string))

tests = [
    " 😀😀😀👨‍👩‍👧‍👦👨‍👩‍👧‍👦😀😀",
    "hello",
    "こんにちは",
    "à̀̀̀̀",
    "🇫🇮🇫🇮",
]

for test in tests:
    print(length(test.encode("utf-8")), grapheme.length(test))

8 8
5 5
5 5
1 1
2 2
