In [None]:
import re

SHORT_SPACES = re.compile(r'(?<=\S) {1,3}(?=\S)')  # only collapse spaces that sit between non-space chars

def normalize_text(text: str) -> str:
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    lines = text.split('\n')

    out = []
    in_code = False

    for orig in lines:
        # detect fence toggle
        stripped = orig.strip()
        if stripped.startswith('```'):
            out.append(stripped)  # normalized fence line
            in_code = not in_code
            continue

        if in_code:
            out.append(orig)
            continue

        # preserve preformatted lines that start with a tab or 4+ spaces
        if orig.startswith('\t') or re.match(r' {4,}', orig):
            out.append(orig.rstrip())
            continue

        # for normal lines: remove trailing whitespace, collapse short internal runs,
        # but keep leading/trailing single spaces intact
        line = orig.rstrip()  # remove trailing spaces only
        if line:
            line = SHORT_SPACES.sub(' ', line)

            # also collapse leading sequences of 2-3 spaces to a single leading space
            # while still preserving 4+ leading spaces would have matched preformatted earlier
            line = re.sub(r'^( {2,3})', ' ', line)

            # collapse trailing sequences of 2-3 spaces to a single space if any remain
            line = re.sub(r'( {2,3})$', ' ', line)

        # collapse multiple blank lines later
        out.append(line)

    # collapse multiple blank lines to single blank line
    collapsed = []
    prev_blank = False
    for l in out:
        is_blank = (l == '')
        if is_blank and prev_blank:
            continue
        collapsed.append(l)
        prev_blank = is_blank

    return '\n'.join(collapsed) + '\n'


In [None]:
from pathlib import Path

def fix_file(path):
    raw = path.read_bytes()
    text = raw.decode("cp1252", errors="ignore")
    normalized_text = normalize_text(text)
    path.write_text(normalized_text, encoding="utf8")

def fix_dir(dir_path):
    p = Path(dir_path)
    for f in p.iterdir():
        if f.is_file():
            fix_file(f)

fix_dir(r"..\data")

