# Algorithmic Macronizer

Testa mapp med 15 om-makroniserade Norma-texter. De saknar [] och {}, så dessa måste ignoreras. Vi testar i lowercase.

In [1]:
import os
import re
from grc_utils import lower_grc, normalize_word

chars_to_clean = r'[()\[\]{}<>⟨⟩⎡⎤\"«»\-—…|⏑⏓†×]'

macronizer_versions = "norma_macronizer"
gold_versions = "norma-syllabarum-graecarum/final"

# global counters
short_total_global = 0
long_total_global = 0
short_fails_global = 0
long_fails_global = 0
line_total_global = 0
line_matches_global = 0

def clean_line(line):
    line = lower_grc(line)
    line = normalize_word(line)
    line = re.sub(chars_to_clean, "", line)
    return line.strip()

def strip_markers(s):
    """Remove ^ and _ for comparison of base text."""
    return s.replace("^", "").replace("_", "")

# iterate over all files in gold folder
for fname in os.listdir(gold_versions):
    if not fname.endswith(".txt"):
        continue

    gold_path = os.path.join(gold_versions, fname)
    macronizer_path = os.path.join(macronizer_versions, fname)

    if not os.path.exists(macronizer_path):
        print(f"Skipping {fname}: no matching macronizer file.")
        continue

    with open(gold_path, encoding="utf-8") as g, open(macronizer_path, encoding="utf-8") as m:
        gold_lines = [clean_line(l) for l in g if l.strip()]
        macronizer_lines = [clean_line(l) for l in m if l.strip()]

    # per-file counters
    short_total = 0
    long_total = 0
    short_fails = 0
    long_fails = 0
    line_total = 0
    line_matches = 0

    # safeguard: iterate over min length
    for gold_line, macron_line in zip(gold_lines, macronizer_lines):
        gi = len(gold_line) - 1
        mi = len(macron_line) - 1

        # line-level match ignoring ^_
        line_total += 1
        if strip_markers(gold_line) == strip_markers(macron_line):
            line_matches += 1

        while gi >= 0:
            gch = gold_line[gi]

            if gch == "^":
                short_total += 1
                if mi < 0:
                    # nothing left in macronizer → assume default short, no fail
                    pass
                elif macron_line[mi] == "^":
                    mi -= 1  # matched short
                elif macron_line[mi] == "_":
                    short_fails += 1  # explicit contradiction
                    mi -= 1
                else:
                    # neither ^ nor _ → treat as implicit short, no fail
                    mi -= 1

            elif gch == "_":
                long_total += 1
                if mi < 0 or macron_line[mi] != "_":
                    long_fails += 1
                else:
                    mi -= 1  # matched long

            else:
                mi -= 1  # advance only on real characters

            gi -= 1

    # update globals
    short_total_global += short_total
    long_total_global += long_total
    short_fails_global += short_fails
    long_fails_global += long_fails
    line_total_global += line_total
    line_matches_global += line_matches

    # per-file report
    short_success = short_total - short_fails
    long_success = long_total - long_fails
    both_total = short_total + long_total
    both_success = short_success + long_success

    print(f"\n=== File: {fname} ===")
    if short_total:
        print(f"^ success: {short_success}/{short_total} = {short_success/short_total:.4f}")
    if long_total:
        print(f"_ success: {long_success}/{long_total} = {long_success/long_total:.4f}")
    if both_total:
        print(f"Both success: {both_success}/{both_total} = {both_success/both_total:.4f}")
    if line_total:
        print(f"Line matches ignoring ^_: {line_matches}/{line_total} = {line_matches/line_total:.4f}")

# global summary
short_success_global = short_total_global - short_fails_global
long_success_global = long_total_global - long_fails_global
both_total_global = short_total_global + long_total_global
both_success_global = short_success_global + long_success_global

print("\n=== Global summary ===")
if short_total_global:
    print(f"^ success: {short_success_global}/{short_total_global} = {short_success_global/short_total_global:.4f}")
if long_total_global:
    print(f"_ success: {long_success_global}/{long_total_global} = {long_success_global/long_total_global:.4f}")
if both_total_global:
    print(f"Both success: {both_success_global}/{both_total_global} = {both_success_global/both_total_global:.4f}")
if line_total_global:
    print(f"Line matches ignoring ^_: {line_matches_global}/{line_total_global} = {line_matches_global/line_total_global:.4f}")



FileNotFoundError: [Errno 2] No such file or directory: 'norma-syllabarum-graecarum/final'