Two general strategies of unifying benchmark: 
- 

In [2]:
!git clone https://github.com/Urdatorn/norma-syllabarum-graecarum.git

Klonar till "norma-syllabarum-graecarum"...
remote: Enumerating objects: 255, done.[K
remote: Counting objects: 100% (255/255), done.[K
remote: Compressing objects: 100% (167/167), done.[K
remote: Total 255 (delta 120), reused 203 (delta 72), pack-reused 0 (from 0)[K
Tar emot objekt: 100% (255/255), 172,91 KiB | 2,08 MiB/s, klart.
Analyserar delta: 100% (120/120), klart.


# Eric's original bench comparison

In [1]:
import os
import re
import torch
from collections import defaultdict
from torch.nn.functional import softmax
from transformers import PreTrainedTokenizerFast, RobertaForTokenClassification

from syllagreek_utils import preprocess_greek_line, syllabify_joined

# -------- Load Model --------
# Point to your trained RoBERTa checkpoint
model_path = "Ericu950/macronizer_mini"

tokenizer: PreTrainedTokenizerFast = PreTrainedTokenizerFast.from_pretrained(model_path)
model: RobertaForTokenClassification = RobertaForTokenClassification.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Respect model's positional limit (RoBERTa uses <s> and </s> as 2 specials)
max_len = min(
    getattr(tokenizer, "model_max_length", 512),
    getattr(model.config, "max_position_embeddings", 514) - 2
)
if max_len <= 0:
    max_len = 512  # fallback

# -------- Helper: parse gold labels in files --------
def extract_expected_pattern(line):
    """
    Input line may contain:
      [ ... ] = expected H
      { ... } = expected L
      plain   = unlabeled (ignored)
    Returns raw_input (no brackets) and sequence of expected labels for syllables only in brackets.
    """
    expected_labels = []
    tokens = []

    pattern = re.compile(r'(\[[^]]+\])|(\{[^}]+\})|([^\[\]\{\}]+)')
    for match in pattern.finditer(line):
        if match.group(1):
            content = match.group(1)[1:-1]
            tokens.append(content)
            expected_labels.append("H")
        elif match.group(2):
            content = match.group(2)[1:-1]
            tokens.append(content)
            expected_labels.append("L")
        elif match.group(3):
            content = match.group(3)
            tokens.append(content)

    raw_input = ''.join(tokens)
    return raw_input, expected_labels

# -------- Rule-based backup (unchanged) --------
def classify_syllables(syllables, clear_mask):
    definitely_heavy_set = set("ὖὗἆἇἶἷήηωώἠἡἢἣἤἥἦἧὠὡὢὣὤὥὦὧὴὼᾄᾅᾆᾇᾐᾑᾔᾕᾖᾗᾠᾤᾦᾧᾳᾴᾶᾷῂῃῄῆῇῖῦῲῳῴῶῷ")
    ambiguous_set = set("ΐάίΰαιυϊϋύἀἁἂἃἄἅἰἱἲἳἴἵὐὑὓὔὕὰῒὶὺ")
    light_set = set("έεοόἐἑἓἔἕὀὁὂὃὄὅὲὸ")

    mute_consonants = set("βγδθκπτφχ")
    nonmute_consonants = set("λρμν")
    sigma = set("σ")
    all_consonants = mute_consonants | nonmute_consonants | sigma

    def token_contains(token, char_set):
        return any(ch in char_set for ch in token)

    def get_nucleus(syl):
        nucleus_chars = [ch for token in syl for ch in token if ch not in all_consonants]
        return ''.join(nucleus_chars) if nucleus_chars else None

    def classify_single_syllable(syl, next_syl):
        full_syllable = "".join(syl)
        nucleus = get_nucleus(syl)
        if nucleus is None:
            return "light"

        if len(nucleus) >= 2:
            base_class = "heavy"
        elif token_contains(nucleus, definitely_heavy_set):
            base_class = "heavy"
        elif token_contains(nucleus, ambiguous_set):
            base_class = "ambiguous"
        elif token_contains(nucleus, light_set):
            base_class = "light"
        else:
            base_class = "light"

        final_char = syl[-1][-1]

        if base_class == "heavy":
            return "heavy"
        elif base_class == "ambiguous":
            if final_char in nonmute_consonants:
                return "heavy"
            if final_char in mute_consonants and next_syl is not None:
                next_onset = next_syl[0][0]
                if next_onset not in nonmute_consonants:
                    return "heavy"
            return "muta cum liquida"
        elif base_class == "light":
            if final_char in nonmute_consonants or final_char in sigma:
                return "heavy"
            elif final_char in mute_consonants and next_syl is not None:
                next_onset = next_syl[0][0]
                if next_onset in nonmute_consonants:
                    return "muta cum liquida"
                else:
                    return "heavy"
            else:
                return "light"

    classifications = []
    for i, syl in enumerate(syllables):
        if not clear_mask[i]:
            classifications.append(None)
            continue
        next_syl = syllables[i+1] if i < len(syllables) - 1 else None
        syl_class = classify_single_syllable(syl, next_syl)
        classifications.append(syl_class)

    return classifications

In [2]:
# -------- Inference: map predictions back to syllables --------
def predict_syllable_weights(raw_line):
    # Your utils should return preprocessed tokens and their syllabification
    tokens = preprocess_greek_line(raw_line)
    syllables = syllabify_joined(tokens)  # list[str], one syllable per element

    # Encode using syllables as pre-split "words"
    enc = tokenizer(
        syllables,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=max_len,
        padding=False  # avoid masking every sequence to max_len
    )

    # Remove token_type_ids if present (RoBERTa doesn't use them)
    enc.pop("token_type_ids", None)

    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        outputs = model(**enc)
        logits = outputs.logits  # [1, seq_len, num_labels]
        probs = softmax(logits, dim=-1)
        pred_ids = torch.argmax(probs, dim=-1)[0].cpu().tolist()

    # Align back to syllables via word_ids
    # For a single example, use the fast tokenizer's word_ids()
    word_ids = tokenizer(
        syllables,
        is_split_into_words=True,
        truncation=True,
        max_length=max_len,
        add_special_tokens=True,
        return_offsets_mapping=False
    ).word_ids()

    aligned_preds = []
    syllable_index = 0
    for i, wid in enumerate(word_ids):
        if wid is None:
            continue  # special tokens like <s>, </s>
        if wid != syllable_index:
            # If tokenizer ever produced more than one token per syllable (shouldn't happen with WordLevel),
            # this keeps only the first; adjust if you want a different policy (e.g., majority).
            syllable_index = wid
        if syllable_index < len(syllables):
            aligned_preds.append((syllables[syllable_index], pred_ids[i]))

    # Trim to actual syllables length (in case truncation kicked in)
    aligned_preds = aligned_preds[:len(syllables)]

    only_sylls = [s for s, _ in aligned_preds]
    labels = [l for _, l in aligned_preds]

    # 0 = "use rules" (as in your previous script), 1 = H, 2 = L  (adapt if your mapping differs)
    clear_mask = [l == 0 for l in labels]

    # For the rule-based fallback we need character-level syllables
    syllables_tokenized = [[ch for ch in syl] for syl in only_sylls]
    rule_based = classify_syllables(syllables_tokenized, clear_mask)

    # Merge: if model says 1→H, 2→L, 0→use rule
    final_labels = []
    for model_label, rule in zip(labels, rule_based):
        if model_label == 1:
            final_labels.append("H")
        elif model_label == 2:
            final_labels.append("L")
        elif model_label == 0:
            final_labels.append("H" if rule == "heavy" else "L" if rule == "light" else None)
        else:
            final_labels.append(None)

    return final_labels

# -------- Evaluation Loop --------
data_dir = "norma-syllabarum-graecarum/final"
results = defaultdict(lambda: {"H_correct": 0, "H_total": 0, "L_correct": 0, "L_total": 0, "all_correct": 0, "all_total": 0})

for filename in os.listdir(data_dir):
    if not filename.endswith(".txt"):
        continue

    filepath = os.path.join(data_dir, filename)
    with open(filepath, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            raw_input, expected = extract_expected_pattern(line)
            predicted = predict_syllable_weights(raw_input)

            for exp, pred in zip(expected, predicted):
                if pred is None:
                    continue
                results[filename]["all_total"] += 1
                results[filename][f"{exp}_total"] += 1
                if exp == pred:
                    results[filename]["all_correct"] += 1
                    results[filename][f"{exp}_correct"] += 1

# -------- Reporting --------
overall = defaultdict(int)

print(f"\n{'File':<30} {'All Acc':>8} {'H Acc':>8} {'L Acc':>8}")
print("-" * 60)
for file, data in sorted(results.items()):
    all_acc = data["all_correct"] / data["all_total"] if data["all_total"] else 0
    h_acc = data["H_correct"] / data["H_total"] if data["H_total"] else 0
    l_acc = data["L_correct"] / data["L_total"] if data["L_total"] else 0

    print(f"{file:<30} {all_acc:8.2%} {h_acc:8.2%} {l_acc:8.2%}")

    for k in data:
        overall[k] += data[k]

# -------- Overall Accuracy --------
print("\nOverall Accuracy:")
all_acc = overall["all_correct"] / overall["all_total"] if overall["all_total"] else 0
h_acc = overall["H_correct"] / overall["H_total"] if overall["H_total"] else 0
l_acc = overall["L_correct"] / overall["L_total"] if overall["L_total"] else 0
print(f"{'All':<10}: {all_acc:.2%}")
print(f"{'Heavy':<10}: {h_acc:.2%}")
print(f"{'Light':<10}: {l_acc:.2%}")


File                            All Acc    H Acc    L Acc
------------------------------------------------------------
acharnenses.txt                  95.61%   95.33%   95.92%
contracelsum.txt                 96.94%   94.91%  100.00%
cratylus.txt                     96.12%   93.37%  100.00%
cyclops.txt                      96.27%   94.37%   98.99%
dionysiaca.txt                   96.33%   93.06%   99.36%
dioscorides.txt                  93.52%   86.96%   98.39%
enchiridion.txt                  95.81%   92.93%   99.38%
insolem.txt                      93.33%   92.72%   94.03%
norma_aristophanis_canticorum.txt   94.97%   93.54%   96.74%
oedipus.txt                      95.76%   94.33%   97.89%
partheneion.txt                  87.39%   80.88%   97.67%
plutarchus.txt                   96.18%   94.12%   98.84%
quintus.txt                      92.08%   89.21%   94.51%
supplices.txt                    88.66%   88.46%   88.89%
thucydides.txt                   95.97%   92.35%  100.00%

Overal

- *All 100% L texts are prose* => Prose is easier for the macronizer (not surprising)
- Doric (`partheneion`) is predictably the hardest ("alphacizing" => more hard dichrona)

# Version that saves failing words

In [1]:
import os
import re
import torch
from tqdm import tqdm
from collections import defaultdict
from torch.nn.functional import softmax
from transformers import PreTrainedTokenizerFast, RobertaForTokenClassification

from syllagreek_utils import preprocess_greek_line, syllabify_joined

model_path = "Ericu950/macronizer_mini"

tokenizer: PreTrainedTokenizerFast = PreTrainedTokenizerFast.from_pretrained(model_path)
model: RobertaForTokenClassification = RobertaForTokenClassification.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

max_len = min(
    getattr(tokenizer, "model_max_length", 512),
    getattr(model.config, "max_position_embeddings", 514) - 2
)
if max_len <= 0:
    max_len = 512

# -------- Helper: parse gold labels in files --------
def extract_expected_pattern(line):
    expected_labels = []
    tokens = []
    pattern = re.compile(r'(\[[^]]+\])|(\{[^}]+\})|([^\[\]\{\}]+)') # 3 groups: 1. [ ... ] bracketed sections, 2. { ... } brace sections, 3. everything else in between
    for match in pattern.finditer(line):
        if match.group(1):
            content = match.group(1)[1:-1]
            tokens.append(content)
            expected_labels.append("H")
        elif match.group(2):
            content = match.group(2)[1:-1]
            tokens.append(content)
            expected_labels.append("L")
        elif match.group(3):
            tokens.append(match.group(3))
    raw_input = ''.join(tokens)
    return raw_input, expected_labels

# -------- Rule-based backup (unchanged) --------
def classify_syllables(syllables, clear_mask):
    definitely_heavy_set = set("ὖὗἆἇἶἷήηωώἠἡἢἣἤἥἦἧὠὡὢὣὤὥὦὧὴὼᾄᾅᾆᾇᾐᾑᾔᾕᾖᾗᾠᾤᾦᾧᾳᾴᾶᾷῂῃῄῆῇῖῦῲῳῴῶῷ")
    ambiguous_set = set("ΐάίΰαιυϊϋύἀἁἂἃἄἅἰἱἲἳἴἵὐὑὓὔὕὰῒὶὺ")
    light_set = set("έεοόἐἑἓἔἕὀὁὂὃὄὅὲὸ")
    mute_consonants = set("βγδθκπτφχ")
    nonmute_consonants = set("λρμν")
    sigma = set("σ")
    all_consonants = mute_consonants | nonmute_consonants | sigma

    def token_contains(token, char_set):
        return any(ch in char_set for ch in token)

    def get_nucleus(syl):
        nucleus_chars = [ch for token in syl for ch in token if ch not in all_consonants]
        return ''.join(nucleus_chars) if nucleus_chars else None

    def classify_single_syllable(syl, next_syl):
        nucleus = get_nucleus(syl)
        if nucleus is None:
            return "light"

        if len(nucleus) >= 2:
            base_class = "heavy"
        elif token_contains(nucleus, definitely_heavy_set):
            base_class = "heavy"
        elif token_contains(nucleus, ambiguous_set):
            base_class = "ambiguous"
        elif token_contains(nucleus, light_set):
            base_class = "light"
        else:
            base_class = "light"

        final_char = syl[-1][-1]

        if base_class == "heavy":
            return "heavy"
        elif base_class == "ambiguous":
            if final_char in nonmute_consonants:
                return "heavy"
            if final_char in mute_consonants and next_syl is not None:
                next_onset = next_syl[0][0]
                if next_onset not in nonmute_consonants:
                    return "heavy"
            return "muta cum liquida"
        elif base_class == "light":
            if final_char in nonmute_consonants or final_char in sigma:
                return "heavy"
            elif final_char in mute_consonants and next_syl is not None:
                next_onset = next_syl[0][0]
                if next_onset in nonmute_consonants:
                    return "muta cum liquida"
                else:
                    return "heavy"
            else:
                return "light"

    classifications = []
    for i, syl in enumerate(syllables):
        if not clear_mask[i]:
            classifications.append(None)
            continue
        next_syl = syllables[i+1] if i < len(syllables) - 1 else None
        classifications.append(classify_single_syllable(syl, next_syl))
    return classifications

# -------- Inference: map predictions back to syllables --------
def predict_syllable_weights(raw_line):
    tokens = preprocess_greek_line(raw_line)
    syllables = syllabify_joined(tokens)

    enc = tokenizer(
        syllables,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=max_len,
        padding=False
    )
    enc.pop("token_type_ids", None)
    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        outputs = model(**enc)
        probs = softmax(outputs.logits, dim=-1) # dim=-1 means "the last dim" of the tensor, e.g. in [[[0,1],[2,3]]] we softmax [0,1] & [2,3]
        pred_ids = torch.argmax(probs, dim=-1)[0].cpu().tolist()

    word_ids = tokenizer(
        syllables,
        is_split_into_words=True,
        truncation=True,
        max_length=max_len,
        add_special_tokens=True,
        return_offsets_mapping=False
    ).word_ids()

    aligned_preds = []
    syllable_index = 0
    for i, wid in enumerate(word_ids):
        if wid is None:
            continue
        if wid != syllable_index:
            syllable_index = wid
        if syllable_index < len(syllables):
            aligned_preds.append((syllables[syllable_index], pred_ids[i]))

    aligned_preds = aligned_preds[:len(syllables)]
    only_sylls = [s for s, _ in aligned_preds]
    labels = [l for _, l in aligned_preds]

    clear_mask = [l == 0 for l in labels]
    syllables_tokenized = [[ch for ch in syl] for syl in only_sylls]
    rule_based = classify_syllables(syllables_tokenized, clear_mask)

    final_labels = []
    for model_label, rule in zip(labels, rule_based):
        if model_label == 1:
            final_labels.append("H")
        elif model_label == 2:
            final_labels.append("L")
        elif model_label == 0:
            final_labels.append("H" if rule == "heavy" else "L" if rule == "light" else None)
        else:
            final_labels.append(None)
    return final_labels

# ---------------------------------------------------------------------------
# -------- Evaluation Loop (TSV failing sentences with line numbers) --------
# ---------------------------------------------------------------------------

gold_dir = "norma-syllabarum-graecarum/final"
pred_dir = "norma_macronizer_mini"
failed_sentences_dir = "failed_sentences"
os.makedirs(failed_sentences_dir, exist_ok=True)

results = defaultdict(lambda: {
    "H_correct":0, "H_total":0,
    "L_correct":0, "L_total":0,
    "all_correct":0, "all_total":0,
    "failed_sentences":[]  # stores tuples: (gold_line, linenr)
})

for filename in tqdm(os.listdir(gold_dir)):
    if not filename.endswith(".txt"):
        continue
    gold_path = os.path.join(gold_dir, filename)
    pred_path = os.path.join(pred_dir, filename)

    # Load predicted lines once
    if os.path.exists(pred_path):
        with open(pred_path, encoding="utf-8") as pf:
            pred_lines = [l.strip() for l in pf.readlines()]
    else:
        pred_lines = []

    with open(gold_path, encoding="utf-8") as gf:
        for linenr, line in enumerate(gf, start=1):
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            raw_input, expected = extract_expected_pattern(line)
            predicted_labels = predict_syllable_weights(raw_input)

            # Stats
            for exp, pred_lbl in zip(expected, predicted_labels):
                if pred_lbl is None:
                    continue
                results[filename]["all_total"] += 1
                results[filename][f"{exp}_total"] += 1
                if exp == pred_lbl:
                    results[filename]["all_correct"] += 1
                    results[filename][f"{exp}_correct"] += 1

            # Record failing lines
            fully_correct = all(
                (p == e or p is None) for p, e in zip(predicted_labels, expected)
            )
            if not fully_correct:
                results[filename]["failed_sentences"].append((line, linenr))

    # Write TSV
    outpath = os.path.join(failed_sentences_dir, filename.replace(".txt","_failed_sentences.tsv"))
    with open(outpath, "w", encoding="utf-8") as out:
        out.write("gold_sentence\tline_number\tpredicted_sentence\n")
        for gold_line, linenr in results[filename]["failed_sentences"]:
            if linenr-1 < len(pred_lines):
                pred_line = pred_lines[linenr-1]
            else:
                pred_line = ""
            out.write(f"{gold_line.replace('[', '').replace(']', '').replace('{', '').replace('}', '')}\t{linenr}\t{pred_line}\n")

# -------- Reporting (unchanged) --------
overall = defaultdict(int)
print(f"\n{'File':<30} {'All Acc':>8} {'H Acc':>8} {'L Acc':>8}")
print("-"*60)
for file, data in sorted(results.items()):
    all_acc = data["all_correct"]/data["all_total"] if data["all_total"] else 0
    h_acc = data["H_correct"]/data["H_total"] if data["H_total"] else 0
    l_acc = data["L_correct"]/data["L_total"] if data["L_total"] else 0
    print(f"{file:<30} {all_acc:8.2%} {h_acc:8.2%} {l_acc:8.2%}")
    for k in ["all_correct","all_total","H_correct","H_total","L_correct","L_total"]:
        overall[k] += data[k]

print("\nOverall Accuracy:")
all_acc = overall["all_correct"]/overall["all_total"] if overall["all_total"] else 0
h_acc = overall["H_correct"]/overall["H_total"] if overall["H_total"] else 0
l_acc = overall["L_correct"]/overall["L_total"] if overall["L_total"] else 0
print(f"{'All':<10}: {all_acc:.2%}")
print(f"{'Heavy':<10}: {h_acc:.2%}")
print(f"{'Light':<10}: {l_acc:.2%}")

100%|██████████| 16/16 [00:04<00:00,  3.50it/s]


File                            All Acc    H Acc    L Acc
------------------------------------------------------------
acharnenses.txt                  95.61%   95.33%   95.92%
contracelsum.txt                 96.94%   94.91%  100.00%
cratylus.txt                     96.12%   93.37%  100.00%
cyclops.txt                      96.27%   94.37%   98.99%
dionysiaca.txt                   96.33%   93.06%   99.36%
dioscorides.txt                  93.52%   86.96%   98.39%
enchiridion.txt                  95.81%   92.93%   99.38%
insolem.txt                      93.33%   92.72%   94.03%
norma_aristophanis_canticorum.txt   94.97%   93.54%   96.74%
oedipus.txt                      95.76%   94.33%   97.89%
partheneion.txt                  87.39%   80.88%   97.67%
plutarchus.txt                   96.18%   94.12%   98.84%
quintus.txt                      92.08%   89.21%   94.51%
supplices.txt                    88.66%   88.46%   88.89%
thucydides.txt                   95.97%   92.35%  100.00%

Overal




In [9]:
import os
import re
import torch
from tqdm import tqdm
from collections import defaultdict
from torch.nn.functional import softmax
from transformers import PreTrainedTokenizerFast, RobertaForTokenClassification

from syllagreek_utils import preprocess_greek_line, syllabify_joined

# -------- Load model --------
model_path = "Ericu950/macronizer_mini"

print(f"Benchmarking {model_path} on Norma Syllabarum Graecarum...")

tokenizer: PreTrainedTokenizerFast = PreTrainedTokenizerFast.from_pretrained(model_path)
model: RobertaForTokenClassification = RobertaForTokenClassification.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

max_len = min(
    getattr(tokenizer, "model_max_length", 512),
    getattr(model.config, "max_position_embeddings", 514) - 2
)
if max_len <= 0:
    max_len = 512

# -------- Helper: parse gold labels in files --------
def extract_expected_pattern(line):
    expected_labels = []
    tokens = []
    pattern = re.compile(r'(\[[^]]+\])|(\{[^}]+\})|([^\[\]\{\}]+)')  # 3 groups: [..], {..}, other
    for match in pattern.finditer(line):
        if match.group(1):
            content = match.group(1)[1:-1]
            tokens.append(content)
            expected_labels.append("H")
        elif match.group(2):
            content = match.group(2)[1:-1]
            tokens.append(content)
            expected_labels.append("L")
        elif match.group(3):
            tokens.append(match.group(3))
    raw_input = ''.join(tokens)
    return raw_input, expected_labels

# -------- Rule-based syllable classification --------
def classify_syllables(syllables, clear_mask):
    definitely_heavy_set = set("ὖὗἆἇἶἷήηωώἠἡἢἣἤἥἦἧὠὡὢὣὤὥὦὧὴὼᾄᾅᾆᾇᾐᾑᾔᾕᾖᾗᾠᾤᾦᾧᾳᾴᾶᾷῂῃῄῆῇῖῦῲῳῴῶῷ")
    ambiguous_set = set("ΐάίΰαιυϊϋύἀἁἂἃἄἅἰἱἲἳἴἵὐὑὓὔὕὰῒὶὺ")
    light_set = set("έεοόἐἑἓἔἕὀὁὂὃὄὅὲὸ")
    mute_consonants = set("βγδθκπτφχ")
    nonmute_consonants = set("λρμν")
    sigma = set("σ")
    all_consonants = mute_consonants | nonmute_consonants | sigma

    def token_contains(token, char_set):
        return any(ch in char_set for ch in token)

    def get_nucleus(syl):
        nucleus_chars = [ch for token in syl for ch in token if ch not in all_consonants]
        return ''.join(nucleus_chars) if nucleus_chars else None

    def classify_single_syllable(syl, next_syl):
        nucleus = get_nucleus(syl)
        if nucleus is None:
            return "light"

        if len(nucleus) >= 2:
            base_class = "heavy"
        elif token_contains(nucleus, definitely_heavy_set):
            base_class = "heavy"
        elif token_contains(nucleus, ambiguous_set):
            base_class = "ambiguous"
        elif token_contains(nucleus, light_set):
            base_class = "light"
        else:
            base_class = "light"

        final_char = syl[-1][-1]

        if base_class == "heavy":
            return "heavy"
        elif base_class == "ambiguous":
            if final_char in nonmute_consonants:
                return "heavy"
            if final_char in mute_consonants and next_syl is not None:
                next_onset = next_syl[0][0]
                if next_onset not in nonmute_consonants:
                    return "heavy"
            return "muta cum liquida"
        elif base_class == "light":
            if final_char in nonmute_consonants or final_char in sigma:
                return "heavy"
            elif final_char in mute_consonants and next_syl is not None:
                next_onset = next_syl[0][0]
                if next_onset in nonmute_consonants:
                    return "muta cum liquida"
                else:
                    return "heavy"
            else:
                return "light"

    classifications = []
    for i, syl in enumerate(syllables):
        if not clear_mask[i]:
            classifications.append(None)
            continue
        next_syl = syllables[i+1] if i < len(syllables) - 1 else None
        classifications.append(classify_single_syllable(syl, next_syl))
    return classifications

# -------- Predict syllable weights --------
def predict_syllable_weights(raw_line):
    tokens = preprocess_greek_line(raw_line)
    syllables = syllabify_joined(tokens)

    enc = tokenizer(
        syllables,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=max_len,
        padding=False
    )
    enc.pop("token_type_ids", None)
    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        outputs = model(**enc)
        probs = softmax(outputs.logits, dim=-1)
        pred_ids = torch.argmax(probs, dim=-1)[0].cpu().tolist()

    word_ids = tokenizer(
        syllables,
        is_split_into_words=True,
        truncation=True,
        max_length=max_len,
        add_special_tokens=True,
        return_offsets_mapping=False
    ).word_ids()

    aligned_preds = []
    syllable_index = 0
    for i, wid in enumerate(word_ids):
        if wid is None:
            continue
        if wid != syllable_index:
            syllable_index = wid
        if syllable_index < len(syllables):
            aligned_preds.append((syllables[syllable_index], pred_ids[i]))

    aligned_preds = aligned_preds[:len(syllables)]
    only_sylls = [s for s, _ in aligned_preds]
    labels = [l for _, l in aligned_preds]

    clear_mask = [l == 0 for l in labels]
    syllables_tokenized = [[ch for ch in syl] for syl in only_sylls]
    rule_based = classify_syllables(syllables_tokenized, clear_mask)

    final_labels = []
    for model_label, rule in zip(labels, rule_based):
        if model_label == 1:
            final_labels.append("H")
        elif model_label == 2:
            final_labels.append("L")
        elif model_label == 0:
            final_labels.append("H" if rule == "heavy" else "L" if rule == "light" else None)
        else:
            final_labels.append(None)
    return final_labels

# ---------------------------------------------------------------------------
# -------- Evaluation Loop (compute predictions on the fly) -----------------
# ---------------------------------------------------------------------------

gold_dir = "norma-syllabarum-graecarum/final"
failed_sentences_dir = "failed_sentences"
failed_words_dir = "failed_words"
os.makedirs(failed_sentences_dir, exist_ok=True)
os.makedirs(failed_words_dir, exist_ok=True)

results = defaultdict(lambda: {
    "H_correct":0, "H_total":0,
    "L_correct":0, "L_total":0,
    "all_correct":0, "all_total":0,
    "failed_sentences":[]  # stores tuples: (gold_line, linenr)
})

failed_rows = []

for filename in tqdm(os.listdir(gold_dir)):
    if not filename.endswith(".txt"):
        continue
    gold_path = os.path.join(gold_dir, filename)

    with open(gold_path, encoding="utf-8") as gf:
        for linenr, line in enumerate(gf, start=1):
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            raw_input, expected = extract_expected_pattern(line)
            predicted_labels = predict_syllable_weights(raw_input)

            # Stats
            for exp, pred_lbl in zip(expected, predicted_labels):
                if pred_lbl is None:
                    continue
                results[filename]["all_total"] += 1
                results[filename][f"{exp}_total"] += 1
                if exp == pred_lbl:
                    results[filename]["all_correct"] += 1
                    results[filename][f"{exp}_correct"] += 1

            # Record failing sentences
            fully_correct = all((p == e or p is None) for p, e in zip(predicted_labels, expected))
            if not fully_correct:
                results[filename]["failed_sentences"].append((line, linenr))

                # ---- Build TSV rows for failed words ----
                gold_pattern_str = ''.join(expected)
                pred_pattern_str = ''.join([p if p else "-" for p in predicted_labels])

                # Build syllable-to-word mapping
                syll_to_word = []
                words = re.findall(r'\S+', re.sub(r'[\[\]\{\}]', '', raw_input))
                for word in words:
                    word_sylls = syllabify_joined(preprocess_greek_line(word))
                    syll_to_word.extend([word] * len(word_sylls))

                # Find failing syllables
                failed_indices = [i for i, (g_lbl, p_lbl) in enumerate(zip(expected, predicted_labels)) if p_lbl != g_lbl and p_lbl is not None]

                # Map each failing syllable to its word (once per word)
                failed_words_seen = set()
                for idx in failed_indices:
                    if idx >= len(syll_to_word):
                        continue
                    failed_word = syll_to_word[idx]
                    if failed_word in failed_words_seen:
                        continue
                    failed_words_seen.add(failed_word)
                    failed_rows.append((failed_word, raw_input, gold_pattern_str, pred_pattern_str))

# Write TSV for failed words
outpath_words = os.path.join(failed_words_dir, "failed_words.tsv")
with open(outpath_words, "w", encoding="utf-8") as out:
    out.write("failed_word\tgold_line\tgold_pattern\tpredicted_pattern\n")
    for row in failed_rows:
        out.write("\t".join(row) + "\n")

# Write TSVs for failed sentences (line-wise)
for filename, data in results.items():
    outpath_sent = os.path.join(failed_sentences_dir, filename.replace(".txt","_failed_sentences.tsv"))
    with open(outpath_sent, "w", encoding="utf-8") as out:
        out.write("gold_sentence\tline_number\tpredicted_sentence\n")
        for gold_line, linenr in data["failed_sentences"]:
            # full predicted pattern string for reference
            pred_labels = predict_syllable_weights(extract_expected_pattern(gold_line)[0])
            pred_line_str = ''.join([p if p else "-" for p in pred_labels])
            out.write(f"{gold_line.replace('[','').replace(']','').replace('{','').replace('}','')}\t{linenr}\t{pred_line_str}\n")

# -------- Reporting (unchanged) --------
overall = defaultdict(int)
print(f"\n{'File':<30} {'All Acc':>8} {'H Acc':>8} {'L Acc':>8}")
print("-"*60)
for file, data in sorted(results.items()):
    all_acc = data["all_correct"]/data["all_total"] if data["all_total"] else 0
    h_acc = data["H_correct"]/data["H_total"] if data["H_total"] else 0
    l_acc = data["L_correct"]/data["L_total"] if data["L_total"] else 0
    print(f"{file:<30} {all_acc:8.2%} {h_acc:8.2%} {l_acc:8.2%}")
    for k in ["all_correct","all_total","H_correct","H_total","L_correct","L_total"]:
        overall[k] += data[k]

print("\nOverall Accuracy:")
all_acc = overall["all_correct"]/overall["all_total"] if overall["all_total"] else 0
h_acc = overall["H_correct"]/overall["H_total"] if overall["H_total"] else 0
l_acc = overall["L_correct"]/overall["L_total"] if overall["L_total"] else 0
print(f"{'All':<10}: {all_acc:.2%}")
print(f"{'Heavy':<10}: {h_acc:.2%}")
print(f"{'Light':<10}: {l_acc:.2%}")

Benchmarking Ericu950/macronizer_mini on Norma Syllabarum Graecarum...


100%|██████████| 16/16 [00:05<00:00,  3.17it/s]



File                            All Acc    H Acc    L Acc
------------------------------------------------------------
acharnenses.txt                  95.61%   95.33%   95.92%
contracelsum.txt                 96.94%   94.91%  100.00%
cratylus.txt                     96.12%   93.37%  100.00%
cyclops.txt                      96.27%   94.37%   98.99%
dionysiaca.txt                   96.33%   93.06%   99.36%
dioscorides.txt                  93.52%   86.96%   98.39%
enchiridion.txt                  95.81%   92.93%   99.38%
insolem.txt                      93.33%   92.72%   94.03%
norma_aristophanis_canticorum.txt   94.97%   93.54%   96.74%
oedipus.txt                      95.76%   94.33%   97.89%
partheneion.txt                  87.39%   80.88%   97.67%
plutarchus.txt                   96.18%   94.12%   98.84%
quintus.txt                      92.08%   89.21%   94.51%
supplices.txt                    88.66%   88.46%   88.89%
thucydides.txt                   95.97%   92.35%  100.00%

Overal

# New unified string-comparison bench algorithm

Ovan benchmark är stavelseviktsbaserad!! Bör vara vokallängdsbaserad. Förenkling: funktion som "skyfflar om" ^/_ i macronizer-mini:s norma till att ligga efter närmast föregående vokal, så att den kan jämföras enkelt med gold, på samma sätt som existerande macronizern.

In [None]:
import re
from grc_utils import DICHRONA

# What to not count as permutation: whitespace + punctuation
PUNCT_WHITESPACE = re.compile(r'[\s\u0387\u037e\u00b7\.,!?;:\"()\[\]{}<>«»\-—…|⏑⏓†×]')

def _is_punct(ch: str) -> bool:
    return bool(PUNCT_WHITESPACE.match(ch))

def _slide_left_over_punct(chars, i):
    """
    Move chars[i] left across any contiguous punctuation/whitespace
    by swapping in-place. Return the new index of the diacritic.
    """
    while i > 0 and _is_punct(chars[i - 1]):
        chars[i - 1], chars[i] = chars[i], chars[i - 1]
        i -= 1
    return i

def markup_after_dichrona(string: str) -> str:
    """
    Ensure every ^/_ follows a char in DICHRONA.

    Algorithm:
    - For each ^/_:
      1) Slide it left across whitespace/punctuation (free).
      2) If left neighbor ∈ DICHRONA → OK.
      3) Else perform exactly ONE real swap with that left neighbor.
      4) Then slide left across whitespace/punctuation again (free).
      5) If left neighbor ∈ DICHRONA → OK, else raise ValueError.
    """
    chars = list(string)
    i = 0
    n = len(chars)

    while i < n:
        if chars[i] in {"^", "_"}:
            # Free slide left first
            i = _slide_left_over_punct(chars, i)

            if i == 0:
                print(f"Markup error: diacritic at start in: {string!r}")

            if chars[i - 1] not in DICHRONA:
                # One real swap with the immediate (non-punct) neighbor
                chars[i], chars[i - 1] = chars[i - 1], chars[i]
                i -= 1  # diacritic moved one left

                # Free slide again after the real swap
                i = _slide_left_over_punct(chars, i)

                if i == 0 or chars[i - 1] not in DICHRONA:
                    print(f"Invalid markup: diacritic not after DICHRONA near index {i} in {string!r}")
                    return string
            # continue scanning; i now points to the diacritic's position
        i += 1

    return "".join(chars)

print(markup_after_dichrona("πατρί Κ^νοσσός"))
print(markup_after_dichrona("κα^τὰ Χ^ρι_στι^α_νῶν"))
print(markup_after_dichrona("ἄτρ^απος"))

πατρί^ Κνοσσός
κα^τὰ^ Χρι_στι^α_νῶν
Invalid markup: diacritic not after DICHRONA near index 2 in 'ἄτρ^απος'
ἄτρ^απος


In [23]:
import os
from tqdm import tqdm

directory = "norma_macronizer_mini"

files = [f for f in os.listdir(directory) if f.endswith(".txt")]

for f in tqdm(files):
    path = os.path.join(directory, f)

    with open(path, "r", encoding="utf-8") as input_file:
        lines = [markup_after_dichrona(line.strip()) for line in input_file]

    with open(path, "w", encoding="utf-8") as output_file:
        output_file.write("\n".join(lines))

100%|██████████| 15/15 [00:00<00:00, 953.41it/s]

Invalid markup: diacritic not after DICHRONA near index 20 in 'σιδήρῳκαὶἀ^πα^θὴστὸσῶ^μαπ^λασσόμενοσεἶτα^'
Invalid markup: diacritic not after DICHRONA near index 35 in 'λόγῳὅσα^ἡμέτερα^ἔργα^οὐκἐφἡμῖνδὲτὸσῶ^μα^ἡκτῆσισ^'
Invalid markup: diacritic not after DICHRONA near index 6 in 'πᾶντὸσῶ^μα^κἀ^ποσείσασθαιτὸγῆρασ^τόδε'
Invalid markup: diacritic not after DICHRONA near index 4 in 'ἱ^ερᾶ_σὑ^πὸτι_μῆσ'
Invalid markup: diacritic not after DICHRONA near index 24 in 'φρίκ^σασ_δαὐτοκόμουλοφιᾶσ_λα^σιαύχενα^χαίταν'
Invalid markup: diacritic not after DICHRONA near index 23 in 'ἡμεῖσδέγαὖδσητήσομενθρετ^τα^νελοτὸνκύκλωπα^'





huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Add Eric's preprocessing of diphthongs and final sigmas. Should also ignore longs in closed sylls! 

In [2]:
# === 2. Diphthong component sets ===
diphth_y = {'α', 'ε', 'η', 'ο'}
upsilon_forms = {'ὐ','ὔ','υ','ὑ','ύ','ὖ','ῦ','ὕ','ὗ','ὺ','ὒ','ὓ'}

diphth_i = {'α', 'ε', 'ο', 'υ'}
iota_forms = {'ἰ','ί','ι','ῖ','ἴ','ἶ','ἵ','ἱ','ἷ','ὶ','ἲ','ἳ'}

# Iota subscript/adscript combinations
adscr_i_first = {
    'α','η','ω','ἀ','ἠ','ὠ','ἁ','ἡ','ὡ','ά','ή','ώ','ὰ','ὴ','ὼ',
    'ᾶ','ῆ','ῶ','ὤ','ὥ','ὢ','ὣ','ἄ','ἅ','ἂ','ἃ','ἤ','ἥ','ἣ',
    'ἢ','ἦ','ἧ','ἆ','ἇ','ὧ','ὦ'
}
adscr_i_second = {'ι'}

# === 3. Word processor: expansion and diphthong merging ===

def process_word(word):
    """
    Expand special Greek letters and merge diphthongs.

    Args:
        word (str): A lowercase Greek word.

    Returns:
        list of str: A list of tokens (letters or diphthongs).
    """
    expanded = []

    # Step 1: Expand characters like ζ → δσ, ξ → κσ, etc.
    for char in word:
        if char == 'ζ':
            expanded.extend(['δ', 'σ'])
        elif char == 'ς':
            expanded.append('σ')
        elif char == 'ῥ':
            expanded.append('ρ')
        elif char == 'ξ':
            expanded.extend(['κ', 'σ'])
        elif char == 'ψ':
            expanded.extend(['π', 'σ'])
        else:
            expanded.append(char)

    # Step 2: Merge diphthongs and adscript combinations
    combined = []
    i = 0
    while i < len(expanded):
        a = expanded[i]
        b = expanded[i+1] if i + 1 < len(expanded) else ''

        if a in diphth_y and b in upsilon_forms:
            combined.append(a + b)
            i += 2
        elif a in diphth_i and b in iota_forms:
            combined.append(a + b)
            i += 2
        elif a in adscr_i_first and b in adscr_i_second:
            combined.append(a + b)
            i += 2
        else:
            combined.append(a)
            i += 1

    return combined

First we let it default to short like the algo macronizer:

In [5]:
import os
from grc_utils import lower_grc, normalize_word, VOWELS, CONSONANTS

greek_chars = VOWELS | CONSONANTS

macronizer_versions = "norma_macronizer_mini"
gold_versions = "norma-syllabarum-graecarum/final"

# global counters
short_total_global = 0
long_total_global = 0
short_fails_global = 0
long_fails_global = 0
line_total_global = 0
line_matches_global = 0

def clean_line(line):
    line = line.strip()
    line = lower_grc(line)
    line = normalize_word(line)

    remove = set(line) - greek_chars - set("^_")
    table = str.maketrans("", "", "".join(remove))
    line = line.translate(table)
    
    line = "".join(process_word(line))
    return line

def strip_markers(s):
    """Remove ^ and _ for comparison of base text."""
    return s.replace("^", "").replace("_", "")

# iterate over all files in gold folder
for fname in os.listdir(gold_versions):
    if not fname.endswith(".txt"):
        continue

    gold_path = os.path.join(gold_versions, fname)
    macronizer_path = os.path.join(macronizer_versions, fname)

    if not os.path.exists(macronizer_path):
        print(f"Skipping {fname}: no matching macronizer file.")
        continue

    with open(gold_path, encoding="utf-8") as g, open(macronizer_path, encoding="utf-8") as m:
        gold_lines = [clean_line(l) for l in g if l.strip()]
        macronizer_lines = [clean_line(l) for l in m if l.strip()]

    # per-file counters
    short_total = 0
    long_total = 0
    short_fails = 0
    long_fails = 0
    line_total = 0
    line_matches = 0

    # safeguard: iterate over min length
    for gold_line, macron_line in zip(gold_lines, macronizer_lines):
        gi = len(gold_line) - 1
        mi = len(macron_line) - 1

        # line-level match ignoring ^_
        line_total += 1
        if strip_markers(gold_line) == strip_markers(macron_line):
            line_matches += 1
        else:
            print(strip_markers(gold_line) + "\n≠\n" + strip_markers(macron_line) + "\n")

        while gi >= 0:
            gch = gold_line[gi]

            if gch == "^":
                short_total += 1
                if mi < 0 or macron_line[mi] != "^":
                    short_fails += 1
                    print(f"Gold:     {gold_line}")
                    print(f"Macronizer: {macron_line}")
                else:
                    mi -= 1  # matched short

            elif gch == "_":
                long_total += 1
                if mi < 0 or macron_line[mi] != "_":
                    long_fails += 1
                    print(f"Gold:     {gold_line}")
                    print(f"Macronizer: {macron_line}")
                else:
                    mi -= 1  # matched long

            else:
                mi -= 1  # advance only on real characters

            gi -= 1

    # update globals
    short_total_global += short_total
    long_total_global += long_total
    short_fails_global += short_fails
    long_fails_global += long_fails
    line_total_global += line_total
    line_matches_global += line_matches

    # per-file report
    short_success = short_total - short_fails
    long_success = long_total - long_fails
    both_total = short_total + long_total
    both_success = short_success + long_success

    print(f"\n=== File: {fname} ===")
    if short_total:
        print(f"^ success: {short_success}/{short_total} = {short_success/short_total:.4f}")
    if long_total:
        print(f"_ success: {long_success}/{long_total} = {long_success/long_total:.4f}")
    if both_total:
        print(f"Both success: {both_success}/{both_total} = {both_success/both_total:.4f}")
    if line_total:
        print(f"Line matches ignoring ^_: {line_matches}/{line_total} = {line_matches/line_total:.4f}")

# global summary
short_success_global = short_total_global - short_fails_global
long_success_global = long_total_global - long_fails_global
both_total_global = short_total_global + long_total_global
both_success_global = short_success_global + long_success_global

print("\n=== Global summary ===")
if short_total_global:
    print(f"^ success: {short_success_global}/{short_total_global} = {short_success_global/short_total_global:.4f}")
if long_total_global:
    print(f"_ success: {long_success_global}/{long_total_global} = {long_success_global/long_total_global:.4f}")
if both_total_global:
    print(f"Both success: {both_success_global}/{both_total_global} = {both_success_global/both_total_global:.4f}")
if line_total_global:
    print(f"Line matches ignoring ^_: {line_matches_global}/{line_total_global} = {line_matches_global/line_total_global:.4f}")



Gold:     ὁμὲνσωτὴρκαὶκύ_ρι^οσἡμῶνἰ_ησοῦσχριστὸσ
Macronizer: ὁμὲνσωτὴρκαὶκύ_ρι^οσἡμῶνἰ^ησοῦσχριστὸσ
Gold:     ὁμὲνσωτὴρκαὶκύ_ρι^οσἡμῶνἰ_ησοῦσχριστὸσ
Macronizer: ὁμὲνσωτὴρκαὶκύ_ρι^οσἡμῶνἰ^ησοῦσχριστὸσ
Gold:     ὁμὲνσωτὴρκαὶκύ_ρι^οσἡμῶνἰ_ησοῦσχριστὸσ
Macronizer: ὁμὲνσωτὴρκαὶκύ_ρι^οσἡμῶνἰ^ησοῦσχριστὸσ
Gold:     καὶτὰ_σἐνἰ_ουδαίοισπράκσεισκρείττουσγεγονέναιφωνῆσ
Macronizer: καὶτὰ_σἐνἰ^ουδαίοισπράκσεισκρείττουσγεγονέναιφωνῆσ
Gold:     καὶτὰ_σἐνἰ_ουδαίοισπράκσεισκρείττουσγεγονέναιφωνῆσ
Macronizer: καὶτὰ_σἐνἰ^ουδαίοισπράκσεισκρείττουσγεγονέναιφωνῆσ
Gold:     ἐλεγχούσηστὴνπσευδομαρτυ^ρί^α_νκαὶλέκσεωνἀ^πολογουμένων
Macronizer: ἐλεγχούσηστὴνπσευδομαρτυ^ρί^ανκαὶλέκσεωνἀ^πολογουμένων
Gold:     οἶδὅπωσπρὸστὰ_σκέλσουκα^τὰ^χρι_στι^α_νῶνἐνσυγγράμμα^σι^
Macronizer: οἶδὅπωσπρὸστὰ_σκέλσουκα^τὰ^χριστια_νῶνἐνσυγγράμμα^σι^
Gold:     οἶδὅπωσπρὸστὰ_σκέλσουκα^τὰ^χρι_στι^α_νῶνἐνσυγγράμμα^σι^
Macronizer: οἶδὅπωσπρὸστὰ_σκέλσουκα^τὰ^χριστια_νῶνἐνσυγγράμμα^σι^
Gold:     πσευδομαρτυ^ρί^α_σκαὶτῆσπίστεωστῶνἐκκλησι^ῶνἐ

NOTE: in article, also compare with best result possible with (finetuned/vanilla?) GPT-5?

Intressant att makronizer-mini ibland makroniserar stängda mcl: ἐνβι^βλί^ῳ

In [None]:
import grc_odycy_joint_trf

nlp = grc_odycy_joint_trf.load()

nlp.pipe(sentence_list)