In [1]:
import re
import time

from grc_utils import has_ambiguous_dichrona_in_open_syllables
from macronize import macronize, macronize_text


test_words = ['λύω', 'ὕδασιν', 'ἱστάμεθα', 'ὕδωρ']

start_time = time.time()
batch_results = macronize(test_words, ifeellucky=True)
end_time = time.time()
print(f"Batch processing time: {end_time - start_time} seconds")

for w in test_words:
    print(f"{w} -> {batch_results[w]}")



Batch processing time: 2.035094976425171 seconds
λύω -> λῡ́ω
ὕδασιν -> ῠ̔́δᾰσῐν
ἱστάμεθα -> ῐ̔στᾰ́μεθᾰ
ὕδωρ -> ὕδωρ


In [21]:
from proper_names import proper_names

def get_words(text):
    return re.findall(r'\w+', text)

def remove_proper_names(text):
    # Build a regex pattern that matches whole words from the set
    pattern = r'\b(?:' + '|'.join(re.escape(name) for name in proper_names) + r')\b'

    # Remove names, handling extra spaces that might appear
    cleaned_text = re.sub(pattern, '', text).strip()
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Normalize spaces

    return cleaned_text

def macronization_ratio_words(text, proper_names=True):
    if not proper_names:
        text = remove_proper_names(text)

    words = get_words(text)

    fully_disambiguated = 0
    for word in words:
        if has_ambiguous_dichrona_in_open_syllables(word):
            continue
        fully_disambiguated += 1

    print(f'\nDisambiguated {fully_disambiguated} words out of {len(words)}')

    ratio = fully_disambiguated / len(words)
    return ratio

sentence = 'Δαρείου καὶ ἱστάμεθα Παρυσάτιδος γίγνονται παῖδες δύο, πρεσβύτερος μὲν Ἀρταξέρξης, νεώτερος δὲ Κῦρος· ἐπεὶ δὲ ἠσθένει Δαρεῖος καὶ ὑπώπτευε τελευτὴν τοῦ βίου, ἐβούλετο τὼ παῖδε ἀμφοτέρω παρεῖναι. ὁ μὲν οὖν πρεσβύτερος παρὼν ἐτύγχανε· Κῦρον δὲ μεταπέμπεται ἀπὸ τῆς ἀρχῆς ἧς αὐτὸν σατράπην ἐποίησε, καὶ στρατηγὸν δὲ αὐτὸν ἀπέδειξε πάντων ὅσοι ἐς Καστωλοῦ πεδίον ἁθροίζονται. ἀναβαίνει οὖν ὁ Κῦρος λαβὼν Τισσαφέρνην ὡς φίλον, καὶ τῶν Ἑλλήνων ἔχων ὁπλίτας ἀνέβη τριακοσίους, ἄρχοντα δὲ αὐτῶν Ξενίαν Παρράσιον.'

macronized_sentence = macronize_text(sentence)
print(f'Macronized text:\n{macronized_sentence}')

print(f'Stil ambiguous words:\n')
for word in get_words(macronized_sentence):
    if has_ambiguous_dichrona_in_open_syllables(word):
        print(word)    

print(f'\nMacronization ratio: {macronization_ratio_words(macronized_sentence)}')
print(f'\nMacronization ratio excl. proper names: {macronization_ratio_words(macronized_sentence, proper_names=False)}')

Elapsed time: 1.96 seconds
Macronized text:
Δαρείου καὶ ῐ̔στᾰ́μεθᾰ Παρυσάτιδος γίγνονται παῖδες δῠ́ο, πρεσβῠ́τερος μὲν Ἀρταξέρξης, νεώτερος δὲ Κῦρος· ἐπεὶ δὲ ἠσθένει Δαρεῖος καὶ ῠ̔πώπτευε τελευτὴν τοῦ βῐ́ου, ἐβούλετο τὼ παῖδε ᾰ̓μφοτέρω πᾰρεῖναι. ὁ μὲν οὖν πρεσβῠ́τερος πᾰρὼν ἐτύγχᾰνε· Κῦρον δὲ μετᾰπέμπεται ἀπὸ τῆς ἀρχῆς ἧς αὐτὸν σατράπην ἐποίησε, καὶ στρατηγὸν δὲ αὐτὸν ἀπέδειξε πᾰ́ντων ὅσοι ἐς Καστωλοῦ πεδίον ἁθροίζονται. ᾰ̓νᾰβαίνει οὖν ὁ Κῦρος λᾰβὼν Τισσαφέρνην ὡς φῐ́λον, καὶ τῶν Ἑλλήνων ἔχων ὁπλῑ́τᾱς ᾰ̓νέβη τρῐᾱκοσῐ́ους, ᾰ̓́ρχοντᾰ δὲ αὐτῶν Ξενίαν Παρράσιον.
Stil ambiguous words:

Δαρείου
Παρυσάτιδος
Ἀρταξέρξης
Δαρεῖος
ἀπὸ
σατράπην
στρατηγὸν
ἀπέδειξε
Τισσαφέρνην
Παρράσιον

Disambiguated 79 words out of 89

Macronization ratio: 0.8876404494382022

Disambiguated 73 words out of 77

Macronization ratio excl. proper names: 0.948051948051948


If not double checking baytones against oxytones, I get:

> Elapsed time: 2.16 seconds
Disambiguated 52714 words out of 62296

> Macronization ratio: 0.8461859509438808


In [None]:
from anabasis import anabasis

macronized_anabasis = macronize_text(anabasis)

print(f'\nMacronization ratio: {macronization_ratio_words(macronized_anabasis)}')
print(f'\nMacronization ratio excl. proper names: {macronization_ratio_words(macronized_anabasis, proper_names=False)}')

with open('anabasis_macronized.py', 'w', encoding='utf-8') as file:
    file.write(f"anabasis_macronized = '''{macronized_anabasis}'''")


Elapsed time: 2.34 seconds
Disambiguated 53730 words out of 62803

Macronization ratio: 0.8555323790264796
Disambiguated 51952 words out of 60005

Macronization ratio excl. proper names: 0.865794517123573


In [1]:
from class_macronizer import Macronizer

input = '''ἀγαθὸς, καλὸς, ἀνήρ'''

macronizer = Macronizer(unicode=False)            
output_unicode = macronizer.macronize_text(input)
macronizer.print_evaluation(input)


#macronizer_markup = Macronizer()
#output_markup = macronizer_markup.macronize_text(input)



Elapsed time: 1.93 seconds

EVALUATION
******************

Input text:
ἀγαθὸς, καλὸς, ἀνήρ...
Elapsed time: 2.04 seconds

Macronized text:
ἀ^γα^θὸς, κα^λὸς, ἀνήρ...

Disambiguated 6 words out of 10 (including proper names)

Macronization ratio: 0.60

Disambiguated 6 words out of 10 (excluding proper names)

Macronization ratio (no proper names): 0.60
