In [None]:
!pip install matplotlib

In [None]:
from pie_extended.cli.utils import get_tagger, get_model, download
from typing import List
import os
import xml.etree.ElementTree as ET

In [None]:
do_download = False
if do_download:
    for dl in download("grc"):
        x = 1

model_name = "grc"
tagger = get_tagger(model_name, batch_size=256, device="cpu", model_path=None)

sentences: List[str] = ["ἄνδρα μοι ἔννεπε, μοῦσα, πολύτροπον. "]
from pie_extended.models.grc.imports import get_iterator_and_processor
for sentence_group in sentences:
    iterator, processor = get_iterator_and_processor()
    print(tagger.tag_str(sentence_group, iterator=iterator, processor=processor) )

In [None]:
!git clone https://github.com/PerseusDL/treebank_data.git

In [None]:
greek_dir = './treebank_data/v2.1/Greek/texts'
latin_dir = './treebank_data/v2.1/Latin/texts'

In [None]:
def parse_treebank_file(file_path):

    tree = ET.parse(file_path)
    root = tree.getroot()

    indexed_gold_sentences = []
    file_id1 = os.path.basename(file_path)
    print(file_id1)
    print(len(root.findall('.//sentence')))

    for i, sentence in enumerate(root.findall('.//sentence')):
        words = []
        for word in sentence.findall('.//word'):
            words.append({
                'form': word.get('form'),
                'lemma': word.get('lemma'),
                'postag': word.get('postag'),
                'relation': word.get('relation'),
                'head': word.get('head')
            })
        indexed_gold_sentences.append((file_id1,i, words))
    return indexed_gold_sentences

In [None]:
def process_gold_directory(directory, filter_text):
    all_sentences = []

    for root, dirs, files in os.walk(directory):
        for file in files:
            if filter_text in file:
                file_path = os.path.join(root, file)
                sentences = parse_treebank_file(file_path)

                for sentence in sentences:
                    all_sentences.append(sentence)

                print(f"Processed {file_path}")

    return all_sentences

In [None]:
treebank_sentences=process_gold_directory(greek_dir, 'tlg0012')

In [None]:
def parse_treebank_file(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    file_id2 = os.path.basename(file_path)

    sentences = []
    for i, sentence in enumerate(root.findall('.//sentence')):
        words = [word.get('form') for word in sentence.findall('.//word')]
        sentence_text = ' '.join(words)
        sentences.append((file_id2, i, sentence_text))
    return sentences

In [None]:
def analyze_with_pie(sentences, model_name):
    tagger = get_tagger(model_name, batch_size=256, device="cpu", model_path=None)
    iterator, processor = get_iterator_and_processor()

    analyzed_sentences = []
    for file_id2, index, sentence in sentences:
        analysis = tagger.tag_str(sentence, iterator=iterator, processor=processor)
        analyzed_sentences.append((file_id2, index, analysis))
    return analyzed_sentences

In [None]:
def process_directory(directory, filter_text, model_name, nb_of_sentences):
    all_sentences = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if filter_text in file:
                file_path = os.path.join(root, file)
                sentences = parse_treebank_file(file_path)
                all_sentences.extend(sentences)
                print(f"Processed {file_path}")
    analyzed_sentences = analyze_with_pie(all_sentences[:nb_of_sentences], model_name)
    return analyzed_sentences

In [None]:
nb_of_sentences = 100
pie_sentences = process_directory(greek_dir, 'tlg0012', 'grc', nb_of_sentences)

In [None]:
def compare_results(treebank_sentences, cltk_sentences):
    for (file_id_tb, index_tb, sentence_tb), (file_id_cltk, index_cltk, sentence_cltk) in zip(treebank_sentences, cltk_sentences):
        print(file_id_tb)
        if file_id_tb == file_id_cltk and index_tb == index_cltk:
            if file_id_tb==file_id_cltk and index_tb==index_cltk :
                print("treebank : "+str(sentence_tb))
                print("cltk data : "+str(sentence_cltk))
                pass

In [None]:
compare_results(treebank_sentences, pie_sentences)

In [None]:
import unicodedata

def normalize_unicode(text):
    if text is not None:
        return unicodedata.normalize('NFC', text)
    else:
        return None

def compare_annotations(tb_sentence, pie_sentence):
    error_details = {
        'lemma_errors': [],
        'pos_errors': []
    }

    file_id2, index, pie_analysis = pie_sentence

    for tb_word, pie_word in zip(tb_sentence[2], pie_analysis):
        tb_lemma_normalized = normalize_unicode(tb_word['lemma'])
        pie_lemma_normalized = normalize_unicode(pie_word['lemma'])

        if tb_lemma_normalized != pie_lemma_normalized:
            error_details['lemma_errors'].append({
                'word': tb_word['form'],
                'tb_lemma': tb_word['lemma'],
                'pie_lemma': pie_word['lemma']
            })

        tb_pos_normalized = normalize_unicode(tb_word['postag'])
        pie_pos_normalized = normalize_unicode(pie_word['pos'])

        if tb_pos_normalized != pie_pos_normalized:
            error_details['pos_errors'].append({
                'word': tb_word['form'],
                'tb_pos': tb_word['postag'],
                'pie_pos': pie_word['pos']
            })

    return error_details


In [None]:
all_errors = {
    'lemma_errors': [],
    'pos_errors': []
}

for tb_sentence, pie_tuple in zip(treebank_sentences, pie_sentences):
    errors = compare_annotations(tb_sentence, pie_tuple)
    all_errors['lemma_errors'].extend(errors['lemma_errors'])
    all_errors['pos_errors'].extend(errors['pos_errors'])

print("Lemma Errors:", all_errors['lemma_errors'])
print("POS Errors:", all_errors['pos_errors'])


In [None]:
from collections import Counter

# Filtrer les erreurs POS en excluant 'g', 'd', et 'b'
filtered_pos_errors = [error for error in all_errors['pos_errors']
                       if error['tb_pos'] is not None and
                          error['tb_pos'][0] not in ['g', 'd', 'b']]

# Compter les erreurs en ne tenant compte que des erreurs filtrées
pos_error_counts = Counter([error['tb_pos'][0] for error in filtered_pos_errors])

print(pos_error_counts)

In [None]:
import matplotlib.pyplot as plt

pos_groups = list(pos_error_counts.keys())
errors = list(pos_error_counts.values())

plt.figure(figsize=(10, 6))
plt.bar(pos_groups, errors, color='skyblue')
plt.xlabel('Groupes de POS Tags')
plt.ylabel('Nombre d\'erreurs')
plt.title('Erreurs de POS Tagging par CLTK par Groupe de Tags')
plt.xticks(rotation=45)
plt.show()

In [None]:
import csv
output_file = '/results/pie_results.csv'

# Écriture dans le fichier CSV
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # En-têtes
    writer.writerow(['Word', 'TB Lemma', 'Pie Lemma', 'TB POS', 'Pie POS'])

    for error in all_errors['lemma_errors']:
        writer.writerow([error['word'], error['tb_lemma'], error['pie_lemma'], '', ''])

    for error in all_errors['pos_errors']:
        writer.writerow([error['word'], '', '', error['tb_pos'], error['pie_pos']])

In [None]:
import unicodedata

def normalize_greek(text):
    if text is not None:
        return unicodedata.normalize('NFC', text)
    return None

def compare_global_annotations(tb_sentence, pie_data):
    annotations = []

    for tb_word in tb_sentence[2]:
        tb_text = normalize_greek(tb_word['form'])
        tb_lemma = normalize_greek(tb_word.get('lemma'))
        tb_pos_short = tb_word['postag'][0] if tb_word['postag'] else None

        # Trouver le token correspondant dans pie_data
        pie_token = next((word for word in pie_data if normalize_greek(word['form']) == tb_text), None)

        if pie_token:
            pie_lemma = normalize_greek(pie_token.get('lemma', ''))
            pie_pos_short = pie_token.get('pos', '')[0] if pie_token.get('pos') else None

            lemma_match = tb_lemma == pie_lemma
            pos_match = tb_pos_short == pie_pos_short

            annotations.append({
                'word': tb_text,
                'tb_lemma': tb_lemma,
                'pie_lemma': pie_lemma,
                'lemma_match': lemma_match,
                'tb_pos_short': tb_pos_short,
                'pie_pos_short': pie_pos_short,
                'pos_match': pos_match
            })

    return annotations


In [None]:
output_file_pie = './results/pie_results.csv'

with open(output_file_pie, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Word', 'TB Lemma', 'Pie Lemma', 'Lemma Match', 'TB POS Short', 'Pie POS Short', 'POS Match'])

    for tb_sentence, pie_tuple in zip(treebank_sentences, pie_sentences):
        file_id_pie, index_pie, pie_data = pie_tuple
        if file_id_pie == tb_sentence[0] and index_pie == tb_sentence[1]:
            try:
                annotations = compare_global_annotations(tb_sentence, pie_data)
                for annotation in annotations:
                    writer.writerow([annotation['word'], annotation['tb_lemma'], annotation['pie_lemma'],
                                     annotation['lemma_match'], annotation['tb_pos_short'], annotation['pie_pos_short'],
                                     annotation['pos_match']])
            except ValueError as e:
                print(f"Erreur dans les données : {e}")
