## Ici, on représente le mislabelling à l'aide de diagrammes de Venn. Les calculs sont fait en set pour ne pas étudier la même entité deux fois

In [1]:
import os
import csv
import matplotlib.pyplot as plt
# from matplotlib.patches import Circle
from matplotlib_venn import venn2, venn2_circles
# import numpy as np
import json

### 1) comparaison des outils sans différencier les textes étudiés

In [2]:
# Récupération des données des fichiers CSV 
def process_csv_file_tool(file_path, fn_set, fp_set, vp_set, auteur):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file, delimiter=';')
        
        for row in reader:
            token = row['Token']
            label = row['Label']
            correction = row['Correction']

            if correction and not label:
                fn_set.add(token + "_" + auteur)  # False Negative
            elif correction and label:
                fp_set.add(token + "_" + auteur)  # False Positive
            elif label and not correction:
                vp_set.add(token + "_" + auteur)  # True Positive

In [3]:
def collect_annotations_tool(base_dir):
    tool_counters = {
        'REF': {'FN': set(), 'FP': set(), 'VP': set()},
        'Tesseract': {'FN': set(), 'FP': set(), 'VP': set()},
        'Kraken': {'FN': set(), 'FP': set(), 'VP': set()}
    }

    for annotator in os.listdir(base_dir):
        annotator_dir = os.path.join(base_dir, annotator)
        if os.path.isdir(annotator_dir):
            for author in os.listdir(annotator_dir):
                author_dir = os.path.join(annotator_dir, author)
                if os.path.isdir(author_dir):
                    for file_name in os.listdir(author_dir):
                        file_path = os.path.join(author_dir, file_name)
                        if file_name.endswith('.csv'):
                            if 'REF' in file_name:
                                tool = 'REF'
                            elif 'Tesseract' in file_name:
                                tool = 'Tesseract'
                            elif 'Kraken' in file_name:
                                tool = 'Kraken'
                            else:
                                continue
                            
                            process_csv_file_tool(file_path, tool_counters[tool]['FN'], tool_counters[tool]['FP'], tool_counters[tool]['VP'], author)
    return tool_counters


In [4]:
base_dir = './ANNOTATION_ANNOTATEURICES'

affichage = collect_annotations(base_dir)
print(affichage)

{'FN': {'océans_DAUDET', 'mon_DAUDET', 'notre_CARRAUD', 'église_DAUDET', 'sauvage_DAUDET', 'La_DAUDET', 'Du_CARRAUD', 'M._DAUDET', 'bord_CARRAUD', 'rive_DAUDET', 'jeune_DAUDET', '_CARRAUD', 'suivante_CARRAUD', 'vingt-cinq_CARRAUD', 'école_DAUDET', 'cinquante_CARRAUD', 'champs_CARRAUD', 'village_CARRAUD', 'habitation_DAUDET', 'bon_CARRAUD', 'rives_DAUDET', 'chènevière_CARRAUD', 'francs_DAUDET', 'chemin_CARRAUD', 'des_CARRAUD', 'brouillards_DAUDET', 'demain_CARRAUD', 'une_CARRAUD', 'loge_DAUDET', '..._DAUDET', 'maison_CARRAUD', 'vierge_DAUDET', 'tribu_DAUDET', 'fossé_CARRAUD', 'fabrique_DAUDET', 'chœur_DAUDET', 'Maman_CARRAUD', 'pays_DAUDET', 'ruisseaux_CARRAUD', 'église_CARRAUD', 'printemps_CARRAUD', 'matelots_DAUDET', 'père_DAUDET', 'La_CARRAUD', 'déserte_DAUDET', 'portes_DAUDET', 'Petite_CARRAUD', 'grange_CARRAUD', 'dix_CARRAUD', "d'_DAUDET", 'Dieu_CARRAUD', 'ma_CARRAUD', 'Dieu_DAUDET', 'nuit_CARRAUD', 'correction_DAUDET', 'monsieur_DAUDET', 'ciel_DAUDET', 'vendredi_CARRAUD', 'la_CARR

In [4]:
def create_and_save_venn_diagram_tool(fn_set, fp_set, vp_set, output_path, tool_name):
    font2 = {'size': 20}  # Adjust the font size here for overall plot
    plt.rc('font', **font2)
    plt.rcParams['text.color'] = 'black'

    plt.figure(figsize=(14, 10))
    venn = venn2([fn_set, fp_set], set_labels=('Annotations Manuelles (FN)', 'Annotations SpaCy (FP)'), set_colors=("darkgrey", "darkblue"), alpha=0.5)
    venn2_circles([fn_set, fp_set], linestyle="dotted", linewidth=1)
    plt.title(f"Diagramme de Venn pour {tool_name}", fontsize=22)

    # Adjust the size of the subset labels (the numbers)
    for text in venn.subset_labels:
        if text:
            text.set_fontsize(48)

    # Adjust position of the set labels (FN and FP)
    for text in venn.set_labels:
        if text:
            text.set_fontsize(26)

    plt.tight_layout(pad=2)
    plt.savefig(output_path, dpi=300)
    plt.close()

In [5]:
# Chemin du répertoire contenant les fichiers CSV
base_dir = './ANNOTATION_ANNOTATEURICES'
output_dir = './venn_diagrams'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Collecter les annotations
tool_counters = collect_annotations_tool(base_dir)

# Créer et sauvegarder les diagrammes de Venn pour chaque outil
for tool, counters in tool_counters.items():
    output_path = os.path.join(output_dir, f'venn_diagram_{tool}.png')
    create_and_save_venn_diagram_tool(counters['FN'], counters['FP'], counters['VP'], output_path, tool)
    print(f"Diagramme de Venn pour {tool} sauvegardé à {output_path}")

Diagramme de Venn pour REF sauvegardé à ./venn_diagrams\venn_diagram_REF.png
Diagramme de Venn pour Tesseract sauvegardé à ./venn_diagrams\venn_diagram_Tesseract.png
Diagramme de Venn pour Kraken sauvegardé à ./venn_diagrams\venn_diagram_Kraken.png


### 2) comparaison des outils avec différenciation des textes étudiés

In [7]:
def process_csv_file(file_path, fn_set, fp_set, vp_set):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file, delimiter=';')
        
        for row in reader:
            token = row['Token']
            label = row['Label']
            correction = row['Correction']

            if correction and not label:
                fn_set.add(token)  # False Negative
            elif correction and label:
                fp_set.add(token)  # False Positive
            elif label and not correction:
                vp_set.add(token)  # True Positive

def collect_annotations_by_text(base_dir):
    tool_counters = {}

    for annotator in os.listdir(base_dir):
        annotator_dir = os.path.join(base_dir, annotator)
        if os.path.isdir(annotator_dir):
            for author in os.listdir(annotator_dir):
                author_dir = os.path.join(annotator_dir, author)
                if os.path.isdir(author_dir):
                    for file_name in os.listdir(author_dir):
                        file_path = os.path.join(author_dir, file_name)
                        if file_name.endswith('.csv'):
                            if 'REF' in file_name:
                                tool = 'REF'
                            elif 'Tesseract' in file_name:
                                tool = 'Tesseract'
                            elif 'Kraken' in file_name:
                                tool = 'Kraken'
                            else:
                                continue
                            
                            # Extract text name from file name
                            text_name = file_name.split('_')[2]  
                            
                            if text_name not in tool_counters:
                                tool_counters[text_name] = {
                                    'REF': {'FN': set(), 'FP': set(), 'VP': set()},
                                    'Tesseract': {'FN': set(), 'FP': set(), 'VP': set()},
                                    'Kraken': {'FN': set(), 'FP': set(), 'VP': set()}
                                }
                            
                            process_csv_file(file_path, tool_counters[text_name][tool]['FN'], tool_counters[text_name][tool]['FP'], tool_counters[text_name][tool]['VP'])

    return tool_counters

In [8]:
def create_and_save_venn_diagram(fn_set, fp_set, vp_set, output_path, tool_name, text_name):
    font2 = {'size': 16}
    plt.rc('font', **font2)
    plt.rcParams['text.color'] = 'black'

    plt.figure(figsize=(14, 10))
    venn = venn2([fn_set, fp_set], set_labels=('Annotations Manuelles (FN)', 'Annotations SpaCy (FP)'), set_colors=("darkgrey", "darkblue"), alpha=0.5)
    venn2_circles([fn_set, fp_set], linestyle="dotted", linewidth=1)
    plt.title(f"Diagramme de Venn pour {tool_name} - {text_name}")

    # Adjust position of the labels if necessary
    for text in venn.set_labels:
        if text:
            text.set_fontsize(26)
    for text in venn.subset_labels:
        if text:
            text.set_fontsize(48)

    plt.tight_layout(pad=2)
    plt.savefig(output_path, dpi=300)
    plt.close()


In [9]:
# Chemin du répertoire contenant les fichiers CSV
base_dir = './ANNOTATION_ANNOTATEURICES'
output_dir = './venn_diagrams'

# Collecter les annotations par texte
tool_counters_by_text = collect_annotations_by_text(base_dir)

# Créer et sauvegarder les diagrammes de Venn pour chaque texte et chaque outil
for text_name, tool_counters in tool_counters_by_text.items():
    for tool, counters in tool_counters.items():
        output_path = os.path.join(output_dir, f'venn_diagram_{tool}_{text_name}.png')
        create_and_save_venn_diagram(counters['FN'], counters['FP'], counters['VP'], output_path, tool, text_name)
        print(f"Diagramme de Venn pour {tool} - {text_name} sauvegardé à {output_path}")


Diagramme de Venn pour REF - petite-Jeanne sauvegardé à ./venn_diagrams\venn_diagram_REF_petite-Jeanne.png
Diagramme de Venn pour Tesseract - petite-Jeanne sauvegardé à ./venn_diagrams\venn_diagram_Tesseract_petite-Jeanne.png
Diagramme de Venn pour Kraken - petite-Jeanne sauvegardé à ./venn_diagrams\venn_diagram_Kraken_petite-Jeanne.png
Diagramme de Venn pour REF - petit-chose sauvegardé à ./venn_diagrams\venn_diagram_REF_petit-chose.png
Diagramme de Venn pour Tesseract - petit-chose sauvegardé à ./venn_diagrams\venn_diagram_Tesseract_petit-chose.png
Diagramme de Venn pour Kraken - petit-chose sauvegardé à ./venn_diagrams\venn_diagram_Kraken_petit-chose.png


In [None]:
#Test pour modifier la taille du cercle proportionnellement au nombre de token: test non concluant

In [32]:
# import os
# import csv
# import matplotlib.pyplot as plt
# import numpy as np
# from matplotlib.patches import Circle
# from matplotlib_venn import venn2, venn2_circles

# def process_csv_file(file_path, fn_set, fp_set, vp_set, author):
#     with open(file_path, 'r', encoding='utf-8') as file:
#         reader = csv.DictReader(file, delimiter=';')
        
#         for row in reader:
#             token = row['Token']
#             label = row['Label']
#             correction = row['Correction']

#             if correction and not label:
#                 fn_set.add(token + "_" + author)  # False Negative
#             elif correction and label:
#                 fp_set.add(token + "_" + author)  # False Positive
#             elif label and not correction:
#                 vp_set.add(token + "_" + author)  # True Positive

# def collect_annotations(base_dir):
#     tool_counters = {
#         'REF': {'FN': set(), 'FP': set(), 'VP': set()},
#         'Tesseract': {'FN': set(), 'FP': set(), 'VP': set()},
#         'Kraken': {'FN': set(), 'FP': set(), 'VP': set()}
#     }

#     for annotator in os.listdir(base_dir):
#         annotator_dir = os.path.join(base_dir, annotator)
#         if os.path.isdir(annotator_dir):
#             for author in os.listdir(annotator_dir):
#                 author_dir = os.path.join(annotator_dir, author)
#                 if os.path.isdir(author_dir):
#                     for file_name in os.listdir(author_dir):
#                         file_path = os.path.join(author_dir, file_name)
#                         if file_name.endswith('.csv'):
#                             if 'REF' in file_name:
#                                 tool = 'REF'
#                             elif 'Tesseract' in file_name:
#                                 tool = 'Tesseract'
#                             elif 'Kraken' in file_name:
#                                 tool = 'Kraken'
#                             else:
#                                 continue
                            
#                             process_csv_file(file_path, tool_counters[tool]['FN'], tool_counters[tool]['FP'], tool_counters[tool]['VP'], author)
#     return tool_counters

# def collect_annotations_by_text(base_dir):
#     tool_counters = {}

#     for annotator in os.listdir(base_dir):
#         annotator_dir = os.path.join(base_dir, annotator)
#         if os.path.isdir(annotator_dir):
#             for author in os.listdir(annotator_dir):
#                 author_dir = os.path.join(annotator_dir, author)
#                 if os.path.isdir(author_dir):
#                     for file_name in os.listdir(author_dir):
#                         file_path = os.path.join(author_dir, file_name)
#                         if file_name.endswith('.csv'):
#                             if 'REF' in file_name:
#                                 tool = 'REF'
#                             elif 'Tesseract' in file_name:
#                                 tool = 'Tesseract'
#                             elif 'Kraken' in file_name:
#                                 tool = 'Kraken'
#                             else:
#                                 continue
                            
#                             # Extract text name from file name
#                             text_name = file_name.split('_')[2]  # Adjust this based on your file naming convention
                            
#                             if text_name not in tool_counters:
#                                 tool_counters[text_name] = {
#                                     'REF': {'FN': set(), 'FP': set(), 'VP': set()},
#                                     'Tesseract': {'FN': set(), 'FP': set(), 'VP': set()},
#                                     'Kraken': {'FN': set(), 'FP': set(), 'VP': set()}
#                                 }
                            
#                             process_csv_file(file_path, tool_counters[text_name][tool]['FN'], tool_counters[text_name][tool]['FP'], tool_counters[text_name][tool]['VP'], author)

#     return tool_counters

# def draw_proportional_venn(fn_set, fp_set, vp_set, output_path, tool_name, title):
#     # Calcul des tailles
#     size_fn = len(fn_set)
#     size_fp = len(fp_set)
#     size_intersection = len(fn_set & fp_set)

#     # Calcul des rayons des cercles
#     radius_fn = np.sqrt(size_fn / np.pi)
#     radius_fp = np.sqrt(size_fp / np.pi)

#     # Position des centres des cercles
#     center_fn = (-radius_fn, 0)
#     center_fp = (radius_fp, 0)

#     fig, ax = plt.subplots(figsize=(14, 10))

#     # Création des cercles
#     circle_fn = Circle(center_fn, radius_fn, edgecolor='darkgrey', facecolor='darkgrey', alpha=0.5)
#     circle_fp = Circle(center_fp, radius_fp, edgecolor='darkblue', facecolor='darkblue', alpha=0.5)

#     ax.add_patch(circle_fn)
#     ax.add_patch(circle_fp)

#     # Calcul des coordonnées pour l'intersection
#     intersection_radius = np.sqrt(size_intersection / np.pi)
#     intersection_center = (0, 0)
#     if size_intersection > 0:
#         circle_intersection = Circle(intersection_center, intersection_radius, edgecolor='black', facecolor='purple', alpha=0.5)
#         ax.add_patch(circle_intersection)

#     # Ajustement de l'échelle et des limites de l'axe
#     ax.set_xlim(-radius_fn - 1, radius_fp + 1)
#     ax.set_ylim(-radius_fp - 1, radius_fp + 1)
#     ax.set_aspect('equal')

#     # Ajout des annotations
#     ax.text(center_fn[0], 0, str(size_fn), horizontalalignment='center', verticalalignment='center', fontsize=20, color='black')
#     ax.text(center_fp[0], 0, str(size_fp), horizontalalignment='center', verticalalignment='center', fontsize=20, color='black')
#     if size_intersection > 0:
#         ax.text(0, 0, str(size_intersection), horizontalalignment='center', verticalalignment='center', fontsize=20, color='black')

#     ax.set_title(title, fontsize=22)
#     ax.axis('off')
#     plt.tight_layout(pad=2)
#     plt.savefig(output_path, dpi=300)
#     plt.close()

# # Chemin du répertoire contenant les fichiers CSV
# base_dir = './ANNOTATION_ANNOTATEURICES'
# output_dir = './Diagrammes_Venn'

# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)

# # Collecter les annotations par outil uniquement
# tool_counters = collect_annotations(base_dir)

# # Créer et sauvegarder les diagrammes de Venn pour chaque outil
# for tool, counters in tool_counters.items():
#     output_path = os.path.join(output_dir, f'venn_diagram_{tool}.png')
#     draw_proportional_venn(counters['FN'], counters['FP'], counters['VP'], output_path, tool, f"Diagramme de Venn pour {tool}")
#     print(f"Diagramme de Venn pour {tool} sauvegardé à {output_path}")

# # Collecter les annotations par texte et outil
# tool_counters_by_text = collect_annotations_by_text(base_dir)

# # Créer et sauvegarder les diagrammes de Venn pour chaque texte et chaque outil
# for text_name, tool_counters in tool_counters_by_text.items():
#     for tool, counters in tool_counters.items():
#         output_path = os.path.join(output_dir, f'venn_diagram_{tool}_{text_name}.png')
#         draw_proportional_venn(counters['FN'], counters['FP'], counters['VP'], output_path, tool, f"Diagramme de Venn pour {tool} - {text_name}")
#         print(f"Diagramme de Venn pour {tool} - {text_name} sauvegardé à {output_path}")


Diagramme de Venn pour REF sauvegardé à ./Diagrammes_Venn\venn_diagram_REF.png
Diagramme de Venn pour Tesseract sauvegardé à ./Diagrammes_Venn\venn_diagram_Tesseract.png
Diagramme de Venn pour Kraken sauvegardé à ./Diagrammes_Venn\venn_diagram_Kraken.png
Diagramme de Venn pour REF - petite-Jeanne sauvegardé à ./Diagrammes_Venn\venn_diagram_REF_petite-Jeanne.png
Diagramme de Venn pour Tesseract - petite-Jeanne sauvegardé à ./Diagrammes_Venn\venn_diagram_Tesseract_petite-Jeanne.png
Diagramme de Venn pour Kraken - petite-Jeanne sauvegardé à ./Diagrammes_Venn\venn_diagram_Kraken_petite-Jeanne.png
Diagramme de Venn pour REF - petit-chose sauvegardé à ./Diagrammes_Venn\venn_diagram_REF_petit-chose.png
Diagramme de Venn pour Tesseract - petit-chose sauvegardé à ./Diagrammes_Venn\venn_diagram_Tesseract_petit-chose.png
Diagramme de Venn pour Kraken - petit-chose sauvegardé à ./Diagrammes_Venn\venn_diagram_Kraken_petit-chose.png


## Calcul des métriques

### Calcul des métriques par outil

In [13]:
# Fonction pour calculer précision, rappel et f-score
def calculate_metrics(fn_set, fp_set, vp_set):
    FN = len(fn_set)
    FP = len(fp_set)
    VP = len(vp_set)
    
    precision = VP / (VP + FP) if (VP + FP) > 0 else 0
    recall = VP / (VP + FN) if (VP + FN) > 0 else 0
    f_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        "precision": precision,
        "recall": recall,
        "f_score": f_score,
        "FN": FN,
        "FP": FP,
        "VP": VP
    }

# Process CSV file (for both programs)
def process_csv_file(file_path, fn_set, fp_set, vp_set, author=None):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file, delimiter=';')
        
        for row in reader:
            token = row['Token']
            label = row['Label']
            correction = row['Correction']
            identifier = token if author is None else f"{token}_{author}"

            if correction and not label:
                fn_set.add(identifier)  # False Negative
            elif correction and label:
                fp_set.add(identifier)  # False Positive
            elif label and not correction:
                vp_set.add(identifier)  # True Positive

# Collect annotations by tool (Program 1)
def collect_annotations_tool(base_dir):
    tool_counters = {
        'REF': {'FN': set(), 'FP': set(), 'VP': set()},
        'Tesseract': {'FN': set(), 'FP': set(), 'VP': set()},
        'Kraken': {'FN': set(), 'FP': set(), 'VP': set()}
    }

    for annotator in os.listdir(base_dir):
        annotator_dir = os.path.join(base_dir, annotator)
        if os.path.isdir(annotator_dir):
            for author in os.listdir(annotator_dir):
                author_dir = os.path.join(annotator_dir, author)
                if os.path.isdir(author_dir):
                    for file_name in os.listdir(author_dir):
                        file_path = os.path.join(author_dir, file_name)
                        if file_name.endswith('.csv'):
                            if 'REF' in file_name:
                                tool = 'REF'
                            elif 'Tesseract' in file_name:
                                tool = 'Tesseract'
                            elif 'Kraken' in file_name:
                                tool = 'Kraken'
                            else:
                                continue
                            
                            process_csv_file(file_path, tool_counters[tool]['FN'], tool_counters[tool]['FP'], tool_counters[tool]['VP'], author)
    return tool_counters

# Collect annotations by text (Program 2)
def collect_annotations_by_text(base_dir):
    tool_counters = {}

    for annotator in os.listdir(base_dir):
        annotator_dir = os.path.join(base_dir, annotator)
        if os.path.isdir(annotator_dir):
            for author in os.listdir(annotator_dir):
                author_dir = os.path.join(annotator_dir, author)
                if os.path.isdir(author_dir):
                    for file_name in os.listdir(author_dir):
                        file_path = os.path.join(author_dir, file_name)
                        if file_name.endswith('.csv'):
                            if 'REF' in file_name:
                                tool = 'REF'
                            elif 'Tesseract' in file_name:
                                tool = 'Tesseract'
                            elif 'Kraken' in file_name:
                                tool = 'Kraken'
                            else:
                                continue
                            
                            text_name = file_name.split('_')[2]
                            
                            if text_name not in tool_counters:
                                tool_counters[text_name] = {
                                    'REF': {'FN': set(), 'FP': set(), 'VP': set()},
                                    'Tesseract': {'FN': set(), 'FP': set(), 'VP': set()},
                                    'Kraken': {'FN': set(), 'FP': set(), 'VP': set()}
                                }
                            
                            process_csv_file(file_path, tool_counters[text_name][tool]['FN'], tool_counters[text_name][tool]['FP'], tool_counters[text_name][tool]['VP'])

    return tool_counters

# Save metrics to JSON
def save_metrics_to_json(metrics, output_path):
    with open(output_path, 'w', encoding='utf-8') as json_file:
        json.dump(metrics, json_file, ensure_ascii=False, indent=4)

# Main program
base_dir = './ANNOTATION_ANNOTATEURICES'
output_dir = './results'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Process for Program 1
tool_counters_tool = collect_annotations_tool(base_dir)
metrics_tool = {tool: calculate_metrics(counters['FN'], counters['FP'], counters['VP']) for tool, counters in tool_counters_tool.items()}
save_metrics_to_json(metrics_tool, os.path.join(output_dir, 'metrics_tool.json'))

# Process for Program 2
tool_counters_text = collect_annotations_by_text(base_dir)
metrics_text = {
    text_name: {tool: calculate_metrics(counters['FN'], counters['FP'], counters['VP']) for tool, counters in tool_counters.items()}
    for text_name, tool_counters in tool_counters_text.items()
}
save_metrics_to_json(metrics_text, os.path.join(output_dir, 'metrics_text.json'))