In [13]:
import os
import glob
from nervaluate import Evaluator
import argparse

'''
Ce script utilise nervaluate pour calculer la précision et le rappel
du marquage automatique des entités nommées par rapport à un étalon-or. 
Il recherche un file "prediction", qui doit contenir toutes les informations à évaluer sous la forme suivante:
TOKEN   PREDICTION     GOLD   VALIDITY
Deux       O             O        1
mois       O             O        1
...
Les scores sont générés pour chaque fichier et enregistrés "precision_rappel".
'''

# Initialize the parser
parser = argparse.ArgumentParser

def pretty_print(result, outfile=None):
    """display the result"""
    x_name = ['Measure'] + [k for k in result]
    y_name = []
    rows = []
    for evaluation in result:
        metrics = result[evaluation]
        row = []
        for metric, score in metrics.items():
            if metric not in y_name:
                y_name.append(metric)
            row.append(round(score, 3)) # Arrondir
        rows.append(row)
    grid = [score for score in [column for column in zip(*rows)]]
    print(*x_name, sep='\t', file=outfile)
    for i, row in enumerate(grid):
        print(y_name[i], *map(str, row), sep='\t', file=outfile)
        
# Insert the path        
for input_file in glob.iglob("/Users/alexsoares/Desktop/EHESS/dev/Savoirs_env/test_db/*"):
    '''loop through the directory'''
    st_annotation = []
    gold_annotation = []
    # load the predictions and gold standard tags
    with open(input_file, 'r', encoding='utf-8') as fin:
        print(input_file)
        for line in fin:
            line = line.strip()
            if not line:
                continue
            try:
                #print(line)
                token, automatique, gold, validity = line.split('\t') 
                #print(token)
            except ValueError:
                print(line)
            automatique = automatique.upper()
            # begin formatting the tags in the 'conll' format
            st_annotation.append(f"{token}\t{automatique}")
            gold_annotation.append(f"{token}\t{gold}")
    #finish 'conll' format
    true = '\n'.join(gold_annotation)
    pred_st = '\n'.join(st_annotation)
    #print(true)
    #print(pred_st)
   
    # generate precision and recall report
    evaluator = Evaluator(true, pred_st, tags=['LOC', 'PER'], loader="conll")
    results, results_by_tag = evaluator.evaluate()
    print(pretty_print(results))
    
        
    """    
    # We can automatize it to do better
    # directory out
    output_dir = "/Users/alexsoares/Desktop/EHESS/dev/Savoirs_env/test_result/"
    # new files out with original's name plus _text and its new format .txt
    results_file = "%s%s_precision_rappel.csv"%(output_dir, os.path.splitext(os.path.basename(input_file))[0])
    print(results_file)
    # save it as blabla_text.csv
    with open(results_file, 'w', encoding="utf-8") as fpout: 
        pretty_print(results_by_tag, outfile=fpout)
        #print to verify the result
        #print(first)
        """

/Users/alexsoares/Desktop/EHESS/dev/Savoirs_env/test_db/BNU_01_Didier_textSpaCy_bert_prediction.tsv
Measure	ent_type	partial	strict	exact
correct	50	47	45	47
incorrect	6	0	11	9
partial	0	9	0	0
missed	3	3	3	3
spurious	31	31	31	31
possible	59	59	59	59
actual	87	87	87	87
precision	0.575	0.592	0.517	0.54
recall	0.847	0.873	0.763	0.797
f1	0.685	0.705	0.616	0.644
None
/Users/alexsoares/Desktop/EHESS/dev/Savoirs_env/test_db/BNU_02_Colin_textSpaCy_bert_prediction.tsv
Measure	ent_type	partial	strict	exact
correct	75	44	44	44
incorrect	0	0	31	31
partial	0	31	0	0
missed	177	177	177	177
spurious	460	460	460	460
possible	252	252	252	252
actual	535	535	535	535
precision	0.14	0.111	0.082	0.082
recall	0.298	0.236	0.175	0.175
f1	0.191	0.151	0.112	0.112
None
/Users/alexsoares/Desktop/EHESS/dev/Savoirs_env/test_db/BNU_03_Jacob_textSpaCy_bert_prediction.tsv
Measure	ent_type	partial	strict	exact
correct	42	43	41	43
incorrect	3	0	4	2
partial	0	2	0	0
missed	1	1	1	1
spurious	46	46	46	46
possible	46	46	46	46
a

In [None]:
# expansion of jupyter notebook
#!jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10