In [8]:
from typing import List, Optional, Set, Tuple
from pathlib import Path
from dataclasses import dataclass
import spacy
from pyJoules.energy_meter import measure_energy
from sklearn.metrics import classification_report
from spacy.tokens import Doc as SpacyDoc

@dataclass
class Token:
    form: str
    tag: str
    is_oov: bool

@dataclass
class Sentence:
    sent_id: str
    tokens: List[Token]

@dataclass
class Corpus:
    sentences: List[Sentence]

def read_conll(path: Path, vocabulaire: Optional[Set[str]] = None) -> Corpus:
    sentences: List[Sentence] = []
    tokens: List[Token] = []
    sid = ""
    with open(path) as f:
        for line in f:
            line = line.strip()
            if line.startswith("# sent_id ="):
                sid = line.split(" ")[-1]
            if not line.startswith("#"):
                if line == "":
                    if tokens:  # Ensure we only append non-empty tokens
                        sentences.append(Sentence(sent_id=sid, tokens=tokens))
                        tokens = []
                else:
                    fields = line.split("\t")
                    form, tag = fields[1], fields[3]
                    if "-" not in fields[0]:  # Avoid contractions like "du"
                        is_oov = form not in vocabulaire if vocabulaire else True
                        tokens.append(Token(form, tag, is_oov))
    return Corpus(sentences)

# Build vocabulary from the corpus
def build_vocabulaire(corpus: Corpus) -> Set[str]:
    return {tok.form for sent in corpus.sentences for tok in sent.tokens}

# Convert a sentence to a spaCy Doc
def sentence_to_doc(sentence: Sentence, vocab) -> SpacyDoc:
    words = [tok.form for tok in sentence.tokens]
    return SpacyDoc(vocab, words=words)

# Convert a spaCy Doc back to a sentence
def doc_to_sentence(doc: SpacyDoc, origin: Sentence) -> Sentence:
    tokens = [
        Token(tok.text, tok.pos_ or tok.tag_, origin_token.is_oov) 
        for tok, origin_token in zip(doc, origin.tokens)
    ]
    return Sentence(origin.sent_id, tokens)

# Run spaCy POS tagging model on the corpus and measure energy consumption
#@measure_energy  # error on MacOS
def tag_corpus_spacy(corpus: Corpus, model_spacy) -> Corpus:
    sentences = []
    for sentence in corpus.sentences:
        doc = sentence_to_doc(sentence, model_spacy.vocab)
        doc = model_spacy(doc)
        sentences.append(doc_to_sentence(doc, sentence))
    return Corpus(sentences)

# Compute accuracy in token level, including OOV words
def compute_accuracy(corpus_gold: Corpus, corpus_test: Corpus, subcorpus: Optional[str] = None) -> Tuple[float, float]:
    nb_ok, nb_total, oov_ok, oov_total = 0, 0, 0, 0
    for sentence_gold, sentence_test in zip(corpus_gold.sentences, corpus_test.sentences):
        if subcorpus is None or subcorpus in sentence_gold.sent_id:
            for token_gold, token_test in zip(sentence_gold.tokens, sentence_test.tokens):
                assert token_gold.form == token_test.form  # Ensure forms match
                nb_total += 1  # Increment total count
                if token_gold.tag == token_test.tag:  # Check if tags match
                    nb_ok += 1
                if token_gold.is_oov:  # Check if token is OOV
                    oov_total += 1
                    if token_gold.tag == token_test.tag:  # Check if OOV tags match
                        oov_ok += 1
    
    accuracy = nb_ok / nb_total if nb_total > 0 else 0.0
    oov_accuracy = oov_ok / oov_total if oov_total > 0 else 0.0
    
    return accuracy, oov_accuracy

# Print classification report
def print_report(corpus_gold: Corpus, corpus_test: Corpus):
    ref = [tok.tag for sent in corpus_gold.sentences for tok in sent.tokens]
    test = [tok.tag for sent in corpus_test.sentences for tok in sent.tokens]
    print(classification_report(ref, test))

def run_evaluation(train_path: str, test_path: str, model_names: List[str]):
    corpus_train = read_conll(Path(train_path))
    vocab_train = build_vocabulaire(corpus_train)

    for model_name in model_names:
        print(f"Running model: {model_name}")
        model_spacy = spacy.load(model_name)

        corpus_gold = read_conll(Path(test_path), vocabulaire=vocab_train)
        corpus_test = tag_corpus_spacy(corpus_gold, model_spacy)

        print("Tagging completed.")
        for subcorpus in ("annodis", "frwiki", "emea", "Europar"):
            print(subcorpus)
            accuracy, oov_accuracy = compute_accuracy(corpus_gold, corpus_test, subcorpus)
            print(f"Accuracy: {accuracy:.4f}, OOV Accuracy: {oov_accuracy:.4f}")
        
        print("Classification Report:")
        print_report(corpus_gold, corpus_test)

def evaluate_models():
    model_names = ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg"]
    train_path = "fr_sequoia-ud-train.conllu"
    test_path = "fr_sequoia-ud-test.conllu"
    run_evaluation(train_path, test_path, model_names)

# Entry point
if __name__ == "__main__":
    evaluate_models()


Running model: fr_core_news_sm
Tagging completed.
annodis
Accuracy: 0.9613, OOV Accuracy: 0.8586
frwiki
Accuracy: 0.9577, OOV Accuracy: 0.7281
emea
Accuracy: 0.9770, OOV Accuracy: 0.7870
Europar
Accuracy: 0.9591, OOV Accuracy: 0.7692
Classification Report:
              precision    recall  f1-score   support

         ADJ       0.90      0.90      0.90       638
         ADP       0.99      1.00      0.99      1634
         ADV       0.98      0.96      0.97       411
         AUX       0.98      0.99      0.98       345
       CCONJ       0.99      1.00      0.99       221
         DET       0.99      0.98      0.99      1492
        NOUN       0.94      0.97      0.95      2161
         NUM       0.97      0.97      0.97       243
        PRON       0.97      0.95      0.96       410
       PROPN       0.90      0.91      0.91       478
       PUNCT       1.00      1.00      1.00      1084
       SCONJ       0.93      0.94      0.94       106
         SYM       1.00      0.50      0