# CLTK Evaluation on Glosses
This notebook tests LatinCy for lemmatization and POS tagging on ~600 Latin glosses. Results are compared to the dataset's original tags.

Created by Thea Schaaf, March 2025

In [None]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#import seaborn as sns
import time
import json
import os
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

Set os path environment for opening files
Configuration of sample names and tasks, results storage

In [7]:
notebook_path = os.path.abspath("02-stanza.ipynb")

In [34]:
# Configuration
MODEL_NAME = "Stanza"
SAMPLE_TYPES = ["medieval_charters", "glosses"]
TASKS = ["lemmatization", "pos_tagging"]


In [35]:
# Results storage
results = {
    "model_name": MODEL_NAME,
    "processing_times": {},
    "accuracy": {},
    "precision": {},
    "recall": {},
    "f1_score": {}
}

Import code

In [None]:
from cltk import NLP
from cltk.languages.utils import get_lang
from cltk.core.data_types import Doc, Word
from cltk.dependency.processes import GreekStanzaProcess, LatinStanzaProcess

# Initialize the Latin NLP pipeline
latin_nlp = NLP(language="lat", suppress_banner=True)

  from .autonotebook import tqdm as notebook_tqdm
2025-05-05 19:37:47 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 426kB [00:00, 177MB/s]                     
2025-05-05 19:37:50 INFO: Downloaded file to /Users/Thea/stanza_resources/resources.json
2025-05-05 19:37:50 INFO: Loading these models for language: la (Latin):
| Processor | Package       |
-----------------------------
| tokenize  | ittb          |
| mwt       | ittb          |
| pos       | ittb_nocharlm |
| lemma     | ittb_nocharlm |

2025-05-05 19:37:50 INFO: Using device: cpu
2025-05-05 19:37:50 INFO: Loading: tokenize
2025-05-05 19:37:53 INFO: Loading: mwt
2025-05-05 19:37:53 INFO: Loading: pos
2025-05-05 19:37:58 INFO: Loading: lemma
2025-05-05 19:37:59 INFO: Done loading 

Process each set of samples by breaking down list into words and process with model

In [None]:
for sample_type in SAMPLE_TYPES:
    print(f"Processing {sample_type}...")

    # Load gold standard data
    gold_file = os.path.join(os.path.dirname(notebook_path), f"../data/gold_standard/gs_{sample_type}.csv")

    gold_df = pd.read_csv(gold_file)


    # Extract text (assuming we need to reconstruct from words)
    # This will depend on your actual CSV structure
    sample_texts = []
    for sample_id in gold_df['sample_id'].unique():
        sample_df = fold_df[gold_df['sample_id'] == sample_id].sort_values('word_id')
        words = sample_df['word'].tolist()
        text = ' '.join(words)
        sample_texts.append((sample_id, text, sample_df))

    # Process samples and measure time
    start_time = time.time()

    processed_results = []
    for sample_id, text, sample_df in sample_texts:

        try:
            tokens = process_with_cltk(text)

            gold_words = sample_df['word'].tolist()
            for i, (gold_word, token) in enumerate(zip(gold_words, tokens)):
                if i < len(sample_df):
                    word_id = sample_df.iloc[i]['word_id']

                    processed_results.append({
                        "sample_id": sample_id,
                        "word_id": word_id,
                        "word": gold_word,  # Use original word to ensure matching
                        "lemma_pred": token["lemma"],
                        "pos_pred": token["pos"],
                        "additional_info": {
                            "upos": token["upos"],
                            "features": token["features"]
                        })

        except Exception as e:
            print(f"Error processing sample {sample_id}: {e}")
            # Add empty predictions for failed samples
            for i, row in sample_df.iterrows():
                processed_results.append({
                    "sample_id": row['sample_id'],
                    "word_id": row['word_id'],
                    "word": row['word'],
                    "lemma_pred": "ERROR",
                    "pos_pred": "ERROR",
                    "additional_info": {"error": str(e)}
                })


    processing_time = time.time() - start_time
    results["processing_times"][sample_type] = processing_time


Processing medieval_charters...
Processing glosses...


In [44]:
print(processed_results)

[{'sample_id': 'BCr.27a40', 'word_id': 'BCr.27a40_1', 'word': '.', 'lemma': '.', 'pos': 'PUNCT'}, {'sample_id': 'BCr.27a40', 'word_id': 'BCr.27a40_2', 'word': 'i', 'lemma': 'i', 'pos': 'NUM'}, {'sample_id': 'BCr.27a40', 'word_id': 'BCr.27a40_3', 'word': '.', 'lemma': '.', 'pos': 'PUNCT'}, {'sample_id': 'BCr.27a40', 'word_id': 'BCr.27a40_1', 'word': 'sidera', 'lemma': 'sidus', 'pos': 'NOUN'}, {'sample_id': 'BCr.27a40', 'word_id': 'BCr.27a40_2', 'word': 'uel', 'lemma': 'uel', 'pos': 'CCONJ'}, {'sample_id': 'BCr.27a40', 'word_id': 'BCr.27a40_3', 'word': 'spatia', 'lemma': 'spatium', 'pos': 'NOUN'}, {'sample_id': 'BVi.01a06', 'word_id': 'BVi.01a06_1', 'word': 'sideera', 'lemma': 'sideo', 'pos': 'VERB'}, {'sample_id': 'BVi.01a06', 'word_id': 'BVi.01a06_2', 'word': 'uel', 'lemma': 'uel', 'pos': 'CCONJ'}, {'sample_id': 'BVi.01a06', 'word_id': 'BVi.01a06_3', 'word': 'spatia', 'lemma': 'spatium', 'pos': 'NOUN'}, {'sample_id': 'BVi.01a07', 'word_id': 'BVi.01a07_1', 'word': 'eo', 'lemma': 'is', '

Turn results into dataframe, combine with gold standard

In [None]:
# Convert to DataFrame for easier comparison
    pred_df = pd.DataFrame(all_results)

    # Save raw predictions for inspection
    pred_df.to_csv(f"results/CLTK_{sample_type}_predictions.csv", index=False)

    # Merge with gold standard for evaluation
    # This combines based on sample_id and word_id to ensure correct alignment
    eval_df = pd.merge(
        gold_df,
        pred_df,
        on=['sample_id', 'word_id', 'word'],
        how='left'
    )

Evalutate lemmatization

In [62]:
 # Evaluate lemmatization
lemma_accuracy = accuracy_score(merged_df['lemma_gold'], merged_df['lemma_pred'])
lemma_precision, lemma_recall, lemma_f1, _ = precision_recall_fscore_support(
    merged_df['lemma_gold'] == merged_df['lemma_pred'],
    [True] * len(merged_df),
    average='binary'
)

TypeError: '<' not supported between instances of 'str' and 'float'

Evaluate POS

In [48]:
# Evaluate POS tagging
pos_accuracy = accuracy_score(merged_df['pos_gold'], merged_df['pos_pred'])
pos_precision, pos_recall, pos_f1, _ = precision_recall_fscore_support(
    merged_df['pos_gold'] == merged_df['pos_pred'],
    [True] * len(merged_df),
    average='binary'
)

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Process recall, precision

In [None]:
results["accuracy"][f"{sample_type}_lemma"] = lemma_accuracy
results["precision"][f"{sample_type}_lemma"] = lemma_precision
results["recall"][f"{sample_type}_lemma"] = lemma_recall
results["f1_score"][f"{sample_type}_lemma"] = lemma_f1

results["accuracy"][f"{sample_type}_pos"] = pos_accuracy
results["precision"][f"{sample_type}_pos"] = pos_precision
results["recall"][f"{sample_type}_pos"] = pos_recall
results["f1_score"][f"{sample_type}_pos"] = pos_f1

In [None]:
merged_df.to_csv(f"../results/{MODEL_NAME}_{sample_type}_detailed.csv", index=False)

print(f"Completed {sample_type}. Processing time: {processing_time:.2f}s")
print(f"Lemmatization accuracy: {lemma_accuracy:.4f}")
print(f"POS tagging accuracy: {pos_accuracy:.4f}")
print("-" * 50)


Completed glosses. Processing time: 4.66s
Lemmatization accuracy: nan
POS tagging accuracy: nan
--------------------------------------------------


Save results

In [52]:
# Save summary results
with open(f"../results/{MODEL_NAME}_summary.json", "w") as f:
    json.dump(results, f, indent=2)
