In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import json
import os
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [20]:
notebook_path = os.path.abspath("02-stanza.ipynb")

In [21]:
# Configuration
MODEL_NAME = "Stanza"
SAMPLE_TYPES = ["medieval_charters", "glosses"]
TASKS = ["lemmatization", "pos_tagging"]


In [8]:
import stanza
nlp = stanza.Pipeline('la', processors='tokenize,mwt,pos,lemma')

  from .autonotebook import tqdm as notebook_tqdm
2025-04-28 15:14:16 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 426kB [00:00, 211MB/s]                     
2025-04-28 15:14:17 INFO: Downloaded file to /Users/Thea/stanza_resources/resources.json
2025-04-28 15:14:17 INFO: Loading these models for language: la (Latin):
| Processor | Package       |
-----------------------------
| tokenize  | ittb          |
| mwt       | ittb          |
| pos       | ittb_nocharlm |
| lemma     | ittb_nocharlm |

2025-04-28 15:14:17 INFO: Using device: cpu
2025-04-28 15:14:17 INFO: Loading: tokenize
2025-04-28 15:14:19 INFO: Loading: mwt
2025-04-28 15:14:19 INFO: Loading: pos
2025-04-28 15:14:20 INFO: Loading: lemma
2025-04-28 15:14:20 INFO: Done loading 

In [22]:
# Results storage
results = {
    "model_name": MODEL_NAME,
    "processing_times": {},
    "accuracy": {},
    "precision": {},
    "recall": {},
    "f1_score": {}
}

In [34]:

sample_type = "medieval_charters"
sample_path = os.path.join(os.path.dirname(notebook_path), f"../data/{sample_type}.csv")
gs_path = os.path.join(os.path.dirname(notebook_path), f"../data/gold_standard/gs_{sample_type}.csv")

df = pd.read_csv(sample_path)
gold_df = pd.read_csv(gs_path)


In [36]:
# unpack sample texts
sample_texts = []
for sample_id in df['SampleID'].unique():
    sample_texts.append((sample_id, text))


In [42]:
start_time = time.time()

processed_results = []
for sample_id, text in sample_texts:
    doc = nlp(text)
    for sent in doc.sentences:
            for word in sent.words:
                processed_results.append({
                    "SampleID": sample_id,
                    "WordID": sample_id+"_"+str(word.id),
                    "Word": word.text,
                    "Lemma": word.lemma,
                    "POS": word.pos,
                })

processing_time = time.time() - start_time
results["processing_times"][sample_type] = processing_time

In [43]:
print(processed_results)

[{'SampleID': 'ch_1', 'WordID': 'ch_1_1', 'Word': 'In', 'Lemma': 'in', 'POS': 'ADP'}, {'SampleID': 'ch_1', 'WordID': 'ch_1_2', 'Word': 'Dei', 'Lemma': 'deus', 'POS': 'NOUN'}, {'SampleID': 'ch_1', 'WordID': 'ch_1_3', 'Word': 'omnipotentis', 'Lemma': 'omnipotens', 'POS': 'ADJ'}, {'SampleID': 'ch_1', 'WordID': 'ch_1_4', 'Word': 'nomine', 'Lemma': 'nomen', 'POS': 'NOUN'}, {'SampleID': 'ch_1', 'WordID': 'ch_1_5', 'Word': 'regnante', 'Lemma': 'regno', 'POS': 'VERB'}, {'SampleID': 'ch_1', 'WordID': 'ch_1_6', 'Word': 'domno', 'Lemma': 'domnum', 'POS': 'NOUN'}, {'SampleID': 'ch_1', 'WordID': 'ch_1_7', 'Word': 'nostro', 'Lemma': 'noster', 'POS': 'DET'}, {'SampleID': 'ch_1', 'WordID': 'ch_1_8', 'Word': 'Karolus', 'Lemma': 'Karolus', 'POS': 'ADJ'}, {'SampleID': 'ch_1', 'WordID': 'ch_1_9', 'Word': 'divina', 'Lemma': 'divinus', 'POS': 'ADJ'}, {'SampleID': 'ch_1', 'WordID': 'ch_1_10', 'Word': 'faventem', 'Lemma': 'favio', 'POS': 'VERB'}, {'SampleID': 'ch_1', 'WordID': 'ch_1_11', 'Word': 'clementia', 

In [38]:
pred_df = pd.DataFrame(processed_results)

In [None]:
merged_df = pd.concat(gold_df, pred_df, on=['SampleID', 'WordID', 'Word'], suffixes=('_gold', '_pred'))

TypeError: concat() takes 1 positional argument but 2 were given