In [None]:
from __future__ import annotations
from typing import Dict, Iterable, Tuple, Optional
from conllu import parse_incr, TokenList
import pandas as pd
import re
from pathlib import Path

def _rebuild_text_and_offsets(tokens: Iterable[dict]) -> Tuple[str, list[Tuple[int, int]]]:
    """
    Rebuilds the original sentence text from FORM fields, respecting the 'SpaceAfter=No' rule.
    At the same time, calculates the (start, end) character offsets for each token in the reconstructed string.

    Parameters:
        tokens (Iterable[dict]): List of token dictionaries from a UD-parsed sentence.

    Returns:
        Tuple[str, list[Tuple[int, int]]]: 
            - The reconstructed sentence text.
            - A list of (start, end) character offsets for each token.
    """
    parts = []   # will store all text fragments
    spans = []   # will store the (start, end) offsets for each token
    cursor = 0   # keeps track of the current position in the reconstructed text

    for tok in tokens:
        form = tok["form"]
        misc = tok.get("misc") or {}

        # Check if we need to insert a space before this token
        # Only insert if there is a previous token and no 'SpaceAfter=No' from the previous token
        if parts and parts[-1] != "" and spans:
            prev_no_space = tok.get("_prev_no_space", False)
        else:
            prev_no_space = False

        if parts and not prev_no_space:
            parts.append(" ")
            cursor += 1

        # Store token start and end positions
        start = cursor
        parts.append(form)
        cursor += len(form)
        end = cursor
        spans.append((start, end))

        # Mark whether this token should NOT have space after it
        tok["_this_no_space"] = (misc.get("SpaceAfter") == "No")

    # Second pass: propagate '_this_no_space' info to '_prev_no_space' of the next token
    for i in range(1, len(spans)):
        tokens[i]["_prev_no_space"] = tokens[i-1].get("_this_no_space", False)

    text = "".join(parts)
    return text, spans


def read_ud_conllu(path: str | Path) -> pd.DataFrame:
    """
    Reads a Universal Dependencies (UD) *.conllu file and returns a tidy DataFrame with gold annotations.
    Ignores:
      - Multiword tokens (IDs like '3-4')
      - Empty nodes (IDs like '2.1')

    Parameters:
        path (str | Path): Path to the .conllu file.

    Returns:
        pd.DataFrame: A DataFrame containing token-level annotations with character offsets.
    """
    rows = []

    with open(path, encoding="utf-8") as f:
        for sent in parse_incr(f):  # type: TokenList
            # Get sentence ID from metadata
            sent_id = (sent.metadata or {}).get("sent_id") \
                      or (sent.metadata or {}).get("sentid") \
                      or ""

            # Keep only real tokens (integer IDs)
            ud_tokens = [t for t in sent if isinstance(t["id"], int)]

            # If the sentence text is explicitly provided in metadata, use it
            text_from_meta = (sent.metadata or {}).get("text")
            if text_from_meta:
                text = text_from_meta
                spans = []
                cursor = 0
                # Greedy alignment: match each token form sequentially in the text
                for t in ud_tokens:
                    form = t["form"]
                    pos = text.find(form, cursor)
                    if pos < 0:
                        # If not found, fallback to immediate position
                        pos = cursor
                    start = pos
                    end = pos + len(form)
                    spans.append((start, end))
                    cursor = end
            else:
                # If no text metadata is present, reconstruct text manually
                text, spans = _rebuild_text_and_offsets(ud_tokens)

            # Add each token’s details to the row list
            for (t, (start, end)) in zip(ud_tokens, spans):
                feats: Optional[Dict[str, str]] = t.get("feats") or {}
                if isinstance(feats, str):
                    # Sometimes feats are given as a string, so parse them into a dictionary
                    d = {}
                    for kv in feats.split("|"):
                        if "=" in kv:
                            k, v = kv.split("=", 1)
                            d[k] = v
                    feats = d

                tense = feats.get("Tense")

                rows.append({
                    "sent_id": sent_id,
                    "text": text,
                    "token_id": t["id"],
                    "form": t["form"],
                    "lemma": t.get("lemma"),
                    "gold_upos": t.get("upos"),
                    "gold_xpos": t.get("xpos"),
                    "gold_feats": feats,
                    "gold_tense": tense,
                    "head": t.get("head"),
                    "deprel": t.get("deprel"),
                    "space_after": not ((t.get("misc") or {}).get("SpaceAfter") == "No"),
                    "char_start": start,
                    "char_end": end,
                    # Slots for future parser predictions
                    "spacy_upos": None,
                    "spacy_feats": None,
                    "spacy_tense": None,
                    "stanza_upos": None,
                    "stanza_feats": None,
                    "stanza_tense": None,
                })

    # Create DataFrame
    df = pd.DataFrame(rows)

    # Convert certain columns to category for efficiency
    for c in ["gold_upos", "gold_tense"]:
        df[c] = df[c].astype("category")

    return df

In [2]:
df = read_ud_conllu("data/uk_parlamint-ud-dev.conllu")
df.head()

Unnamed: 0,sent_id,text,token_id,form,lemma,gold_upos,gold_xpos,gold_feats,gold_tense,head,deprel,space_after,char_start,char_end,spacy_upos,spacy_feats,spacy_tense,stanza_upos,stanza_feats,stanza_tense
0,ParlaMint-UA_2022-01-25-m0.u90.p4.lang1.s2,"Я коли вам кажу завжди про економіку, я дивлюс...",1,Я,я,PRON,PRON,"{'Animacy': 'Anim', 'Case': 'Nom', 'Number': '...",,10,nsubj:outer,True,0,1,,,,,,
1,ParlaMint-UA_2022-01-25-m0.u90.p4.lang1.s2,"Я коли вам кажу завжди про економіку, я дивлюс...",2,коли,коли,SCONJ,ADV,{'PronType': 'Rel'},,4,mark,True,2,6,,,,,,
2,ParlaMint-UA_2022-01-25-m0.u90.p4.lang1.s2,"Я коли вам кажу завжди про економіку, я дивлюс...",3,вам,ви,PRON,PRON,"{'Animacy': 'Anim', 'Case': 'Dat', 'Number': '...",,4,iobj,True,7,10,,,,,,
3,ParlaMint-UA_2022-01-25-m0.u90.p4.lang1.s2,"Я коли вам кажу завжди про економіку, я дивлюс...",4,кажу,казати,VERB,VERB,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",Pres,10,advcl,True,11,15,,,,,,
4,ParlaMint-UA_2022-01-25-m0.u90.p4.lang1.s2,"Я коли вам кажу завжди про економіку, я дивлюс...",5,завжди,завжди,ADV,ADV,{'PronType': 'Tot'},,4,advmod,True,16,22,,,,,,


In [None]:
# --- Metrics (macro-F1) ---
import spacy
import pandas as pd
from collections import Counter, defaultdict
from typing import List, Dict, Tuple, Optional, Iterable
from sklearn.metrics import f1_score, confusion_matrix, classification_report


def macro_f1(df: pd.DataFrame, y_col_true: str, y_col_pred: str,
             labels: Optional[Iterable[str]] = None) -> float:
    """
    Calculate the macro-averaged F1 score for classification results.

    Parameters:
        df (pd.DataFrame): DataFrame containing true and predicted labels.
        y_col_true (str): Column name for true labels.
        y_col_pred (str): Column name for predicted labels.
        labels (Optional[Iterable[str]]): Set of labels to consider.
            If None, all unique labels from both columns are used.

    Returns:
        float: Macro-averaged F1 score (0–1 range).
    """
    y_true = df[y_col_true].astype(str)
    y_pred = df[y_col_pred].astype(str)
    if labels is None:
        labels = sorted(set(y_true.unique()) | set(y_pred.unique()))
    return f1_score(y_true, y_pred, labels=labels, average="macro", zero_division=0)


def macro_f1_tense(df: pd.DataFrame, pred_col: str, restrict_to_verbs: bool = True) -> float:
    """
    Calculate macro-F1 for tense prediction.
    Can optionally restrict evaluation to only verbs (gold_upos == 'VERB').

    Parameters:
        df (pd.DataFrame): DataFrame with gold and predicted tenses.
        pred_col (str): Column containing predicted tenses.
        restrict_to_verbs (bool): If True, only evaluate on verbs.

    Returns:
        float: Macro-averaged F1 score for tense prediction.
    """
    data = df
    if restrict_to_verbs:
        data = data[data["gold_upos"] == "VERB"]

    def norm(series: pd.Series) -> pd.Series:
        # Normalize to strings, replacing missing values with 'None'
        if pd.api.types.is_categorical_dtype(series):
            series = series.cat.add_categories(["None"]).fillna("None")
        else:
            series = series.fillna("None").astype(str)
        return series.astype(str)

    y_true = norm(data["gold_tense"])
    y_pred = norm(data[pred_col])

    labels = sorted(set(y_true.unique()) | set(y_pred.unique()))
    return f1_score(y_true, y_pred, labels=labels, average="macro", zero_division=0)


# --- Character-level alignment helpers ---
def span_overlap(a: Tuple[int, int], b: Tuple[int, int]) -> int:
    """
    Compute the number of overlapping characters between two spans.

    Parameters:
        a, b (Tuple[int, int]): (start, end) character indices.

    Returns:
        int: Number of overlapping characters (0 if none).
    """
    return max(0, min(a[1], b[1]) - max(a[0], b[0]))


def align_by_spans(ud_spans: List[Tuple[int, int]], pred_spans: List[Tuple[int, int]]) -> Dict[int, List[int]]:
    """
    Align UD tokens with model-predicted tokens based on character span overlap.

    Any non-zero overlap counts as a match.

    Parameters:
        ud_spans (List[Tuple[int, int]]): Character spans for UD tokens.
        pred_spans (List[Tuple[int, int]]): Character spans for predicted tokens.

    Returns:
        Dict[int, List[int]]: Mapping from UD token index to a list of predicted token indices.
    """
    mapping = defaultdict(list)
    for i, u in enumerate(ud_spans):
        for j, p in enumerate(pred_spans):
            if span_overlap(u, p) > 0:
                mapping[i].append(j)
    return mapping


# --- Run spaCy model and attach predictions ---
def run_spacy_and_attach(df: pd.DataFrame, model_name: str = "uk_core_news_md") -> pd.DataFrame:
    """
    Run a spaCy model on the sentence texts in a UD DataFrame and attach UPOS and Tense predictions.

    The function:
      - Tokenizes each sentence with spaCy
      - Aligns spaCy tokens to UD tokens using character spans
      - Aggregates predictions for UD tokens that align with multiple model tokens

    Parameters:
        df (pd.DataFrame): UD DataFrame containing columns: sent_id, text, char_start, char_end.
        model_name (str): Name of the spaCy model to load.

    Returns:
        pd.DataFrame: Copy of the original DataFrame with `spacy_upos` and `spacy_tense` columns filled.
    """
    nlp = spacy.load(model_name, disable=[])  # Use full pipeline: tokenization, POS, morph
    out = df.copy()

    # Process each sentence separately
    for sent_id, seg in out.groupby("sent_id", sort=False):
        text = seg["text"].iloc[0]
        doc = nlp(text)

        # Extract spaCy token spans and features
        pred_spans = [(t.idx, t.idx + len(t.text)) for t in doc]
        pred_upos = [t.pos_ for t in doc]
        pred_tense = [t.morph.get("Tense", [None])[0] if t.morph.get("Tense") else None for t in doc]

        # UD token spans (in order)
        ud_spans = list(zip(seg["char_start"].tolist(), seg["char_end"].tolist()))
        mapping = align_by_spans(ud_spans, pred_spans)

        # Aggregate predictions for each UD token
        agg_upos, agg_tense = [], []
        for i_ud in range(len(ud_spans)):
            js = mapping.get(i_ud, [])
            if not js:
                agg_upos.append(None)
                agg_tense.append(None)
                continue

            # Majority vote for UPOS
            maj_upos = Counter(pred_upos[j] for j in js if pred_upos[j] is not None).most_common(1)
            pick_upos = maj_upos[0][0] if maj_upos else None

            # Majority vote for Tense (pick first if multiple ties)
            tens = [pred_tense[j] for j in js if pred_tense[j] not in (None, "")]
            if tens:
                pick_tense = Counter(tens).most_common(1)[0][0]
            else:
                pick_tense = None

            agg_upos.append(pick_upos)
            agg_tense.append(pick_tense)

        # Write predictions back to the DataFrame
        out.loc[seg.index, "spacy_upos"] = agg_upos
        out.loc[seg.index, "spacy_tense"] = agg_tense

    # Convert to categorical for efficiency
    out["spacy_upos"] = out["spacy_upos"].astype("category")
    out["spacy_tense"] = out["spacy_tense"].astype("category")
    return out

from collections import Counter

def run_stanza_and_attach(df, nlp=None):
    import stanza
    if nlp is None:
        stanza.download("uk")
        nlp = stanza.Pipeline("uk", processors="tokenize,mwt,pos,lemma")

    out = df.copy()

    for sent_id, seg in out.groupby("sent_id", sort=False):
        text = seg["text"].iloc[0]
        doc = nlp(text)

        pred_spans, pred_upos, pred_tense = [], [], []

        for sent in doc.sentences:
            for tok in sent.tokens:
                start, end = tok.start_char, tok.end_char
                if start is None or end is None:
                    continue

                upos_list = [w.upos for w in tok.words if w.upos]
                feats_dicts = []
                for w in tok.words:
                    if w.feats:
                        feats = dict(kv.split("=", 1) for kv in w.feats.split("|") if "=" in kv)
                        feats_dicts.append(feats)

                upos = Counter(upos_list).most_common(1)[0][0] if upos_list else None
                tenses = [fd.get("Tense") for fd in feats_dicts if fd.get("Tense")]
                tense = Counter(tenses).most_common(1)[0][0] if tenses else None

                pred_spans.append((start, end))
                pred_upos.append(upos)
                pred_tense.append(tense)

        ud_spans = list(zip(seg["char_start"].tolist(), seg["char_end"].tolist()))

        mapping = align_by_spans(ud_spans, pred_spans)

        agg_upos, agg_tense = [], []
        for i_ud in range(len(ud_spans)):
            js = mapping.get(i_ud, [])
            if not js:
                agg_upos.append(None)
                agg_tense.append(None)
                continue
            maj_upos = Counter(pred_upos[j] for j in js if pred_upos[j] is not None).most_common(1)
            pick_upos = maj_upos[0][0] if maj_upos else None
            tens = [pred_tense[j] for j in js if pred_tense[j] not in (None, "")]
            pick_tense = Counter(tens).most_common(1)[0][0] if tens else None

            agg_upos.append(pick_upos)
            agg_tense.append(pick_tense)

        out.loc[seg.index, "stanza_upos"] = agg_upos
        out.loc[seg.index, "stanza_tense"] = agg_tense

    out["stanza_upos"] = out["stanza_upos"].astype("category")
    out["stanza_tense"] = out["stanza_tense"].astype("category")
    return out

# --- Validation helpers ---
def validate_ud_df(df: pd.DataFrame) -> None:
    """
    Validate a UD DataFrame to ensure it has correct structure and no critical errors.

    Checks:
        - Required columns are present
        - No duplicate (sent_id, token_id) pairs
        - All spans are valid (start < end and start >= 0)
        - All sentences have non-empty text

    Raises:
        AssertionError: If any of the checks fail.
    """
    # Check required columns
    assert {"sent_id", "text", "token_id", "gold_upos", "char_start", "char_end"}.issubset(df.columns)

    # Ensure (sent_id, token_id) is unique
    assert df.duplicated(["sent_id", "token_id"]).sum() == 0, "Duplicate (sent_id, token_id)"

    # Ensure all spans are valid
    bad_spans = ((df["char_end"] <= df["char_start"]) | (df["char_start"] < 0)).sum()
    assert bad_spans == 0, f"Invalid spans: {bad_spans}"

    # Ensure no sentence has empty text
    assert (df.groupby("sent_id")["text"].first().str.len() == 0).sum() == 0, "#text is empty"

In [4]:
!pip3 install -U spacy
!python -m spacy download uk_core_news_md

Collecting spacy
  Downloading spacy-3.8.7-cp311-cp311-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.13-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp311-cp311-macosx_11_0_arm64.whl.metadata (8.5 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.10-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.4 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.6-cp311-cp311-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Using cached wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  D

In [31]:
!pip3 install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.10.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.14.1 stanza-1.10.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [32]:
import stanza
stanza.download("uk")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-08-09 23:46:21 INFO: Downloaded file to /Users/pelmeshek1706/stanza_resources/resources.json
2025-08-09 23:46:21 INFO: Downloading default packages for language: uk (Ukrainian) ...


Downloading https://huggingface.co/stanfordnlp/stanza-uk/resolve/v1.10.0/models/default.zip:   0%|          | …

2025-08-09 23:46:33 INFO: Downloaded file to /Users/pelmeshek1706/stanza_resources/uk/default.zip
2025-08-09 23:46:34 INFO: Finished downloading models and saved to /Users/pelmeshek1706/stanza_resources


In [38]:
df = read_ud_conllu("data/uk_parlamint-ud-dev.conllu")

In [39]:
validate_ud_df(df)

In [40]:
df = run_spacy_and_attach(df, model_name="uk_core_news_md")
df = run_stanza_and_attach(df)
df.head()

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-08-09 23:49:46 INFO: Downloaded file to /Users/pelmeshek1706/stanza_resources/resources.json
2025-08-09 23:49:46 INFO: Downloading default packages for language: uk (Ukrainian) ...
2025-08-09 23:49:47 INFO: File exists: /Users/pelmeshek1706/stanza_resources/uk/default.zip
2025-08-09 23:49:49 INFO: Finished downloading models and saved to /Users/pelmeshek1706/stanza_resources
2025-08-09 23:49:49 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-08-09 23:49:49 INFO: Downloaded file to /Users/pelmeshek1706/stanza_resources/resources.json
2025-08-09 23:49:49 INFO: Loading these models for language: uk (Ukrainian):
| Processor | Package     |
---------------------------
| tokenize  | iu          |
| mwt       | iu          |
| pos       | iu_charlm   |
| lemma     | iu_nocharlm |

2025-08-09 23:49:49 INFO: Using device: cpu
2025-08-09 23:49:49 INFO: Loading: tokenize
2025-08-09 23:49:49 INFO: Loading: mwt
2025-08-09 23:49:49 INFO: Loading: pos
2025-08-09 23:49:51 INFO: Loading: lemma
2025-08-09 23:49:51 INFO: Done loading processors!


Unnamed: 0,sent_id,text,token_id,form,lemma,gold_upos,gold_xpos,gold_feats,gold_tense,head,deprel,space_after,char_start,char_end,spacy_upos,spacy_feats,spacy_tense,stanza_upos,stanza_feats,stanza_tense
0,ParlaMint-UA_2022-01-25-m0.u90.p4.lang1.s2,"Я коли вам кажу завжди про економіку, я дивлюс...",1,Я,я,PRON,PRON,"{'Animacy': 'Anim', 'Case': 'Nom', 'Number': '...",,10,nsubj:outer,True,0,1,PRON,,,PRON,,
1,ParlaMint-UA_2022-01-25-m0.u90.p4.lang1.s2,"Я коли вам кажу завжди про економіку, я дивлюс...",2,коли,коли,SCONJ,ADV,{'PronType': 'Rel'},,4,mark,True,2,6,ADV,,,ADV,,
2,ParlaMint-UA_2022-01-25-m0.u90.p4.lang1.s2,"Я коли вам кажу завжди про економіку, я дивлюс...",3,вам,ви,PRON,PRON,"{'Animacy': 'Anim', 'Case': 'Dat', 'Number': '...",,4,iobj,True,7,10,PRON,,,PRON,,
3,ParlaMint-UA_2022-01-25-m0.u90.p4.lang1.s2,"Я коли вам кажу завжди про економіку, я дивлюс...",4,кажу,казати,VERB,VERB,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",Pres,10,advcl,True,11,15,VERB,,Pres,VERB,,Pres
4,ParlaMint-UA_2022-01-25-m0.u90.p4.lang1.s2,"Я коли вам кажу завжди про економіку, я дивлюс...",5,завжди,завжди,ADV,ADV,{'PronType': 'Tot'},,4,advmod,True,16,22,ADV,,,ADV,,


In [44]:
f1_spacy_upos = macro_f1(df, "gold_upos", "spacy_upos", labels=upos10)
f1_stanza_upos = macro_f1(df, "gold_upos", "stanza_upos", labels=upos10)
gap = f1_stanza_upos - f1_spacy_upos

In [45]:
print(f"spaCy UPOS F1: {f1_spacy_upos:.3f}")
print(f"Stanza UPOS F1: {f1_stanza_upos:.3f}")
print(f"GAP: {gap:.3%}")

spaCy UPOS F1: 0.972
Stanza UPOS F1: 0.975
GAP: 0.285%


In [46]:
f1_spacy_tense_verbs  = macro_f1_tense(df, "spacy_tense",  restrict_to_verbs=True)
f1_stanza_tense_verbs = macro_f1_tense(df, "stanza_tense", restrict_to_verbs=True)
gap_tense_verbs = f1_stanza_tense_verbs - f1_spacy_tense_verbs

print(f"spaCy Tense F1 (VERB only):   {f1_spacy_tense_verbs:.3f}")
print(f"Stanza Tense F1 (VERB only):  {f1_stanza_tense_verbs:.3f}")
print(f"Tense gap (VERB only):        {gap_tense_verbs:.2%}")

spaCy Tense F1 (VERB only):   0.972
Stanza Tense F1 (VERB only):  0.929
Tense gap (VERB only):        -4.32%


  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):


In [47]:
mask_has_tense = df["gold_tense"].notna()
f1_spacy_tense_all  = macro_f1_tense(df[mask_has_tense], "spacy_tense",  restrict_to_verbs=False)
f1_stanza_tense_all = macro_f1_tense(df[mask_has_tense], "stanza_tense", restrict_to_verbs=False)
gap_tense_all = f1_stanza_tense_all - f1_spacy_tense_all

print(f"spaCy Tense F1 (has gold Tense):   {f1_spacy_tense_all:.3f}")
print(f"Stanza Tense F1 (has gold Tense):  {f1_stanza_tense_all:.3f}")
print(f"Tense gap (has gold Tense):        {gap_tense_all:.2%}")

spaCy Tense F1 (has gold Tense):   0.981
Stanza Tense F1 (has gold Tense):  0.706
Tense gap (has gold Tense):        -27.46%


  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):


In [16]:
f1_spacy_tense = macro_f1_tense(df, "spacy_tense", restrict_to_verbs=True)
f1_spacy_tense

  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):


0.971756372227338

In [14]:
print(classification_report(df["gold_upos"].astype(str),
                            df["spacy_upos"].astype(str),
                            labels=upos10, zero_division=0))

              precision    recall  f1-score   support

         ADJ       0.99      0.96      0.98      1062
         ADP       0.99      1.00      1.00       929
         ADV       0.94      0.97      0.95       536
         AUX       0.97      0.93      0.95        99
       CCONJ       0.98      0.98      0.98       320
         DET       0.96      0.98      0.97       396
        NOUN       0.98      1.00      0.99      2647
        PRON       0.99      0.98      0.98       521
       PROPN       0.97      0.92      0.94       455
        VERB       0.99      1.00      0.99      1202

   micro avg       0.98      0.98      0.98      8167
   macro avg       0.97      0.97      0.97      8167
weighted avg       0.98      0.98      0.98      8167



# Without local files

In [None]:
import requests
from io import StringIO

def read_ud_conllu(path_or_url: str | Path) -> pd.DataFrame:
    if str(path_or_url).startswith("http"):
        r = requests.get(path_or_url)
        r.raise_for_status()
        file_obj = StringIO(r.text)
    else:
        file_obj = open(path_or_url, encoding="utf-8")

    rows = []
    with file_obj as f:
        for sent in parse_incr(f):
            sent_id = (sent.metadata or {}).get("sent_id") or (sent.metadata or {}).get("sentid") or ""
            ud_tokens = [t for t in sent if isinstance(t["id"], int)]
            text_from_meta = (sent.metadata or {}).get("text")
            if text_from_meta:
                text = text_from_meta
                spans = []
                cursor = 0
                for t in ud_tokens:
                    form = t["form"]
                    pos = text.find(form, cursor)
                    if pos < 0:
                        pos = cursor
                    start = pos
                    end = pos + len(form)
                    spans.append((start, end))
                    cursor = end
            else:
                text, spans = _rebuild_text_and_offsets(ud_tokens)

            for (t, (start, end)) in zip(ud_tokens, spans):
                feats = t.get("feats") or {}
                if isinstance(feats, str):
                    feats = dict(kv.split("=", 1) for kv in feats.split("|") if "=" in kv)
                tense = feats.get("Tense")

                rows.append({
                    "sent_id": sent_id,
                    "text": text,
                    "token_id": t["id"],
                    "form": t["form"],
                    "lemma": t.get("lemma"),
                    "gold_upos": t.get("upos"),
                    "gold_xpos": t.get("xpos"),
                    "gold_feats": feats,
                    "gold_tense": tense,
                    "head": t.get("head"),
                    "deprel": t.get("deprel"),
                    "space_after": not ((t.get("misc") or {}).get("SpaceAfter") == "No"),
                    "char_start": start,
                    "char_end": end,
                    "spacy_upos": None,
                    "spacy_feats": None,
                    "spacy_tense": None,
                    "stanza_upos": None,
                    "stanza_feats": None,
                    "stanza_tense": None,
                })
    df = pd.DataFrame(rows)
    for c in ["gold_upos", "gold_tense"]:
        df[c] = df[c].astype("category")
    return df

## test file

In [48]:
url = "https://raw.githubusercontent.com/UniversalDependencies/UD_Ukrainian-ParlaMint/master/uk_parlamint-ud-test.conllu"
df = read_ud_conllu(url)
df.head()

Unnamed: 0,sent_id,text,token_id,form,lemma,gold_upos,gold_xpos,gold_feats,gold_tense,head,deprel,space_after,char_start,char_end,spacy_upos,spacy_feats,spacy_tense,stanza_upos,stanza_feats,stanza_tense
0,ParlaMint-UA_2022-01-25-m0.u185.p3.lang1.s1,За - 116,1,За,за,ADP,ADP,{'Case': 'Acc'},,0,root,True,0,2,,,,,,
1,ParlaMint-UA_2022-01-25-m0.u185.p3.lang1.s1,За - 116,2,-,-,PUNCT,PUNCT,{},,3,punct,True,3,4,,,,,,
2,ParlaMint-UA_2022-01-25-m0.u185.p3.lang1.s1,За - 116,3,116,116,NUM,NUM,"{'Case': 'Nom', 'NumType': 'Card'}",,1,orphan,True,5,8,,,,,,
3,ParlaMint-UA_2022-01-25-m0.u185.p4.lang1.s1,Рішення не прийнято.,1,Рішення,рішення,NOUN,NOUN,"{'Animacy': 'Inan', 'Case': 'Acc', 'Gender': '...",,3,obj,True,0,7,,,,,,
4,ParlaMint-UA_2022-01-25-m0.u185.p4.lang1.s1,Рішення не прийнято.,2,не,не,PART,PART,{'Polarity': 'Neg'},,3,advmod:neg,True,8,10,,,,,,


In [49]:
df = run_spacy_and_attach(df, model_name="uk_core_news_md")
df = run_stanza_and_attach(df)
df.head()

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-08-10 00:08:30 INFO: Downloaded file to /Users/pelmeshek1706/stanza_resources/resources.json
2025-08-10 00:08:30 INFO: Downloading default packages for language: uk (Ukrainian) ...
2025-08-10 00:08:30 INFO: File exists: /Users/pelmeshek1706/stanza_resources/uk/default.zip
2025-08-10 00:08:32 INFO: Finished downloading models and saved to /Users/pelmeshek1706/stanza_resources
2025-08-10 00:08:32 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-08-10 00:08:32 INFO: Downloaded file to /Users/pelmeshek1706/stanza_resources/resources.json
2025-08-10 00:08:33 INFO: Loading these models for language: uk (Ukrainian):
| Processor | Package     |
---------------------------
| tokenize  | iu          |
| mwt       | iu          |
| pos       | iu_charlm   |
| lemma     | iu_nocharlm |

2025-08-10 00:08:33 INFO: Using device: cpu
2025-08-10 00:08:33 INFO: Loading: tokenize
2025-08-10 00:08:33 INFO: Loading: mwt
2025-08-10 00:08:33 INFO: Loading: pos
2025-08-10 00:08:34 INFO: Loading: lemma
2025-08-10 00:08:34 INFO: Done loading processors!


Unnamed: 0,sent_id,text,token_id,form,lemma,gold_upos,gold_xpos,gold_feats,gold_tense,head,deprel,space_after,char_start,char_end,spacy_upos,spacy_feats,spacy_tense,stanza_upos,stanza_feats,stanza_tense
0,ParlaMint-UA_2022-01-25-m0.u185.p3.lang1.s1,За - 116,1,За,за,ADP,ADP,{'Case': 'Acc'},,0,root,True,0,2,ADP,,,ADP,,
1,ParlaMint-UA_2022-01-25-m0.u185.p3.lang1.s1,За - 116,2,-,-,PUNCT,PUNCT,{},,3,punct,True,3,4,PUNCT,,,PUNCT,,
2,ParlaMint-UA_2022-01-25-m0.u185.p3.lang1.s1,За - 116,3,116,116,NUM,NUM,"{'Case': 'Nom', 'NumType': 'Card'}",,1,orphan,True,5,8,NUM,,,NUM,,
3,ParlaMint-UA_2022-01-25-m0.u185.p4.lang1.s1,Рішення не прийнято.,1,Рішення,рішення,NOUN,NOUN,"{'Animacy': 'Inan', 'Case': 'Acc', 'Gender': '...",,3,obj,True,0,7,NOUN,,,NOUN,,
4,ParlaMint-UA_2022-01-25-m0.u185.p4.lang1.s1,Рішення не прийнято.,2,не,не,PART,PART,{'Polarity': 'Neg'},,3,advmod:neg,True,8,10,PART,,,PART,,


In [50]:
upos10 = ["ADJ","ADP","ADV","AUX","CCONJ","DET","NOUN","PRON","PROPN","VERB"]

f1_spacy_upos = macro_f1(df, "gold_upos", "spacy_upos", labels=upos10)
f1_stanza_upos = macro_f1(df, "gold_upos", "stanza_upos", labels=upos10)
gap = f1_stanza_upos - f1_spacy_upos

print(f"spaCy UPOS F1: {f1_spacy_upos:.3f}")
print(f"Stanza UPOS F1: {f1_stanza_upos:.3f}")
print(f"GAP: {gap:.3%}")

spaCy UPOS F1: 0.973
Stanza UPOS F1: 0.972
GAP: -0.112%


In [51]:
f1_spacy_tense_verbs  = macro_f1_tense(df, "spacy_tense",  restrict_to_verbs=True)
f1_stanza_tense_verbs = macro_f1_tense(df, "stanza_tense", restrict_to_verbs=True)
gap_tense_verbs = f1_stanza_tense_verbs - f1_spacy_tense_verbs

print(f"spaCy Tense F1 (VERB only):   {f1_spacy_tense_verbs:.3f}")
print(f"Stanza Tense F1 (VERB only):  {f1_stanza_tense_verbs:.3f}")
print(f"Tense gap (VERB only):        {gap_tense_verbs:.2%}")

spaCy Tense F1 (VERB only):   0.959
Stanza Tense F1 (VERB only):  0.926
Tense gap (VERB only):        -3.31%


  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):


In [52]:
mask_has_tense = df["gold_tense"].notna()
f1_spacy_tense_all  = macro_f1_tense(df[mask_has_tense], "spacy_tense",  restrict_to_verbs=False)
f1_stanza_tense_all = macro_f1_tense(df[mask_has_tense], "stanza_tense", restrict_to_verbs=False)
gap_tense_all = f1_stanza_tense_all - f1_spacy_tense_all

print(f"spaCy Tense F1 (has gold Tense):   {f1_spacy_tense_all:.3f}")
print(f"Stanza Tense F1 (has gold Tense):  {f1_stanza_tense_all:.3f}")
print(f"Tense gap (has gold Tense):        {gap_tense_all:.2%}")

spaCy Tense F1 (has gold Tense):   0.729
Stanza Tense F1 (has gold Tense):  0.708
Tense gap (has gold Tense):        -2.09%


  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):


In [24]:
print(classification_report(df["gold_upos"].astype(str),
                            df["spacy_upos"].astype(str),
                            labels=upos10, zero_division=0))

              precision    recall  f1-score   support

         ADJ       0.98      0.95      0.96      1043
         ADP       0.99      0.99      0.99       928
         ADV       0.94      0.93      0.94       533
         AUX       0.98      0.97      0.98       122
       CCONJ       0.98      0.98      0.98       322
         DET       0.96      0.99      0.98       440
        NOUN       0.98      0.99      0.99      2721
        PRON       0.99      0.97      0.98       563
       PROPN       0.96      0.93      0.95       379
        VERB       0.98      1.00      0.99      1207

   micro avg       0.98      0.98      0.98      8258
   macro avg       0.98      0.97      0.97      8258
weighted avg       0.98      0.98      0.98      8258



## train file

In [53]:
url = "https://raw.githubusercontent.com/UniversalDependencies/UD_Ukrainian-ParlaMint/master/uk_parlamint-ud-train.conllu"
df = read_ud_conllu(url)
df.head()

Unnamed: 0,sent_id,text,token_id,form,lemma,gold_upos,gold_xpos,gold_feats,gold_tense,head,deprel,space_after,char_start,char_end,spacy_upos,spacy_feats,spacy_tense,stanza_upos,stanza_feats,stanza_tense
0,ParlaMint-UA_2003-10-14-m0.u1.p1.lang1.s1,"Доброго ранку, шановні народні депутати, запро...",1,Доброго,добрий,ADJ,ADJ,"{'Case': 'Gen', 'Degree': 'Pos', 'Gender': 'Ma...",,2,amod,True,0,7,,,,,,
1,ParlaMint-UA_2003-10-14-m0.u1.p1.lang1.s1,"Доброго ранку, шановні народні депутати, запро...",2,ранку,ранок,NOUN,NOUN,"{'Animacy': 'Inan', 'Case': 'Gen', 'Gender': '...",,0,root,False,8,13,,,,,,
2,ParlaMint-UA_2003-10-14-m0.u1.p1.lang1.s1,"Доброго ранку, шановні народні депутати, запро...",3,",",",",PUNCT,PUNCT,{},,6,punct,True,13,14,,,,,,
3,ParlaMint-UA_2003-10-14-m0.u1.p1.lang1.s1,"Доброго ранку, шановні народні депутати, запро...",4,шановні,шановний,ADJ,ADJ,"{'Case': 'Voc', 'Degree': 'Pos', 'Number': 'Pl...",,6,amod,True,15,22,,,,,,
4,ParlaMint-UA_2003-10-14-m0.u1.p1.lang1.s1,"Доброго ранку, шановні народні депутати, запро...",5,народні,народний,ADJ,ADJ,"{'Case': 'Voc', 'Number': 'Plur'}",,6,amod,True,23,30,,,,,,


In [54]:
df = run_spacy_and_attach(df, model_name="uk_core_news_md")
df = run_stanza_and_attach(df)
df.head()

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-08-10 00:11:12 INFO: Downloaded file to /Users/pelmeshek1706/stanza_resources/resources.json
2025-08-10 00:11:12 INFO: Downloading default packages for language: uk (Ukrainian) ...
2025-08-10 00:11:13 INFO: File exists: /Users/pelmeshek1706/stanza_resources/uk/default.zip
2025-08-10 00:11:15 INFO: Finished downloading models and saved to /Users/pelmeshek1706/stanza_resources
2025-08-10 00:11:15 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-08-10 00:11:15 INFO: Downloaded file to /Users/pelmeshek1706/stanza_resources/resources.json
2025-08-10 00:11:15 INFO: Loading these models for language: uk (Ukrainian):
| Processor | Package     |
---------------------------
| tokenize  | iu          |
| mwt       | iu          |
| pos       | iu_charlm   |
| lemma     | iu_nocharlm |

2025-08-10 00:11:15 INFO: Using device: cpu
2025-08-10 00:11:15 INFO: Loading: tokenize
2025-08-10 00:11:15 INFO: Loading: mwt
2025-08-10 00:11:15 INFO: Loading: pos
2025-08-10 00:11:16 INFO: Loading: lemma
2025-08-10 00:11:17 INFO: Done loading processors!


Unnamed: 0,sent_id,text,token_id,form,lemma,gold_upos,gold_xpos,gold_feats,gold_tense,head,deprel,space_after,char_start,char_end,spacy_upos,spacy_feats,spacy_tense,stanza_upos,stanza_feats,stanza_tense
0,ParlaMint-UA_2003-10-14-m0.u1.p1.lang1.s1,"Доброго ранку, шановні народні депутати, запро...",1,Доброго,добрий,ADJ,ADJ,"{'Case': 'Gen', 'Degree': 'Pos', 'Gender': 'Ma...",,2,amod,True,0,7,ADJ,,,ADJ,,
1,ParlaMint-UA_2003-10-14-m0.u1.p1.lang1.s1,"Доброго ранку, шановні народні депутати, запро...",2,ранку,ранок,NOUN,NOUN,"{'Animacy': 'Inan', 'Case': 'Gen', 'Gender': '...",,0,root,False,8,13,NOUN,,,NOUN,,
2,ParlaMint-UA_2003-10-14-m0.u1.p1.lang1.s1,"Доброго ранку, шановні народні депутати, запро...",3,",",",",PUNCT,PUNCT,{},,6,punct,True,13,14,PUNCT,,,PUNCT,,
3,ParlaMint-UA_2003-10-14-m0.u1.p1.lang1.s1,"Доброго ранку, шановні народні депутати, запро...",4,шановні,шановний,ADJ,ADJ,"{'Case': 'Voc', 'Degree': 'Pos', 'Number': 'Pl...",,6,amod,True,15,22,ADJ,,,ADJ,,
4,ParlaMint-UA_2003-10-14-m0.u1.p1.lang1.s1,"Доброго ранку, шановні народні депутати, запро...",5,народні,народний,ADJ,ADJ,"{'Case': 'Voc', 'Number': 'Plur'}",,6,amod,True,23,30,ADJ,,,ADJ,,


In [55]:
upos10 = ["ADJ","ADP","ADV","AUX","CCONJ","DET","NOUN","PRON","PROPN","VERB"]

f1_spacy_upos = macro_f1(df, "gold_upos", "spacy_upos", labels=upos10)
f1_stanza_upos = macro_f1(df, "gold_upos", "stanza_upos", labels=upos10)
gap = f1_stanza_upos - f1_spacy_upos

print(f"spaCy UPOS F1: {f1_spacy_upos:.3f}")
print(f"Stanza UPOS F1: {f1_stanza_upos:.3f}")
print(f"GAP: {gap:.3%}")

spaCy UPOS F1: 0.974
Stanza UPOS F1: 0.974
GAP: 0.071%


In [56]:
f1_spacy_tense_verbs  = macro_f1_tense(df, "spacy_tense",  restrict_to_verbs=True)
f1_stanza_tense_verbs = macro_f1_tense(df, "stanza_tense", restrict_to_verbs=True)
gap_tense_verbs = f1_stanza_tense_verbs - f1_spacy_tense_verbs

print(f"spaCy Tense F1 (VERB only):   {f1_spacy_tense_verbs:.3f}")
print(f"Stanza Tense F1 (VERB only):  {f1_stanza_tense_verbs:.3f}")
print(f"Tense gap (VERB only):        {gap_tense_verbs:.2%}")

spaCy Tense F1 (VERB only):   0.967
Stanza Tense F1 (VERB only):  0.952
Tense gap (VERB only):        -1.50%


  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):


In [57]:
mask_has_tense = df["gold_tense"].notna()
f1_spacy_tense_all  = macro_f1_tense(df[mask_has_tense], "spacy_tense",  restrict_to_verbs=False)
f1_stanza_tense_all = macro_f1_tense(df[mask_has_tense], "stanza_tense", restrict_to_verbs=False)
gap_tense_all = f1_stanza_tense_all - f1_spacy_tense_all

print(f"spaCy Tense F1 (has gold Tense):   {f1_spacy_tense_all:.3f}")
print(f"Stanza Tense F1 (has gold Tense):  {f1_stanza_tense_all:.3f}")
print(f"Tense gap (has gold Tense):        {gap_tense_all:.2%}")

spaCy Tense F1 (has gold Tense):   0.731
Stanza Tense F1 (has gold Tense):  0.719
Tense gap (has gold Tense):        -1.27%


  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):
  if pd.api.types.is_categorical_dtype(series):


In [29]:
print(classification_report(df["gold_upos"].astype(str),
                            df["spacy_upos"].astype(str),
                            labels=upos10, zero_division=0))

              precision    recall  f1-score   support

         ADJ       0.98      0.99      0.98      6649
         ADP       1.00      1.00      1.00      5440
         ADV       0.95      0.94      0.95      3188
         AUX       0.95      0.95      0.95       623
       CCONJ       0.98      0.98      0.98      2254
         DET       0.96      0.99      0.97      2676
        NOUN       0.98      0.99      0.98     16050
        PRON       0.98      0.97      0.98      3117
       PROPN       0.95      0.96      0.95      2191
        VERB       0.98      0.99      0.99      6543

   micro avg       0.98      0.98      0.98     48731
   macro avg       0.97      0.98      0.97     48731
weighted avg       0.98      0.98      0.98     48731

