In [32]:
from openai import OpenAI

import os

base_url = "https://llm.hpc.rug.nl/"
model_name = "mistralai/Mistral-Small-3.2-24B-Instruct-2506"

api_key = os.getenv("RUGLLM_API_KEY")

client = OpenAI(base_url=base_url, api_key=api_key)


In [71]:
# test UG LLM compatibilty with response_format

from pydantic import BaseModel

class CalendarEvent(BaseModel):
    name: str
    date: str
    participants: list[str]


completion = client.beta.chat.completions.parse(
    model=model_name,
    messages=[
        {"role": "system", "content": "Extract the event information."},
        {
            "role": "user",
            "content": "Alice and Bob are going to a science fair on Friday.",
        },
    ],
    response_format=CalendarEvent,
)

completion.choices[0].message.parsed

CalendarEvent(name='science fair', date='Friday', participants=['Alice', 'Bob'])

👍 in UG llm logs:

```
 mistral_small 	INFO 08-29 02:31:51 [logger.py:43] Received request chatcmpl-7ed491e616e948f6a011cd3438bfacf6: prompt: None, params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.7, top_p=0.95, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=131050, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=GuidedDecodingParams(json={'properties': {'name': {'title': 'Name', 'type': 'string'}, 'date': {'title': 'Date', 'type': 'string'}, 'participants': {'items': {'type': 'string'}, 'title': 'Participants', 'type': 'array'}}, 'required': ['name', 'date', 'participants'], 'title': 'CalendarEvent', 'type': 'object', 'additionalProperties': False}, regex=None, choice=None, grammar=None, json_object=None, backend=None, backend_was_auto=False, disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, whitespace_pattern=None, structural_tag=None), extra_args=None), prompt_token_ids: [1, 17, 91108, 1278, 3349, 3686, 1046, 18, 3, 66899, 1321, 12382, 1584, 4670, 1317, 1261, 12470, 13177, 1408, 27533, 1046, 4], prompt_embeds shape: None, lora_request: None, prompt_adapter_request: None.
```

In [34]:
import pandas as pd

df_annotations = pd.read_json("data/examples_from_annotations.jsonl", lines=True)

In [35]:
df_annotations.head()

Unnamed: 0,text,labels,source
0,Zesde Brief.,[],Zesde Brief (met annotaties).docx
1,Op de Bloksberg des morgens van den 15 July 1...,"[{'span': 'Bloksberg', 'types': ['E53 Place', ...",Zesde Brief (met annotaties).docx
2,Waardste Vriend!,[],Zesde Brief (met annotaties).docx
3,U te melden wat wy deeze dagen op onze reis do...,"[{'span': 'Harz', 'types': ['E53 Place']}]",Zesde Brief (met annotaties).docx
4,Hoe zoude ik toch schriftelyk kunnen vermelden...,"[{'span': 'eenige honderd voet diep', 'types':...",Zesde Brief (met annotaties).docx


In [36]:
# example adapted from intfloat/multilingual-e5-base readme

import torch.nn.functional as F
import torch
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from typing import List


def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


def get_embeddings_batched(texts: List[str], model, tokenizer, batch_size=8):
    """Process texts in batches and return embeddings."""
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]

        # Tokenize the batch
        batch_dict = tokenizer(
            batch_texts,
            max_length=512,
            padding=True,
            truncation=True,
            return_tensors="pt",
        )

        # Get embeddings for the batch
        with torch.no_grad():  # Disable gradient calculation for inference
            outputs = model(**batch_dict)
            batch_embeddings = average_pool(
                outputs.last_hidden_state, batch_dict["attention_mask"]
            )
            batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)

        all_embeddings.append(batch_embeddings)

    # Concatenate all batch embeddings
    return torch.cat(all_embeddings, dim=0)


# Each input text should start with "query: " or "passage: ", even for non-English texts.
input_texts = [
    "query: how much protein should a female eat",
    "query: 南瓜的家常做法",
    "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
    "passage: 1.清炒南瓜丝 原料:嫩南瓜半个 调料:葱、盐、白糖、鸡精 做法: 1、南瓜用刀薄薄的削去表面一层皮,用勺子刮去瓤 2、擦成细丝(没有擦菜板就用刀慢慢切成细丝) 3、锅烧热放油,入葱花煸出香味 4、入南瓜丝快速翻炒一分钟左右,放盐、一点白糖和鸡精调味出锅 2.香葱炒南瓜 原料:南瓜1只 调料:香葱、蒜末、橄榄油、盐 做法: 1、将南瓜去皮,切成片 2、油锅8成热后,将蒜末放入爆香 3、爆香后,将南瓜片放入,翻炒 4、在翻炒的同时,可以不时地往锅里加水,但不要太多 5、放入盐,炒匀 6、南瓜差不多软和绵了之后,就可以关火 7、撒入香葱,即可出锅",
]

tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-base")
model = AutoModel.from_pretrained("intfloat/multilingual-e5-base")

# Split into queries and passages
queries = [text for text in input_texts if text.startswith("query:")]
passages = [text for text in input_texts if text.startswith("passage:")]

# Get embeddings in batches
batch_size = 8  # Adjust based on your memory constraints
query_embeddings = get_embeddings_batched(queries, model, tokenizer, batch_size)
passage_embeddings = get_embeddings_batched(passages, model, tokenizer, batch_size)

# Calculate similarity scores
scores = (query_embeddings @ passage_embeddings.T) * 100
print(scores.tolist())

[[90.79100799560547, 71.17613220214844], [72.53498077392578, 90.14604949951172]]


In [37]:
from pathlib import Path
import json

datadir = Path("data")
all_examples_file = datadir.joinpath("examples_cleaned.jsonl")

with all_examples_file.open("r", encoding="utf-8") as f:
    all_examples = [json.loads(line) for line in f]

sentences_to_embed = [f"query: {ex['text']}" for ex in all_examples]

embeddings = get_embeddings_batched(sentences_to_embed, model, tokenizer, batch_size)

In [38]:
for ex, emb in zip(all_examples, embeddings.numpy()):
    ex["text_embedding"] = emb

df = pd.DataFrame(all_examples)
test_file = "Ontology-based Annotation.docx"
train_file = "Zesde Brief (met annotaties).docx"
df_test = df.query(f"source == '{test_file}'")
df_train = df.query(f"source == '{train_file}'")

In [80]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

embeddings_train = np.stack(df_train["text_embedding"])
embeddings_test = np.stack(df_test["text_embedding"])


n_fewshot_examples = 5
nn = NearestNeighbors(n_neighbors=n_fewshot_examples, metric="cosine").fit(embeddings_train)

distances, indices = nn.kneighbors(embeddings_test)

In [81]:
test_index = 3
print("Test example:")
print(df_test.iloc[test_index])
print("Most similar train examples::")
print(df_train.iloc[indices[test_index]])

Test example:
text              Wilhelmshöhe, voorheen Weissenstein genoemd en...
labels            [{'span': 'Wilhelmshöhe', 'types': ['E53 Place...
source                               Ontology-based Annotation.docx
text_embedding    [0.014872815, 0.052165806, -0.04137797, 0.0222...
Name: 110, dtype: object
Most similar train examples::
                                                 text  \
21  Hiermede gingen wy   [43] die naar het naburig...   
57  Hier hielden wy ons weder een oogenblik op en ...   
75  Nu hadden wy echter het schoone van de weg op ...   
98  Gegend von Wernigerode von den 12 Morgen gesehen]   
48  De verandering van temperatuur was hier aanmer...   

                                               labels  \
21  [{'span': 'gingen wy', 'types': ['E9 Move']}, ...   
57  [{'span': 'geruime tyd', 'types': ['E54 Dimens...   
75  [{'span': '3 uur', 'types': ['E52 Time-Span']}...   
98                                                 []   
48  [{'span': 'hoogte van 293 t

In [82]:
unique_labels = (
    df_train["labels"]
    .explode()
    .dropna()
    .apply(lambda x: x["types"])
    .explode()
    .value_counts()
    .index.to_list()
)

In [83]:
system_prompt_template = f"""
You are an expert annotator of old Dutch texts. You always return annotations as valid JSON.

You output the following labels: {', '.join(unique_labels)}

A single span can be assigned multiple labels.

Here are a few examples:
{{}}
""".strip()

In [84]:
type(df_train["labels"].iloc[1][0])

dict

In [85]:
unique_labels

['E53 Place',
 'E19 Physical Thing',
 'E21 Person',
 'E52 Time-Span',
 'E54 Dimension',
 'E86 Leaving',
 'E74 Group',
 'E9 Move']

In [86]:
def format_examples(examples: pd.DataFrame):
    return "\n\n".join(
        [
            f"Sentence:\n{ex['text']}\nAnnotation:\n{json.dumps({'entities': ex['labels']}, ensure_ascii=False)}"
            for ex in examples.to_records()
        ]
    )


print(format_examples(df_train.iloc[indices[test_index]]))

Sentence:
Hiermede gingen wy   [43] die naar het naburig Zellerfeld  was gewandelt) onder geleide van een gids naar de voornaamste ingang der mynen ongeveer ¾ uur  van de stad gelegen: 
Annotation:
{"entities": [{"span": "gingen wy", "types": ["E9 Move"]}, {"span": "Zellerfeld", "types": ["E53 Place"]}, {"span": "¾ uur", "types": ["E54 Dimension"]}]}

Sentence:
Hier hielden wy ons weder een oogenblik op en bestegen toen weder den berg alwaar wy op een smal pad een geruime tyd  moesten wagten wyl ons hier eene menigte ezels tegenkwamen met lange zakken op den rug die juist de breedte van de weg besloegen zoodat wy niet passeren konden: hier ruischte een helder beekje waarvan het yskoude water heerlyk smaakte: na nog anderhalf uur  gewandelt te hebben kregen wy de stad Andreasberg  in het oog, dezelve is nog hoger als Clausthal  en wel 1817 voet  boven de Oostzee verheven, weldra waren wy in dezelve & het schoone logement binnen getreden. 
Annotation:
{"entities": [{"span": "geruime tyd"

In [87]:
from typing import List, Literal
from pydantic import BaseModel

# Define the allowed entity types using Literal for type validation
EntityType = Literal[
    'E53 Place',
    'E19 Physical Thing',
    'E21 Person',
    'E52 Time-Span',
    'E54 Dimension',
    'E86 Leaving',
    'E74 Group',
    'E9 Move'
]

class Entity(BaseModel):
    span: str
    types: List[EntityType]

class EntityOutput(BaseModel):
    entities: List[Entity]

In [88]:
from tqdm import trange

responses = []

for idx in trange(len(df_test)):

    system_prompt = system_prompt_template.format(
        format_examples(df_train.iloc[indices[idx]])
    )

    completion = client.beta.chat.completions.parse(
        model=model_name,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": df_test.iloc[idx]["text"]},
        ],
        temperature=0.15,
        response_format=EntityOutput,
    )

    responses.append(completion.choices[0].message.parsed)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 223/223 [08:49<00:00,  2.38s/it]


In [89]:
responses

[EntityOutput(entities=[]),
 EntityOutput(entities=[Entity(span='Göttingen', types=['E53 Place']), Entity(span='9 July 1816', types=['E52 Time-Span'])]),
 EntityOutput(entities=[]),
 EntityOutput(entities=[Entity(span='Wilhelmshöhe', types=['E53 Place']), Entity(span='Weissenstein', types=['E53 Place']), Entity(span='Napoleonshöhe', types=['E53 Place']), Entity(span='Keurvorst van Hessen Cassel', types=['E74 Group']), Entity(span='groot uur', types=['E54 Dimension'])]),
 EntityOutput(entities=[Entity(span='landgraven Karel', types=['E74 Group']), Entity(span='Frederik de II', types=['E21 Person']), Entity(span='Keurvorst Willem de IX', types=['E21 Person'])]),
 EntityOutput(entities=[]),
 EntityOutput(entities=[Entity(span='Zondag ll', types=['E52 Time-Span']), Entity(span='Weissenstein', types=['E53 Place']), Entity(span='groote gewoel der wandelaars zoo wel inwoners der stad als vreemdelingen', types=['E74 Group'])]),
 EntityOutput(entities=[Entity(span='Casernes', types=['E19 Physic

In [99]:
from tqdm import trange

responses_oai = []

client_oai = OpenAI()
oai_model = "gpt-4o-mini"

for idx in trange(len(df_test)):

    system_prompt = system_prompt_template.format(
        format_examples(df_train.iloc[indices[idx]])
    )

    completion = client_oai.beta.chat.completions.parse(
        model=oai_model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": df_test.iloc[idx]["text"]},
        ],
        response_format=EntityOutput,
    )

    responses_oai.append(completion.choices[0].message.parsed)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 223/223 [04:47<00:00,  1.29s/it]


In [100]:
responses_oai

[EntityOutput(entities=[]),
 EntityOutput(entities=[Entity(span='Göttingen', types=['E53 Place']), Entity(span='den 9 July 1816', types=['E52 Time-Span'])]),
 EntityOutput(entities=[]),
 EntityOutput(entities=[Entity(span='Wilhelmshöhe', types=['E53 Place']), Entity(span='Weissenstein', types=['E53 Place']), Entity(span='een groot uur', types=['E52 Time-Span']), Entity(span='de stad', types=['E53 Place'])]),
 EntityOutput(entities=[Entity(span='Karel', types=['E21 Person']), Entity(span='Frederik de II', types=['E21 Person']), Entity(span='Willem de IX', types=['E21 Person'])]),
 EntityOutput(entities=[]),
 EntityOutput(entities=[Entity(span='Zondag ll', types=['E52 Time-Span']), Entity(span='Weissenstein', types=['E53 Place']), Entity(span='waterwerken', types=['E19 Physical Thing']), Entity(span='inwoners der stad', types=['E74 Group']), Entity(span='vreemdelingen', types=['E74 Group'])]),
 EntityOutput(entities=[Entity(span='Casernes', types=['E19 Physical Thing']), Entity(span='30,

In [101]:
response = responses[0]

In [104]:
response.model_dump()["entities"]

[]

In [105]:
df_test

Unnamed: 0,text,labels,source,text_embedding
107,Vierde Brief.,[],Ontology-based Annotation.docx,"[0.038164765, 0.029106557, 0.0037032517, 0.026..."
108,Göttingen den 9 July 1816 .,"[{'span': 'Göttingen', 'types': ['E53 Place']}...",Ontology-based Annotation.docx,"[-0.027040392, 0.053370032, -0.0045714774, 0.0..."
109,Waardste Vriend!,[],Ontology-based Annotation.docx,"[0.03379706, 0.020902578, 0.012958331, 0.01300..."
110,"Wilhelmshöhe, voorheen Weissenstein genoemd en...","[{'span': 'Wilhelmshöhe', 'types': ['E53 Place...",Ontology-based Annotation.docx,"[0.014872815, 0.052165806, -0.04137797, 0.0222..."
111,de landgraven Karel en Frederik de II hebben h...,"[{'span': 'Karel', 'types': ['E21 Person']}, {...",Ontology-based Annotation.docx,"[0.031060653, 0.05399668, -0.017590867, 0.0279..."
...,...,...,...,...
325,ik zal deeze brief medenemen en dezelve te Elb...,"[{'span': 'Elbrugerode', 'types': ['E53 Place']}]",Ontology-based Annotation.docx,"[0.02856872, 0.03850489, -0.038084652, 0.01974..."
326,Van Maagdeburg hoop ik u te schryven en de dr...,[],Ontology-based Annotation.docx,"[0.0014612004, 0.05482568, -0.01458141, 0.0023..."
327,[op linkerblad:,[],Ontology-based Annotation.docx,"[0.026018828, 0.0623048, -0.016845644, 0.01691..."
328,NB. Alleen van de zyde van Wernigerode aan de...,[],Ontology-based Annotation.docx,"[0.006645154, 0.06859083, -0.0057803346, 0.035..."


In [111]:
df_out = df_test.copy()

In [113]:
df_out["mistral_small_3.2_output"] = [res.model_dump()["entities"] for res in responses]
df_out["gpt_4o_mini_output"] = [res.model_dump()["entities"] for res in responses_oai]

In [116]:
df_out[["text", "labels", "mistral_small_3.2_output", "gpt_4o_mini_output"]].to_json(
    "data/llm_predictions.jsonl", lines=True, orient="records", index=False
)