## Imports

In [None]:
import aisuite as ai

In [2]:
client = ai.Client()
client.configure({
  "ollama" : {
    "timeout": 600,
  }
})

## Test `aisuite` with dummy prompts

In [3]:
messages = [
    {"role": "system", "content": "Respond in Pirate English. Always try to include the phrase - No rum No fun."},
    {"role": "user", "content": "Tell me a joke about Captain Jack Sparrow"},
]

In [24]:
!ollama list

NAME               ID              SIZE      MODIFIED          
phi4:latest        ac896e5b8b34    9.1 GB    55 seconds ago       
gemma2:9b          ff02c3702f32    5.4 GB    41 minutes ago       
llama3.2:latest    a80c4f17acd5    2.0 GB    About an hour ago    
deepseek-r1:8b     28f8fd6cdc67    4.9 GB    2 hours ago          
llama3.3:latest    a6eb4748fd29    42 GB     3 weeks ago          


In [27]:
models = [
    #"ollama:deepseek-r1:8b",
    #"ollama:llama:3.3:latest",
    "ollama:llama3.2:latest",
    "ollama:gemma2:9b",
    "ollama:phi4:latest"
]

In [28]:
replies = {}

for selected_model in models:
    response = client.chat.completions.create(model=selected_model, messages=messages)
    replies[selected_model] = response.choices[0].message.content

In [22]:
for k,v in replies.items():
    print(f"Model: {k}; reply's length = {len(v)}")

Model: ollama:llama3.2:latest; reply's length = 243
Model: ollama:gemma2:9b; reply's length = 304


In [29]:
replies

{'ollama:llama3.2:latest': "Yer lookin' fer a joke about that scurvy dog, eh? Alright then, listen close:\n\nWhy did Captain Jack Sparrow bring a ladder aboard his ship?\n\nBecause he heard the drinks were on the house! Arrr, no rum, no fun!",
 'ollama:gemma2:9b': "Ahoy, matey! Ye want a tale 'bout ol' Jack Sparrow? \n\nGather 'round and listen close:\n\nWhy did Captain Jack Sparrow always carry two compasses? \n\nTo be sure he wasn't lost at sea...and to have one to point the way to the nearest grog stash! No rum, no fun, ye hear?  🍻💀\n\n\n",
 'ollama:phi4:latest': 'Ahoy there, matey! Gather \'round for a tale o\' ol\' Cap\'n Jack Sparrow!\n\nSo, what happens when you mix Captain Jack Sparrow with a chicken?\n\nYou get... "Cluckin\' up the wrong ship!"\n\nArrr, no rum, no fun! But remember, ye never know where yer adventure will take ye next!'}

## Query LLMs with real TextEnt data

- for each document, load the pre-generated summary
- based on the summary, for each doc generate 3 prompts (metadata, metadata + incipit, metadata + summary)
- iterate over doc, iterate over prompts per doc, iterate over models, then query with triples (docu, model, prompt)

- start with a spacy document
- load the corresponding pre-generated summary
- define a `build_prompts` function that takes a `spacy_doc` as input and returns a list of tuples `('prompt-id', 'prompt-message')` 

In [98]:
import random
from pathlib import Path
from textentlib.prompting import pre_generate_prompts
from textentlib.utils import load_or_create_corpus, nlp_model_fr

In [99]:
SPACY_CORPUS_SERIALIZED_PATH = Path("../data/corpus_24022025.spacy")
PRE_GENERATED_PROMPTS_PATH = Path("../data/prompts/pregenerated")    
SAMPLE_SIZE = 50

In [100]:
spacy_corpus = load_or_create_corpus(SPACY_CORPUS_SERIALIZED_PATH)

Loaded serialize spacy corpus from ../data/corpus_24022025.spacy
Number of documents in the corpus: 594
Number of entities in the corpus: 287389
Number of tokens in the corpus: 12885306


In [101]:
docs = spacy_corpus.get_docs(nlp_model_fr.vocab)
docs = list(docs)

In [102]:
# TODO:
# - we may want to exclude documents in the validation set
# - we may want to exclude documents that are very long (> 150k tokens)
sampled_docs = random.sample(docs, SAMPLE_SIZE)

In [103]:
len(sampled_docs)

50

In [104]:
len(sampled_docs)

50

In [105]:
pre_generate_prompts(sampled_docs, PRE_GENERATED_PROMPTS_PATH)

Pre-generating prompts: 100%|██████████| 50/50 [00:01<00:00, 30.27it/s]







In [9]:
from pathlib import Path
from dataclasses import dataclass

@dataclass
class LLMrequest:
    prompt_id: str
    document_id: str
    prompt_path: Path
    prompt: str

@dataclass
class LLMresponse:
    document_id: str
    prompt_id: str
    prompt: str
    model_name: str
    response: str

In [10]:
import aisuite as ai

client = ai.Client()
client.configure({
  "ollama" : {
    "timeout": 600,
  }
})

In [26]:
from typing import List, Dict, Tuple

def serialize_llm_responses(responses: List[LLMresponse], output_path: Path) -> None:

    for response in responses:

        output_dir = Path(output_path / response.document_id)

        if not output_dir.exists():
            output_dir.mkdir(parents=True, exist_ok=True)

        filename = f"{response.document_id}_{response.prompt_id}_{response.model_name.replace(':', '-')}.txt"
        filepath = output_path / response.document_id / filename
        response_trimmed = response.response.replace('```json', '').replace('```', '').strip()

        with filepath.open("w", encoding="utf-8") as file:
            file.write(response_trimmed)
    return

def query_llm(model: str, requests: List[LLMrequest], output_path: Path) -> List[LLMresponse]:
    # pass over the requests to a given model and gather the responses
    responses = []
    for request in requests:
        # Avoid asking the model, if an answer file already exists
        filename = f"{request.document_id}_{request.prompt_id}_{model.replace(':', '-')}.txt"
        filepath = output_path / request.document_id / filename
        if filepath.exists():
            #print(f"Found file {filepath} for document {request.document_id}")
            print(f"Skipping request for document {request.document_id}[{request.prompt_id}] using model {model} as it already exists")
            continue
            
        print(f"Processing prompt {request.prompt_id} for document {request.document_id} using model {model}")
        response = client.chat.completions.create(model=model, messages=[{"role": "user", "content": request.prompt}])
        llm_response = LLMresponse(
            document_id=request.document_id,
            prompt_id=request.prompt_id,
            prompt=request.prompt,
            model_name=model,
            response=response.choices[0].message.content
        )
        responses.append(llm_response)
        serialize_llm_responses([llm_response], output_path)
    return responses

In [86]:
import re
import json
import pandas as pd
from typing import Dict

import contextlib
import re
import json

JSON_PATTERN = re.compile(r"```json\n(.*?)```", re.DOTALL)
DIRECT_JSON_PATTERN = re.compile(r"\{[^}]*\}", re.DOTALL)


def try_extract_json_from_text(text: str) -> tuple[str, dict | None]:
    # function taken from https://danielvanstrien.xyz/posts/2025/deepseek/distil-deepseek-modernbert.html
    if match := JSON_PATTERN.search(text):
        json_results = match.group(1)
        with contextlib.suppress(json.JSONDecodeError):
            return text, json.loads(json_results)
    if match := DIRECT_JSON_PATTERN.search(text):
        json_text = match.group(0)
        with contextlib.suppress(json.JSONDecodeError):
            return text, json.loads(json_text)
    return text, None

def process_json_response(response_raw: str) -> Dict:
    # is response empty?
    # is response valid JSON?
    
    output_dict = {}
    output_dict['is_response_empty'] = True if response_raw.strip() == '' else False

    try:
        response_json = json.loads(response_raw)
        output_dict['is_response_valid_json'] = True
        output_dict.update(response_json)
    except json.JSONDecodeError:
        output_dict['is_response_valid_json'] = False
        _, response_json = try_extract_json_from_text(response_raw)
        if response_json:
            output_dict.update(response_json)
    return output_dict

def process_llm_responses(llm_responses_path: Path) -> pd.DataFrame:
    # each sub-folder contains the responses for a given document
    # we need to group the responses by model so that separate dataframes can be generated
    responses = []
    all_response_files = list(llm_responses_path.glob('*/*.txt'))
    for file_path in all_response_files:
        doc_id, prompt_id, model_id = file_path.name.replace('.txt', '').split('_')
        with file_path.open("r", encoding="utf-8") as file:
            response_raw = file.read()
        response = {
            "document_id": doc_id,
            "prompt_id": prompt_id,
            "model_id": model_id,
            "response_raw": response_raw
        }

        response_content = process_json_response(response_raw)
        response.update(response_content)
        responses.append(response)
    return pd.DataFrame(responses)

In [87]:
llm_responses_path = Path('../data/llm_responses')
data = process_llm_responses(llm_responses_path)

In [89]:
data[data['is_response_valid_json']==True]

Unnamed: 0,document_id,prompt_id,model_id,response_raw,is_response_empty,is_response_valid_json,period,period_reasoning,timeframe_start,timeframe_end,location,location_reasoning,location_qid
0,bpt6k15110748,prompt-excerpt,anthropic-claude-3-7-sonnet-20250219,"{\n ""period"": ""Ancient Greece or Persia"",\n...",False,True,Ancient Greece or Persia,The excerpt mentions a character with a royal ...,0550-01-01,0330-01-01,Ancient Persia (Achaemenid Empire),"The play is titled 'Panthée', which likely ref...",Q47246
1,bpt6k15110748,prompt-metadata,anthropic-claude-3-7-sonnet-20250219,"{\n ""period"": ""Antiquity, Achaemenid Empire...",False,True,"Antiquity, Achaemenid Empire",Panthée is likely based on the story of Panthe...,0550-01-01,0530-01-01,Ancient Persia (modern-day Iran),The story of Panthea takes place in the Persia...,Q794
4,bpt6k15110748,prompt-summary,anthropic-claude-3-7-sonnet-20250219,"{\n ""period"": ""Ancient Persian Empire (Acha...",False,True,Ancient Persian Empire (Achaemenid Dynasty),"The play prominently features Cyrus, who appea...",0559-01-01,0530-01-01,Persia (Ancient Persian Empire),Among the top 5 places mentioned are 'Perse' (...,Q47246
5,bpt6k15110748,prompt-excerpt,ollama-gemma2-9b,"{\n ""period"": ""16th-18th century Europe"",\n ...",False,True,16th-18th century Europe,The play is a French tragedy from the XVII cen...,1500-01-01,1800-12-31,France or a European kingdom influenced by Fre...,"The author is Tristan L'Hermite, a French play...",
7,bpt6k15110748,prompt-metadata,ollama-phi4-latest,"{\n ""period"": ""Ancient Greece"",\n ""perio...",False,True,Ancient Greece,"Tristan L'Hermite's play 'Panthée, tragédie de...",-800-01-01,-146-12-31,Greece,Given that the play is a tragedy and likely in...,Q48
9,bpt6k15110748,prompt-summary,ollama-gemma2-9b,"{\n ""period"": ""17th century"",\n ""period_...",False,True,17th century,The metadata indicates the play was published ...,1600-01-01,1700-12-31,Perse,The text mentions Cyrus and the play likely dr...,Q895
11,bpt6k15110748,prompt-metadata,ollama-gemma2-9b,"{\n ""period"": ""Classical antiquity"",\n ""...",False,True,Classical antiquity,"The title 'Panthée' directly refers to Pan, a ...",1000-01-01,500-01-01,Ancient Greece,"The mention of Pan, a Greek god, strongly sugg...",Q394
12,bpt6k8569801,prompt-metadata,ollama-gemma2-9b,"{\n ""period"": ""17th century"",\n ""period_reas...",False,True,17th century,"The publication date is 1699, placing it withi...",1600-01-01,1700-01-01,Lyon,"The title of the play is 'Le Carnaval de Lyon,...",Q894
13,bpt6k8569801,prompt-excerpt,ollama-gemma2-9b,"{\n ""period"": ""17th century"",\n ""period_...",False,True,17th century,The metadata indicates the play was published ...,1600-01-01,1700-01-01,Lyon,The title of the play is 'Le Carnaval de Lyon'...,Q24683
15,bpt6k8569801,prompt-metadata,ollama-phi4-latest,"{\n ""period"": ""Late 17th century France"",\n...",False,True,Late 17th century France,"The play was published in 1699, and it is titl...",1698-01-01,1700-12-31,"Lyon, France",The title 'Le Carnaval de Lyon' directly sugge...,Q1524


In [80]:
data[(data['document_id'] == 'bpt6k15110748') & (data['prompt_id'] == 'prompt-summary')]

Unnamed: 0,document_id,prompt_id,model_id,response_raw,is_response_empty,is_response_valid_json,period,period_reasoning,timeframe_start,timeframe_end,location,location_reasoning,location_qid
3,bpt6k15110748,prompt-summary,ollama-llama3.2-latest,,True,False,,,,,,,
4,bpt6k15110748,prompt-summary,anthropic-claude-3-7-sonnet-20250219,"{\n ""period"": ""Ancient Persian Empire (Acha...",False,True,Ancient Persian Empire (Achaemenid Dynasty),"The play prominently features Cyrus, who appea...",0559-01-01,0530-01-01,Persia (Ancient Persian Empire),Among the top 5 places mentioned are 'Perse' (...,Q47246
9,bpt6k15110748,prompt-summary,ollama-gemma2-9b,"{\n ""period"": ""17th century"",\n ""period_...",False,True,17th century,The metadata indicates the play was published ...,1600-01-01,1700-12-31,Perse,The text mentions Cyrus and the play likely dr...,Q895
10,bpt6k15110748,prompt-summary,ollama-phi4-latest,"{\n ""period"": ""Achaemenid Empire, particula...",False,False,,,,,,,


In [69]:
data[(data['is_response_empty'] == False) & (data['is_response_valid_json'] == False)]

Unnamed: 0,document_id,prompt_id,model_id,response_raw,is_response_empty,is_response_valid_json,period,period_reasoning,timeframe_start,timeframe_end,location,location_reasoning,location_qid
6,bpt6k15110748,prompt-excerpt,ollama-phi4-latest,"{\n ""period"": ""17th century France"",\n ""...",False,False,,,,,,,
10,bpt6k15110748,prompt-summary,ollama-phi4-latest,"{\n ""period"": ""Achaemenid Empire, particula...",False,False,,,,,,,
14,bpt6k8569801,prompt-excerpt,ollama-phi4-latest,"{\n ""period"": ""Late 17th century France"",\n...",False,False,,,,,,,
27,btv1b8622118r,prompt-summary,ollama-phi4-latest,"{\n ""period"": ""17th century Europe"",\n ""...",False,False,,,,,,,


In [15]:
llm_requests = []

for subdir in Path('../data/prompts/pregenerated').iterdir():
    for file in subdir.iterdir():
        doc_id, prompt_id = file.name.split('_')
        prompt_id = prompt_id.split('.')[0]
        prompt = file.read_text()
        llm_requests.append(LLMrequest(prompt_id, doc_id, file, prompt))

In [16]:
print(len(llm_requests))

147


In [27]:
llm_responses = []
llm_responses_path = Path('../data/llm_responses')

models = [
    #"ollama:deepseek-r1:8b",
    #"ollama:llama:3.3:latest",
    #"deepseek:deepseek-reasoner",
    "anthropic:claude-3-7-sonnet-20250219",
    #"ollama:llama3.2:latest",
    #"ollama:gemma2:9b",
    #"ollama:phi4:latest"
]

for model in models:
    llm_responses += query_llm(model, llm_requests[:3], llm_responses_path)

serialize_llm_responses(llm_responses, Path(llm_responses_path))

Processing prompt prompt-summary for document bpt6k15110748 using model anthropic:claude-3-7-sonnet-20250219
Processing prompt prompt-metadata for document bpt6k15110748 using model anthropic:claude-3-7-sonnet-20250219
Processing prompt prompt-excerpt for document bpt6k15110748 using model anthropic:claude-3-7-sonnet-20250219


In [28]:
for r in llm_responses:
    print(f'Model: {r.model_name}; Prompt: {r.prompt}')
    print(f'Response: {r.response}')

Model: anthropic:claude-3-7-sonnet-20250219; Prompt: Look at the following JSON object describing a theatre play in French (XVII century); the `metadata` property contains basic information about the play (author, title, publication date), while the `context` property contains information about the people and places that are most frequently mentioned in the play (such as label, mention frequency, and salient sentences where it appears).

INPUT:
```json
{
  "metadata": {
    "author": "Tristan L'Hermite",
    "title": "Panthée, tragédie de M. de Tristan",
    "publication_date": "1639",
    "document_id": "bpt6k15110748"
  },
  "context": {
    "people": {
      "top_1_person": {
        "entity": {
          "label": "CYRUS",
          "frequency": 3
        },
        "related_sentences": [
          "PANTHÉE\n Seigneur, votre bonté s'est acquise Abradate J'ai dépêché des miens pour lui faire savoir Qu'elles sont vos vertus, et quel est son devoir: S'il n'a changé d'esprit j'ose bien 

## `astropy` detour

In [148]:
import numpy as np
from astropy.time import Time

In [240]:
Range = namedtuple('Range', ['start', 'end'])

In [199]:
r1 = Range(start=Time(-480, format='jyear'), end=Time(-430, format='jyear'))

In [244]:
r2 = Range(start=Time(-300, format='jyear'), end=Time(210, format='jyear'))

In [245]:
latest_start = max(r1.start, r2.start)
earliest_end = min(r1.end, r2.end)
delta = (earliest_end - latest_start)

In [246]:
delta.jd // 365

np.float64(-131.0)

In [243]:
len(np.arange(r1.start.jd, r1.end.jd)) // 365

50

In [247]:
len(np.arange(r2.start.jd, r2.end.jd)) // 365

510