## Imports

In [1]:
import aisuite as ai

In [2]:
client = ai.Client()
client.configure({
  "ollama" : {
    "timeout": 600,
  }
})

## Test `aisuite` with dummy prompts

In [3]:
messages = [
    {"role": "system", "content": "Respond in Pirate English. Always try to include the phrase - No rum No fun."},
    {"role": "user", "content": "Tell me a joke about Captain Jack Sparrow"},
]

In [24]:
!ollama list

NAME               ID              SIZE      MODIFIED          
phi4:latest        ac896e5b8b34    9.1 GB    55 seconds ago       
gemma2:9b          ff02c3702f32    5.4 GB    41 minutes ago       
llama3.2:latest    a80c4f17acd5    2.0 GB    About an hour ago    
deepseek-r1:8b     28f8fd6cdc67    4.9 GB    2 hours ago          
llama3.3:latest    a6eb4748fd29    42 GB     3 weeks ago          


In [27]:
models = [
    #"ollama:deepseek-r1:8b",
    #"ollama:llama:3.3:latest",
    "ollama:llama3.2:latest",
    "ollama:gemma2:9b",
    "ollama:phi4:latest"
]

In [28]:
replies = {}

for selected_model in models:
    response = client.chat.completions.create(model=selected_model, messages=messages)
    replies[selected_model] = response.choices[0].message.content

In [22]:
for k,v in replies.items():
    print(f"Model: {k}; reply's length = {len(v)}")

Model: ollama:llama3.2:latest; reply's length = 243
Model: ollama:gemma2:9b; reply's length = 304


In [29]:
replies

{'ollama:llama3.2:latest': "Yer lookin' fer a joke about that scurvy dog, eh? Alright then, listen close:\n\nWhy did Captain Jack Sparrow bring a ladder aboard his ship?\n\nBecause he heard the drinks were on the house! Arrr, no rum, no fun!",
 'ollama:gemma2:9b': "Ahoy, matey! Ye want a tale 'bout ol' Jack Sparrow? \n\nGather 'round and listen close:\n\nWhy did Captain Jack Sparrow always carry two compasses? \n\nTo be sure he wasn't lost at sea...and to have one to point the way to the nearest grog stash! No rum, no fun, ye hear?  🍻💀\n\n\n",
 'ollama:phi4:latest': 'Ahoy there, matey! Gather \'round for a tale o\' ol\' Cap\'n Jack Sparrow!\n\nSo, what happens when you mix Captain Jack Sparrow with a chicken?\n\nYou get... "Cluckin\' up the wrong ship!"\n\nArrr, no rum, no fun! But remember, ye never know where yer adventure will take ye next!'}

## Query LLMs with real TextEnt data

- for each document, load the pre-generated summary
- based on the summary, for each doc generate 3 prompts (metadata, metadata + incipit, metadata + summary)
- iterate over doc, iterate over prompts per doc, iterate over models, then query with triples (docu, model, prompt)

- start with a spacy document
- load the corresponding pre-generated summary
- define a `build_prompts` function that takes a `spacy_doc` as input and returns a list of tuples `('prompt-id', 'prompt-message')` 

In [78]:
import json
from tqdm import tqdm
from pathlib import Path
from typing import List, Dict, Tuple
from spacy.tokens import Doc

def build_summary_prompt(spacy_doc: Doc) -> str:
    """
    Builds a summary prompt based on a spaCy document.

    Args:
        spacy_doc (Doc): A spaCy document object containing the text and metadata.

    Returns:
        str: A formatted summary prompt.

    The summary is loaded from a JSON file located in the "../data/summaries" directory.
    The filename of the summary is derived from the 'document_id' stored in the user_data attribute of the spaCy document.
    """
    summaries_path = Path("../data/summaries")
    doc_summary_path = summaries_path / f"{spacy_doc.user_data['document_id']}_summary.json"

    # load base prompt
    with open("../data/prompts/summary_prompt.txt", "r") as file:
        base_prompt = file.read()

    # load the pre-computed summary from its JSON file
    with doc_summary_path.open('r', encoding='utf-8') as file:
        summary = json.load(file)

    # JSON to pretty string
    summary_as_string = json.dumps(summary, indent=2, ensure_ascii=False)
    return base_prompt.format(document_summary=summary_as_string)

def build_incipit_prompt(spacy_doc: Doc) -> str:
    pass

def build_prompts(spacy_doc: Doc) -> List[Tuple[str, str]]:
    """
    Builds prompts based on a spaCy document.

    Args:
        spacy_doc (Doc): A spaCy document object containing the text and metadata.

    Returns:
        List[Tuple[str, str]]: A list of tuples where each tuple contains a prompt ID and its text.    
    """
    prompts = []
    prompts.append(
          ('prompt-w-summary', build_summary_prompt(spacy_doc)),
          ('prompt-incipit', build_incipit_prompt(spacy_doc)),
    )
    return prompts

In [73]:
def pre_generate_prompts(spacy_docs: List[Doc], output_path: Path) -> None:
    for spacy_doc in tqdm(spacy_docs, desc="Pre-generating prompts"):
        doc_id = spacy_doc.user_data["document_id"]
        prompts = build_prompts(spacy_doc)

        # Define the path to the directory
        directory_path = output_path / doc_id

        # Check if the directory exists
        if not directory_path.exists():
            directory_path.mkdir(parents=True, exist_ok=True) # Create the directory if it does not exist

        for prompt_id, prompt in prompts:
            print(f"Writing prompt {prompt_id} for document {doc_id}")
            with open(output_path / doc_id / f"{doc_id}_{prompt_id}.txt", "w") as file:
                file.write(prompt)

In [8]:
import random
from textentlib.utils import load_or_create_corpus, nlp_model_fr

In [74]:
SPACY_CORPUS_SERIALIZED_PATH = Path("../data/corpus_24012025.spacy")
PRE_GENERATED_PROMPTS_PATH = Path("../data/prompts/pregenerated")    
SAMPLE_SIZE = 10

In [19]:
spacy_corpus = load_or_create_corpus(SPACY_CORPUS_SERIALIZED_PATH)

Loaded serialize spacy corpus from ../data/corpus_24012025.spacy
Number of documents in the corpus: 594
Number of entities in the corpus: 287389
Number of tokens in the corpus: 12885306


In [20]:
docs = spacy_corpus.get_docs(nlp_model_fr.vocab)
docs = list(docs)

In [75]:
# TODO:
# - we may want to exclude documents in the validation set
# - we may want to exclude documents that are very long (> 150k tokens)
sampled_docs = random.sample(docs, SAMPLE_SIZE)

In [76]:
len(sampled_docs)

10

In [79]:
pre_generate_prompts(sampled_docs, PRE_GENERATED_PROMPTS_PATH)

Pre-generating prompts: 100%|██████████| 10/10 [00:00<00:00, 258.25it/s]

Writing prompt prompt-w-summary for document bpt6k1090087k
Writing prompt prompt-w-summary for document bpt6k10900457
Writing prompt prompt-w-summary for document bpt6k1280456m
Writing prompt prompt-w-summary for document bpt6k1090099s
Writing prompt prompt-w-summary for document bpt6k1280403g
Writing prompt prompt-w-summary for document bpt6k1090071q
Writing prompt prompt-w-summary for document bpt6k56266087
Writing prompt prompt-w-summary for document bpt6k10901460
Writing prompt prompt-w-summary for document bpt6k1521653f
Writing prompt prompt-w-summary for document bpt6k56285481



