# Run validation of LLM-judge evaluation

## Imports

In [None]:
import aisuite as ai
from pathlib import Path
from textentlib.utils import read_configuration
from textentlib.llm_utils import fetch_prompts
from textentlib.llm_utils import query_llm, serialize_llm_responses, LLMrequest, LLMresponse, gt_metadata_to_dataframe

## Read configuration

In [11]:
config = read_configuration(Path('../data/config.yaml'))
llms = config['validation']['models']
base_path = Path('/Users/mromanel/Documents/UniGe-TextEnt/chrono-spatial-processing/')
gt_path = base_path / config['validation']['groundtruth_path']
pregen_prompts_path = base_path / config['validation']['pregenerated_prompts_path']

In [12]:
print('\n'.join(llms))

ollama:phi4-mini:latest
ollama:gemma2:9b
openai:o1-mini
openai:gpt-4o
deepseek:deepseek-reasoner
anthropic:claude-3-7-sonnet-20250219


## Get validation docs and related prompts

In [4]:
df_gt_metadata = gt_metadata_to_dataframe(Path(gt_path))
columns_to_keep = ['author', 'title', 'publication_date', 'document_length', 'keep_fine_tuning']
df_annotated_docs = df_gt_metadata[(df_gt_metadata.exclude == 0) & (df_gt_metadata.annotated == 1)][columns_to_keep]

In [5]:
df_validation_docs = df_annotated_docs[df_annotated_docs.keep_fine_tuning == 1]
validation_doc_ids = df_validation_docs.index.to_list()

In [6]:
validation_doc_ids

['bpt6k10901623',
 'bpt6k9807756q',
 'bpt6k852913n',
 'bpt6k5772699f',
 'bpt6k1090242p']

In [7]:
llm_requests = fetch_prompts(Path(pregen_prompts_path), validation_doc_ids)

In [8]:
llm_requests

[LLMrequest(prompt_id='prompt-excerpt.txt', document_id='bpt6k10901623', prompt_path=PosixPath('/Users/mromanel/Documents/UniGe-TextEnt/chrono-spatial-processing/data/prompts/pregenerated/bpt6k10901623/bpt6k10901623_prompt-excerpt.txt'), prompt='Look at the following JSON object describing a theatre play in French (XVII century); the `metadata` property contains basic information about the play (author, title, publication date), while the `excerpt` property contains an excerpt of 400 words sampled from around the middle of the document.\n\nINPUT:\n```json\n{\n  "metadata": {\n    "author": "Boisrobert, François de",\n    "title": "Théodore, Reyne de Hongrie, tragi-comédie",\n    "publication_date": "1658",\n    "document_id": "bpt6k10901623"\n  },\n  "excerpt": "re; Oui j\'ai pitié de vous, Prince, et je vous promets, Si vous vous repentez, de n\'y penser jamais, Je me reprocherai cette ardeur enragée, Comme si on l\'avais bizarrement songée, Revenez donc à vous, ouvrez, ouurez les yeu

In [13]:
llm_responses = []
llm_responses_path = Path('../data/validation/llm_responses')

client = ai.Client()
client.configure({"ollama" : {"timeout": 600}})

for model in llms[0:1]:
    llm_responses += query_llm(client, model, llm_requests, llm_responses_path)

Processing prompt prompt-excerpt.txt for document bpt6k10901623 using model ollama:phi4-mini:latest
Time taken to get response: 48.05 seconds. Total tokens: None
Processing prompt prompt-metadata.txt for document bpt6k10901623 using model ollama:phi4-mini:latest
Time taken to get response: 38.98 seconds. Total tokens: None
Processing prompt prompt-summary.txt for document bpt6k10901623 using model ollama:phi4-mini:latest
Time taken to get response: 72.21 seconds. Total tokens: None
Processing prompt prompt-excerpt.txt for document bpt6k1090242p using model ollama:phi4-mini:latest
Time taken to get response: 42.82 seconds. Total tokens: None
Processing prompt prompt-metadata.txt for document bpt6k1090242p using model ollama:phi4-mini:latest
Time taken to get response: 46.73 seconds. Total tokens: None
Processing prompt prompt-summary.txt for document bpt6k1090242p using model ollama:phi4-mini:latest
Time taken to get response: 64.53 seconds. Total tokens: None
Processing prompt prompt-s

KeyboardInterrupt: 

## Prepare data for scoring

## Ask LLM-judge

- read LLM responses, read GT, and combine the two
- select the samples (n=5) we kept aside from the GT dataset 
- fetch the scoring criteria (from a file?)
- prepare a prompt with: prediction, reference (GT), and scoring criteria
- ask the LLM and store the answer (scores + reasoning) (`data/validation/llm_judge`)
- do this on all responses for the validation documents
    - produce scores dataframe, to be then compared with human scoring

## Compute IAA between scorers

- read in all the scores on the validation docs (human + LLM-judge)
- reshape dataframe so to have for each document and for each score type, all the scores assigned
- for each score type compute the IAA
- compute an average IAA across all score types