# Run evaluation

## Imports

In [1]:
import pandas as pd
import shutil
import aisuite as ai
from pathlib import Path
from textentlib.utils import read_configuration
from textentlib.llm_utils import fetch_prompts, try_extract_json_from_text
from textentlib.llm_utils import prepare_evaluation_dataframe, query_llm, query_llm_judge

## Read configuration

In [2]:
config = read_configuration(Path('../data/config.yaml'))
eval_config = config['evaluation']
base_path = Path('/Users/mromanel/Documents/UniGe-TextEnt/chrono-spatial-processing/')
gt_path = base_path / eval_config['groundtruth_path']
scores_path = base_path / eval_config['scores_output_path']
pregen_prompts_path = base_path / eval_config['pregenerated_prompts_path']

In [107]:
df_scores_raw = pd.read_csv(list(scores_path.glob('*scores.tsv'))[0], sep='\t', index_col=0).drop(columns=['total_tokens'])

In [108]:
df_scores_raw.head(5)

Unnamed: 0,score_period_string,score_period_interval,score_location_string,score_location_qid,score_reasons,response_id,evaluator
0,0.0,0.0,0.0,0.0,The predicted period '17th century France' doe...,btv1b8607044w$prompt-excerpt.txt$ollama:mistra...,openai:o1-mini
1,0.0,0.0,0.5,0.0,The predicted period differs from the referenc...,btv1b8607044w$prompt-summary.txt$anthropic:cla...,openai:o1-mini
2,0.0,0.0,0.0,0.0,All prediction fields are missing or invalid (...,btv1b8607044w$prompt-summary.txt$ollama:phi4-m...,openai:o1-mini
3,0.5,0.5,0.5,0.0,The predicted period 'Classical Antiquity' is ...,btv1b8607044w$prompt-excerpt.txt$anthropic:cla...,openai:o1-mini
4,0.5,0.5,0.0,0.0,The predicted period 'Classical Antiquity' is ...,btv1b8607044w$prompt-summary.txt$deepseek:deep...,openai:o1-mini


In [109]:
def unpack_response_id(row):
    return {
        'document_id': row.split('$')[0],
        'prompt_id': row.split('$')[1],
        'model_name': row.split('$')[2],
    }

In [110]:
df_scores = pd.concat(
    [
        df_scores_raw.apply(lambda x: unpack_response_id(x.response_id), axis='columns', result_type='expand'),
        df_scores_raw,
    ],
    axis='columns',
)


In [111]:
df_scores.tail()

Unnamed: 0,document_id,prompt_id,model_name,score_period_string,score_period_interval,score_location_string,score_location_qid,score_reasons,response_id,evaluator
1561,bpt6k852919x,prompt-metadata.txt,openai:o1-mini,0.5,0.5,0.5,0.5,The predicted period '17th century' is narrowe...,bpt6k852919x$prompt-metadata.txt$openai:o1-mini,openai:o1-mini
1562,bpt6k852919x,prompt-summary.txt,ollama:deepseek-r1:14b,0.0,0.0,1.0,0.0,The predicted period 'Pre-Roman Gaul' does not...,bpt6k852919x$prompt-summary.txt$ollama:deepsee...,openai:o1-mini
1563,bpt6k852919x,prompt-excerpt.txt,ollama:deepseek-r1:14b,0.5,0.5,0.5,0.5,The predicted period '17th Century' is narrowe...,bpt6k852919x$prompt-excerpt.txt$ollama:deepsee...,openai:o1-mini
1564,bpt6k852919x,prompt-metadata.txt,deepseek:deepseek-reasoner,0.5,0.5,0.5,0.5,The period '17th century' is narrower than 'Ea...,bpt6k852919x$prompt-metadata.txt$deepseek:deep...,openai:o1-mini
1565,bpt6k852919x,prompt-summary.txt,ollama:gemma3:12b,0.5,0.5,1.0,0.0,The period 'XVII century' is more specific tha...,bpt6k852919x$prompt-summary.txt$ollama:gemma3:12b,openai:o1-mini


In [112]:
len(df_scores.document_id.unique())

58

In [124]:
df_scores['prompt_id'] = pd.Categorical(df_scores['prompt_id'], ['prompt-metadata.txt', 'prompt-excerpt.txt', 'prompt-summary.txt'])

In [159]:
df_scores['prompt_id'] = df_scores['prompt_id'].cat.rename_categories({
    'prompt-metadata.txt': 'metadata',
    'prompt-excerpt.txt': 'excerpt',
    'prompt-summary.txt': 'summary',
})

In [143]:
df_scores['model_name'] = pd.Categorical(
    df_scores['model_name'], 
    [
        'ollama:phi4-mini:latest',
        'ollama:gemma3:12b',
        'ollama:mistral-small:24b',
        'ollama:deepseek-r1:14b',
        'ollama:deepseek-r1:32b',
        'openai:gpt-4o',
        'openai:o1-mini',
        'deepseek:deepseek-reasoner',
        'anthropic:claude-3-7-sonnet-20250219'
    ]
)

In [162]:
df_scores['model_name'] = df_scores['model_name'].cat.rename_categories({
    'ollama:phi4-mini:latest': 'Phi4 Mini',
    'ollama:gemma3:12b': 'Gemma3',
    'ollama:mistral-small:24b': 'Mistral Small 24B',
    'ollama:deepseek-r1:14b': 'DeepSeek R1 (14B)',
    'ollama:deepseek-r1:32b': 'DeepSeek R1 (32B)',
    'openai:gpt-4o': 'GPT-4O',
    'openai:o1-mini': 'O1 Mini',
    'deepseek:deepseek-reasoner': 'DeepSeek R1 (671B)',
    'anthropic:claude-3-7-sonnet-20250219': 'Claude 3.7 Sonnet'
})

In [172]:
def is_correct(row, accuracy='strict'):
    result = {
        'accuracy': accuracy,
    }
    scores = ['score_period_string', 'score_period_interval', 'score_location_string', 'score_location_qid']
    for score in scores:
        if accuracy == 'strict':
            result[f'is_correct_{score}'] = row[score] == 1
        elif accuracy == 'lenient':
            result[f'is_correct_{score}'] = row[score] >= 0.5
        else:
            raise ValueError(f'Unknown accuracy: {accuracy}')
    return result

In [173]:
df_accuracy_strict = pd.concat(
    [
        df_scores.apply(lambda x: is_correct(x), axis='columns', result_type='expand'),
        df_scores
    ], axis='columns'
)

In [174]:
df_accuracy_lenient = pd.concat(
    [
        df_scores.apply(lambda x: is_correct(x, accuracy='lenient'), axis='columns', result_type='expand'),
        df_scores
    ], axis='columns'
)

In [175]:
df_accuracy_strict.head(2)

Unnamed: 0,accuracy,is_correct_score_period_string,is_correct_score_period_interval,is_correct_score_location_string,is_correct_score_location_qid,document_id,prompt_id,model_name,score_period_string,score_period_interval,score_location_string,score_location_qid,score_reasons,response_id,evaluator
0,strict,False,False,False,False,btv1b8607044w,excerpt,Mistral Small 24B,0.0,0.0,0.0,0.0,The predicted period '17th century France' doe...,btv1b8607044w$prompt-excerpt.txt$ollama:mistra...,openai:o1-mini
1,strict,False,False,False,False,btv1b8607044w,summary,Claude 3.7 Sonnet,0.0,0.0,0.5,0.0,The predicted period differs from the referenc...,btv1b8607044w$prompt-summary.txt$anthropic:cla...,openai:o1-mini


In [176]:
df_accuracy_strict.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1566 entries, 0 to 1565
Data columns (total 15 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   accuracy                          1566 non-null   object  
 1   is_correct_score_period_string    1566 non-null   bool    
 2   is_correct_score_period_interval  1566 non-null   bool    
 3   is_correct_score_location_string  1566 non-null   bool    
 4   is_correct_score_location_qid     1566 non-null   bool    
 5   document_id                       1566 non-null   object  
 6   prompt_id                         1566 non-null   category
 7   model_name                        1566 non-null   category
 8   score_period_string               1566 non-null   float64 
 9   score_period_interval             1566 non-null   float64 
 10  score_location_string             1566 non-null   float64 
 11  score_location_qid                1566 non-null   float64 
 1

In [184]:
df_accuracy_strict_pct = pd.pivot_table(
    df_accuracy_strict,
    index=['model_name', 'prompt_id'],
    aggfunc='sum',
    columns=['accuracy'],
    values=['is_correct_score_period_string', 'is_correct_score_period_interval', 'is_correct_score_location_string', 'is_correct_score_location_qid'],
    observed=False
).apply(lambda x: round(x*100/len(df_accuracy_strict.document_id.unique()),2)).reorder_levels(order=[1,0], axis=1)

In [185]:
df_accuracy_strict_pct

Unnamed: 0_level_0,accuracy,strict,strict,strict,strict
Unnamed: 0_level_1,Unnamed: 1_level_1,is_correct_score_location_qid,is_correct_score_location_string,is_correct_score_period_interval,is_correct_score_period_string
model_name,prompt_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Phi4 Mini,metadata,1.72,0.0,0.0,0.0
Phi4 Mini,excerpt,1.72,0.0,0.0,0.0
Phi4 Mini,summary,1.72,1.72,0.0,1.72
Gemma3,metadata,3.45,8.62,1.72,6.9
Gemma3,excerpt,1.72,8.62,1.72,13.79
Gemma3,summary,22.41,37.93,0.0,6.9
Mistral Small 24B,metadata,8.62,18.97,0.0,17.24
Mistral Small 24B,excerpt,5.17,12.07,0.0,10.34
Mistral Small 24B,summary,13.79,37.93,1.72,24.14
DeepSeek R1 (14B),metadata,1.72,17.24,0.0,6.9


In [187]:
df_accuracy_lenient_pct = pd.pivot_table(
    df_accuracy_lenient,
    index=['model_name', 'prompt_id'],
    aggfunc='sum',
    columns=['accuracy'],
    values=['is_correct_score_period_string', 'is_correct_score_period_interval', 'is_correct_score_location_string', 'is_correct_score_location_qid'],
    observed=False,
).apply(lambda x: round(x*100/len(df_accuracy_lenient.document_id.unique()),2)).reorder_levels(order=[1,0], axis=1)

In [188]:
df_accuracy_lenient_pct

Unnamed: 0_level_0,accuracy,lenient,lenient,lenient,lenient
Unnamed: 0_level_1,Unnamed: 1_level_1,is_correct_score_location_qid,is_correct_score_location_string,is_correct_score_period_interval,is_correct_score_period_string
model_name,prompt_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Phi4 Mini,metadata,1.72,0.0,1.72,1.72
Phi4 Mini,excerpt,1.72,0.0,3.45,0.0
Phi4 Mini,summary,1.72,3.45,1.72,3.45
Gemma3,metadata,3.45,55.17,79.31,74.14
Gemma3,excerpt,3.45,62.07,77.59,79.31
Gemma3,summary,22.41,60.34,74.14,75.86
Mistral Small 24B,metadata,29.31,53.45,81.03,81.03
Mistral Small 24B,excerpt,20.69,39.66,72.41,68.97
Mistral Small 24B,summary,32.76,67.24,81.03,84.48
DeepSeek R1 (14B),metadata,5.17,31.03,55.17,51.72


In [190]:
df_accuracy_pct = pd.concat(
    [
        df_accuracy_strict_pct,
        df_accuracy_lenient_pct
    ],
    axis='columns',
)

In [191]:
df_accuracy_pct

Unnamed: 0_level_0,accuracy,strict,strict,strict,strict,lenient,lenient,lenient,lenient
Unnamed: 0_level_1,Unnamed: 1_level_1,is_correct_score_location_qid,is_correct_score_location_string,is_correct_score_period_interval,is_correct_score_period_string,is_correct_score_location_qid,is_correct_score_location_string,is_correct_score_period_interval,is_correct_score_period_string
model_name,prompt_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Phi4 Mini,metadata,1.72,0.0,0.0,0.0,1.72,0.0,1.72,1.72
Phi4 Mini,excerpt,1.72,0.0,0.0,0.0,1.72,0.0,3.45,0.0
Phi4 Mini,summary,1.72,1.72,0.0,1.72,1.72,3.45,1.72,3.45
Gemma3,metadata,3.45,8.62,1.72,6.9,3.45,55.17,79.31,74.14
Gemma3,excerpt,1.72,8.62,1.72,13.79,3.45,62.07,77.59,79.31
Gemma3,summary,22.41,37.93,0.0,6.9,22.41,60.34,74.14,75.86
Mistral Small 24B,metadata,8.62,18.97,0.0,17.24,29.31,53.45,81.03,81.03
Mistral Small 24B,excerpt,5.17,12.07,0.0,10.34,20.69,39.66,72.41,68.97
Mistral Small 24B,summary,13.79,37.93,1.72,24.14,32.76,67.24,81.03,84.48
DeepSeek R1 (14B),metadata,1.72,17.24,0.0,6.9,5.17,31.03,55.17,51.72


In [196]:
df_accuracy_pct.columns

MultiIndex([( 'strict',    'is_correct_score_location_qid'),
            ( 'strict', 'is_correct_score_location_string'),
            ( 'strict', 'is_correct_score_period_interval'),
            ( 'strict',   'is_correct_score_period_string'),
            ('lenient',    'is_correct_score_location_qid'),
            ('lenient', 'is_correct_score_location_string'),
            ('lenient', 'is_correct_score_period_interval'),
            ('lenient',   'is_correct_score_period_string')],
           names=['accuracy', None])

In [199]:
strict_accuracy_label = 'Strict'
lenient_accuracy_label = 'Lenient'


In [200]:
print(df_accuracy_pct.to_latex(
    float_format="%.2f",
    multicolumn=True,
    header=[
        (strict_accuracy_label, 'Loc. QID'),
        (strict_accuracy_label, 'Loc. Str.'),
        (strict_accuracy_label, 'Per. Str.'),
        (strict_accuracy_label, 'Per. Interv.'),
        (lenient_accuracy_label, 'Loc. QID'),
        (lenient_accuracy_label, 'Loc. Str.'),
        (lenient_accuracy_label, 'Per. Str.'),
        (lenient_accuracy_label, 'Per. Interv.')
    ],
    caption="Accuracy of the models' predictions on the test set, with strict and lenient accuracy.",
    label="tab:accuracy",
))

\begin{table}
\caption{Accuracy of the models' predictions on the test set, with strict and lenient accuracy.}
\label{tab:accuracy}
\begin{tabular}{llrrrrrrrr}
\toprule
 & accuracy & \multicolumn{4}{r}{Strict} & \multicolumn{4}{r}{Lenient} \\
 &  & Loc. QID & Loc. Str. & Per. Str. & Per. Interv. & Loc. QID & Loc. Str. & Per. Str. & Per. Interv. \\
model_name & prompt_id &  &  &  &  &  &  &  &  \\
\midrule
\multirow[t]{3}{*}{Phi4 Mini} & metadata & 1.72 & 0.00 & 0.00 & 0.00 & 1.72 & 0.00 & 1.72 & 1.72 \\
 & excerpt & 1.72 & 0.00 & 0.00 & 0.00 & 1.72 & 0.00 & 3.45 & 0.00 \\
 & summary & 1.72 & 1.72 & 0.00 & 1.72 & 1.72 & 3.45 & 1.72 & 3.45 \\
\cline{1-10}
\multirow[t]{3}{*}{Gemma3} & metadata & 3.45 & 8.62 & 1.72 & 6.90 & 3.45 & 55.17 & 79.31 & 74.14 \\
 & excerpt & 1.72 & 8.62 & 1.72 & 13.79 & 3.45 & 62.07 & 77.59 & 79.31 \\
 & summary & 22.41 & 37.93 & 0.00 & 6.90 & 22.41 & 60.34 & 74.14 & 75.86 \\
\cline{1-10}
\multirow[t]{3}{*}{Mistral Small 24B} & metadata & 8.62 & 18.97 & 0.00 & 17

## Data inspection

Write all predictions + groundtruth + scoring to a Markdown file, where some interesting cases can be inspected. 

In [35]:
from textentlib.llm_utils import llm_responses_to_dataframe
from textentlib.llm_utils import gt_annotations_to_dataframe, gt_metadata_to_dataframe

In [102]:
df_predictions = llm_responses_to_dataframe(Path(base_path / eval_config['responses_path']))
#df_predictions.head(2)

In [103]:
df_scores_raw = pd.read_csv(list(scores_path.glob('*scores.tsv'))[0], sep='\t', index_col=0).drop(columns=['total_tokens']).set_index('response_id')
#df_scores_raw.head(2)

In [104]:
df_metadata = gt_metadata_to_dataframe(gt_path)
#df_metadata.head(2)

In [105]:
df_gt = gt_annotations_to_dataframe(gt_path)
#df_gt.head(2)

"['Unnamed: 11', 'Unnamed: 12'] not found in axis"


In [96]:
from typing import List

def write_predictions_markdown(
        markdown_path: Path,
        models: List[str],
        prompts: List[str],
        predictions_df: pd.DataFrame,
        metadata_df: pd.DataFrame,
        gt_df: pd.DataFrame,
        scores_df: pd.DataFrame,
) -> None:
    
    markdown_output = ""

    document_markdown_template = """
# Document ID: {document_id}

- **author**: {author}
- **title**: {title}
- **publication date**: {publication_date}

{predictions}
    """

    prediction_markdown_template = """
## {model_name}-{prompt_id}

### Period

**Model reasoning**: {period_reasoning}

#### Period string
- **GT**: {gt_period_string}
- **Prediction**: {prediction_period_string}
- **Score**: {score_period_string} 

#### Period interval
- **GT**: {gt_period_interval}
- **Prediction**: {prediction_period_interval}
- **Score**: {score_period_interval}

### Location

**Model reasoning**: {location_reasoning}

#### Location string
- **GT**: {gt_location_string}
- **Prediction**: {prediction_location_string}
- **Score**: {score_location_string}

#### Location QID
- **GT**: {gt_location_qid}
- **Prediction**: {prediction_location_qid}
- **Score**: {score_location_qid}

#### Score reasoning
{score_reasoning}
    """

    subset = predictions_df[(predictions_df.model_name.isin(models)) & (predictions_df.prompt_id.isin(prompts))]
    document_ids = subset.document_id.unique()
    
    for document_id in document_ids:

        predictions_md = []
        for idx, row in subset[subset.document_id == document_id].sort_values(by='model_name').iterrows():
            score_row = scores_df.loc[idx]
            gt_row = gt_df.loc[document_id]

            predictions_md .append(prediction_markdown_template.format(
                model_name=row.model_name,
                prompt_id=row.prompt_id,
                # period information
                period_reasoning=row.pred_period_reasoning,
                gt_period_string=gt_row.gt_period,
                prediction_period_string=row.pred_period,
                score_period_string=score_row.score_period_string,
                gt_period_interval=gt_row.gt_timeframe,
                prediction_period_interval=row.pred_timeframe,
                score_period_interval=score_row.score_period_interval,
                score_reasoning=score_row.score_reasons,
                # location information
                location_reasoning=row.pred_location_reasoning,
                gt_location_string=gt_row.gt_preferred_location,
                prediction_location_string=row.pred_location,
                score_location_string=score_row.score_location_string,
                gt_location_qid=gt_row.gt_preferred_location_QID,
                prediction_location_qid=row.pred_location_qid,
                score_location_qid=score_row.score_location_qid,
            ))

        markdown_output += document_markdown_template.format(
            document_id=document_id,
            author=metadata_df.loc[document_id].author,
            title=metadata_df.loc[document_id].title,
            publication_date=metadata_df.loc[document_id].publication_date,
            predictions='\n'.join(predictions_md),
        )

    with open(markdown_path, 'w') as f:
        f.write(markdown_output)
    return

In [97]:
write_predictions_markdown(
    markdown_path=base_path / 'evaluation_report.md',
    models=[
        'anthropic:claude-3-7-sonnet-20250219',
        'deepseek:deepseek-reasoner',
        'ollama:mistral-small:24b'
    ],
    prompts=[
        'prompt-summary.txt',
        'prompt-excerpt.txt'
    ],
    predictions_df=df_predictions,
    scores_df=df_scores_raw,
    metadata_df=df_metadata,
    gt_df=df_gt,
)

## Error analysis

1. **Identify documents that are challenging for all models.**
Group scores by document, then take the max aggregated score for each group.
For which document is this score below a certain threshold, indicating challenging documents?

2. **Find cases where Mistral's prediction on time is better than Claude's or DeepSeek's.** 
Group scores by document. Consider only time-related scores (string, interval). For any document, check if Mistral's time-related scores are higher than the others; then check manually those cases.