### This notebook contains code to:
1. Generate candidate solutions for a set of tasks
2. Evaluate set of candidate solutions in one file or a set of files
3. Get evaluation stats by labels, by usage of specific tools (functions), the share of the match score and CI (Ward formula)

Note that the reference solutions (groud truth solutions) need to be in the set you send to the evaluation. 

**Task_editor GUI** allow to edit the task text, generate draft ground ruth solution for the task, manually correct it and leave comments for evaluator on which steps are optional, what arguments are acceptable.

##### Imports

In [None]:
# !pip install -r requirements.txt

In [7]:
import os
import json
from tqdm import tqdm

In [2]:
import importlib

import geobenchx.utils
importlib.reload(geobenchx.utils)
from geobenchx.utils import generate_timestamp_id, get_dataframe_info, get_solution_code

import geobenchx.dataclasses
importlib.reload(geobenchx.dataclasses)
from geobenchx.dataclasses import TaskSet


import geobenchx.agent
importlib.reload(geobenchx.agent)
from geobenchx.agent import execute_task

import geobenchx.evaluation
importlib.reload(geobenchx.evaluation)
from geobenchx.evaluation import score_solutions_set, generate_eval_stats, get_eval_stats_by_subsets

from geobenchx.constants import DATA_FOLDER, RESULTS_FOLDER, MODEL_CLAUDE, MODEL_GEMINI, MODEL_GPT_41, MODEL_GEMINI_ADV, MODEL_CLAUDE_mini, MODEL_O3, MODEL_O4, MODEL_GPT_mini, MODEL_CLAUDE_ADV4

import geobenchx.generate_solutions
importlib.reload(geobenchx.generate_solutions)
from geobenchx.generate_solutions import generate_solutions

In [3]:
model = MODEL_GEMINI
temperature = 0

## Generate solutions

In [4]:
# Select file with the task set to solve

tasks_filename = r"tasks_and_reference_solutions.json"

In [5]:
# Loading tasks from that file

tasks = TaskSet.read_from_file(tasks_filename, DATA_FOLDER)


In [6]:
len(tasks)

202

In [7]:
# Check the set composition
tasks.get_labels_counts()

{<TaskLabels.CONTROL: 'Control question'>: 3,
 <TaskLabels.SPATIAL_OPERATIONS: 'Spatial operations'>: 53,
 <TaskLabels.TASK_SET_03: 'Task Set 03'>: 53,
 <TaskLabels.HEATMAPS_CONTOUR_LINES: 'Heatmaps, Contour Lines'>: 54,
 <TaskLabels.TASK_SET_04: 'Task Set 04'>: 54,
 <TaskLabels.VAGUE: 'Vague'>: 4,
 <TaskLabels.MERGE_VISUALIZE: 'Merge, Visualize'>: 36,
 <TaskLabels.TASK_SET_01: 'Task Set 01'>: 36,
 <TaskLabels.PROCESS_MERGE_VISUALIZE: 'Process, Merge, Visualize'>: 56,
 <TaskLabels.TASK_SET_02: 'Task Set 02'>: 56}

In [8]:
# Creating a standartized name. Each generation will overwrite the candidate solutions and metadata, so if you have 1 output file name, it will be overwritten each time you run the code

output_tasks_filename = f'generated_solutions_{model}_temp_{temperature}.json'

In [9]:
output_tasks_filename


'generated_solutions_gemini-2.0-flash-001_temp_0.json'

In [None]:
# Generating candidate solutions. Double check your output file name above

tasks_solved, _, _ = generate_solutions(tasks, model = model, temperature=temperature, output_filename=output_tasks_filename, max_steps=25, skip_solved=False, capture_history=True)

## Evaluate solutions (single file, single LLM) and get evaluation stats

In [None]:
# Name of the file with tasks and solutions to score the candidate solutions

taskset_to_evaluate = ''

In [None]:
# Score solutions in the selected file, reads tasks inside the function

score_solutions_set(tasks_filename=taskset_to_evaluate, folder=RESULTS_FOLDER, model=model, temperature=temperature, skip_scored= False)

In [None]:
# Generate evaluation stats for all tasks - works on the task set, so read the task file first

taskset_to_evaluate = TaskSet.read_from_file(taskset_to_evaluate, RESULTS_FOLDER)

eval_stats = generate_eval_stats(taskset_to_evaluate)
eval_stats

In [None]:
# Get evaluations stat for solvable tasks and unsolvable task (reject_task in reference solutions)
# You can use this function to get stats by task label if you added labels while creating tasks

get_eval_stats_by_subsets(taskset_to_evaluate, RESULTS_FOLDER, functions_names=['reject_task'])

## Batch final (evaluation of multiple files and by multiple LLMS)


In [None]:
files_to_evaluate = [
# input names of files as a Python list
]

In [None]:
models = [
    MODEL_CLAUDE,
    MODEL_GPT_41,
    MODEL_GEMINI_ADV
]

In [None]:
for filename in files_to_evaluate:
    for model in models:
        tasks = TaskSet.read_from_file(filename, RESULTS_FOLDER)
        new_filename = 'eval_by_' + model + '_' + filename
        tasks = TaskSet.save_to_file(tasks, new_filename, RESULTS_FOLDER)
        score_solutions_set(tasks_filename=new_filename, folder=RESULTS_FOLDER, model=model, temperature=0, skip_scored=False)

## Calculating final stats

In [None]:
filenames = [
# files with scored solutions for evaluation as a Python list
]

In [33]:
benchmarking_result_filename = os.path.join(RESULTS_FOLDER, 'benchmark_results.json')
with open(benchmarking_result_filename, 'r') as f:
    benchmarking_res_dict = json.load(f)

In [None]:
# Resulting stats for all tasks

for filename in filenames:
    tasks = TaskSet.read_from_file(filename, RESULTS_FOLDER)
    bd = generate_eval_stats(tasks)
    benchmarking_res_dict[filename] = bd


In [None]:
# Resulting stats by label

for filename in filenames:
    bd = get_eval_stats_by_subsets(filename, RESULTS_FOLDER, labels=['Task Set 01', 'Task Set 02', 'Task Set 03', 'Task Set 04', 'Control question'])
    benchmarking_res_dict[filename] = bd

In [None]:
# Resulting stats by solvable and not

for filename in filenames:
    bd = get_eval_stats_by_subsets(filename, RESULTS_FOLDER, functions_names=['reject_task'])
    benchmarking_res_dict[filename] = bd

In [None]:
# Saving the results

with open(benchmarking_result_filename, 'w') as f:
    json.dump(benchmarking_res_dict, f, indent=4)