# Retrieve and Parse Model Responses

In [None]:
import json
file_path = 'results_metadata.json'
with open(file_path, 'r') as file:
    # Load the JSON data from the file into a Python dictionary
    metadata = json.load(file)

metadata

# Retrieve Batch Responses

## Gemini

### Initialize

In [1]:
%pip install -q -U "google-genai>=1.0.0"

Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain 0.2.5 requires tenacity<9.0.0,>=8.1.0, but you have tenacity 9.1.2 which is incompatible.
langchain-community 0.2.5 requires tenacity<9.0.0,>=8.1.0, but you have tenacity 9.1.2 which is incompatible.
langchain-core 0.2.9 requires tenacity!=8.4.0,<9.0.0,>=8.1.0, but you have tenacity 9.1.2 which is incompatible.
ollama 0.2.1 requires httpx<0.28.0,>=0.27.0, but you have httpx 0.28.1 which is incompatible.
streamlit 1.30.0 requires packaging<24,>=16.8, but you have packaging 24.1 which is incompatible.
streamlit 1.30.0 requires tenacity<9,>=8.1.0, but you have tenacity 9.1.2 which is incompatible.

[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [75]:
import os
from google import genai
from google.genai import types

key = os.getenv("GOOGLE_API_KEY")
client = genai.Client(api_key=key,http_options={'api_version': 'v1alpha'})

### Save Results

In [76]:
def save_job_content(batch_job, model_name, dataset_name):
    if batch_job.state.name == 'JOB_STATE_SUCCEEDED':
        result_file_name = batch_job.dest.file_name
        print(f"         Results are in file: {result_file_name}")

        print("         Downloading and saving result file content...")
        file_content_bytes = client.files.download(file=result_file_name)
        file_content = file_content_bytes.decode('utf-8')

        # Create model-specific folder
        folder_name = 'Raw Results/Gemini/' + model_name.replace(".", "-")
        os.makedirs(folder_name, exist_ok=True)
        
        # Define save path
        save_path = os.path.join(folder_name, f"{dataset_name}_{model_name}.json")

        # Parse JSONL into a list of objects
        all_results = []
        for line in file_content.splitlines():
            if line:
                parsed_response = json.loads(line)
                all_results.append(parsed_response)

        # Save as JSON
        with open(save_path, "w") as f:
            json.dump(all_results, f, indent=2)

        print(f"        ✅ Saved results to {save_path}")

    else:
        print(f"Job did not succeed. Final state: {batch_job.state.name}")


In [77]:
gem_models = ["gemini-2.5-flash", "gemini-2.5-pro"]

for dataset_name, dataset_info in metadata.items():
    model_info = dataset_info['models']
    print(f"{dataset_name}:")
    for model_name, model_data in model_info.items():
        if model_name in gem_models:
            model_batch = model_data['batch_id']
            print(f'    {model_name}:   {model_batch}')
            batch_job = client.batches.get(name=model_batch)
            save_job_content(batch_job, model_name, dataset_name)
            key = f"{dataset_name}_{model_name}.json"
            model_info[model_name]['output_filename'] = key

    print()

boolq_valid:
    gemini-2.5-pro:   batches/40v60ylnpg26z17kst8c2tjbpb5lmrhn9rrf
         Results are in file: files/batch-40v60ylnpg26z17kst8c2tjbpb5lmrhn9rrf
         Downloading and saving result file content...
        ✅ Saved results to Raw Results/Gemini/gemini-2-5-pro\boolq_valid_gemini-2.5-pro.json
    gemini-2.5-flash:   batches/b6oo1vivedngmdlk1ahnluu326o9x81b9vci
         Results are in file: files/batch-b6oo1vivedngmdlk1ahnluu326o9x81b9vci
         Downloading and saving result file content...
        ✅ Saved results to Raw Results/Gemini/gemini-2-5-flash\boolq_valid_gemini-2.5-flash.json

lsat_ar_test:
    gemini-2.5-pro:   batches/sklakmhs91rg1lcir2gbufjoewm4lharjbdz
         Results are in file: files/batch-sklakmhs91rg1lcir2gbufjoewm4lharjbdz
         Downloading and saving result file content...
        ✅ Saved results to Raw Results/Gemini/gemini-2-5-pro\lsat_ar_test_gemini-2.5-pro.json
    gemini-2.5-flash:   batches/r1o5lbc3p95sfgnu8zy3obihdhw17evmdo6x
         Resul

## ChatGPT

### Initialize

In [None]:
%pip install -U openai

In [78]:
from openai import OpenAI
key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key = key)

### Get Responses

In [80]:
def save_job_content(batch_job, model_name, dataset_name):
    folder_name = 'Raw Results/GPT/' + model_name.replace(".", "-")
    os.makedirs(folder_name, exist_ok=True)
    status = batch_job.status

    if status == 'completed':
        output_file_id = batch_job.output_file_id
        resp = client.files.content(output_file_id)
        text = getattr(resp, "text", None) or resp.content.decode("utf-8")
        print(f'        OutputFile ID: {output_file_id}')

        # Save output as a JSONL
        file_name = f"{dataset_name}_{model_name}.jsonl"
        save_path = os.path.join(folder_name, file_name)
        with open(save_path, "w", encoding="utf-8") as f:
            f.write(text if text.endswith("\n") else text + "\n")
    else:
        print(f"Job did not succeed. Final state: {status}")
    


In [81]:
openai_models = ["gpt-4o"]

for dataset_name, dataset_info in metadata.items():

    model_info = dataset_info['models']
    print(f"{dataset_name}:")
    
    for model_name, model_data in model_info.items():

        if model_name in openai_models:

            model_batch = model_data['batch_id']
            print(f'    {model_name}:   {model_batch}')

            batch = client.batches.retrieve(model_batch)
            save_job_content(batch, model_name, dataset_name)
            key = f"{dataset_name}_{model_name}.json"
            model_info[model_name]['output_filename'] = key
    print()

boolq_valid:
    gpt-4o:   batch_68885a713de48190b5c616795507d7f4
        OutputFile ID: file-7cT9kMESsjkHeAweoJ6G3M

lsat_ar_test:
    gpt-4o:   batch_6882b9edb8888190a7273726ffd6f3ba
        OutputFile ID: file-BDcxDoD6Y8SNEVWGeTUz61

sciq_test:
    gpt-4o:   batch_6882b9f6ed908190ab2f6acea76f3b12
        OutputFile ID: file-EFXvvbpomZvVecmLByHHz7

life_eval:
    gpt-4o:   batch_6882b9ec28508190b161285d2f77395d
        OutputFile ID: file-CqJyuePrsh2LByvkUCdhry

halu_eval_qa:
    gpt-4o:   batch_68885dc2a5988190b9a1bb1e6e75c0d7
        OutputFile ID: file-Xoz5tg27geR2dcEMELD3QH

sat_en:
    gpt-4o:   batch_6882b9f307888190ac4769c47dac1eda
        OutputFile ID: file-Y9Pbosz7ZRXjtE7aw8Qysy



## Claude

### Initialize

In [49]:
%pip install -U pip -U Anthropic

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Collecting Anthropic
  Downloading anthropic-0.62.0-py3-none-any.whl.metadata (27 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 1.8/1.8 MB 19.1 MB/s eta 0:00:00
Downloading anthropic-0.62.0-py3-none-any.whl (296 kB)
Installing collected packages: pip, Anthropic
  Attempting uninstall: pip
    Found existing installation: pip 24.3.1
    Uninstalling pip-24.3.1:
      Successfully uninstalled pip-24.3.1
Successfully installed Anthropic-0.62.0 pip-25.2
Note: you may need to restart the kernel to use updated packages.


In [82]:
import anthropic
from anthropic import Anthropic, AsyncAnthropic, APIError

key = os.getenv("ANTHROPIC_API_KEY")
client = Anthropic(api_key = key)

### Get Responses

In [83]:
def save_claude_batch_jsonl(batch_id, model_name, dataset_name):
    folder_name = os.path.join("Raw Results", "Claude", model_name.replace(".", "-"))
    os.makedirs(folder_name, exist_ok=True)

    save_path = os.path.join(folder_name, f"{dataset_name}_{model_name}.jsonl")

    with open(save_path, "w", encoding="utf-8") as f:
        for result in client.messages.batches.results(batch_id):
            # Convert object to dict so it’s JSON serializable
            result_dict = result.model_dump()  # anthropic SDK uses Pydantic-like models
            f.write(json.dumps(result_dict) + "\n")

    print(f"✅ Saved Claude batch results to {save_path}")

In [90]:
claude_models = ['claude-3-haiku-20240307', 'claude-3-7-sonnet-20250219', "claude-sonnet-4-20250514"]

for dataset_name, dataset_info in metadata.items():

    model_info = dataset_info['models']
    print(f"{dataset_name}:")
    
    for model_name, model_data in model_info.items():

        if model_name in claude_models:

            batch_id = model_data['batch_id']
            print(f'    {model_name}:   {batch_id}')
            save_claude_batch_jsonl(batch_id, model_name, dataset_name)
            key = f"{dataset_name}_{model_name}.jsonl"
            model_info[model_name]['output_filename'] = key
    print()

boolq_valid:
    claude-3-haiku-20240307:   msgbatch_01Eik9sN8Ek6cBd9haYdQoD9
✅ Saved Claude batch results to Raw Results\Claude\claude-3-haiku-20240307\boolq_valid_claude-3-haiku-20240307.jsonl
    claude-3-7-sonnet-20250219:   msgbatch_01G2yAtxwzZ22CiE7XmZTvQR
✅ Saved Claude batch results to Raw Results\Claude\claude-3-7-sonnet-20250219\boolq_valid_claude-3-7-sonnet-20250219.jsonl
    claude-sonnet-4-20250514:   msgbatch_01UdsJtGrwuHrV9xv7kUPwVo
✅ Saved Claude batch results to Raw Results\Claude\claude-sonnet-4-20250514\boolq_valid_claude-sonnet-4-20250514.jsonl

lsat_ar_test:
    claude-3-7-sonnet-20250219:   msgbatch_01NQkyt5AQQnoRgJu1KSY15c
✅ Saved Claude batch results to Raw Results\Claude\claude-3-7-sonnet-20250219\lsat_ar_test_claude-3-7-sonnet-20250219.jsonl
    claude-3-haiku-20240307:   msgbatch_011VWy4skZeUZ5o3HcLG7hTJ
✅ Saved Claude batch results to Raw Results\Claude\claude-3-haiku-20240307\lsat_ar_test_claude-3-haiku-20240307.jsonl
    claude-sonnet-4-20250514:   msgbatc

In [91]:
## Update results_metadata.json
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)

## Deepseek

# Parse Results

## GPT