# Retrieve and Parse Model Responses

In [137]:
import json
file_path = 'results_metadata.json'
with open(file_path, 'r') as file:
    # Load the JSON data from the file into a Python dictionary
    metadata = json.load(file)

metadata

{'boolq_valid': {'fields': ['Reasoning', 'Answer', 'Confidence'],
  'models': {'gpt-4o': {'file_id': 'file-12Pr3mz95PN2sGpZnCioLh',
    'batch_id': 'batch_6882b9e8c0b88190a6078d1259b0bbac',
    'output_filename': 'boolq_valid_gpt-4o.json',
    'old_batch': 'batch_68885a713de48190b5c616795507d7f4'},
   'claude-3-haiku-20240307': {'file_id': 'boolq_valid',
    'batch_id': 'msgbatch_01Eik9sN8Ek6cBd9haYdQoD9',
    'output_filename': 'boolq_valid_claude-3-haiku-20240307.jsonl'},
   'gemini-2.5-pro': {'file_id': 'none',
    'batch_id': 'batches/40v60ylnpg26z17kst8c2tjbpb5lmrhn9rrf',
    'output_filename': 'boolq_valid_gemini-2.5-pro.json'},
   'claude-3-7-sonnet-20250219': {'file_id': 'boolq_valid',
    'batch_id': 'msgbatch_01G2yAtxwzZ22CiE7XmZTvQR',
    'output_filename': 'boolq_valid_claude-3-7-sonnet-20250219.jsonl'},
   'gemini-2.5-flash': {'file_id': 'none',
    'batch_id': 'batches/b6oo1vivedngmdlk1ahnluu326o9x81b9vci',
    'output_filename': 'boolq_valid_gemini-2.5-flash.json'},
   '

# Retrieve Batch Responses

## Gemini

### Initialize

In [1]:
%pip install -q -U "google-genai>=1.0.0"

Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain 0.2.5 requires tenacity<9.0.0,>=8.1.0, but you have tenacity 9.1.2 which is incompatible.
langchain-community 0.2.5 requires tenacity<9.0.0,>=8.1.0, but you have tenacity 9.1.2 which is incompatible.
langchain-core 0.2.9 requires tenacity!=8.4.0,<9.0.0,>=8.1.0, but you have tenacity 9.1.2 which is incompatible.
ollama 0.2.1 requires httpx<0.28.0,>=0.27.0, but you have httpx 0.28.1 which is incompatible.
streamlit 1.30.0 requires packaging<24,>=16.8, but you have packaging 24.1 which is incompatible.
streamlit 1.30.0 requires tenacity<9,>=8.1.0, but you have tenacity 9.1.2 which is incompatible.

[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [75]:
import os
from google import genai
from google.genai import types

key = os.getenv("GOOGLE_API_KEY")
client = genai.Client(api_key=key,http_options={'api_version': 'v1alpha'})

### Save Results

In [76]:
def save_job_content(batch_job, model_name, dataset_name):
    if batch_job.state.name == 'JOB_STATE_SUCCEEDED':
        result_file_name = batch_job.dest.file_name
        print(f"         Results are in file: {result_file_name}")

        print("         Downloading and saving result file content...")
        file_content_bytes = client.files.download(file=result_file_name)
        file_content = file_content_bytes.decode('utf-8')

        # Create model-specific folder
        folder_name = 'Raw Results/Gemini/' + model_name.replace(".", "-")
        os.makedirs(folder_name, exist_ok=True)
        
        # Define save path
        save_path = os.path.join(folder_name, f"{dataset_name}_{model_name}.json")

        # Parse JSONL into a list of objects
        all_results = []
        for line in file_content.splitlines():
            if line:
                parsed_response = json.loads(line)
                all_results.append(parsed_response)

        # Save as JSON
        with open(save_path, "w") as f:
            json.dump(all_results, f, indent=2)

        print(f"        ✅ Saved results to {save_path}")

    else:
        print(f"Job did not succeed. Final state: {batch_job.state.name}")


In [77]:
gem_models = ["gemini-2.5-flash", "gemini-2.5-pro"]

for dataset_name, dataset_info in metadata.items():
    model_info = dataset_info['models']
    print(f"{dataset_name}:")
    for model_name, model_data in model_info.items():
        if model_name in gem_models:
            model_batch = model_data['batch_id']
            print(f'    {model_name}:   {model_batch}')
            batch_job = client.batches.get(name=model_batch)
            save_job_content(batch_job, model_name, dataset_name)
            key = f"{dataset_name}_{model_name}.json"
            model_info[model_name]['output_filename'] = key

    print()

boolq_valid:
    gemini-2.5-pro:   batches/40v60ylnpg26z17kst8c2tjbpb5lmrhn9rrf
         Results are in file: files/batch-40v60ylnpg26z17kst8c2tjbpb5lmrhn9rrf
         Downloading and saving result file content...
        ✅ Saved results to Raw Results/Gemini/gemini-2-5-pro\boolq_valid_gemini-2.5-pro.json
    gemini-2.5-flash:   batches/b6oo1vivedngmdlk1ahnluu326o9x81b9vci
         Results are in file: files/batch-b6oo1vivedngmdlk1ahnluu326o9x81b9vci
         Downloading and saving result file content...
        ✅ Saved results to Raw Results/Gemini/gemini-2-5-flash\boolq_valid_gemini-2.5-flash.json

lsat_ar_test:
    gemini-2.5-pro:   batches/sklakmhs91rg1lcir2gbufjoewm4lharjbdz
         Results are in file: files/batch-sklakmhs91rg1lcir2gbufjoewm4lharjbdz
         Downloading and saving result file content...
        ✅ Saved results to Raw Results/Gemini/gemini-2-5-pro\lsat_ar_test_gemini-2.5-pro.json
    gemini-2.5-flash:   batches/r1o5lbc3p95sfgnu8zy3obihdhw17evmdo6x
         Resul

## ChatGPT

### Initialize

In [None]:
%pip install -U openai

In [103]:
from openai import OpenAI
key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key = key)

### Get Responses

In [104]:
def save_job_content(batch_job, model_name, dataset_name):
    folder_name = 'Raw Results/GPT/' + model_name.replace(".", "-")
    os.makedirs(folder_name, exist_ok=True)
    status = batch_job.status

    if status == 'completed':
        output_file_id = batch_job.output_file_id
        resp = client.files.content(output_file_id)
        text = getattr(resp, "text", None) or resp.content.decode("utf-8")
        print(f'        OutputFile ID: {output_file_id}')

        # Save output as a JSONL
        file_name = f"{dataset_name}_{model_name}.jsonl"
        save_path = os.path.join(folder_name, file_name)
        with open(save_path, "w", encoding="utf-8") as f:
            f.write(text if text.endswith("\n") else text + "\n")
    else:
        print(f"Job did not succeed. Final state: {status}")
    


In [109]:
openai_models = ["gpt-4o"]

for dataset_name, dataset_info in metadata.items():

    model_info = dataset_info['models']
    print(f"{dataset_name}:")
    
    for model_name, model_data in model_info.items():

        if model_name in openai_models:

            model_batch = model_data['batch_id']
            print(f'    {model_name}:   {model_batch}')

            batch = client.batches.retrieve(model_batch)
            save_job_content(batch, model_name, dataset_name)
            key = f"{dataset_name}_{model_name}.json"
            model_info[model_name]['output_filename'] = key
    print()

boolq_valid:
    gpt-4o:   batch_6882b9e8c0b88190a6078d1259b0bbac
        OutputFile ID: file-M4SPm8TMc6Sjrf2Nunr8Yy

lsat_ar_test:
    gpt-4o:   batch_6882b9edb8888190a7273726ffd6f3ba
        OutputFile ID: file-BDcxDoD6Y8SNEVWGeTUz61

sciq_test:
    gpt-4o:   batch_6882b9f6ed908190ab2f6acea76f3b12
        OutputFile ID: file-EFXvvbpomZvVecmLByHHz7

life_eval:
    gpt-4o:   batch_6882b9ec28508190b161285d2f77395d
        OutputFile ID: file-CqJyuePrsh2LByvkUCdhry

halu_eval_qa:
    gpt-4o:   batch_68885dc2a5988190b9a1bb1e6e75c0d7
        OutputFile ID: file-Xoz5tg27geR2dcEMELD3QH

sat_en:
    gpt-4o:   batch_6882b9f307888190ac4769c47dac1eda
        OutputFile ID: file-Y9Pbosz7ZRXjtE7aw8Qysy



In [None]:
## Update results_metadata.json
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)

## Claude

### Initialize

In [49]:
%pip install -U pip -U Anthropic

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Collecting Anthropic
  Downloading anthropic-0.62.0-py3-none-any.whl.metadata (27 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 1.8/1.8 MB 19.1 MB/s eta 0:00:00
Downloading anthropic-0.62.0-py3-none-any.whl (296 kB)
Installing collected packages: pip, Anthropic
  Attempting uninstall: pip
    Found existing installation: pip 24.3.1
    Uninstalling pip-24.3.1:
      Successfully uninstalled pip-24.3.1
Successfully installed Anthropic-0.62.0 pip-25.2
Note: you may need to restart the kernel to use updated packages.


In [82]:
import anthropic
from anthropic import Anthropic, AsyncAnthropic, APIError

key = os.getenv("ANTHROPIC_API_KEY")
client = Anthropic(api_key = key)

### Get Responses

In [83]:
def save_claude_batch_jsonl(batch_id, model_name, dataset_name):
    folder_name = os.path.join("Raw Results", "Claude", model_name.replace(".", "-"))
    os.makedirs(folder_name, exist_ok=True)

    save_path = os.path.join(folder_name, f"{dataset_name}_{model_name}.jsonl")

    with open(save_path, "w", encoding="utf-8") as f:
        for result in client.messages.batches.results(batch_id):
            # Convert object to dict so it’s JSON serializable
            result_dict = result.model_dump()  # anthropic SDK uses Pydantic-like models
            f.write(json.dumps(result_dict) + "\n")

    print(f"✅ Saved Claude batch results to {save_path}")

In [90]:
claude_models = ['claude-3-haiku-20240307', 'claude-3-7-sonnet-20250219', "claude-sonnet-4-20250514"]

for dataset_name, dataset_info in metadata.items():

    model_info = dataset_info['models']
    print(f"{dataset_name}:")
    
    for model_name, model_data in model_info.items():

        if model_name in claude_models:

            batch_id = model_data['batch_id']
            print(f'    {model_name}:   {batch_id}')
            save_claude_batch_jsonl(batch_id, model_name, dataset_name)
            key = f"{dataset_name}_{model_name}.jsonl"
            model_info[model_name]['output_filename'] = key
    print()

boolq_valid:
    claude-3-haiku-20240307:   msgbatch_01Eik9sN8Ek6cBd9haYdQoD9
✅ Saved Claude batch results to Raw Results\Claude\claude-3-haiku-20240307\boolq_valid_claude-3-haiku-20240307.jsonl
    claude-3-7-sonnet-20250219:   msgbatch_01G2yAtxwzZ22CiE7XmZTvQR
✅ Saved Claude batch results to Raw Results\Claude\claude-3-7-sonnet-20250219\boolq_valid_claude-3-7-sonnet-20250219.jsonl
    claude-sonnet-4-20250514:   msgbatch_01UdsJtGrwuHrV9xv7kUPwVo
✅ Saved Claude batch results to Raw Results\Claude\claude-sonnet-4-20250514\boolq_valid_claude-sonnet-4-20250514.jsonl

lsat_ar_test:
    claude-3-7-sonnet-20250219:   msgbatch_01NQkyt5AQQnoRgJu1KSY15c
✅ Saved Claude batch results to Raw Results\Claude\claude-3-7-sonnet-20250219\lsat_ar_test_claude-3-7-sonnet-20250219.jsonl
    claude-3-haiku-20240307:   msgbatch_011VWy4skZeUZ5o3HcLG7hTJ
✅ Saved Claude batch results to Raw Results\Claude\claude-3-haiku-20240307\lsat_ar_test_claude-3-haiku-20240307.jsonl
    claude-sonnet-4-20250514:   msgbatc

In [110]:
## Update results_metadata.json
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)

# Parse Results

In [123]:
import pandas as pd
import numpy as np
import ast
import re

os.mkdir('Parsed Results')

## GPT

In [None]:
# GPT PARSER
def parse_gpt_response(entries: list, fields, stats = False):
  answer_list = []
  answer_index_list = []
  answer_token_list = []
  answer_token_logprobs_list = []
  content_list = []
  qid_list = []

  t1 = []
  t2 = []
  t3 = []
  t4 = []
  t5 = []

  t1_probs = []
  t2_probs = []
  t3_probs = []
  t4_probs = []
  t5_probs = []

  correct_format = []
  coerce = []

  for entry in entries:
    response_tokens = entry['response']['body']['choices'][0]['logprobs']['content']
    content = entry['response']['body']['choices'][0]['message']['content']

    qid = entry['custom_id']
    qid_list.append(qid)

    # Get Answer and Answer Index
    try:
      answer = ast.literal_eval(content)['Answer']
      correct_format.append(True)
      coerce.append(True)
    except:
      #print('Old Content')
      #print(content)

      ## Fix All possible issues with content:
      try:
        open_bracket_index = content.index('{')
        closed_bracket_index = content.rfind('}')
        new_content = (content[open_bracket_index:closed_bracket_index+1]
                  .replace('Response:', '')
                  .replace(':"', '":') ## Update this for new format with "
                  .strip()
        )

        #print('New Content')
        #print(new_content)
        answer = ast.literal_eval(new_content)['Answer']
        correct_format.append(False)
        coerce.append(True)
        content = new_content
      except:
        ## If Uncoerceable
        coerce.append(False)
        correct_format.append(False)
        answer_list.append(None)
        answer_index_list.append(None)
        answer_token_list.append(None)
        answer_token_logprobs_list.append(None)
        content_list.append(content)

        
        t1.append(None)
        t2.append(None)
        t3.append(None)
        t4.append(None)
        t5.append(None)

        t1_probs.append(None)
        t2_probs.append(None)
        t3_probs.append(None)
        t4_probs.append(None)
        t5_probs.append(None)
        continue


    content_list.append(content)

    answer_list.append(answer)
    pattern = r'"(' + re.escape(answer) + r')"'
    match = re.search(pattern, content)

    #answer_index = content.find(str(answer))
    if match is None:
      print(content)
      #time.sleep(5)
      answer_index_list.append(None)
      answer_token_list.append(None)
      answer_token_logprobs_list.append(None)
      t1.append(None)
      t2.append(None)
      t3.append(None)
      t4.append(None)
      t5.append(None)
      t1_probs.append(None)
      t2_probs.append(None)
      t3_probs.append(None)
      t4_probs.append(None)
      t5_probs.append(None)
      continue

    answer_index = match.start() #+ 1
    answer_index_list.append(answer_index)

    #print(f'Answer: {answer:<10} | Answer Index: {answer_index}')

    # Find answer token in JSON
    position = 0
    str_char = 0
    while str_char < answer_index and position < len(response_tokens):
      token_info = response_tokens[position]
      str_char += len(token_info['bytes'])
      position += 1
    # Check if answer_index was found within response_tokens
    if position >= len(response_tokens) or str_char < answer_index:
        answer_token = None
        answer_token_logpobs = None
        tokens = [None] * 5
        probs = [None] * 5
        print(f"Warning: Answer index {answer_index} out of bounds for response tokens.")
    else:
        answer_token = response_tokens[position]['token']
        answer_token_logpobs = response_tokens[position]['top_logprobs']

        tokens = []
        logprobs = []
        for token in answer_token_logpobs:
          tokens.append(token['token'])
          try:
            logprobs.append(token['logprob'])
          except:
            print(token)
            logprobs.append(0)

        probs = np.exp(logprobs)


    t1.append(tokens[0] if len(tokens)>0 else None)
    t2.append(tokens[1] if len(tokens)>1 else None)
    t3.append(tokens[2] if len(tokens)>2 else None)
    t4.append(tokens[3] if len(tokens)>3 else None)
    t5.append(tokens[4] if len(tokens)>4 else None)

    t1_probs.append(probs[0] if len(probs)>0 else None)
    t2_probs.append(probs[1] if len(probs)>1 else None)
    t3_probs.append(probs[2] if len(probs)>2 else None)
    t4_probs.append(probs[3] if len(probs)>3 else None)
    t5_probs.append(probs[4] if len(probs)>4 else None)


    answer_token_list.append(answer_token)
    answer_token_logprobs_list.append(answer_token_logpobs)

  # Make into dataframe]

  data = {
    'Question ID': qid_list,
    'content': content_list,
    'answer': answer_list,
    'token_index': answer_index_list,
    'token': answer_token_list,
    't1': t1,
    't1_prob': t1_probs,
    't2': t2,
    't2_prob': t2_probs,
    't3': t3,
    't3_prob': t3_probs,
    't4': t4,
    't4_prob': t4_probs,
    't5': t5,
    't5_prob': t5_probs,
    'correct_format': correct_format,
    'coerce': coerce
  }


  for field in fields:
    data[field] = []
    for response_content in content_list:
      try:
        content_dict = ast.literal_eval(response_content)
        data[field].append(content_dict.get(field, None)) # Use .get with default None
      except:
        data[field].append(None)
  if stats:
    print(f'{"Category":<15}| {"Length":<5} |  Mean')
    print('-' * 42)

    for category in data:
      try:
        # Attempt to convert to float and calculate mean, skipping non-numeric categories
        numeric_data = [float(x) for x in data[category] if x is not None]
        if numeric_data: # Only calculate mean if there are numeric values
          print(f'{category:<15}| {len(data[category]):<5}  | {np.mean(numeric_data):.5}')
        else:
          print(f'{category:<15}| {len(data[category]):<5}  |')
      except (ValueError, TypeError):
        # Handle cases where conversion to float fails (non-numeric data)
        print(f'{category:<15}| {len(data[category]):<5}  |')



  df = pd.DataFrame(data)
  return df



In [None]:
import time
from IPython.display import display, clear_output
folder_path = "Raw Results\GPT\gpt-4o"

model_name = 'gpt-4o'

for benchmark in metadata:
    filename = f'{benchmark}_{model_name}.jsonl'
    file_path = os.path.join(folder_path, filename)
    print(file_path)

    fields = metadata[benchmark]['fields']
    print(fields)

     # Load all lines from the file
    entries = []
    with open(file_path, "r") as f:
        for line in f:
            try:
                entries.append(json.loads(line))
            except:
                print("Skipped a line")
                continue
    result_folder_path = f"Parsed Results/GPT/{model_name}"
    os.makedirs(result_folder_path, exist_ok=True)

    df = parse_gpt_response(entries, fields= fields, stats= True)
    save_file_name = f'{benchmark}_{model_name}.csv'

    save_file_path = os.path.join(result_folder_path, save_file_name)

    df.to_csv(save_file_path, index= False)

    display(df)
    time.sleep(1)
    clear_output(wait=True)



## Gemini

In [174]:
def parse_gemini_response(entries, fields, stats=False):
    answer_list = []
    answer_index_list = []
    answer_token_list = []
    answer_token_logprobs_list = []
    content_list = []
    qid_list = []

    correct_format = []
    coerce = []

    for entry in entries:
        qid_list.append(entry['key'])
        content = entry['response']['candidates'][0]['content']['parts'][0]['text']
        try:
            answer = ast.literal_eval(content)['Answer']
            correct_format.append(True)
            coerce.append(True)
        except:
            # print('Old Content')
            # print(content)

            # Fix all possible issues with content:
            try:
                open_bracket_index = content.index('{')
                closed_bracket_index = content.rfind('}')
                new_content = (
                    content[open_bracket_index:closed_bracket_index+1]
                    .replace('Response:', '')
                    .replace(':"', '":')  # Update this for new format with "
                    .strip()
                )

                # print('New Content')
                # print(new_content)
                answer = ast.literal_eval(new_content)['Answer']
                correct_format.append(False)
                coerce.append(True)
                content = new_content
            except:
                # If uncoerceable
                coerce.append(False)
                correct_format.append(False)
                answer_list.append(None)
                answer_index_list.append(None)
                answer_token_list.append(None)
                answer_token_logprobs_list.append(None)
                content_list.append(content)
                continue

        # Append successful parses
        answer_list.append(answer)
        answer_index_list.append(None)  # Placeholder if needed later
        answer_token_list.append(None)  # Placeholder if needed later
        answer_token_logprobs_list.append(None)  # Placeholder if needed later
        content_list.append(content)

    data = {
        'Question ID': qid_list,
        'content': content_list,
        'answer': answer_list,
        'correct_format': correct_format,
        'coerce': coerce
    }

    for field in fields:
        data[field] = []
        for response_content in content_list:
            try:
                content_dict = ast.literal_eval(response_content)
                data[field].append(content_dict.get(field, None))  # Use .get with default None
            except:
                data[field].append(None)

    if stats:
        print(f'{"Category":<15}| {"Length":<5} |  Mean')
        print('-' * 42)

        for category in data:
            try:
                # Attempt to convert to float and calculate mean, skipping non-numeric categories
                numeric_data = [float(x) for x in data[category] if x is not None]
                if numeric_data:  # Only calculate mean if there are numeric values
                    print(f'{category:<15}| {len(data[category]):<5}  | {np.mean(numeric_data):.5}')
                else:
                    print(f'{category:<15}| {len(data[category]):<5}  |')
            except (ValueError, TypeError):
                # Handle cases where conversion to float fails (non-numeric data)
                print(f'{category:<15}| {len(data[category]):<5}  |')

    return pd.DataFrame(data)


In [185]:
import time
from IPython.display import display, clear_output

models = ["gemini-2.5-flash", "gemini-2.5-pro"]

for model_name in models:
    for benchmark in metadata:
        filename = f'{benchmark}_{model_name}.json'
        folder_path = f"Raw Results\Gemini\{model_name.replace('.', '-')}"
        file_path = os.path.join(folder_path, filename)
        print(file_path)

        fields = metadata[benchmark]['fields']
        #print(fields)

        # Load all lines from the file
        with open(file_path, "r") as f:
            entries = json.load(f)
        df = parse_gemini_response(entries= entries, fields=fields, stats= True) 

        #display(df)
        #time.sleep(5)
        #clear_output(wait=True) 



        result_folder_path = f"Parsed Results/Gemini/{model_name}"
        os.makedirs(result_folder_path, exist_ok=True)

        save_file_name = f'{benchmark}_{model_name}.csv'

        save_file_path = os.path.join(result_folder_path, save_file_name)

        df.to_csv(save_file_path, index= False)

        # display(df)
        # time.sleep(1)
        # clear_output(wait=True)
#Raw Results\Gemini\gemini-2-5-flash\boolq_valid_gemini-2.5-flash.json
#Raw Results\Gemini\gemini-2-5-flash\boolq_valid_gemini-2.5-flash.jsonl

Raw Results\Gemini\gemini-2-5-flash\boolq_valid_gemini-2.5-flash.json
Category       | Length |  Mean
------------------------------------------
Question ID    | 3270   | 1634.5
content        | 3270   |
answer         | 3270   |
correct_format | 3270   | 0.01315
coerce         | 3270   | 0.99969
Reasoning      | 3270   |
Answer         | 3270   |
Confidence     | 3270   | 0.98942
Raw Results\Gemini\gemini-2-5-flash\lsat_ar_test_gemini-2.5-flash.json
Category       | Length |  Mean
------------------------------------------
Question ID    | 230    | 114.5
content        | 230    |
answer         | 230    |
correct_format | 230    | 0.46522
coerce         | 230    | 0.76957
Reasoning      | 230    |
Answer         | 230    |
A              | 230    | 0.21554
B              | 230    | 0.20621
C              | 230    | 0.21828
D              | 230    | 0.15398
E              | 230    | 0.20486
Raw Results\Gemini\gemini-2-5-flash\sciq_test_gemini-2.5-flash.json
Category       | Length |  M

## Claude