In [1]:
import os
import re
import pandas as pd
import json

In [12]:
all_responses = {}
titan_path = "/mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out"
for folder in os.listdir(titan_path):
    if folder.startswith('mas_safety') or folder.startswith('mas_metacognition'):
        print(f'Processing folder: {folder}')
        parsed_responses = {}
        for f in os.listdir(f'{titan_path}/{folder}/logs'):
            if f.endswith('.txt'):
                with open(os.path.join(titan_path, folder, 'logs', f), 'r') as file:
                    content = file.read()
                    vals = content.split('----------------------------------------')
                    if len(vals) < 3: 
                        print(f"Skipping {f} due to insufficient data.")
                        continue
                    else:
                        response = vals[-2].split('Content:')[-1].replace('\n', ' ').replace(' ', '')
                        pattern = r'^[0-9],[1-5]$'
                        match = re.search(pattern, response)
                        if match:
                            parsed_responses[f.replace('.txt', '')] = {
                                    'answer': int(response.split(',')[0]),
                                    'confidence': int(response.split(',')[1])
                            }
                file.close()
        print(f'Parsed {len(parsed_responses)} responses from {folder}')
        all_responses[folder] = parsed_responses


Processing folder: mas_safety_gpt4o_mmlu_safety
Parsed 1533 responses from mas_safety_gpt4o_mmlu_safety
Processing folder: mas_metacognition_gpt4o_medcalc_metacognition
Parsed 411 responses from mas_metacognition_gpt4o_medcalc_metacognition
Processing folder: mas_metacognition_gpt4o_pubmedqa_metacognition
Parsed 1000 responses from mas_metacognition_gpt4o_pubmedqa_metacognition
Processing folder: mas_metacognition_gpt4o_medbullets_metacognition
Parsed 308 responses from mas_metacognition_gpt4o_medbullets_metacognition
Processing folder: mas_safety_gpt4o_mmlupro_safety
Parsed 1101 responses from mas_safety_gpt4o_mmlupro_safety
Processing folder: mas_safety_gpt4o_casehold_safety
Parsed 403 responses from mas_safety_gpt4o_casehold_safety
Processing folder: mas_safety_gpt4o_bbq_safety
Parsed 871 responses from mas_safety_gpt4o_bbq_safety
Processing folder: mas_metacognition_gpt4o_metamedqa_metacognition
Parsed 1371 responses from mas_metacognition_gpt4o_metamedqa_metacognition
Processing f

In [27]:
def generate_summary(results_df, experiment_name):

    total = len(results_df)
    corrects = results_df['is_correct'].sum()
    accuracy = (corrects / total) * 100 if total > 0 else 0

    acc_by_conf = results_df.groupby('confidence')['is_correct'].mean() * 100

    with open(f"{experiment_name}_SUMMARY.txt", 'w') as f:
        f.write(f"Total questions: {total}\n")
        f.write(f"Correct: {corrects}\n")
        f.write(f"Overall Accuracy: {accuracy:.2f}%\n\n")
        f.write("Accuracy by confidence level:\n")
        for conf, acc in acc_by_conf.items():
            if pd.notna(conf):
                f.write(f"  {int(conf)}: {acc:.2f}%\n")

In [28]:
# generate parsed response csv
for experiment_name, responses in all_responses.items():
    results = []
    benchmark_path = f'../../benchmarks/{experiment_name.split("_")[1]}/{experiment_name.split("gpt4o_")[1]}.json'
    benchmark_df = pd.DataFrame(json.load(open(benchmark_path, 'r')))
    for id, response in responses.items():
        answer = int(response['answer'])
        confidence = int(response['confidence'])
        correct = int(benchmark_df[benchmark_df['id'] == int(id)]['target'].values[0])
        is_correct = int(answer == correct)
        
        results.append({
                    'id': int(id),
                    'model_answer': answer,
                    'confidence': confidence,
                    'correct_answer': correct,
                    'is_correct': is_correct
                    })

    results_df = pd.DataFrame(results)
    generate_summary(results_df, experiment_name)
    results_df.to_csv(f'{experiment_name}_RESPONSES.csv', index=False)

In [None]:
# summary files for all mas_safety and mas_metacognition
# bbq summary for eval optim and response file
# bbq summary for zero shot and response file 