In [1]:
import os
import pandas as pd
import json
import subprocess

import torch
from transformers import AutoTokenizer, AutoModel
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from difflib import get_close_matches

# Question Filtering by Num Times Incorrect

In [6]:
mmlu_counts = {'ethics' : {}, 'safety' : {}, 'metacognition' : {}}

for f in os.listdir('../../out/gpt4o/mas/'):
    if 'RESPONSE' in f and 'mmlu_' in f:
        df_responses = pd.read_csv('../../out/gpt4o/mas/' + f).set_index('id')
        df_category = f.split('_')[4]
        for index, row in df_responses.iterrows():
            if row['is_correct'] == 0:
                mmlu_counts[df_category][index] = mmlu_counts[df_category].get(index, 0) + 1

for f in os.listdir('../../out/gpt4o/evaloptim/'):
    if 'RESPONSE' in f and 'mmlu_' in f:
        df_responses = pd.read_csv('../../out/gpt4o/evaloptim/' + f).set_index('id')
        df_category = f.split('_')[1]
        for index, row in df_responses.iterrows():
            if row['is_correct'] == 0:
                mmlu_counts[df_category][index] = mmlu_counts[df_category].get(index, 0) + 1

for f in os.listdir('../../out/gpt4o/zeroshot/'):
    if 'RESPONSE' in f and 'mmlu_' in f:
        df_responses = pd.read_csv('../../out/gpt4o/zeroshot/' + f).set_index('id')
        df_category = f.split('_')[2]
        for index, row in df_responses.iterrows():
            if row['is_correct'] == 0:
                mmlu_counts[df_category][index] = mmlu_counts[df_category].get(index, 0) + 1




In [7]:
threshold = 3
thresh_ids = {'ethics': [], 'safety': [], 'metacognition': []}
for category, counts in mmlu_counts.items():
    print(f"Category: {category}")
    for index, count in counts.items():
        if count >= threshold:
            thresh_ids[category].append(index)
    print(f"{len(thresh_ids[category])} IDs with at least {threshold}")
    thresh_ids[category].sort()
    print()  # Add a newline for better readability between categories

Category: ethics
0 IDs with at least 3

Category: safety
0 IDs with at least 3

Category: metacognition
0 IDs with at least 3



In [None]:
#json.dump(thresh_ids, open('../../out/gpt4o/mmlu_incorrect_threshold_3.json', 'w'))

In [9]:
thresh_ids = json.load(open('../../out/gpt4o/mmlu_incorrect_threshold_3.json', 'r'))

# Log Parsing for "Difficult Questions" 

In [22]:
log_paths = {'mas': {'ethics': '/mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out/mas_ethics_gpt4o_mmlu_ethics/logs', 
                     'safety': '/mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out/mas_safety_gpt4o_mmlu_safety/logs', 
                     'metacognition': '/mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out/mas_metacognition_gpt4o_mmlu_metacognition/logs'},
              'evaloptim': {'ethics': '/mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out/evaloptimizer_mmlu_ethics/logs',
                            'safety': '/mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out/evaloptimizer_gpt4o_mmlu_safety/logs',
                            'metacognition': '/mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out/evaloptimizer_gpt4o_mmlu_metacognition/logs'}
        }

In [27]:
for category in log_paths:
    for subset in log_paths[category]:
        print(category.upper(), subset.upper())
        for logfile in os.listdir(log_paths[category][subset]):
            if logfile.endswith('.txt') and int(logfile.split('.txt')[0]) in thresh_ids[subset]:
                print(logfile, category, log_paths[category][subset])
                from_path = os.path.join(log_paths[category][subset], logfile)
                to_path = os.path.join('/mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out/gpt4o/mmlu_thresh_analysis', category, subset, logfile)
                subprocess.run(['cp', from_path, to_path])
        print()

MAS ETHICS
25.txt mas /mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out/mas_ethics_gpt4o_mmlu_ethics/logs
612.txt mas /mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out/mas_ethics_gpt4o_mmlu_ethics/logs
118.txt mas /mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out/mas_ethics_gpt4o_mmlu_ethics/logs
854.txt mas /mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out/mas_ethics_gpt4o_mmlu_ethics/logs
885.txt mas /mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out/mas_ethics_gpt4o_mmlu_ethics/logs
350.txt mas /mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out/mas_ethics_gpt4o_mmlu_ethics/logs
22.txt mas /mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out/mas_ethics_gpt4o_mmlu_ethics/logs
163.txt mas /mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out/mas_ethics_gpt4o_mmlu_ethics/logs
502.txt mas /mnt/bulk-titan/sanddhya/agents_edge/Agent_Edge_Benchmarking/out/mas_ethics_gpt4o_mmlu_ethi

# Identifying medically relevant questions form "difficult subset" using clinical relevance scoring approaches

mmlu_ethics_thresh_filtered = pd.DataFrame

In [70]:
mmlu_ethics_thresh_filtered = pd.DataFrame(json.load(open('../../benchmarks/ethics/mmlu_ethics.json', 'r'))).set_index('id').iloc[thresh_ids['ethics']][['question']]
mmlu_metacognition_thresh_filtered = pd.DataFrame(json.load(open('../../benchmarks/metacognition/mmlu_metacognition.json', 'r'))).set_index('id').iloc[thresh_ids['metacognition']][['question']]
mmlu_safety_thresh_filtered = pd.DataFrame(json.load(open('../../benchmarks/safety/mmlu_safety.json', 'r'))).set_index('id').iloc[thresh_ids['safety']][['question']]


In [79]:
# ----------------------------
# 1. Load models and resources
# ----------------------------

# Load BioBERT model
print("Loading BioBERT...")
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biobert = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# Load SciSpacy model
print("Loading SciSpacy...")
#nlp = spacy.load("en_core_sci_md")

# Load UMLS/ontology terms from file
print("Loading medical ontology terms...")
try:
    with open("umls_terms.txt", "r", encoding="utf-8") as f:
        medical_terms = set(line.strip().lower() for line in f if line.strip())
except FileNotFoundError:
    print("Warning: umls_terms.txt not found. Ontology matching will be skipped.")
    medical_terms = set()

# ----------------------------
# 2. Define scoring functions
# ----------------------------

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    with torch.no_grad():
        outputs = biobert(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()  # CLS token

reference_embedding = get_embedding("diagnosis treatment prognosis sign therapy disease infection scan cancer \
                                     patient surgery imaging clinical symptom prescription \
                                        hospital doctor pain referral, consultation, follow-up, admission, history, discharge, \
                                    inflammation fever fatigue swelling nausea cough dizziness, appetite, weak temperature \
                                    pulse blood pressure oxygen height weight examination test injection dose antibiotic")

def similarity_score(text):
    emb = get_embedding(text)
    return float(cosine_similarity(emb, reference_embedding)[0][0])

#def ner_score(text):
#    doc = nlp(text)
#    return len([ent for ent in doc.ents if ent.label_])

clinical_keywords = {
    "diagnosis", "treatment", "prognosis", "sign", "therapy", "disease", "infection", "scan", "cancer", \
                                     "patient", "surgery", "imaging", "clinical", "symptom", "prescription", \
                                        "hospital", "doctor", "pain", "referral", "consultation", "follow-up", "admission", "history", "discharge", \
                                    "inflammation", "fever", "fatigue", "swelling", "nausea", "cough", "dizziness", "appetite", "weak", "temperature", \
                                    "pulse", "blood pressure", "oxygen", "height", "weight", "examination", "test", "injection", "dose", "antibiotic"
}

def keyword_score(text):
    words = set(text.lower().split())
    return sum(word in clinical_keywords for word in words)

def ontology_score(text):
    if not medical_terms:
        return 0.0
    matches = get_close_matches(text.lower(), medical_terms, n=1, cutoff=0.85)
    return 1.0 if matches else 0.0

def composite_score(text):
    return (
        0.7 * similarity_score(text) +
        #0.3 * ner_score(text) +
        0.3 * keyword_score(text) +
        0.2 * ontology_score(text)
    )


Loading BioBERT...
Loading SciSpacy...
Loading medical ontology terms...


In [80]:
for idx, q in mmlu_metacognition_thresh_filtered.iterrows():
    print(idx, composite_score(q.question))

0 0.5964570343494415
3 0.544698965549469
4 0.5790937781333922
26 0.5806000709533691
48 0.5501837790012359
51 0.5570122480392455
57 0.5418634116649628
60 0.5783970832824706
77 0.5863012194633483
78 0.5341133594512939
81 0.5364590823650359
96 0.5966560125350951
108 0.5356849491596222
112 0.5713735163211822
115 0.5891846299171447
174 0.8846885323524474
221 2.3889678359031676
257 2.3794330716133114
278 1.7943316519260406
341 1.4791878223419188
