# Comparing Calibration Data Across All Models / Question Sets

In [None]:
%pip install -r requirements.txt

In [6]:
# Initialize
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from pathlib import Path
import json


## Combine All Results Into Composite CSV

First we will read from the Parsed Results folder and make a dictionary that abstracts the structure of the directory

In [20]:
# Read the folder and create a dictionary to model the structure of the files

def folder_tree_dict(root, *, include_files=True, follow_symlinks=False, ignore_hidden=True):
    root = Path(root)

    def build(p: Path):
        out = {}
        for entry in sorted(p.iterdir(), key=lambda x: (x.is_file(), x.name.lower())):
            if ignore_hidden and entry.name.startswith("."):
                continue
            try:
                if entry.is_dir() and (follow_symlinks or not entry.is_symlink()):
                    out[entry.name] = build(entry)
                else:
                    if include_files:
                        out[entry.name] = None  # or {"size": entry.stat().st_size}
            except PermissionError:
                out[entry.name] = "<permission-denied>"
        return out

    return {root.name: build(root)}


folder_path = r"Parsed Results"

folder_abstraction_dict = folder_tree_dict(folder_path)[folder_path]

folder_abstraction_dict.keys()

dict_keys(['Claude', 'Deepseek', 'Gemini', 'GPT', 'Llama'])

### Combined CSV
We want to make a well-formed CSV for future analysis. This CSV will have the following fields for columns:

```text
Question Set (str) ---------------- Required: The display name of the question set
Question ID (str) ----------------- Required: The Question ID
Model (str) ----------------------- Required: The model that provided the response (e.g. Llama-3.1-8B-Instruct)
Model Type (str) ------------------ Required: The family of models which  the model (e.g. Llama)
Coerce (Bool) --------------------- Required: Whether the parser was able to understand the response

Question (str) -------------------- Required: The question posed to the model
Correct Answer (str) -------------- Optional: Depends on Question Set (LifeEval is different than others)
Content (str) --------------------- Optional: Depends on Coerce value (NA if Coerce == False)
Reasoning (str) ------------------- Optional: Depends on Coerce value (NA if Coerce == False)
Answer (str) ---------------------- Optional: Depends on Coerce value (NA if Coerce == False)
Score (float) --------------------- Optional: Depends on Coerce value (NA if Coerce == False)

Stated Confidence Answer (float) -- Optional: Depends on Question Set (NA if not available)
Stated Confidence A (float) ------- Optional: Depends on Question Set (NA if not available)
Stated Confidence B (float) ------- Optional: Depends on Question Set (NA if not available)
Stated Confidence C (float) ------- Optional: Depends on Question Set (NA if not available)
Stated Confidence D (float) ------- Optional: Depends on Question Set (NA if not available)
Stated Confidence E (float)-------- Optional: Depends on Question Set (NA if not available)

Token Probability Answer (float) -- Optional: Depends on Model Type (NA if not available)
Token Probability A (float) ------- Optional: Depends on Model Type (NA if not available)
Token Probability B (float) ------- Optional: Depends on Model Type (NA if not available)
Token Probability C (float) ------- Optional: Depends on Model Type (NA if not available)
Token Probability D (float) ------- Optional: Depends on Model Type (NA if not available)
Token Probability E (float) ------- Optional: Depends on Model Type (NA if not available)

```

In [392]:
combined_df = pd.DataFrame()
clean_df = pd.DataFrame()


mcq_qsets = ['LSAT-AR', 'SAT-EN', 'SciQ']



for model_type, models in folder_abstraction_dict.items():
    model_type_path = folder_path + f"/{model_type}"
    print(model_type_path)
    for model_name, qsets in models.items():
        model_path = model_type_path + f"/{model_name}"
        print(f"    {model_path}")
        for qset_file_name in qsets:
            splitter = f"_{model_name}"
            qset_name = qset_file_name.split(splitter)[0]
            qset_path = model_path + f"/{qset_file_name}"
            print(f"        {qset_name}:    {qset_path}")
            #--------- Write a function to spit out a dataframe w/ model_name, qset_name, true_answer and concat it

            source_df = pd.read_csv(qset_path)
            source_df["Model"] = model_name
            source_df["Model Type"] = model_type
            source_df["Question Set"] = qset_name
            source_df["Question ID"] = source_df["Question ID"].astype(str)
            combined_df = pd.concat([combined_df, source_df], ignore_index=True)




combined_df.drop(["Unnamed: 0", "Question ID.1"], axis = 1, inplace = True)

## Still need to add correct answer and score

col_rename_map ={
# Metadata
'Question Set': "Question Set",
'Question ID': "Question ID",
'Model': "Model",
'Model Type': "Model Type",
'coerce': "Coerce",

# Model Response

'content': "Content",
'Reasoning': "Reasoning",
'Answer': "Answer",

# Stated Confidence
'Confidence': "Stated Confidence Answer",
"A": "Stated Confidence A",
"B": "Stated Confidence B",
'C': "Stated Confidence C",
'D': "Stated Confidence D",
'E': "Stated Confidence E",

# Token Probability
'True_prob': "Token Probability True",
'False_prob': "Token Probability False",
'Answer_prob': "Token Probability Answer",
'A_prob': "Token Probability A",
'B_prob': "Token Probability B",
'C_prob': "Token Probability C",
'D_prob': "Token Probability D",
'E_prob': "Token Probability E"
}

combined_df = combined_df.rename(columns = col_rename_map)

qset_rename = {
 'boolq_valid': "BoolQ",
 'halu_eval_qa': "HaluEval",
 'life_eval': "LifeEval",
 'lsat_ar_test': "LSAT-AR",
 'sat_en': "SAT-EN",
 'sciq_test':"SciQ"
}

combined_df["Question Set"] = combined_df["Question Set"].map(qset_rename)

with pd.option_context('display.max_columns', None,
                       #'display.width', None,
                       #'display.max_colwidth', None
                       ):
    display(combined_df.head(5))

Parsed Results/Claude
    Parsed Results/Claude/claude-3-7-sonnet-20250219
        boolq_valid:    Parsed Results/Claude/claude-3-7-sonnet-20250219/boolq_valid_claude-3-7-sonnet-20250219.csv
        halu_eval_qa:    Parsed Results/Claude/claude-3-7-sonnet-20250219/halu_eval_qa_claude-3-7-sonnet-20250219.csv
        life_eval:    Parsed Results/Claude/claude-3-7-sonnet-20250219/life_eval_claude-3-7-sonnet-20250219.csv
        lsat_ar_test:    Parsed Results/Claude/claude-3-7-sonnet-20250219/lsat_ar_test_claude-3-7-sonnet-20250219.csv
        sat_en:    Parsed Results/Claude/claude-3-7-sonnet-20250219/sat_en_claude-3-7-sonnet-20250219.csv
        sciq_test:    Parsed Results/Claude/claude-3-7-sonnet-20250219/sciq_test_claude-3-7-sonnet-20250219.csv
    Parsed Results/Claude/claude-3-haiku-20240307
        boolq_valid:    Parsed Results/Claude/claude-3-haiku-20240307/boolq_valid_claude-3-haiku-20240307.csv
        halu_eval_qa:    Parsed Results/Claude/claude-3-haiku-20240307/halu_eval_qa

Unnamed: 0,Reasoning,Answer,Stated Confidence Answer,Coerce,Content,Question ID,Model,Model Type,Question Set,Stated Confidence A,Stated Confidence B,Stated Confidence C,Stated Confidence D,Stated Confidence E,Token Probability True,Token Probability False,Token Probability Answer,Token Probability A,Token Probability B,Token Probability C,Token Probability D,Token Probability E
0,The question is asking whether the production ...,False,0.6,True,"{\n ""Reasoning"": ""The question is asking wh...",0,claude-3-7-sonnet-20250219,Claude,BoolQ,,,,,,,,,,,,,
1,House tax and property tax are often used inte...,True,0.85,True,"{\n ""Reasoning"": ""House tax and property ta...",1,claude-3-7-sonnet-20250219,Claude,BoolQ,,,,,,,,,,,,,
2,This question is asking about phantom limb pai...,True,0.95,True,"{\n ""Reasoning"": ""This question is asking a...",2,claude-3-7-sonnet-20250219,Claude,BoolQ,,,,,,,,,,,,,
3,Harry Potter and the Escape from Gringotts is ...,True,0.98,True,"{\n ""Reasoning"": ""Harry Potter and the Esca...",3,claude-3-7-sonnet-20250219,Claude,BoolQ,,,,,,,,,,,,,
4,Hydroxyzine HCl (hydrochloride) and hydroxyzin...,True,0.98,True,"{\n ""Reasoning"": ""Hydroxyzine HCl (hydrochl...",4,claude-3-7-sonnet-20250219,Claude,BoolQ,,,,,,,,,,,,,


In [387]:
combined_clean = combined_df.copy()

# this gets the qid and qset where coerce is false
bad_qid_qset = combined_df[combined_df["Coerce"] == False][["Question ID", "Question Set"]]

#bad_qid_qset = bad_qid_qset[["Question ID", "Question Set"]]

bad = set(bad_qid_qset["Question ID"] +  "_" + bad_qid_qset["Question Set"])


combined_clean['combined_name'] = combined_clean['Question ID'] + "_" + combined_clean["Question Set"]



mask = ~combined_clean["combined_name"].isin(bad)   # True = keep

combined_clean = combined_clean[mask]

In [391]:
## Clean up MCQ Question Sets
cc = combined_clean[combined_clean["Question Set"].isin(mcq_qsets)]
cc_letters = cc[["Stated Confidence A", "Stated Confidence B", "Stated Confidence C","Stated Confidence D","Stated Confidence E",]].copy()
sum_confidence = cc_letters.sum(axis = 1)

con_mask = cc[sum_confidence == 0.0]["combined_name"]

# Indixes that aren't in con_mask
combined_clean = combined_clean.loc[~combined_clean["combined_name"].isin(con_mask)]

#---------------------------------------------------------------------------------------------
## Make sure we also elimanate QID that are missing
he = combined_clean[combined_clean["Question Set"] == "HaluEval"]

s = combined_clean["combined_name"].value_counts() % 11  #---------------- This seems to get rid of too much
bad_qid = s.index[s.ne(0)].tolist()  



combined_clean = combined_clean[~combined_clean["combined_name"].isin(bad_qid)]


## Special mask for LifeEval:

le_df = combined_clean[combined_clean["Question Set"] == "LifeEval"]

con_isnum = pd.to_numeric( le_df['Stated Confidence Answer'], errors='coerce').notna()
le_bad_qid= le_df[~con_isnum]["combined_name"]


combined_clean = combined_clean[(~combined_clean["combined_name"].isin(le_bad_qid)) ]


display(combined_clean.head(5))
combined_clean.shape



Unnamed: 0,Reasoning,Answer,Stated Confidence Answer,Coerce,Content,Question ID,Model,Model Type,Question Set,Stated Confidence A,...,Stated Confidence E,Token Probability True,Token Probability False,Token Probability Answer,Token Probability A,Token Probability B,Token Probability C,Token Probability D,Token Probability E,combined_name
0,The question is asking whether the production ...,False,0.6,True,"{\n ""Reasoning"": ""The question is asking wh...",0,claude-3-7-sonnet-20250219,Claude,BoolQ,,...,,,,,,,,,,0_BoolQ
2,This question is asking about phantom limb pai...,True,0.95,True,"{\n ""Reasoning"": ""This question is asking a...",2,claude-3-7-sonnet-20250219,Claude,BoolQ,,...,,,,,,,,,,2_BoolQ
3,Harry Potter and the Escape from Gringotts is ...,True,0.98,True,"{\n ""Reasoning"": ""Harry Potter and the Esca...",3,claude-3-7-sonnet-20250219,Claude,BoolQ,,...,,,,,,,,,,3_BoolQ
4,Hydroxyzine HCl (hydrochloride) and hydroxyzin...,True,0.98,True,"{\n ""Reasoning"": ""Hydroxyzine HCl (hydrochl...",4,claude-3-7-sonnet-20250219,Claude,BoolQ,,...,,,,,,,,,,4_BoolQ
5,Barq's Root Beer is not a Pepsi product. It is...,False,0.98,True,"{\n ""Reasoning"": ""Barq's Root Beer is not a...",5,claude-3-7-sonnet-20250219,Claude,BoolQ,,...,,,,,,,,,,5_BoolQ


(69278, 23)

## Summary Info

Here we see the Raw versus Filtered counts for all the datasets.

In [None]:
counts = pd.DataFrame({
    "Raw": round(combined_df["Question Set"].value_counts() / 11), # This is slightly below 2000 for Llama because we had to drop 3 duplicates
    "Filtered": combined_clean["Question Set"].value_counts() / 11
})

counts[~counts.index.isin(mcq_qsets)]

counts["Prop. Kept"] = counts["Filtered"] / counts["Raw"]
counts
# Need to look at: LSAT-AR (-1)???,  Sciq(-1) Im getting rid of too much?

Unnamed: 0_level_0,Raw,Filtered,Prop. Kept
Question Set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BoolQ,3270.0,2503.0,0.765443
HaluEval,1999.0,1790.0,0.895448
LSAT-AR,230.0,86.0,0.373913
LifeEval,808.0,751.0,0.929455
SAT-EN,206.0,173.0,0.839806
SciQ,1000.0,995.0,0.995


Jacob's cleaned version of SciQ has 996 rows while mine has 995. When investigating each deleted Question ID in my version we find that they all have 1 instance where the answer could not be coerced. Does Jacob have an extra question that shouldn't be there?

In [354]:


sciq_filtered = combined_clean[combined_clean["Question Set"] == "SciQ"]
sciq_raw = combined_df[combined_df["Question Set"] == "SciQ"]


sciq_filtered_ids = sciq_filtered.index

deleted = sciq_raw[~sciq_raw.index.isin(sciq_filtered_ids)]
deleted["Question ID"].unique()

# Deleted QID in SciQ are ['13', '40', '295', '699', '705'] after visual inspection they all seem to have False


false_counts = (deleted['Coerce'].eq(False)
                .groupby(deleted['Question ID'])
                .sum()
                .rename('false_count'))

# as a DataFrame
false_counts = false_counts.reset_index()
false_counts


Unnamed: 0,Question ID,false_count
0,13,1
1,295,1
2,40,1
3,699,1
4,705,1


In [128]:
combined_df[combined_df["Coerce"] == True].pivot_table(index="Model", columns="Question Set",
                        aggfunc="size", fill_value=0)

Question Set,BoolQ,HaluEval,LSAT-AR,LifeEval,SAT-EN,SciQ
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Meta-Llama-3.1-70B-Instruct,3241,1960,208,807,202,1000
Meta-Llama-3.1-8B-Instruct,3200,1997,190,800,202,997
claude-3-7-sonnet-20250219,3267,2000,228,808,206,999
claude-3-haiku-20240307,3036,1855,230,808,181,999
claude-sonnet-4-20250514,3258,2000,188,808,205,1000
deepseek-r1,3214,2000,228,808,206,1000
deepseek-v3,2898,2000,228,808,204,1000
gemini-2.5-flash,3261,2000,177,808,206,1000
gemini-2.5-pro,3190,1984,188,808,204,1000
gpt-4o,3247,2000,230,808,206,1000


### Additional EDA:

Don was curious about some missing fields. For BoolQ we can see that all models have 2503 rows.

In [351]:
combined_clean[(combined_clean["Question Set"] == "BoolQ")]["Model"].value_counts()

Model
claude-3-7-sonnet-20250219     2503
claude-3-haiku-20240307        2503
claude-sonnet-4-20250514       2503
deepseek-r1                    2503
deepseek-v3                    2503
gemini-2.5-flash               2503
gemini-2.5-pro                 2503
gpt-4o                         2503
o3-2025-04-16                  2503
Meta-Llama-3.1-70B-Instruct    2503
Meta-Llama-3.1-8B-Instruct     2503
Name: count, dtype: int64

In [361]:
combined_clean[(combined_clean["Question Set"] == "BoolQ") & (combined_clean["Model Type"] == "Gemini")].head()

Unnamed: 0,Reasoning,Answer,Stated Confidence Answer,Coerce,Content,Question ID,Model,Model Type,Question Set,Stated Confidence A,...,Stated Confidence E,Token Probability True,Token Probability False,Token Probability Answer,Token Probability A,Token Probability B,Token Probability C,Token Probability D,Token Probability E,combined_name
37570,The television series 'The Tudors' aired on Sh...,False,1.0,True,"```json\n{\n ""Reasoning"": ""The television s...",3000,gemini-2.5-flash,Gemini,BoolQ,,...,,,,,,,,,,3000_BoolQ
37571,The question asks if the character Marley dies...,True,1.0,True,"```json\n{\n ""Reasoning"": ""The question ask...",3001,gemini-2.5-flash,Gemini,BoolQ,,...,,,,,,,,,,3001_BoolQ
37572,Canada is a prominent country with a well-deve...,True,1.0,True,"```json\n{\n ""Reasoning"": ""Canada is a prom...",3002,gemini-2.5-flash,Gemini,BoolQ,,...,,,,,,,,,,3002_BoolQ
37574,The term 'Pit Bull' commonly refers to a type ...,False,1.0,True,"```json\n{\n ""Reasoning"": ""The term 'Pit Bu...",3004,gemini-2.5-flash,Gemini,BoolQ,,...,,,,,,,,,,3004_BoolQ
37575,A 'postal code' is a general term used interna...,False,1.0,True,"Response: {\n ""Reasoning"": ""A 'postal code'...",3005,gemini-2.5-flash,Gemini,BoolQ,,...,,,,,,,,,,3005_BoolQ


## Save both CSVs



In [390]:

raw_path   = Path("Parsed Results/combined_raw.csv")
clean_path = Path("Parsed Results/combined_clean.csv")


# write
combined_df.to_csv(raw_path, index=False, encoding="utf-8")
combined_clean.to_csv(clean_path, index=False, encoding="utf-8")
