In [47]:
from pathlib import Path
import json
import pandas as pd

In [48]:
root_path = Path.cwd().parent / "results"

In [49]:
def is_not_hallucination(response, model):
    gpt_model = "I'm sorry, but" in response and (
            "Examples" in response or "example" in response or "examples" in response)

    gemini_model_part = "provided context" in response or "provided text" in response or "information" in response
    gemini_model = ("does not mention" in response and gemini_model_part) or (
            "does not contain" in response and gemini_model_part) or (
                           "not available" in response and gemini_model_part) or (
                           "could not extract" in response and gemini_model_part) or (
                           "i cannot answer" in response and gemini_model_part) or (
                           "cannot be found" in response and gemini_model_part)
    claude_model = ("I apologize, but I don't have access to" in response) or (
            "I'm afraid I don't have enough context" in response) or (
                           "Unfortunately I do not have enough context" in response) or (
                           "Unfortunately, without having access to" in response) or (
                           "Unfortunately, I cannot provide a definitive" in response)

    if model in ["gpt-3.5-turbo", "gpt-4-turbo"]:
        return gpt_model
    if model in ["gemini-pro-text", "gemini-pro-chat"]:
        return gemini_model
    if model in ["claude-2.1", "claude-3-haiku", 'claude-3-sonnet', 'claude-3-opus']:
        return claude_model

    return gpt_model or gemini_model or claude_model

In [50]:
tasks = ["anli", "aqua", "gsm8k", "race-h", "race-m", "strategyqa", "triviaqa", "winogrande", "halueval-dialogue",
         "halueval-general", "halueeval-qa", "halueval-summarization", "math-algebra", "math-count-prob",
         "math-geometry", "math-int-algebra", "math-number", "math-pre-algebra", "math-pre-calc"]

In [51]:
def analyse(task, model):
    task_path = model / task
    null_shot_path = task_path / "null-shot"
    zero_shot_path = task_path / "zero-shot"
    null_shot_files = sorted(list(null_shot_path.glob("*.json")))
    null_shot_files = null_shot_files[:-1]
    baseline_files = sorted(list(zero_shot_path.glob("*.json")))
    baseline_files = baseline_files[:-1]
    files = list(zip(null_shot_files, baseline_files))

    output_path = Path.cwd().parent / Path("analysis_results") / "hallucination" / model
    output_path.mkdir(parents=True, exist_ok=True)

    all_count = 0
    improve_only_count = 0
    all_total = 0
    improve_only_total = 0
    for idx in range(len(files)):
        null_shot_file = files[idx][0]
        baseline_file = files[idx][1]
        with open(null_shot_file, 'r') as f:
            null_shot_data = json.load(f)
            null_shot_response = null_shot_data["response"]
            null_shot_is_correct = null_shot_data["is_correct"]
            all_total += 1
            with open(baseline_file, 'r') as f:
                baseline_data = json.load(f)
                baseline_is_correct = baseline_data["is_correct"]
            if not baseline_is_correct and null_shot_is_correct:
                improve_only_total += 1
        if (not is_not_hallucination(null_shot_response, model)) and ('"Examples"' in null_shot_response or "examples section" in null_shot_response):
            all_count += 1
            if not baseline_is_correct and null_shot_is_correct:
                improve_only_count += 1
            
                
    print(f"hallucination count: {count}/{total}")
    return all_count, all_total, improve_only_count, improve_only_total

In [52]:
models = [model for model in root_path.iterdir() if
          model.is_dir() and not model.stem.startswith("pythia") and not model.stem.startswith(
              "qwen") and not model.stem.startswith("llama")]

In [53]:
report = {
    "model": [],
    "task": [],
    "hallucination_count": [],
    "total_count": [],
    "hallucination_improve_only_count": [],
    "total_improve_only_count": []
}

for t in tasks:
    for model in models:
        m = model.parts[-1]
        print(f"== task: {t}, model: {m} ==")
        a_count, a_total, i_count, i_total = analyse(t, model)
        report["model"].append(m)
        report["task"].append(t)
        report["hallucination_count"].append(a_count)
        report["total_count"].append(a_total)
        report["hallucination_improve_only_count"].append(i_count)
        report["total_improve_only_count"].append(i_total)

== task: anli, model: gemini-pro-chat ==
hallucination count: 69/546
== task: anli, model: palm-2-chat ==
hallucination count: 69/546
== task: anli, model: claude-3-opus ==
hallucination count: 69/546
== task: anli, model: claude-2.1 ==
hallucination count: 69/546
== task: anli, model: palm-2-text ==
hallucination count: 69/546
== task: anli, model: claude-3-haiku ==
hallucination count: 69/546
== task: anli, model: gemini-pro-text ==
hallucination count: 69/546
== task: anli, model: gpt-4-turbo ==
hallucination count: 69/546
== task: anli, model: gpt-3.5-turbo ==
hallucination count: 69/546
== task: anli, model: claude-3-sonnet ==
hallucination count: 69/546
== task: aqua, model: gemini-pro-chat ==
hallucination count: 69/546
== task: aqua, model: palm-2-chat ==
hallucination count: 69/546
== task: aqua, model: claude-3-opus ==
hallucination count: 69/546
== task: aqua, model: claude-2.1 ==
hallucination count: 69/546
== task: aqua, model: palm-2-text ==
hallucination count: 69/546
==

In [54]:
df = pd.DataFrame(report)
df = df[df["total_count"] != 0]

In [55]:
df['all_percentage'] = (df['hallucination_count'] / df['total_count']) * 100
df['improve_only_percentage'] = (df['hallucination_improve_only_count'] / df['total_improve_only_count']) * 100

In [56]:
print(df['all_percentage'].mean(), df['all_percentage'].std())
print(df['improve_only_percentage'].mean(), df['improve_only_percentage'].std())

15.95252755638885 27.925553789974636
16.12400431701438 28.63227225443099


In [57]:
df.to_csv(Path.cwd().parent / "analysis_results" / "hallucination_count.csv", index=False)

In [59]:
df = df.sort_values(by='all_percentage', ascending=False)

In [62]:
df.groupby('task').agg({'hallucination_count': 'sum', 'total_count': 'sum', 'hallucination_improve_only_count': 'sum', 'total_improve_only_count': 'sum'}).sort_values(by='hallucination_count', ascending=False)

Unnamed: 0_level_0,hallucination_count,total_count,hallucination_improve_only_count,total_improve_only_count
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
strategyqa,6216,22900,417,1428
race-h,4292,34980,397,2574
gsm8k,3140,13190,234,1208
winogrande,2908,12670,131,804
triviaqa,2788,10000,17,181
race-m,2077,14360,175,906
anli,1695,12000,252,949
math-algebra,1687,11870,155,1005
math-int-algebra,1529,9030,104,513
math-pre-algebra,1325,8710,97,657


In [63]:
df.groupby('model').agg({'hallucination_count': 'sum', 'total_count': 'sum', 'hallucination_improve_only_count': 'sum', 'total_improve_only_count': 'sum'}).sort_values(by='hallucination_count', ascending=False)

Unnamed: 0_level_0,hallucination_count,total_count,hallucination_improve_only_count,total_improve_only_count
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
claude-3-haiku,14293,41771,1187,3145
claude-3-sonnet,7831,41771,515,3536
claude-3-opus,6815,41771,269,2968
gpt-3.5-turbo,2204,41771,254,3347
claude-2.1,1063,41771,61,3092
gemini-pro-chat,326,41771,71,1166
gemini-pro-text,309,41771,62,1233
palm-2-chat,205,41771,9,1620
gpt-4-turbo,73,41771,2,1564
palm-2-text,0,41771,0,1705
