In [1]:
from pathlib import Path
import json
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Error Analysis

## Data Preparation

In [2]:
root_path = Path.cwd().parent / "results"
output_path = Path.cwd().parent / "analysis_results" / "error_analysis"
output_path.mkdir(exist_ok=True)

In [3]:
models = [m for m in root_path.iterdir() if m.is_dir()]

In [4]:
summary = {}

In [5]:
for model in tqdm(models):
    tasks = [t for t in model.iterdir() if t.is_dir()]
    if summary.get(model.name) is None:
        summary[model.name] = {}
    for task in tqdm(tasks):
        if summary[model.name].get(task.name) is None:
            summary[model.name][task.name] = {}
        prompt_engineering = [p for p in task.iterdir() if p.is_dir()]
        for prompt in tqdm(prompt_engineering):
            if summary[model.name][task.name].get(prompt.name) is None:
                summary[model.name][task.name][prompt.name] = {}
            prompt_name = prompt.name
            outputs = [f for f in prompt.iterdir() if f.is_file() and f.suffix == ".json"]
            for output in tqdm(outputs):
                with open(output, "r") as f:
                    data = json.load(f)
                    if summary[model.name][task.name][prompt_name].get("total") is None:
                        summary[model.name][task.name][prompt_name]["total"] = 0
                    if summary[model.name][task.name][prompt_name].get("correct") is None:
                        summary[model.name][task.name][prompt_name]["correct"] = 0
                    if summary[model.name][task.name][prompt_name].get("incorrect") is None:
                        summary[model.name][task.name][prompt_name]["incorrect"] = 0

                    if not data.get("is_correct"):
                        if summary[model.name][task.name][prompt_name].get("incorrect_files") is None:
                            summary[model.name][task.name][prompt_name]["incorrect_files"] = []
                        summary[model.name][task.name][prompt_name]["incorrect_files"].append(output)
                        summary[model.name][task.name][prompt_name]["incorrect"] += 1
                    else:
                        summary[model.name][task.name][prompt_name]["correct"] += 1
                    summary[model.name][task.name][prompt_name]["total"] += 1

  0%|          | 0/23 [00:00<?, ?it/s]
  0%|          | 0/15 [00:00<?, ?it/s][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A


100%|██████████| 541/541 [00:00<00:00, 6784.63it/s]



100%|██████████| 541/541 [00:00<00:00, 6479.73it/s]


100%|██████████| 2/2 [00:00<00:00, 11.54it/s][A[A

  7%|▋         | 1/15 [00:00<00:02,  5.72it/s][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A


  0%|          | 0/1188 [00:00<?, ?it/s][A[A[A


100%|██████████| 1188/1188 [00:00<00:00, 6375.20it/s][A[A[A


 50%|█████     | 1/2 [00:00<00:00,  4.96it/s][A[A


  0%|          | 0/1188 [00:00<?, ?it/s][A[A[A


 50%|█████     | 594/1188 [00:00<00:00, 5935.32it/s][A[A[A


100%|██████████| 1188/1188 [00:00<00:00, 5879.04it/s][A[A[A


100%|██████████| 2/2 [00:00<00:00,  4.72it/s][A[A

 13%|█▎        | 2/15 [00:00<00:04,  3.10it/s][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A


  0%|          | 0/1201 [00:00<?, ?it/s][A[A[A


 40%|████      | 484/1201 [00:00<00:00, 4833.62it/s][A[A[

In [6]:
summary_str = {}
for model in summary:
    summary_str[model] = {}
    for task in summary[model]:
        summary_str[model][task] = {}
        for prompt in summary[model][task]:
            summary_str[model][task][prompt] = {}
            for key in summary[model][task][prompt]:
                if isinstance(summary[model][task][prompt][key], list):
                    summary_str[model][task][prompt][key] = [str(p) for p in summary[model][task][prompt][key]]
                else:
                    summary_str[model][task][prompt][key] = summary[model][task][prompt][key]

In [7]:
with open(output_path / "error_summary.json", "w") as f:
    json.dump(summary_str, f, indent=2)

## Categorize the errors

In [8]:
with open(output_path / "error_summary.json", "r") as f:
    error_summary = json.load(f)

In [9]:
for model in error_summary:
    for task in error_summary[model]:
        for prompt in error_summary[model][task]:
            if error_summary[model][task][prompt].get("incorrect_files") is not None:
                error_summary[model][task][prompt]["incorrect_files"] = [Path(p) for p in
                                                                         error_summary[model][task][prompt][
                                                                             "incorrect_files"]]

In [10]:
error_categories = {}

Three error categories:
1. Empty response: The model did not return any response.
2. Incorrect response: The model returned a response but it was incorrect.
3. Error response: The model returned an error message.

In [11]:
for model in tqdm(error_summary):
    # if model in ["llama-2-7b", "llama-2-chat-7b"]:  # Skip ablation studies
    #     continue

    if error_categories.get(model) is None:
        error_categories[model] = {}
    for task in tqdm(error_summary[model]):
        if error_categories[model].get(task) is None:
            error_categories[model][task] = {}
        for pe in tqdm(error_summary[model][task]):
            if pe in ["null-shot-after", "null-shot-v1", "null-shot-v2", "null-shot-v3"]:  # Skip ablation studies
                continue

            if error_categories[model][task].get(pe) is None:
                error_categories[model][task][pe] = {}

            if error_summary[model][task][pe].get("incorrect_files") is not None:
                error_categories[model][task][pe]["empty_response"] = []
                error_categories[model][task][pe]["incorrect_response"] = []
                error_categories[model][task][pe]["error_response"] = []
                error_categories[model][task][pe]["total"] = error_summary[model][task][pe]["total"]
                error_categories[model][task][pe]["correct"] = error_summary[model][task][pe]["correct"]

                for file in tqdm(error_summary[model][task][pe]["incorrect_files"]):
                    with open(file, "r") as f:
                        data = json.load(f)
                        if data.get("response") is None:
                            continue
                        elif data.get("response") == "":
                            error_categories[model][task][pe]["empty_response"].append(file)
                        elif not data.get("is_correct"):
                            error_categories[model][task][pe]["incorrect_response"].append(file)
                        elif "ERROR:" in data.get("response"):
                            error_categories[model][task][pe]["error_response"].append(file)

  0%|          | 0/23 [00:00<?, ?it/s]
  0%|          | 0/15 [00:00<?, ?it/s][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A


100%|██████████| 541/541 [00:00<00:00, 6290.91it/s]



100%|██████████| 541/541 [00:00<00:00, 6259.83it/s]


100%|██████████| 2/2 [00:00<00:00, 11.43it/s][A[A

  7%|▋         | 1/15 [00:00<00:02,  5.67it/s][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A


  0%|          | 0/1188 [00:00<?, ?it/s][A[A[A


100%|██████████| 1188/1188 [00:00<00:00, 6097.16it/s][A[A[A


 50%|█████     | 1/2 [00:00<00:00,  5.11it/s][A[A


  0%|          | 0/1188 [00:00<?, ?it/s][A[A[A


 48%|████▊     | 575/1188 [00:00<00:00, 5749.83it/s][A[A[A


100%|██████████| 1188/1188 [00:00<00:00, 5809.77it/s][A[A[A


100%|██████████| 2/2 [00:00<00:00,  4.98it/s][A[A

 13%|█▎        | 2/15 [00:00<00:04,  3.23it/s][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A


  0%|          | 0/1111 [00:00<?, ?it/s][A[A[A


 45%|████▌     | 502/1111 [00:00<00:00, 5015.74it/s][A[A[

In [12]:
summarized_error_categories = {}
for model in error_categories:
    summarized_error_categories[model] = {}
    for task in error_categories[model]:
        summarized_error_categories[model][task] = {}
        for pe in error_categories[model][task]:
            summarized_error_categories[model][task][pe] = {
                "total": error_categories[model][task][pe]["total"],
                "correct": error_categories[model][task][pe]["correct"],
            }
            for category in error_categories[model][task][pe]:
                if category in ["total", "correct"]:
                    continue
                summarized_error_categories[model][task][pe][category] = len(
                    error_categories[model][task][pe][category])

Are there any pairs that has non-zero error?

In [13]:
count = 0
for model in summarized_error_categories:
    for task in summarized_error_categories[model]:
        for pe in summarized_error_categories[model][task]:
            if summarized_error_categories[model][task][pe]["error_response"] != 0:
                count += 1
                
print(count)

0


What is the average `empty_response` rate for each pair?

In [14]:
temp = []
for model in summarized_error_categories:
    for task in summarized_error_categories[model]:
        for pe in summarized_error_categories[model][task]:
            temp.append({
                "model": model,
                "task": task,
                "pe": pe,
                "empty_response": summarized_error_categories[model][task][pe]["empty_response"] /
                                  summarized_error_categories[model][task][pe]["total"]
            })

temp = sorted(temp, key=lambda x: x["empty_response"], reverse=True)
# Group by model
for model in summarized_error_categories:
    count = 0
    total = 0
    for task in summarized_error_categories[model]:
        for pe in summarized_error_categories[model][task]:
            count += summarized_error_categories[model][task][pe]["empty_response"]
            total += summarized_error_categories[model][task][pe]["total"]
    print(f"{model}: {count / total}")

pythia-160m: 0.0
qwen-1.5-0.5b-chat: 0.0
gemini-pro-chat: 0.0179168657429527
qwen-1.5-4b-chat: 0.0
pythia-1b: 0.0
pythia-410m: 0.0
pythia-1.4b: 0.0
palm-2-chat: 0.0835179313440183
claude-3-opus: 0.0
pythia-31m: 0.0
qwen-1.5-7b-chat: 0.0
pythia-14m: 0.0
llama-2-chat-7b: 0.0
claude-2.1: 0.0
pythia-70m: 0.0
palm-2-text: 0.10145651449999277
claude-3-haiku: 0.0
qwen-1.5-1.8b-chat: 0.0
llama-2-7b: 0.0
gemini-pro-text: 0.018083365909452864
gpt-4-turbo: 0.0
gpt-3.5-turbo: 0.0
claude-3-sonnet: 0.0


In [15]:
count = 0
for model in summarized_error_categories:
    for task in summarized_error_categories[model]:
        for pe in summarized_error_categories[model][task]:
            if model in ["gpt-3.5-turbo", "gpt-4-turbo"]:
                if summarized_error_categories[model][task][pe]["empty_response"] != 0:
                    count += 1

print(count)

0


In [16]:
df = pd.DataFrame(temp)
df.head()

Unnamed: 0,model,task,pe,empty_response
0,palm-2-chat,math-number,zero-shot,0.739372
1,palm-2-chat,math-number,zero-shot-cot,0.735675
2,palm-2-chat,math-algebra,zero-shot,0.686027
3,palm-2-chat,math-algebra,zero-shot-cot,0.679293
4,palm-2-chat,math-pre-algebra,zero-shot,0.634174


### Visualize

In [17]:
sns.set(style="whitegrid")

sns.color_palette("hls", 8)

In [18]:
import warnings

warnings.filterwarnings("ignore")

In [19]:
models_with_zero_empty_response = ["llama-2-7b", "llama-2-chat-7b", "gpt-4-turbo", "gpt-3.5-turbo"]

plot_data = {}
for model in summarized_error_categories:
    if model in models_with_zero_empty_response:
        continue
    for task in summarized_error_categories[model]:
        for pe in summarized_error_categories[model][task]:
            if plot_data.get(model) is None:
                plot_data[model] = {}
            if plot_data[model].get(task) is None:
                plot_data[model][task] = {}
            if plot_data[model][task].get(pe) is None:
                plot_data[model][task][pe] = {}
            for category in summarized_error_categories[model][task][pe]:
                plot_data[model][task][pe] = {
                    "Correct": summarized_error_categories[model][task][pe]["correct"],
                    "Incorrect": summarized_error_categories[model][task][pe]["incorrect_response"],
                    "Empty": summarized_error_categories[model][task][pe]["empty_response"]
                }

In [20]:
model_names = {
    "gpt-3.5-turbo": "GPT-3.5 Turbo",
    "gpt-4-turbo": "GPT-4 Turbo",
    "palm-2-text": "PaLM 2",
    "palm-2-chat": "PaLM 2 for Chat",
    "llama-2-7b": "Llama 2 - 7B",
    "llama-2-chat-7b": "Llama 2 Chat - 7B",
    "gemini-pro-text": "Gemini Pro",
    "gemini-pro-chat": "Gemini Pro (Chat)",
}

task_names = {
    "anli": "ANLI",
    "aqua": "AQuA-RAT",
    "gsm8k": "GSM8K",
    "race-m": "RACE (Middle school)",
    "race-h": "RACE (High school)",
    "strategyqa": "StrategyQA",
    "triviaqa": "TriviaQA",
    "winogrande": "WinoGrande",
}

pe_names = {
    "zero-shot": "Zero-Shot",
    "null-shot": "Null-Shot",
    "zero-shot-cot": "Zero-Shot CoT",
    "null-shot-cot": "Null-Shot CoT",
}

In [21]:
for model in plot_data:
    for task in plot_data[model]:
        data = {
            "zero-shot": plot_data[model][task]["zero-shot"]["Empty"],
            "null-shot": plot_data[model][task]["null-shot"]["Empty"],
            "zero-shot-cot": plot_data[model][task]["zero-shot-cot"]["Empty"],
            "null-shot-cot": plot_data[model][task]["null-shot-cot"]["Empty"]
        }
        
        fig, ax = plt.subplots(1, 1, figsize=(5, 5))
        sns.barplot(x=list(data.keys()), y=list(data.values()), ax=ax)
        ax.set_xlabel("Prompt Engineering")
        ax.set_ylabel("Empty Response")
        ax.set_xticklabels([pe_names[pe] for pe in data.keys()])
        ax.tick_params(axis='x', labelsize=10)
        
        ax.set_ylim(0, summarized_error_categories[model][task]["zero-shot"]["total"] * 0.2)

        for p, pe in zip(ax.patches, data.keys()):
            percentage = '{:.2f}%'.format(100 * p.get_height() / summarized_error_categories[model][task][pe][
                "total"])
            
            if p.get_height() > 0:
                if p.get_height() / summarized_error_categories[model][task][pe]["total"] * 0.2 > 0.15:
                    ax.annotate(f"{int(p.get_height())}\n({percentage})",
                                (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center',
                                xytext=(0, -15), textcoords='offset points', fontsize=10, color="white")
                    continue
            
            ax.annotate(f"{int(p.get_height())}\n({percentage})",
                        (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center',
                        xytext=(0, 15), textcoords='offset points', fontsize=10)
        
        
        ax.set_title(f"{model_names[model]} - {task_names[task]}")
        plt.show()

KeyError: 'zero-shot-cot'

In [None]:
colors = ["#88AB8E", "#C84361", "#8ACDD7"]
sns.color_palette(colors)

In [None]:
for model in plot_data:
    for task in plot_data[model]:
        fig, ax = plt.subplots(1, 4, figsize=(15, 5))

        for i, pe in enumerate(plot_data[model][task]):
            xs = plot_data[model][task][pe].keys()
            ys = plot_data[model][task][pe].values()

            sns.barplot(x=xs, y=ys, ax=ax[i], hue=colors, legend=False, palette=colors)

            ax[i].set_ylim(0, summarized_error_categories[model][task][pe][
                "total"])  # Set the y-axis limit to the total number of records in each task

            ax[i].set_title(pe_names[pe])

            for p in ax[i].patches:
                percentage = '{:.2f}%'.format(
                    100 * p.get_height() / summarized_error_categories[model][task][pe]["total"])

                # Nice formatting for annotations
                if p.get_height() > 0:
                    if p.get_height() / summarized_error_categories[model][task][pe]["total"] > 0.15:
                        ax[i].annotate(f"{int(p.get_height())}\n({percentage})",
                                       (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center',
                                       xytext=(0, -15), textcoords='offset points', fontsize=10, color="white")
                        continue

                ax[i].annotate(f"{int(p.get_height())}\n({percentage})",
                               (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center',
                               xytext=(0, 15), textcoords='offset points', fontsize=10)

        fig.suptitle(f"{model_names[model]} - {task_names[task]}")
        Path("error_analysis").mkdir(exist_ok=True)
        plt.savefig(f"error_analysis/{model}_{task}.png", bbox_inches='tight')
        plt.show()

In [22]:
plot_data_df = {
    "model": [],
    "task": [],
    "method": [],
    "Correct": [],
    "Incorrect": [],
    "Empty": []
}

for model in plot_data:
    for task in plot_data[model]:
        for method in plot_data[model][task]:
            plot_data_df["model"].append(model)
            plot_data_df["task"].append(task)
            plot_data_df["method"].append(method)
            plot_data_df["Correct"].append(plot_data[model][task][method]["Correct"])
            plot_data_df["Incorrect"].append(plot_data[model][task][method]["Incorrect"])
            plot_data_df["Empty"].append(plot_data[model][task][method]["Empty"])

plot_data_df = pd.DataFrame(plot_data_df)

In [23]:
plot_data_df.head()

Unnamed: 0,model,task,method,Correct,Incorrect,Empty
0,pythia-160m,math-number,zero-shot,0,540,0
1,pythia-160m,math-number,null-shot,0,540,0
2,pythia-160m,math-algebra,zero-shot,0,1187,0
3,pythia-160m,math-algebra,null-shot,0,1187,0
4,pythia-160m,anli,zero-shot,90,1110,0


In [24]:
plot_data_df.to_csv(Path.cwd().parent / "analysis_results" /    "error_analysis.csv", index=False)