In [1]:
import os
import json
from pathlib import Path
import pandas as pd

# Read Experiment Result Files

In [2]:
experiment_results_path = Path(os.getcwd()).parent / "experiment_results"
summarized_results_path = experiment_results_path / "parsed_outputs" / "summarized_results.json"

In [3]:
with open(summarized_results_path, "r") as f:
    summarized_results = json.load(f).get("results", None)

## Group by Model, Task, and Output Format

In [4]:
summarized_result_grouped_by_model = {}
for result in summarized_results:
    model = result.get("model", None)
    if model not in summarized_result_grouped_by_model:
        summarized_result_grouped_by_model[model] = []
    summarized_result_grouped_by_model[model].append(result)

In [5]:
llms = list(summarized_result_grouped_by_model.keys())

for model in llms:
    summarized_result_grouped_by_task = {}
    for result in summarized_result_grouped_by_model[model]:
        task = result.get("task", None)
        if task not in summarized_result_grouped_by_task:
            summarized_result_grouped_by_task[task] = []
        summarized_result_grouped_by_task[task].append(result)
    summarized_result_grouped_by_model[model] = summarized_result_grouped_by_task
    
for model in llms:
    for task in summarized_result_grouped_by_model[model]:
        summarized_result_grouped_by_output_format = {}
        for result in summarized_result_grouped_by_model[model][task]:
            output_format = result.get("output_format", None)
            if output_format not in summarized_result_grouped_by_output_format:
                summarized_result_grouped_by_output_format[output_format] = []
            summarized_result_grouped_by_output_format[output_format].append(result)
        summarized_result_grouped_by_model[model][task] = summarized_result_grouped_by_output_format

# Data Analysis

## Error Rate

In [6]:
llms_error_rate = {}
for model in llms:
    llms_error_rate[model] = {}
    for task in summarized_result_grouped_by_model[model]:
        llms_error_rate[model][task] = {}
        for output_format in summarized_result_grouped_by_model[model][task]:
            llms_error_rate[model][task][output_format] = 0
            for result in summarized_result_grouped_by_model[model][task][output_format]:
                if result.get("error", None) is not None:
                    llms_error_rate[model][task][output_format] += 1

In [7]:
llms_error_rate

{'mpt': {'character': {'xml': 76, 'json': 45, 'yaml': 65},
  'enemy': {'xml': 76, 'json': 41, 'yaml': 51},
  'dialogue': {'xml': 86, 'json': 84, 'yaml': 86},
  'quest': {'xml': 42, 'json': 32, 'yaml': 37},
  'story': {'xml': 49, 'json': 51, 'yaml': 67}},
 'llama-2': {'character': {'xml': 86, 'json': 89, 'yaml': 81},
  'enemy': {'xml': 85, 'json': 94, 'yaml': 91},
  'dialogue': {'xml': 85, 'json': 94, 'yaml': 83},
  'quest': {'xml': 89, 'json': 97, 'yaml': 88},
  'story': {'xml': 94, 'json': 97, 'yaml': 94}},
 'gemini-pro': {'character': {'xml': 0, 'json': 0, 'yaml': 2},
  'enemy': {'xml': 2, 'json': 1, 'yaml': 1},
  'dialogue': {'xml': 1, 'json': 0, 'yaml': 7},
  'quest': {'xml': 0, 'json': 1, 'yaml': 15},
  'story': {'xml': 5, 'json': 4, 'yaml': 13}},
 'gpt-4': {'character': {'xml': 1, 'json': 0, 'yaml': 1},
  'enemy': {'xml': 12, 'json': 11, 'yaml': 12},
  'dialogue': {'xml': 0, 'json': 0, 'yaml': 11},
  'quest': {'xml': 1, 'json': 0, 'yaml': 8},
  'story': {'xml': 4, 'json': 3, 'yam

In [8]:
# save to csv with pandas
data_for_df = {
    "model": [],
    "task": [],
    "output_format": [],
    "error_rate": []
}

for model in llms_error_rate:
    for task in llms_error_rate[model]:
        for output_format in llms_error_rate[model][task]:
            data_for_df["model"].append(model)
            data_for_df["task"].append(task)
            data_for_df["output_format"].append(output_format)
            data_for_df["error_rate"].append(llms_error_rate[model][task][output_format])
            
df = pd.DataFrame(data_for_df)
df.to_csv(experiment_results_path / "main_results.csv", index=False)

In [9]:
llms_error_rate_by_model = {}
for model in llms_error_rate:
    llms_error_rate_by_model[model] = 0
    for task in llms_error_rate[model]:
        for output_format in llms_error_rate[model][task]:
            llms_error_rate_by_model[model] += llms_error_rate[model][task][output_format]
llms_error_rate_by_model

{'mpt': 888,
 'llama-2': 1347,
 'gemini-pro': 52,
 'gpt-4': 98,
 'falcon': 1297,
 'gpt-3.5-turbo': 275,
 'palm': 1337}

In [10]:
llms_error_rate_by_task = {}
for model in llms_error_rate:
    for task in llms_error_rate[model]:
        if task not in llms_error_rate_by_task:
            llms_error_rate_by_task[task] = 0
        for output_format in llms_error_rate[model][task]:
            llms_error_rate_by_task[task] += llms_error_rate[model][task][output_format]
llms_error_rate_by_task

{'character': 1023,
 'enemy': 1052,
 'dialogue': 966,
 'quest': 1054,
 'story': 1199}

In [11]:
llms_error_rate_by_output_format = {}
for model in llms_error_rate:
    for task in llms_error_rate[model]:
        for output_format in llms_error_rate[model][task]:
            if output_format not in llms_error_rate_by_output_format:
                llms_error_rate_by_output_format[output_format] = 0
            llms_error_rate_by_output_format[output_format] += llms_error_rate[model][task][output_format]
llms_error_rate_by_output_format

{'xml': 1739, 'json': 1762, 'yaml': 1793}

In [12]:
llms_error_rate_by_model_task = {}
for model in llms_error_rate:
    for task in llms_error_rate[model]:
        if (model, task) not in llms_error_rate_by_model_task:
            llms_error_rate_by_model_task[(model, task)] = 0
        for output_format in llms_error_rate[model][task]:
            llms_error_rate_by_model_task[(model, task)] += llms_error_rate[model][task][output_format]
llms_error_rate_by_model_task

{('mpt', 'character'): 186,
 ('mpt', 'enemy'): 168,
 ('mpt', 'dialogue'): 256,
 ('mpt', 'quest'): 111,
 ('mpt', 'story'): 167,
 ('llama-2', 'character'): 256,
 ('llama-2', 'enemy'): 270,
 ('llama-2', 'dialogue'): 262,
 ('llama-2', 'quest'): 274,
 ('llama-2', 'story'): 285,
 ('gemini-pro', 'character'): 2,
 ('gemini-pro', 'enemy'): 4,
 ('gemini-pro', 'dialogue'): 8,
 ('gemini-pro', 'quest'): 16,
 ('gemini-pro', 'story'): 22,
 ('gpt-4', 'character'): 2,
 ('gpt-4', 'enemy'): 35,
 ('gpt-4', 'dialogue'): 11,
 ('gpt-4', 'quest'): 9,
 ('gpt-4', 'story'): 41,
 ('falcon', 'character'): 256,
 ('falcon', 'enemy'): 274,
 ('falcon', 'dialogue'): 236,
 ('falcon', 'quest'): 270,
 ('falcon', 'story'): 261,
 ('gpt-3.5-turbo', 'character'): 21,
 ('gpt-3.5-turbo', 'enemy'): 1,
 ('gpt-3.5-turbo', 'dialogue'): 56,
 ('gpt-3.5-turbo', 'quest'): 74,
 ('gpt-3.5-turbo', 'story'): 123,
 ('palm', 'character'): 300,
 ('palm', 'enemy'): 300,
 ('palm', 'dialogue'): 137,
 ('palm', 'quest'): 300,
 ('palm', 'story'): 3

In [13]:
llms_error_rate_by_model_format = {}
for model in llms_error_rate:
    for task in llms_error_rate[model]:
        for output_format in llms_error_rate[model][task]:
            if (model, output_format) not in llms_error_rate_by_model_format:
                llms_error_rate_by_model_format[(model, output_format)] = 0
            llms_error_rate_by_model_format[(model, output_format)] += llms_error_rate[model][task][output_format]
llms_error_rate_by_model_format       

{('mpt', 'xml'): 329,
 ('mpt', 'json'): 253,
 ('mpt', 'yaml'): 306,
 ('llama-2', 'xml'): 439,
 ('llama-2', 'json'): 471,
 ('llama-2', 'yaml'): 437,
 ('gemini-pro', 'xml'): 8,
 ('gemini-pro', 'json'): 6,
 ('gemini-pro', 'yaml'): 38,
 ('gpt-4', 'xml'): 18,
 ('gpt-4', 'json'): 14,
 ('gpt-4', 'yaml'): 66,
 ('falcon', 'xml'): 423,
 ('falcon', 'json'): 456,
 ('falcon', 'yaml'): 418,
 ('gpt-3.5-turbo', 'xml'): 118,
 ('gpt-3.5-turbo', 'json'): 129,
 ('gpt-3.5-turbo', 'yaml'): 28,
 ('palm', 'xml'): 404,
 ('palm', 'json'): 433,
 ('palm', 'yaml'): 500}