In [8]:
import os
import json
from pathlib import Path

# Read Experiment Result Files

In [45]:
experiment_results_path = Path(os.getcwd()).parent / "experiment_results"
commercial_llms_path = experiment_results_path / "commercial-llms"
open_source_llms_path = experiment_results_path / "open-source-llms"

In [46]:
commercial_llms_summarized_result_file_path = commercial_llms_path / "parsed_outputs" / "summarized_results.json"
open_source_llms_summarized_result_file_path = open_source_llms_path / "parsed_outputs" / "summarized_results.json"

In [47]:
with open(commercial_llms_summarized_result_file_path, "r") as f:
    commercial_llms_summarized_result = json.load(f).get("results", None)

In [48]:
with open(open_source_llms_summarized_result_file_path, "r") as f:
    open_source_llms_summarized_result = json.load(f).get("results", None)

## Group by Model, Task, and Output Format

In [49]:
commercial_llms_summarized_result_grouped_by_model = {}
for result in commercial_llms_summarized_result:
    model = result.get("model", None)
    if model not in commercial_llms_summarized_result_grouped_by_model:
        commercial_llms_summarized_result_grouped_by_model[model] = []
    commercial_llms_summarized_result_grouped_by_model[model].append(result)

In [50]:
open_source_llms_summarized_result_grouped_by_model = {}
for result in open_source_llms_summarized_result:
    model = result.get("model", None)
    if model not in open_source_llms_summarized_result_grouped_by_model:
        open_source_llms_summarized_result_grouped_by_model[model] = []
    open_source_llms_summarized_result_grouped_by_model[model].append(result)

In [51]:
commercial_llms = list(commercial_llms_summarized_result_grouped_by_model.keys())

for model in commercial_llms:
    commercial_llms_summarized_result_grouped_by_task = {}
    for result in commercial_llms_summarized_result_grouped_by_model[model]:
        task = result.get("task", None)
        if task not in commercial_llms_summarized_result_grouped_by_task:
            commercial_llms_summarized_result_grouped_by_task[task] = []
        commercial_llms_summarized_result_grouped_by_task[task].append(result)
    commercial_llms_summarized_result_grouped_by_model[model] = commercial_llms_summarized_result_grouped_by_task
    
for model in commercial_llms:
    for task in commercial_llms_summarized_result_grouped_by_model[model]:
        commercial_llms_summarized_result_grouped_by_output_format = {}
        for result in commercial_llms_summarized_result_grouped_by_model[model][task]:
            output_format = result.get("output_format", None)
            if output_format not in commercial_llms_summarized_result_grouped_by_output_format:
                commercial_llms_summarized_result_grouped_by_output_format[output_format] = []
            commercial_llms_summarized_result_grouped_by_output_format[output_format].append(result)
        commercial_llms_summarized_result_grouped_by_model[model][task] = commercial_llms_summarized_result_grouped_by_output_format

In [52]:
open_source_llms = list(open_source_llms_summarized_result_grouped_by_model.keys())

for model in open_source_llms:
    open_source_llms_summarized_result_grouped_by_task = {}
    for result in open_source_llms_summarized_result_grouped_by_model[model]:
        task = result.get("task", None)
        if task not in open_source_llms_summarized_result_grouped_by_task:
            open_source_llms_summarized_result_grouped_by_task[task] = []
        open_source_llms_summarized_result_grouped_by_task[task].append(result)
    open_source_llms_summarized_result_grouped_by_model[model] = open_source_llms_summarized_result_grouped_by_task
    
for model in open_source_llms:
    for task in open_source_llms_summarized_result_grouped_by_model[model]:
        open_source_llms_summarized_result_grouped_by_output_format = {}
        for result in open_source_llms_summarized_result_grouped_by_model[model][task]:
            output_format = result.get("output_format", None)
            if output_format not in open_source_llms_summarized_result_grouped_by_output_format:
                open_source_llms_summarized_result_grouped_by_output_format[output_format] = []
            open_source_llms_summarized_result_grouped_by_output_format[output_format].append(result)
        open_source_llms_summarized_result_grouped_by_model[model][task] = open_source_llms_summarized_result_grouped_by_output_format

In [55]:
llms = commercial_llms + open_source_llms
llms_summarized_result = {}
for model in llms:
    llms_summarized_result[model] = {}
    if model in commercial_llms:
        llms_summarized_result[model] = commercial_llms_summarized_result_grouped_by_model[model]
    if model in open_source_llms:
        llms_summarized_result[model] = open_source_llms_summarized_result_grouped_by_model[model]

# Data Analysis

## Error Rate

In [68]:
llms_error_rate = {}
for model in llms:
    llms_error_rate[model] = {}
    for task in llms_summarized_result[model]:
        llms_error_rate[model][task] = {}
        for output_format in llms_summarized_result[model][task]:
            llms_error_rate[model][task][output_format] = 0
            for result in llms_summarized_result[model][task][output_format]:
                if result.get("error", None) is not None:
                    llms_error_rate[model][task][output_format] += 1
                    
# Correct PaLM as it is missing some "yaml" formats for all tasks
llms_error_rate["palm"]["character"]["yaml"] = 100
llms_error_rate["palm"]["enemy"]["yaml"] = 100
llms_error_rate["palm"]["dialogue"]["yaml"] = 100
llms_error_rate["palm"]["quest"]["yaml"] = 100
llms_error_rate["palm"]["story"]["yaml"] = 100

In [69]:
llms_error_rate

{'gpt-4': {'character': {'xml': 1, 'json': 0, 'yaml': 1},
  'enemy': {'xml': 13, 'json': 11, 'yaml': 0},
  'dialogue': {'xml': 0, 'json': 3, 'yaml': 11},
  'quest': {'xml': 6, 'json': 3, 'yaml': 6},
  'story': {'xml': 11, 'json': 8, 'yaml': 32}},
 'gpt-3.5-turbo': {'character': {'xml': 0, 'json': 0, 'yaml': 0},
  'enemy': {'xml': 0, 'json': 1, 'yaml': 1},
  'dialogue': {'xml': 2, 'json': 0, 'yaml': 1},
  'quest': {'xml': 3, 'json': 5, 'yaml': 0},
  'story': {'xml': 3, 'json': 4, 'yaml': 8}},
 'palm': {'character': {'xml': 100, 'json': 100, 'yaml': 100},
  'enemy': {'xml': 100, 'json': 100, 'yaml': 100},
  'dialogue': {'xml': 89, 'json': 90, 'yaml': 100},
  'quest': {'xml': 100, 'json': 100, 'yaml': 100},
  'story': {'xml': 100, 'json': 100, 'yaml': 100}},
 'mpt': {'character': {'xml': 87, 'json': 57, 'yaml': 29},
  'enemy': {'xml': 70, 'json': 43, 'yaml': 15},
  'dialogue': {'xml': 92, 'json': 86, 'yaml': 34},
  'quest': {'xml': 81, 'json': 49, 'yaml': 26},
  'story': {'xml': 84, 'json

In [70]:
llms_error_rate_by_model = {}
for model in llms_error_rate:
    llms_error_rate_by_model[model] = 0
    for task in llms_error_rate[model]:
        for output_format in llms_error_rate[model][task]:
            llms_error_rate_by_model[model] += llms_error_rate[model][task][output_format]
llms_error_rate_by_model

{'gpt-4': 106,
 'gpt-3.5-turbo': 28,
 'palm': 1479,
 'mpt': 848,
 'llama-2': 1241,
 'falcon': 1229}

In [71]:
llms_error_rate_by_task = {}
for model in llms_error_rate:
    for task in llms_error_rate[model]:
        if task not in llms_error_rate_by_task:
            llms_error_rate_by_task[task] = 0
        for output_format in llms_error_rate[model][task]:
            llms_error_rate_by_task[task] += llms_error_rate[model][task][output_format]
llms_error_rate_by_task

{'character': 977, 'enemy': 950, 'dialogue': 1010, 'quest': 960, 'story': 1034}

In [72]:
llms_error_rate_by_output_format = {}
for model in llms_error_rate:
    for task in llms_error_rate[model]:
        for output_format in llms_error_rate[model][task]:
            if output_format not in llms_error_rate_by_output_format:
                llms_error_rate_by_output_format[output_format] = 0
            llms_error_rate_by_output_format[output_format] += llms_error_rate[model][task][output_format]
llms_error_rate_by_output_format

{'xml': 1910, 'json': 1803, 'yaml': 1218}

In [73]:
llms_error_rate_by_model_task = {}
for model in llms_error_rate:
    for task in llms_error_rate[model]:
        if (model, task) not in llms_error_rate_by_model_task:
            llms_error_rate_by_model_task[(model, task)] = 0
        for output_format in llms_error_rate[model][task]:
            llms_error_rate_by_model_task[(model, task)] += llms_error_rate[model][task][output_format]
llms_error_rate_by_model_task

{('gpt-4', 'character'): 2,
 ('gpt-4', 'enemy'): 24,
 ('gpt-4', 'dialogue'): 14,
 ('gpt-4', 'quest'): 15,
 ('gpt-4', 'story'): 51,
 ('gpt-3.5-turbo', 'character'): 0,
 ('gpt-3.5-turbo', 'enemy'): 2,
 ('gpt-3.5-turbo', 'dialogue'): 3,
 ('gpt-3.5-turbo', 'quest'): 8,
 ('gpt-3.5-turbo', 'story'): 15,
 ('palm', 'character'): 300,
 ('palm', 'enemy'): 300,
 ('palm', 'dialogue'): 279,
 ('palm', 'quest'): 300,
 ('palm', 'story'): 300,
 ('mpt', 'character'): 173,
 ('mpt', 'enemy'): 128,
 ('mpt', 'dialogue'): 212,
 ('mpt', 'quest'): 156,
 ('mpt', 'story'): 179,
 ('llama-2', 'character'): 253,
 ('llama-2', 'enemy'): 250,
 ('llama-2', 'dialogue'): 248,
 ('llama-2', 'quest'): 246,
 ('llama-2', 'story'): 244,
 ('falcon', 'character'): 249,
 ('falcon', 'enemy'): 246,
 ('falcon', 'dialogue'): 254,
 ('falcon', 'quest'): 235,
 ('falcon', 'story'): 245}

In [74]:
llms_error_rate_by_model_format = {}
for model in llms_error_rate:
    for task in llms_error_rate[model]:
        for output_format in llms_error_rate[model][task]:
            if (model, output_format) not in llms_error_rate_by_model_format:
                llms_error_rate_by_model_format[(model, output_format)] = 0
            llms_error_rate_by_model_format[(model, output_format)] += llms_error_rate[model][task][output_format]
llms_error_rate_by_model_format       

{('gpt-4', 'xml'): 31,
 ('gpt-4', 'json'): 25,
 ('gpt-4', 'yaml'): 50,
 ('gpt-3.5-turbo', 'xml'): 8,
 ('gpt-3.5-turbo', 'json'): 10,
 ('gpt-3.5-turbo', 'yaml'): 10,
 ('palm', 'xml'): 489,
 ('palm', 'json'): 490,
 ('palm', 'yaml'): 500,
 ('mpt', 'xml'): 414,
 ('mpt', 'json'): 294,
 ('mpt', 'yaml'): 140,
 ('llama-2', 'xml'): 500,
 ('llama-2', 'json'): 500,
 ('llama-2', 'yaml'): 241,
 ('falcon', 'xml'): 468,
 ('falcon', 'json'): 484,
 ('falcon', 'yaml'): 277}