In [2]:
from pathlib import Path
import pandas as pd
import json

# EDA

In [3]:
root_path = Path("../experiment_results")
llms_path_outputs = root_path / "outputs"
llms_path_parsed_outputs = root_path / "parsed_outputs"

In [4]:
with open(llms_path_parsed_outputs / "summarized_results.json", 'r') as f:
    parsed_jsons = json.loads(f.read())
    summarized_results = parsed_jsons.get("results", None)
error_df = pd.read_csv(llms_path_outputs / "error_categories.csv")

## Data Integrity Checking

In [5]:
task_subfolders = [folder for folder in llms_path_outputs.iterdir() if folder.is_dir()]
for task_path in task_subfolders:
    model_subfolders = [folder for folder in task_path.iterdir() if folder.is_dir()]
    for model_path in model_subfolders:
        format_subfolders = [folder for folder in model_path.iterdir() if folder.is_dir()]
        for format_path in format_subfolders:
            files = [file for file in format_path.glob("*.txt")]
            if len(files) != 100:
                print(str(format_path), len(files))

## Grouping

### What happens to PaLM 2?

In [6]:
df_palm = error_df[error_df["model"] == "palm"]
df_palm = df_palm.groupby(["task", "error"]).agg({"error_message": "count"}).reset_index()
df_palm

Unnamed: 0,task,error,error_message
0,character,empty_response,300
1,dialogue,empty_response,134
2,dialogue,incorrect_syntax,3
3,enemy,empty_response,300
4,quest,empty_response,300
5,story,empty_response,300


### Category Count

In [7]:
# Check if there are any errors from PaLM 2
error_df[error_df["model"] == "palm"].groupby("error").agg({"error_message": "count"}).reset_index()

Unnamed: 0,error,error_message
0,empty_response,1334
1,incorrect_syntax,3


In [8]:
error_df.groupby("error").agg({"error_message": "count"}).reset_index()

Unnamed: 0,error,error_message
0,empty_response,3994
1,incorrect_syntax,1300
2,key_completeness,129


## Grouping

### Grouped by Model

In [9]:
df_grouped_by_model_error = error_df.groupby(["model", "error"]).agg({"error_message": "count"}).reset_index()
df_grouped_by_model_error

Unnamed: 0,model,error,error_message
0,falcon,empty_response,964
1,falcon,incorrect_syntax,333
2,falcon,key_completeness,96
3,gemini-pro,empty_response,26
4,gemini-pro,incorrect_syntax,26
5,gemini-pro,key_completeness,16
6,gpt-3.5-turbo,empty_response,134
7,gpt-3.5-turbo,incorrect_syntax,141
8,gpt-4,empty_response,46
9,gpt-4,incorrect_syntax,52


In [10]:
df_grouped_by_model_error.to_csv(llms_path_outputs / "grouped_by_model_error.csv", index=False)

### Grouped by Task

In [11]:
df_grouped_by_task_error = error_df.groupby(["task", "error"]).agg({"error_message": "count"}).reset_index()
df_grouped_by_task_error

Unnamed: 0,task,error,error_message
0,character,empty_response,787
1,character,incorrect_syntax,236
2,character,key_completeness,26
3,dialogue,empty_response,709
4,dialogue,incorrect_syntax,257
5,dialogue,key_completeness,22
6,enemy,empty_response,814
7,enemy,incorrect_syntax,238
8,enemy,key_completeness,12
9,quest,empty_response,794


In [14]:
# Excluding PaLM 2
df_grouped_by_task_error_excl_palm_2 = error_df[error_df['model'] != 'palm'].groupby(["task", "error"]).agg({"error_message": "count"}).reset_index()
df_grouped_by_task_error_excl_palm_2

Unnamed: 0,task,error,error_message
0,character,empty_response,487
1,character,incorrect_syntax,236
2,character,key_completeness,26
3,dialogue,empty_response,575
4,dialogue,incorrect_syntax,254
5,dialogue,key_completeness,22
6,enemy,empty_response,514
7,enemy,incorrect_syntax,238
8,enemy,key_completeness,12
9,quest,empty_response,494


### Grouped by Format

In [11]:
df_grouped_by_format_error = error_df.groupby(["format", "error"]).agg({"error_message": "count"}).reset_index()
df_grouped_by_format_error

Unnamed: 0,format,error,error_message
0,json,empty_response,1762
1,json,key_completeness,28
2,xml,empty_response,1056
3,xml,incorrect_syntax,683
4,xml,key_completeness,43
5,yaml,empty_response,1176
6,yaml,incorrect_syntax,617
7,yaml,key_completeness,58


In [13]:
# Excluding PaLM 2
df_grouped_by_format_error_excl_palm_2 = error_df[error_df['model'] != 'palm'].groupby(["format", "error"]).agg({"error_message": "count"}).reset_index()
df_grouped_by_format_error_excl_palm_2

Unnamed: 0,format,error,error_message
0,json,empty_response,1329
1,json,key_completeness,28
2,xml,empty_response,655
3,xml,incorrect_syntax,680
4,xml,key_completeness,43
5,yaml,empty_response,676
6,yaml,incorrect_syntax,617
7,yaml,key_completeness,58
