In [1]:
import os
import json
import pandas as pd
from pathlib import Path

In [2]:
result_path = Path(os.getcwd()) / "results"

In [3]:
result_summary = {}
summary_file_name = "summary.json"

In [4]:
for llm in os.listdir(result_path):
    if os.path.isdir(result_path / llm):
        result_summary[llm] = {}
        for task in os.listdir(result_path / llm):
            if os.path.isdir(result_path / llm / task):
                result_summary[llm][task] = {}
                for pe in os.listdir(result_path / llm / task):
                    if os.path.isdir(result_path / llm / task / pe):
                        result_summary[llm][task][pe] = {}
                        files = [file for file in os.listdir(result_path / llm / task / pe) if
                                 file != summary_file_name and file != ".DS_Store"]
                        with open(result_path / llm / task / pe / summary_file_name, "r") as f:
                            result_summary[llm][task][pe] = json.load(f)

In [5]:
result_summary

{'palm-2-chat': {'anli': {'zero-shot': {'model': 'palm-2-chat',
    'task': 'anli',
    'pe_technique': 'zero-shot',
    'current_index': 1200,
    'is_processing': False,
    'correct': 512,
    'total': 1200,
    'accuracy': 42.66666666666667,
    'created_at': '2023-12-29-12-54-46'},
   'null-shot-cot': {'model': 'palm-2-chat',
    'task': 'anli',
    'pe_technique': 'null-shot-cot',
    'current_index': 1200,
    'is_processing': False,
    'correct': 509,
    'total': 1200,
    'accuracy': 42.41666666666667,
    'created_at': '2023-12-29-12-54-49'},
   'zero-shot-cot': {'model': 'palm-2-chat',
    'task': 'anli',
    'pe_technique': 'zero-shot-cot',
    'current_index': 1200,
    'is_processing': False,
    'correct': 522,
    'total': 1200,
    'accuracy': 43.5,
    'created_at': '2023-12-29-12-54-48'},
   'null-shot': {'model': 'palm-2-chat',
    'task': 'anli',
    'pe_technique': 'null-shot',
    'current_index': 1200,
    'is_processing': False,
    'correct': 520,
    'total

In [6]:
for model in result_summary.keys():
    for task in result_summary[model].keys():
        for pe in result_summary[model][task].keys():
            lengths = []
            time_takens = []
            files = [file for file in os.listdir(result_path / model / task / pe) if file != summary_file_name and file != ".DS_Store"]
            for file in files:
                with open(Path(result_path) / model / task / pe / file) as f:
                    data = json.load(f)
                    lengths.append(data["length"])
                    time_takens.append(data["time_taken"])
            mean_length = sum(lengths) / len(lengths)
            mean_time_taken = sum(time_takens) / len(time_takens)
            std_length = (sum([(length - mean_length) ** 2 for length in lengths]) / len(lengths)) ** 0.5
            std_time_taken = (sum([(time_taken - mean_time_taken) ** 2 for time_taken in time_takens]) / len(time_takens)) ** 0.5
            min_length = min(lengths)
            min_time_taken = min(time_takens)
            max_length = max(lengths)
            max_time_taken = max(time_takens)
            median_length = sorted(lengths)[len(lengths) // 2]
            median_time_taken = sorted(time_takens)[len(time_takens) // 2]
            total_time_taken = sum(time_takens)
            result_summary[model][task][pe]["mean_length"] = mean_length
            result_summary[model][task][pe]["median_length"] = median_length
            result_summary[model][task][pe]["std_length"] = std_length
            result_summary[model][task][pe]["min_length"] = min_length
            result_summary[model][task][pe]["max_length"] = max_length
            result_summary[model][task][pe]["mean_time_taken"] = mean_time_taken
            result_summary[model][task][pe]["median_time_taken"] = median_time_taken
            result_summary[model][task][pe]["std_time_taken"] = std_time_taken
            result_summary[model][task][pe]["min_time_taken"] = min_time_taken
            result_summary[model][task][pe]["max_time_taken"] = max_time_taken
            result_summary[model][task][pe]["total_time_taken"] = total_time_taken            

In [7]:
df = pd.DataFrame.from_dict({(i, j, k): result_summary[i][j][k]
                             for i in result_summary.keys()
                             for j in result_summary[i].keys()
                             for k in result_summary[i][j].keys()},
                            orient='index')

In [8]:
df = df.drop(columns=["current_index", "is_processing"])

In [9]:
df

Unnamed: 0,Unnamed: 1,Unnamed: 2,model,task,pe_technique,correct,total,accuracy,created_at,mean_length,median_length,std_length,min_length,max_length,mean_time_taken,median_time_taken,std_time_taken,min_time_taken,max_time_taken,total_time_taken
palm-2-chat,anli,zero-shot,palm-2-chat,anli,zero-shot,512,1200,42.666667,2023-12-29-12-54-46,940.503333,893,364.215984,0,3673,3.856965,3.707466,1.160879,1.379333,12.517058,4628.357446
palm-2-chat,anli,null-shot-cot,palm-2-chat,anli,null-shot-cot,509,1200,42.416667,2023-12-29-12-54-49,1287.180833,1200,494.813151,0,4658,5.207829,4.887037,1.767292,2.317173,18.855802,6249.395060
palm-2-chat,anli,zero-shot-cot,palm-2-chat,anli,zero-shot-cot,522,1200,43.500000,2023-12-29-12-54-48,1037.376667,977,375.734924,0,3745,4.174589,3.947784,1.317973,1.929132,19.959722,5009.506384
palm-2-chat,anli,null-shot,palm-2-chat,anli,null-shot,520,1200,43.333333,2023-12-29-12-54-47,1038.062500,965,424.794947,0,4658,4.202485,3.980942,1.371685,1.479593,16.875456,5042.981712
palm-2-chat,triviaqa,zero-shot,palm-2-chat,triviaqa,zero-shot,702,1000,70.200000,2023-12-29-12-55-16,997.997000,1001,618.858208,0,3962,4.738361,4.574121,2.300055,0.943068,17.262771,4738.361188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
gpt-3.5-turbo,race-h,null-shot-after,gpt-3.5-turbo,race-h,null-shot-after,2727,3498,77.958834,2023-12-29-12-56-59,232.633505,223,196.103293,4,1253,1.726554,1.528745,2.004366,0.329291,35.706460,6039.486307
gpt-3.5-turbo,race-h,null-shot-v2,gpt-3.5-turbo,race-h,null-shot-v2,2836,3498,81.074900,2023-12-29-12-57-05,40.795597,39,22.834433,4,413,1.051681,0.762450,1.229380,0.326367,33.271666,3678.781846
gpt-3.5-turbo,race-h,zero-shot-cot,gpt-3.5-turbo,race-h,zero-shot-cot,2717,3498,77.672956,2023-12-29-12-53-52,419.522584,367,224.473467,25,1741,3.448154,1.969786,6.890552,0.419034,125.843588,12061.642372
gpt-3.5-turbo,race-h,null-shot,gpt-3.5-turbo,race-h,null-shot,2825,3498,80.760435,2023-12-29-12-53-50,83.260720,44,111.424351,4,857,4.916380,1.700208,10.119005,0.334305,99.133077,17197.498657
