In [1]:
import os
import json
import pandas as pd
from pathlib import Path

In [2]:
result_path = Path(os.getcwd()) / "results"

In [3]:
result_summary = {}
summary_file_name = "summary.json"

In [4]:
for llm in os.listdir(result_path):
    if os.path.isdir(result_path / llm):
        result_summary[llm] = {}
        for task in os.listdir(result_path / llm):
            if os.path.isdir(result_path / llm / task):
                result_summary[llm][task] = {}
                for pe in os.listdir(result_path / llm / task):
                    if os.path.isdir(result_path / llm / task / pe):
                        result_summary[llm][task][pe] = {}
                        files = [file for file in os.listdir(result_path / llm / task / pe) if
                                 file != summary_file_name and file != ".DS_Store"]
                        with open(result_path / llm / task / pe / summary_file_name, "r") as f:
                            result_summary[llm][task][pe] = json.load(f)

In [5]:
result_summary

{'gemini-pro-chat': {'anli': {'zero-shot': {'model': 'gemini-pro-chat',
    'task': 'anli',
    'pe_technique': 'zero-shot',
    'current_index': 1200,
    'is_processing': False,
    'correct': 613,
    'total': 1200,
    'accuracy': 51.083333333333336,
    'created_at': '2024-02-11-20-55-25'},
   'null-shot-cot': {'model': 'gemini-pro-chat',
    'task': 'anli',
    'pe_technique': 'null-shot-cot',
    'current_index': 1200,
    'is_processing': False,
    'correct': 540,
    'total': 1200,
    'accuracy': 45.0,
    'created_at': '2024-02-11-20-55-27'},
   'zero-shot-cot': {'model': 'gemini-pro-chat',
    'task': 'anli',
    'pe_technique': 'zero-shot-cot',
    'current_index': 1200,
    'is_processing': False,
    'correct': 587,
    'total': 1200,
    'accuracy': 48.91666666666667,
    'created_at': '2024-02-11-20-55-26'},
   'null-shot': {'model': 'gemini-pro-chat',
    'task': 'anli',
    'pe_technique': 'null-shot',
    'current_index': 1200,
    'is_processing': False,
    'corr

In [11]:
for model in result_summary.keys():
    for task in result_summary[model].keys():
        for pe in result_summary[model][task].keys():
            lengths = []
            time_takens = []
            files = [file for file in os.listdir(result_path / model / task / pe) if file != summary_file_name and file != ".DS_Store"]
            for file in files:
                with open(Path(result_path) / model / task / pe / file) as f:
                    data = json.load(f)
                    lengths.append(data["length"])
                    time_takens.append(data["time_taken"])
            mean_length = sum(lengths) / len(lengths)
            mean_time_taken = sum(time_takens) / len(time_takens)
            std_length = (sum([(length - mean_length) ** 2 for length in lengths]) / len(lengths)) ** 0.5
            std_time_taken = (sum([(time_taken - mean_time_taken) ** 2 for time_taken in time_takens]) / len(time_takens)) ** 0.5
            min_length = min(lengths)
            min_time_taken = min(time_takens)
            max_length = max(lengths)
            max_time_taken = max(time_takens)
            median_length = sorted(lengths)[len(lengths) // 2]
            median_time_taken = sorted(time_takens)[len(time_takens) // 2]
            total_time_taken = sum(time_takens)
            result_summary[model][task][pe]["mean_length"] = mean_length
            result_summary[model][task][pe]["median_length"] = median_length
            result_summary[model][task][pe]["std_length"] = std_length
            result_summary[model][task][pe]["min_length"] = min_length
            result_summary[model][task][pe]["max_length"] = max_length
            result_summary[model][task][pe]["mean_time_taken"] = mean_time_taken
            result_summary[model][task][pe]["median_time_taken"] = median_time_taken
            result_summary[model][task][pe]["std_time_taken"] = std_time_taken
            result_summary[model][task][pe]["min_time_taken"] = min_time_taken
            result_summary[model][task][pe]["max_time_taken"] = max_time_taken
            result_summary[model][task][pe]["total_time_taken"] = total_time_taken            

In [7]:
df = pd.DataFrame.from_dict({(i, j, k): result_summary[i][j][k]
                             for i in result_summary.keys()
                             for j in result_summary[i].keys()
                             for k in result_summary[i][j].keys()},
                            orient='index')

In [8]:
df = df.drop(columns=["current_index", "is_processing"])

In [9]:
df

Unnamed: 0,Unnamed: 1,Unnamed: 2,model,task,pe_technique,correct,total,accuracy,created_at,mean_length,median_length,std_length,min_length,max_length,mean_time_taken,median_time_taken,std_time_taken,min_time_taken,max_time_taken,total_time_taken
gemini-pro-chat,anli,zero-shot,gemini-pro-chat,anli,zero-shot,613,1200,51.083333,2024-02-11-20-55-25,13.236667,13,3.081773,0,16,1.276951,1.072749,0.639031,0.246920,8.278853,1532.340873
gemini-pro-chat,anli,null-shot-cot,gemini-pro-chat,anli,null-shot-cot,540,1200,45.000000,2024-02-11-20-55-27,80.192500,16,143.966560,0,875,1.732134,1.320717,1.045407,0.235197,10.327671,2078.561143
gemini-pro-chat,anli,zero-shot-cot,gemini-pro-chat,anli,zero-shot-cot,587,1200,48.916667,2024-02-11-20-55-26,14.296667,16,2.788725,0,16,1.427989,1.114105,2.008976,0.254749,66.187193,1713.587162
gemini-pro-chat,anli,null-shot,gemini-pro-chat,anli,null-shot,623,1200,51.916667,2024-02-11-20-55-25,14.009167,13,2.414004,0,16,1.286978,1.099193,0.551863,0.297095,5.648444,1544.373709
gemini-pro-chat,triviaqa,zero-shot,gemini-pro-chat,triviaqa,zero-shot,705,1000,70.500000,2024-02-11-20-55-34,13.327000,9,61.193595,0,1779,1.558215,1.099253,1.154269,0.781255,12.064900,1558.214798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
gpt-3.5-turbo,race-h,null-shot-after,gpt-3.5-turbo,race-h,null-shot-after,2727,3498,77.958834,2024-01-03-13-17-11,232.633505,223,196.103293,4,1253,1.726554,1.528745,2.004366,0.329291,35.706460,6039.486307
gpt-3.5-turbo,race-h,null-shot-v2,gpt-3.5-turbo,race-h,null-shot-v2,2836,3498,81.074900,2024-01-03-13-17-17,40.795597,39,22.834433,4,413,1.051681,0.762450,1.229380,0.326367,33.271666,3678.781846
gpt-3.5-turbo,race-h,zero-shot-cot,gpt-3.5-turbo,race-h,zero-shot-cot,2717,3498,77.672956,2024-01-03-13-18-42,419.522584,367,224.473467,25,1741,3.448154,1.969786,6.890552,0.419034,125.843588,12061.642372
gpt-3.5-turbo,race-h,null-shot,gpt-3.5-turbo,race-h,null-shot,2825,3498,80.760435,2024-01-03-13-18-38,83.260720,44,111.424351,4,857,4.916380,1.700208,10.119005,0.334305,99.133077,17197.498657


In [10]:
df.to_csv("summary.csv")