## Print the evaluation results

In [None]:
import os
from lm_eval.utils import make_table
import json
import pandas as pd

eval_root_path="eval_results"
TASK_LISTS=['mmlu', 'bbh', 'gsm8k', 'truthfulqa_mc2', "arc_challenge", "piqa", "hellaswag", "openbookqa", "triviaqa", 'sciq', 'arc_easy', 'logiqa', 'boolq', 'winogrande'] ##task


base_model="meta-llama/Llama-3.2-3B" #"meta-llama/Llama-3.1-8B" "mistralai/Mistral-7B-v0.3"
data_prop = 0.6

##eval model name
model_tags=["ds2-50k-self-evolving"]


results_all = {}
for model_tag in model_tags:
    eval_result_path = f"{eval_root_path}/{os.path.basename(base_model)}/{data_prop}/{model_tag}/"

    if model_tag != 'base':
        exp_files = os.listdir(eval_result_path)
        for file_name in exp_files:
            if str(data_prop) in file_name and os.path.basename(base_model) in file_name: 
                log_path = file_name
    else:
        log_path = os.listdir(eval_result_path)[0]
 
    json_files = os.listdir(os.path.join(eval_result_path, log_path))

    results = {}
    for file in json_files:
        with open(os.path.join(eval_result_path, log_path, file), 'r') as f:
            temp = json.load(f)
            for task in TASK_LISTS:
                if task in temp['results'].keys():                    
                    if task in ['hellaswag', 'piqa', 'openbookqa', 'arc_challenge', 'mmlu', 'truthfulqa_mc2', 'sciq', 'arc_easy', 'logiqa', 'boolq', 'winogrande']:
                        metric = 'acc,none'
                    elif task == 'gsm8k':
                        metric = 'exact_match,strict-match'
                    elif task == "triviaqa":
                        metric = "exact_match,remove_whitespace"
                    elif task == 'bbh':
                        metric = 'exact_match,get-answer' 
                    results[task] = temp['results'][task][metric]

    ## load tydiqa result 
    tydiqa_result_file = os.path.join(eval_result_path, "metrics.json")
    if os.path.exists(tydiqa_result_file):
        with open(tydiqa_result_file, 'r') as f:
            results['tydiqa'] = round(json.load(f)['average']['f1'] / 100, 4)

    results_all[model_tag] = results


results_df = pd.DataFrame.from_dict(results_all, orient='index')
TASK_LISTS=["truthfulqa_mc2", "tydiqa", 'logiqa', 'mmlu',  "hellaswag", "arc_challenge", "boolq"]

results_df = results_df[TASK_LISTS]
results_df = results_df.map(lambda x: round(100*x, 2) if pd.notnull(x) else x)
results_df['Average'] = results_df.mean(axis=1).round(1)

print("\nResults DataFrame (Reordered with Average, Percentage Format):\n")
results_df = results_df.reindex(model_tags)
print(results_df.to_string(line_width=1000))


