## Print the evaluation results

In [2]:
import os
from lm_eval.utils import make_table
import json
import pandas as pd

############# base model #########
base_model="meta-llama/Llama-3.2-3B"
# base_model="meta-llama/Llama-3.1-8B"
# base_model="llama8bai/llama8b-7B-v0.3"
# base_model="meta-llama/Llama-3.1-8B-Instruct"

########### train data tag ###############



# dataset_name="filtered-cured-50k-rho-baseline-global-llama3b-41" 
# dataset_name="filtered-cured-50k-rho-baseline-global-llama3b-43"
# dataset_name="filtered-cured-50k-rho-baseline-llama3b-global-low-ppl-0.86"
dataset_name="filtered-cured-50k-rho-baseline-llama3b-global-high-ppl"
model_tags=[f'{dataset_name}']
data_prop = 0.6

TASK_LISTS=['mmlu', 'bbh', 'gsm8k', 'truthfulqa_mc2', "arc_challenge", "piqa", "hellaswag", "openbookqa", "triviaqa", 'sciq', 'arc_easy', 'logiqa', 'boolq', 'winogrande'] ##task

results_all = {}
for model_tag in model_tags:
    
    data_path = f"token_selection_results/{data_prop}/{model_tag}/"
    # temp = os.listdir(data_path)[-1]
    
    if model_tag != 'base':
        exp_files = os.listdir(data_path)
        # print(exp_files)
        for file_name in exp_files:
            if str(data_prop) in file_name and os.path.basename(base_model) in file_name: ## search the file based on data proportion
                temp = file_name
    else:
        temp = os.listdir(data_path)[0]
        # temp_file_path = [name for name in temp if os.path.isdir(os.path.join(data_path, name))]

    data_path = os.path.join(data_path, temp)
    json_files = os.listdir(data_path)

    results = {}
    for file in json_files:
        with open(os.path.join(data_path, file), 'r') as f:
            temp = json.load(f)
            # print("#"* 50 + "\n")
            for task in TASK_LISTS:
                if task in temp['results'].keys():
                    # print(temp['results'][task])
                    
                    if task in ['hellaswag', 'piqa', 'openbookqa', 'arc_challenge', 'mmlu', 'truthfulqa_mc2', 'sciq', 'arc_easy', 'logiqa', 'boolq', 'winogrande']:
                        metric = 'acc,none'
                    elif task == 'gsm8k':
                        metric = 'exact_match,strict-match'
                    elif task == "triviaqa":
                        metric = "exact_match,remove_whitespace"
                    elif task == 'bbh':
                        metric = 'exact_match,get-answer' 
                    results[task] = temp['results'][task][metric]
    
    ####### tydiqa file #########
    if os.path.exists(f"token_selection_results/{data_prop}/{model_tag}/metrics.json"):
        with open(f"token_selection_results/{data_prop}/{model_tag}/metrics.json", 'r') as f:
            temp = json.load(f)
            results['tydiqa'] = round(temp['average']['f1'] / 100, 4)
    ###########################
    
    results_all[model_tag] = results



results_df = pd.DataFrame.from_dict(results_all, orient='index')

TASK_LISTS=["truthfulqa_mc2", "tydiqa", 'logiqa', 'mmlu',  "hellaswag", "arc_challenge", "boolq"]

results_df = results_df[TASK_LISTS]
results_df = results_df.map(lambda x: round(100*x, 2) if pd.notnull(x) else x)
results_df['Average'] = results_df.mean(axis=1).round(1)

print("\nResults DataFrame (Reordered with Average, Percentage Format):\n")
results_df = results_df.reindex(model_tags)
results_df.index = results_df.index.str.replace('_', '-', regex=False)

print(results_df.to_string(line_width=1000))

latex_table = results_df.to_latex(index=True, caption="模型评估结果", label="tab:results", float_format="%.2f")

# 打印 LaTeX 表格到控制台
print("\n" +"#" * 80 + "\nLaTeX Form:\n" + "#" * 80 )
print(latex_table)


Results DataFrame (Reordered with Average, Percentage Format):

                                                         truthfulqa_mc2  tydiqa  logiqa   mmlu  hellaswag  arc_challenge  boolq  Average
filtered-cured-50k-rho-baseline-llama3b-global-high-ppl           44.28   38.98   23.26  57.01      55.28          44.79  73.63     48.2

################################################################################
LaTeX Form:
################################################################################
\begin{table}
\caption{模型评估结果}
\label{tab:results}
\begin{tabular}{lrrrrrrrr}
\toprule
 & truthfulqa_mc2 & tydiqa & logiqa & mmlu & hellaswag & arc_challenge & boolq & Average \\
\midrule
filtered-cured-50k-rho-baseline-llama3b-global-high-ppl & 44.28 & 38.98 & 23.26 & 57.01 & 55.28 & 44.79 & 73.63 & 48.20 \\
\bottomrule
\end{tabular}
\end{table}



## Calculate the overall average result

In [None]:
import os
from lm_eval.utils import make_table
import json
import pandas as pd

############# base model #########
base_model="meta-llama/Llama-3.2-3B"
# base_model="meta-llama/Llama-3.1-8B"
# base_model="mistralai/Mistral-7B-v0.3"
# base_model="meta-llama/Llama-3.1-8B-Instruct"

########### train data tag ###############

Train_DATASET_LIST=[
        ["filtered-cured-10k-warmup-llama3b-41", "filtered-cured-10k-warmup-llama3b", "filtered-cured-10k-warmup-llama3b-43"]
        ["filtered-cured-50k-full-baseline-41," "filtered-cured-50k-full-baseline", "filtered-cured-50k-full-baseline-43"]
        ["filtered-cured-50k-random-baseline-41", "filtered-cured-50k-random-baseline", "filtered-cured-50k-random-baseline-43"]
    ["filtered-cured-50k-rho-baseline-sample-llama3b-41", "filtered-cured-50k-rho-baseline-sample-llama3b", "filtered-cured-50k-rho-baseline-sample-llama3b-43"]
    ["filtered-cured-50k-rho-baseline-global-llama3b-41", "filtered-cured-50k-rho-baseline-global-llama3b", "filtered-cured-50k-rho-baseline-global-llama3b-43" ] 
    ["filtered-cured-50k-iter-split-global_data_prop_0.6_llama3b-non-filtered-fixed-base-model-41_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_llama3b-non-filtered-fixed-base-model_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_llama3b-non-filtered-fixed-base-model-43_4"]
]

for dataset_names in Train_DATASET_LIST:


    for dataset_name in dataset_names:
        model_tags=[f'{dataset_name}']
        data_prop = 0.6


        TASK_LISTS=['mmlu', 'bbh', 'gsm8k', 'truthfulqa_mc2', "arc_challenge", "piqa", "hellaswag", "openbookqa", "triviaqa", 'sciq', 'arc_easy', 'logiqa', 'boolq', 'winogrande'] ##task
        
        results_all = {}
        for model_tag in model_tags:
            
            data_path = f"token_selection_results/{data_prop}/{model_tag}/"
            # temp = os.listdir(data_path)[-1]
            
            if model_tag != 'base':
                exp_files = os.listdir(data_path)
                # print(exp_files)
                for file_name in exp_files:
                    if str(data_prop) in file_name and os.path.basename(base_model) in file_name: ## search the file based on data proportion
                        temp = file_name
            else:
                temp = os.listdir(data_path)[0]
                # temp_file_path = [name for name in temp if os.path.isdir(os.path.join(data_path, name))]

            data_path = os.path.join(data_path, temp)
            json_files = os.listdir(data_path)

            results = {}
            for file in json_files:
                with open(os.path.join(data_path, file), 'r') as f:
                    temp = json.load(f)
                    # print("#"* 50 + "\n")
                    for task in TASK_LISTS:
                        if task in temp['results'].keys():
                            # print(temp['results'][task])
                            
                            if task in ['hellaswag', 'piqa', 'openbookqa', 'arc_challenge', 'mmlu', 'truthfulqa_mc2', 'sciq', 'arc_easy', 'logiqa', 'boolq', 'winogrande']:
                                metric = 'acc,none'
                            elif task == 'gsm8k':
                                metric = 'exact_match,strict-match'
                            elif task == "triviaqa":
                                metric = "exact_match,remove_whitespace"
                            elif task == 'bbh':
                                metric = 'exact_match,get-answer' 
                            results[task] = temp['results'][task][metric]
            
            ####### tydiqa file #########
            if os.path.exists(f"token_selection_results/{data_prop}/{model_tag}/metrics.json"):
                with open(f"token_selection_results/{data_prop}/{model_tag}/metrics.json", 'r') as f:
                    temp = json.load(f)
                    results['tydiqa'] = round(temp['average']['f1'] / 100, 4)
            ###########################
            
            results_all[model_tag] = results



        results_df = pd.DataFrame.from_dict(results_all, orient='index')
        TASK_LISTS=["truthfulqa_mc2", "tydiqa", 'logiqa', 'mmlu',  "hellaswag", "arc_challenge", "boolq"]

        results_df = results_df[TASK_LISTS]
        results_df = results_df.map(lambda x: round(100*x, 2) if pd.notnull(x) else x)
        results_df['Average'] = results_df.mean(axis=1).round(1)

        print("\nResults DataFrame (Reordered with Average, Percentage Format):\n")
        results_df = results_df.reindex(model_tags)
        results_df.index = results_df.index.str.replace('_', '-', regex=False)

        print(results_df.to_string(line_width=1000))

        latex_table = results_df.to_latex(index=True, caption="模型评估结果", label="tab:results", float_format="%.2f")

        # 打印 LaTeX 表格到控制台
        print("\n" +"#" * 80 + "\nLaTeX Form:\n" + "#" * 80 )
        print(latex_table)

# Store the results using csv file

In [8]:
import os
import json
import pandas as pd

# 基础模型设置


### llama3b
# base_model="meta-llama/Llama-3.2-3B"
# Train_DATASET_LIST = [
#     ["filtered-cured-10k-warmup-llama3b-41", "filtered-cured-10k-warmup-llama3b", "filtered-cured-10k-warmup-llama3b-43"],
#     ["filtered-cured-50k-full-baseline-41", "filtered-cured-50k-full-baseline", "filtered-cured-50k-full-baseline-43"],
#     ["filtered-cured-50k-random-baseline-41", "filtered-cured-50k-random-baseline", "filtered-cured-50k-random-baseline-43"],
#     ["filtered-cured-50k-rho-baseline-sample-llama3b-41", "filtered-cured-50k-rho-baseline-sample", "filtered-cured-50k-rho-baseline-sample-llama3b-43"],
#     ["filtered-cured-50k-rho-baseline-global-llama3b-41", "filtered-cured-50k-rho-baseline-global", "filtered-cured-50k-rho-baseline-global-llama3b-43"],
#     ["filtered-cured-50k-iter-split-global_data_prop_0.6_llama3b-non-filtered-fixed-base-model-41_4","filtered-cured-50k-iter-split-global_data_prop_0.6_llama3b-non-filtered-fixed-base-model_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_llama3b-non-filtered-fixed-base-model-43_4"]
# ]

## mistral
# base_model="mistralai/Mistral-7B-v0.3"
# Train_DATASET_LIST=[
#     ["filtered-cured-50k-rho-baseline-mistral-41", "filtered-cured-50k-rho-baseline", "filtered-cured-50k-rho-baseline-mistral-43" ],
#     ["filtered-cured-50k-random-baseline-41", "filtered-cured-50k-shuffle-random-baseline" ,"filtered-cured-50k-random-baseline-43"],
#     ["filtered-cured-50k-full-baseline-41", "filtered-cured-50k-shuffle-full-baseline", "filtered-cured-50k-full-baseline-43"],
#     ["filtered-cured-10k-warmup-mistral-41", "filtered-cured-10k-warmup-mistral", "filtered-cured-10k-warmup-mistral-43"],
#     ["filtered-cured-50k-rho-baseline-mistral-global-41", "filtered-cured-50k-rho-baseline-mistral-global" ,"filtered-cured-50k-rho-baseline-mistral-global-43"],
#     ["filtered-cured-50k-iter-split-global_data_prop_0.6_mistral-non-filtered-fixed-base-model-41_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_mistral-non-filtered-fixed-base-model_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_mistral-non-filtered-fixed-base-model-43_4"],
#     ]


base_model="mistralai/Mistral-7B-v0.3"
Train_DATASET_LIST = [
    ["filtered-cured-10k-warmup-mistral-new-41","filtered-cured-10k-warmup-mistral-new-43",],
    ["filtered-cured-50k-full-baseline-mistral-41", "filtered-cured-50k-full-baseline-mistral-43",], #
    ["filtered-cured-50k-fixed-model-cleaning-mistral-41","filtered-cured-50k-fixed-model-cleaning-mistral-43",],
    ["filtered-cured-50k-rho-baseline-mistral-new-41","filtered-cured-50k-rho-baseline-mistral-new-43",],
    ["filtered-cured-50k-iter-split-global_data_prop_0.6_mistral-non-filtered-fixed-base-model-new-41_4","filtered-cured-50k-iter-split-global_data_prop_0.6_mistral-non-filtered-fixed-base-model-new-43_4",],
]

Train_DATASET_LIST = [
["filtered-cured-50k-iter-split-global_data_prop_0.6_mistral-non-filtered-fixed-base-model_4"],]


#### llama8b
# base_model="meta-llama/Llama-3.1-8B"
# Train_DATASET_LIST=(
#     ["filtered-cured-50k-rho-baseline-llama8b-41", "filtered-cured-50k-rho-baseline","filtered-cured-50k-rho-baseline-llama8b-43" ],
#     ["filtered-cured-50k-random-baseline-41", "filtered-cured-50k-shuffle-random-baseline" , "filtered-cured-50k-random-baseline-43"],
#     ["filtered-cured-50k-full-baseline-41", "filtered-cured-50k-shuffle-full-baseline", "filtered-cured-50k-full-baseline-43"],
#     ["filtered-cured-10k-warmup-llama8b-41", "filtered-cured-10k-warmup-llama8b", "filtered-cured-10k-warmup-llama8b-43"],
#     ["filtered-cured-50k-rho-baseline-llama8b-global-41", "filtered-cured-50k-rho-baseline-llama8b-global", "filtered-cured-50k-rho-baseline-llama8b-global-43"],
#     ["filtered-cured-50k-iter-split-global_data_prop_0.6_llama8b-non-filtered-fixed-base-model-41_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_llama8b-non-filtered-fixed-base-model_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_llama8b-non-filtered-fixed-base-model-43_4"],
#     ) 

# 定义一个大字典来存储所有数据集的结果
all_datasets_results = {}


# 遍历所有数据集组合
for dataset_names in Train_DATASET_LIST:
    dataset_group_results = {}  # 存储当前数据集组的结果

    for dataset_name in dataset_names:
        # print(f"**** current dataset_name: {dataset_name} ***")
        model_tags = [f'{dataset_name}']
        data_prop = 0.6
        TASK_LISTS = ['mmlu', 'bbh', 'gsm8k', 'truthfulqa_mc2', "arc_challenge", "piqa", "hellaswag", "openbookqa", "triviaqa", 'sciq', 'arc_easy', 'logiqa', 'boolq', 'winogrande']
        
        results_all = {}
        for model_tag in model_tags:
            data_path = f"token_selection_results/{data_prop}/{model_tag}/"
            # temp = os.listdir(data_path)[-1]
            
            if model_tag != 'base':
                exp_files = os.listdir(data_path)
                # print(exp_files)
                for file_name in exp_files:
                    if str(data_prop) in file_name and os.path.basename(base_model) in file_name: ## search the file based on data proportion
                        temp = file_name
                # print(temp)
            else:
                temp = os.listdir(data_path)[0]
                # temp_file_path = [name for name in temp if os.path.isdir(os.path.join(data_path, name))]

            data_path = os.path.join(data_path, temp)
            json_files = os.listdir(data_path)

            results = {}
            for file in json_files:
                with open(os.path.join(data_path, file), 'r') as f:
                    temp = json.load(f)
                    # print("#"* 50 + "\n")
                    for task in TASK_LISTS:
                        if task in temp['results'].keys():
                            # print(temp['results'][task])
                            
                            if task in ['hellaswag', 'piqa', 'openbookqa', 'arc_challenge', 'mmlu', 'truthfulqa_mc2', 'sciq', 'arc_easy', 'logiqa', 'boolq', 'winogrande']:
                                metric = 'acc,none'
                            elif task == 'gsm8k':
                                metric = 'exact_match,strict-match'
                            elif task == "triviaqa":
                                metric = "exact_match,remove_whitespace"
                            elif task == 'bbh':
                                metric = 'exact_match,get-answer' 
                            results[task] = temp['results'][task][metric]
            
            ####### tydiqa file #########
            if os.path.exists(f"token_selection_results/{data_prop}/{model_tag}/metrics.json"):
                with open(f"token_selection_results/{data_prop}/{model_tag}/metrics.json", 'r') as f:
                    temp = json.load(f)
                    results['tydiqa'] = round(temp['average']['f1'] / 100, 4)
            ###########################
            
            results_all[model_tag] = results
            
        results_df = pd.DataFrame.from_dict(results_all, orient='index')
        TASK_LISTS=["truthfulqa_mc2", "tydiqa", 'logiqa', 'mmlu',  "hellaswag", "arc_challenge", "boolq"]

        results_df = results_df[TASK_LISTS]
        results_df = results_df.map(lambda x: round(100*x, 2) if pd.notnull(x) else x)
        results_df['Average'] = results_df.mean(axis=1).round(1)

        # print("\nResults DataFrame (Reordered with Average, Percentage Format):\n")
        results_df = results_df.reindex(model_tags)
        results_df.index = results_df.index.str.replace('_', '-', regex=False)

        # print(results_df.to_string(line_width=1000))
        results_df.to_csv(f'csv_results/{os.path.basename(base_model)}/{dataset_name}_results.csv')
        dataset_group_results[dataset_name] = results_all
    


### Summarize the average results for each base model

In [9]:
import os
import json
import pandas as pd
import numpy as np

# 基础模型设置
# base_model = "meta-llama/Llama-3.2-3B"
# Train_DATASET_LIST = [
#     ["filtered-cured-10k-warmup-llama3b-41", "filtered-cured-10k-warmup-llama3b", "filtered-cured-10k-warmup-llama3b-43"],
#     ["filtered-cured-50k-full-baseline-41", "filtered-cured-50k-full-baseline", "filtered-cured-50k-full-baseline-43"],
#     ["filtered-cured-50k-random-baseline-41", "filtered-cured-50k-random-baseline", "filtered-cured-50k-random-baseline-43"],
#     ["filtered-cured-50k-rho-baseline-sample-llama3b-41", "filtered-cured-50k-rho-baseline-sample", "filtered-cured-50k-rho-baseline-sample-llama3b-43"],
#     ["filtered-cured-50k-rho-baseline-global-llama3b-41", "filtered-cured-50k-rho-baseline-global", "filtered-cured-50k-rho-baseline-global-llama3b-43"],
#     ["filtered-cured-50k-iter-split-global_data_prop_0.6_llama3b-non-filtered-fixed-base-model-41_4","filtered-cured-50k-iter-split-global_data_prop_0.6_llama3b-non-filtered-fixed-base-model_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_llama3b-non-filtered-fixed-base-model-43_4"]
# ]


# base_model="mistralai/Mistral-7B-v0.3"
# Train_DATASET_LIST=[
#     ["filtered-cured-10k-warmup-mistral-41", "filtered-cured-10k-warmup-mistral", "filtered-cured-10k-warmup-mistral-43"],
#     ["filtered-cured-50k-full-baseline-41", "filtered-cured-50k-shuffle-full-baseline", "filtered-cured-50k-full-baseline-43"],
#     ["filtered-cured-50k-random-baseline-41", "filtered-cured-50k-shuffle-random-baseline" ,"filtered-cured-50k-random-baseline-43"],
#     ["filtered-cured-50k-rho-baseline-mistral-41", "filtered-cured-50k-rho-baseline", "filtered-cured-50k-rho-baseline-mistral-43" ],
#     ["filtered-cured-50k-rho-baseline-mistral-global-41", "filtered-cured-50k-rho-baseline-mistral-global" ,"filtered-cured-50k-rho-baseline-mistral-global-43"],
#     ["filtered-cured-50k-iter-split-global_data_prop_0.6_mistral-non-filtered-fixed-base-model-41_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_mistral-non-filtered-fixed-base-model_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_mistral-non-filtered-fixed-base-model-43_4"],
#     ]

base_model="mistralai/Mistral-7B-v0.3"
Train_DATASET_LIST = [
    ["filtered-cured-10k-warmup-mistral-new-41", "filtered-cured-10k-warmup-mistral", "filtered-cured-10k-warmup-mistral-new-43",],
    ["filtered-cured-50k-full-baseline-mistral-41", "filtered-cured-50k-shuffle-full-baseline", "filtered-cured-50k-full-baseline-mistral-43",], 
    ["filtered-cured-50k-random-baseline-41", "filtered-cured-50k-shuffle-random-baseline" ,"filtered-cured-50k-random-baseline-43"],
    ["filtered-cured-50k-rho-baseline-mistral-new-41", "filtered-cured-50k-rho-baseline", "filtered-cured-50k-rho-baseline-mistral-new-43",],
    ["filtered-cured-50k-fixed-model-cleaning-mistral-41", "filtered-cured-50k-rho-baseline-mistral-global" ,"filtered-cured-50k-fixed-model-cleaning-mistral-43",],
    ["filtered-cured-50k-iter-split-global_data_prop_0.6_mistral-non-filtered-fixed-base-model-new-41_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_mistral-non-filtered-fixed-base-model_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_mistral-non-filtered-fixed-base-model-new-43_4",],
]



# base_model="meta-llama/Llama-3.1-8B"
# Train_DATASET_LIST=[
#     ["filtered-cured-10k-warmup-llama8b-41", "filtered-cured-10k-warmup-llama8b", "filtered-cured-10k-warmup-llama8b-43"],
#     ["filtered-cured-50k-full-baseline-41", "filtered-cured-50k-shuffle-full-baseline", "filtered-cured-50k-full-baseline-43"],
#     ["filtered-cured-50k-random-baseline-41", "filtered-cured-50k-shuffle-random-baseline" ,"filtered-cured-50k-random-baseline-43"],
#     ["filtered-cured-50k-rho-baseline-llama8b-41", "filtered-cured-50k-rho-baseline", "filtered-cured-50k-rho-baseline-llama8b-43" ],
#     ["filtered-cured-50k-rho-baseline-llama8b-global-41", "filtered-cured-50k-rho-baseline-llama8b-global" ,"filtered-cured-50k-rho-baseline-llama8b-global-43"],
#     ["filtered-cured-50k-iter-split-global_data_prop_0.6_llama8b-non-filtered-fixed-base-model-41_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_llama8b-non-filtered-fixed-base-model_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_llama8b-non-filtered-fixed-base-model-43_4"],
#     ]


# 定义一个大字典来存储所有数据集的结果
all_datasets_results = {}


root_path = "csv_results"
combined_df_all =None
overall_average_results = []

for dataset_names in Train_DATASET_LIST:
    average_results = []
    data_frames = []
    # 读取每个数据集的 CSV 文件
    for dataset_name in dataset_names:
        data = pd.read_csv(os.path.join(root_path, os.path.basename(base_model), f'{dataset_name}_results.csv'))
        data_frames.append(data)


    concatenated_data = pd.concat(data_frames, axis=0)
    data_for_analysis = concatenated_data.iloc[:, 1:]

    # 计算平均值和方差并四舍五入到两位小数
    mean_values = data_for_analysis.mean().round(2)
    std_dev_values = data_for_analysis.std().round(2)

    combined_df = pd.DataFrame({col: f"{mean} ± {std_dev}" for col, mean, std_dev in zip(mean_values.index, mean_values, std_dev_values)}, index=[f'{dataset_names[1]}'])
    # print(combined_df.to_string(line_width=1000))
    overall_average_results.append(combined_df)


combined_df_all = pd.concat(overall_average_results, axis=0)
combined_df_all.index = ['ds2', 'full_token', 'random', 'rho', 'fixed_model_cleaning','self_evolving_cleaning']
print(combined_df_all.to_string(line_width=1000))

combined_df_all.to_csv(f'csv_results/{os.path.basename(base_model)}/final_results.csv')


# latex_table = combined_df_all.to_latex(index=True, caption="模型评估结果", label="tab:results", float_format="%.2f")
# print("\n" +"#" * 80 + "\nLaTeX Form:\n" + "#" * 80 )
# print(latex_table)

                       truthfulqa_mc2         tydiqa        logiqa          mmlu     hellaswag arc_challenge         boolq       Average
ds2                      43.49 ± 0.65    55.12 ± 0.6   27.24 ± 1.7  62.57 ± 0.05  61.36 ± 0.24  51.39 ± 0.81  85.06 ± 1.35   55.2 ± 0.44
full_token                43.63 ± 0.1   56.55 ± 0.85  26.15 ± 0.77   62.64 ± 0.2  61.12 ± 0.04  50.88 ± 0.35  83.66 ± 0.16  54.93 ± 0.29
random                   43.91 ± 0.11   54.09 ± 1.39   26.1 ± 1.12  62.65 ± 0.17   61.1 ± 0.09   50.5 ± 0.48   83.4 ± 0.32  54.53 ± 0.06
rho                      43.89 ± 0.08   56.16 ± 1.43   27.5 ± 1.89  62.46 ± 0.29  61.32 ± 0.04   51.54 ± 0.4  82.64 ± 0.57  55.07 ± 0.51
fixed_model_cleaning     44.02 ± 0.45   59.15 ± 0.45  26.15 ± 0.09  60.99 ± 1.19  61.49 ± 0.15  51.94 ± 0.26  83.42 ± 1.71    55.3 ± 0.1
self_evolving_cleaning   43.54 ± 1.65  50.86 ± 10.77  27.85 ± 1.29  59.91 ± 2.27  61.36 ± 0.08   52.0 ± 1.17    83.0 ± 1.5   54.07 ± 1.5


## Generate independent results for each random seed

In [9]:
import os
import json
import pandas as pd
import numpy as np



# base_model = "meta-llama/Llama-3.2-3B"
# Train_DATASET_LIST = [
#     ["filtered-cured-10k-warmup-llama3b-41", "filtered-cured-10k-warmup-llama3b", "filtered-cured-10k-warmup-llama3b-43"],
#     ["filtered-cured-50k-full-baseline-41", "filtered-cured-50k-full-baseline", "filtered-cured-50k-full-baseline-43"],
#     ["filtered-cured-50k-random-baseline-41", "filtered-cured-50k-random-baseline", "filtered-cured-50k-random-baseline-43"],
#     ["filtered-cured-50k-rho-baseline-sample-llama3b-41", "filtered-cured-50k-rho-baseline-sample", "filtered-cured-50k-rho-baseline-sample-llama3b-43"],
#     ["filtered-cured-50k-rho-baseline-global-llama3b-41", "filtered-cured-50k-rho-baseline-global", "filtered-cured-50k-rho-baseline-global-llama3b-43"],
#     ["filtered-cured-50k-iter-split-global_data_prop_0.6_llama3b-non-filtered-fixed-base-model-41_4","filtered-cured-50k-iter-split-global_data_prop_0.6_llama3b-non-filtered-fixed-base-model_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_llama3b-non-filtered-fixed-base-model-43_4"]
# ]

base_model="meta-llama/Llama-3.1-8B"
Train_DATASET_LIST=[
    ["filtered-cured-10k-warmup-llama8b-41", "filtered-cured-10k-warmup-llama8b", "filtered-cured-10k-warmup-llama8b-43"],
    ["filtered-cured-50k-full-baseline-41", "filtered-cured-50k-shuffle-full-baseline", "filtered-cured-50k-full-baseline-43"],
    ["filtered-cured-50k-random-baseline-41", "filtered-cured-50k-shuffle-random-baseline" ,"filtered-cured-50k-random-baseline-43"],
    ["filtered-cured-50k-rho-baseline-llama8b-41", "filtered-cured-50k-rho-baseline", "filtered-cured-50k-rho-baseline-llama8b-43" ],
    ["filtered-cured-50k-rho-baseline-llama8b-global-41", "filtered-cured-50k-rho-baseline-llama8b-global" ,"filtered-cured-50k-rho-baseline-llama8b-global-43"],
    ["filtered-cured-50k-iter-split-global_data_prop_0.6_llama8b-non-filtered-fixed-base-model-41_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_llama8b-non-filtered-fixed-base-model_4", "filtered-cured-50k-iter-split-global_data_prop_0.6_llama8b-non-filtered-fixed-base-model-43_4"],
    ]


all_datasets_results = {}

root_path = "csv_results"
combined_df_all =None
overall_average_results = []

for random_seed_idx in range(3):
    data_frames = []

    for dataset_names in Train_DATASET_LIST:
        dataset_name = dataset_names[random_seed_idx]

        data = pd.read_csv(os.path.join(root_path, os.path.basename(base_model), f'{dataset_name}_results.csv'))
        data.set_index(data.columns[0], inplace=True)
        data_frames.append(data)

    concatenated_data = pd.concat(data_frames,axis=0)
    new_row_names = ['ds2', 'full_token', 'random', 'rho', 'fixed_model_cleaning','self_evolving_cleaning']
    concatenated_data.index = new_row_names
    
    print(concatenated_data.to_string(line_width=1000))
    
    concatenated_data.to_csv(f'csv_results/{os.path.basename(base_model)}/independent_results_4{1+random_seed_idx}.csv')


                        truthfulqa_mc2  tydiqa  logiqa   mmlu  hellaswag  arc_challenge  boolq  Average
ds2                              47.74   54.33   26.98  65.79      60.57          53.32  83.38     56.0
full_token                       49.84   51.76   28.06  65.75      60.28          54.01  83.17     56.1
random                           50.31   53.26   27.44  65.90      60.31          54.44  83.38     56.4
rho                              56.42   60.97   27.13  65.76      61.95          55.21  82.52     58.6
fixed_model_cleaning             55.94   62.13   28.22  65.61      62.00          55.12  82.71     58.8
self_evolving_cleaning           59.78   63.63   26.51  65.18      62.62          54.52  82.49     59.2
                        truthfulqa_mc2  tydiqa  logiqa   mmlu  hellaswag  arc_challenge  boolq  Average
ds2                              49.57   50.66   27.44  65.80      60.37          53.57  83.38     55.8
full_token                       47.51   55.62   28.53  65.78   