In [2]:
import pandas as pd
import json
import os
import glob
import re

In [3]:
input_file_paths = [
    '/data-server/datasets/evaluation/evaluation_output/benchmarks',
    #'/home/lriviere/EVAL_INSTRUCT/benchmarks',
    #'/lustre/fsn1/projects/rech/qgz/commun/evaluation_output/benchmarks',
    #'~/Lucie-Training/evaluation/out',
    ]

file_paths = []
for input_file_path in input_file_paths:
    file_paths += glob.glob(os.path.join(os.path.expanduser(input_file_path),'**/*.json'), recursive=True)

folder_output_table = "../"
output_file_eval_lucie = os.path.join(folder_output_table, "evaluation_learning_curve_lucie.csv")
output_file_eval_baselines = os.path.join(folder_output_table, "evaluation_baselines.csv")

In [4]:
PERF_NAMES = [
    "acc_norm,none", "acc,none",
    "rouge1_acc,none", "rouge1,none",
    "exact_match,flexible-extract", "exact_match,none",
    "prompt_level_loose_acc,none",
]

In [5]:
def lucie_training_step_total(training_steps, training_phase=None):
    if not training_phase:
        return training_steps

    if training_phase == "extension":
        return 753851 + training_steps
    
    if training_phase == "annealing":
        return lucie_training_step_total(1220, "extension") + training_steps

    if training_phase.startswith("instruct"):
        # TODO after annealing is completed
        return lucie_training_step_total(10, "annealing") + training_steps

    raise ValueError(f"Unknown training phase {training_phase}")

def lucie_training_step_to_tokens(training_steps, training_phase=None, cumulate=True):
    if not training_phase: # 1. main pretrained
        # Batch size ramp-up
        training_tokens = {
            0: 0,
            5000: 5700059136,
            10000: 13884194816,
            15000: 26008354816,
            22818: 55567450112,
        }.get(training_steps)

        if training_tokens is None:
            assert training_steps >= 20000, f"Cannot infer number of tokens for {training_steps=}"
            training_tokens = round(43747901440 + ((training_steps - 20000) / 5000) * 20971520000)

    else:
        training_phase_meta = {
            "extension": {
                "previous_phase": None,
                "tokens_by_batch": 4096000,
            },
            "annealing": {
                "previous_phase": "extension",
                "tokens_by_batch": 4096 * 128,
            },
            "instruct": {
                "previous_phase": "annealing",
                "tokens_by_batch": 4096 * 128,
            },
        }.get(training_phase.split("-")[0])
        if not training_phase_meta: raise ValueError(f"Unknown training phase {training_phase}")

        previous_training_phase = training_phase_meta["previous_phase"]
        tokens_by_batch = training_phase_meta["tokens_by_batch"]

        num_steps_before = lucie_training_step_total(0, training_phase)
        num_tokens_before = lucie_training_step_to_tokens(num_steps_before, previous_training_phase) if cumulate else 0
        training_tokens = training_steps * tokens_by_batch + num_tokens_before

    return training_tokens

In [6]:
def read_json(path):
    with open(path, 'r') as file:
        data = json.load(file)
    return data

def process_results(json_data):
    results = json_data.pop("results")
    config = json_data.pop("config")
    configs = json_data.pop("configs")

    out = []
    for benchmark, result_dict in results.items():
        score_name = None
        for n in PERF_NAMES:
            if n in result_dict:
                score_name = n
                break
        
        if benchmark in configs:
            num_fewshot = configs[benchmark].get('num_fewshot', None)
            doc_to_target = configs[benchmark].get('doc_to_target', None)
        else: 
            num_fewshot = None
            doc_to_target = None

        if score_name is not None:
            stderr_name = score_name.split(",")[0] + "_stderr," + ",".join(score_name.split(",")[1:])
            result_dict.pop("alias", None)
            out.append(
                {
                    "dataset": benchmark, 
                    "score_name": score_name, 
                    "score": result_dict[score_name], 
                    "stderr": result_dict[stderr_name], 
                    "num_fewshot": num_fewshot,
                    # "doc_to_target": doc_to_target,
                    **result_dict,
                }
            )
    return out

In [7]:
def process_name(json_data):
    model_name = json_data['model_name']
    if 'Lucie' in model_name: # Lucie model
        name = 'Lucie-7B'
        intermediate_checkpoint = False
        #training_steps_in_phase = int(re.search(r'global_step(\d+)', model_name).group(1))
        match_step = re.search(r'global_step(\d+)', model_name)
        training_steps_in_phase = int(match_step.group(1)) if match_step else 0
        if 'extension' in model_name:
            phase_type = 'extension'
            phase_str = '2_extension'
            if 'extension_rope20M' in model_name:
                expe_name = 'rope20M'
            else:
                expe_name = 'rope500k'
        elif ('annealing' in model_name) or ('stage2' in model_name):
            phase_type = 'annealing'
            if 'stage2' in model_name:
                size = '5B_tokens'
            else:
                size = '40M_tokens'
            match = re.search(r'mix_(\d+)', model_name)
            if match:
                mix = int(match.group(1))
            else:
                mix = 1
            phase_str = f"3_{phase_type}" 
            expe_name = f"{size}-mix{mix}"
        elif 'instruction_lora' in model_name:
            phase_type = 'instruct'
            match1 = re.search(r'instruction_lora/Lucie/human/(.*)_global_step', model_name)
            match2 = re.search(r'instruction_lora/Lucie/(.*)/merged', model_name)
            if match1:
                expe_name = match1.group(1)
            elif match2:
                expe_name = match2.group(1)
            else:
                expe_name = '????'
            phase_str = f"4_{phase_type}_lora"
        elif 'instruction_assistant_only' in model_name:
            phase_type = 'instruct'
            match = re.search(r'mix_(\d+)', model_name)
            if match:
                mix = int(match.group(1))
            else:
                mix = 1
            phase_str = f"4_{phase_type}_full"
            expe_name = f"mix{mix}"
        elif 'instruction' in model_name:
            phase_type = 'instruct'
            match = re.search(r'mix_(\d+)', model_name)
            if match:
                mix = int(match.group(1))
            else:
                mix = 1
            phase_str = f"4_{phase_type}_full_deprecated"
            expe_name = f"mix{mix}"
        elif 'pretrained' in model_name:
            phase_type = None
            phase_str = f"1_main"
            expe_name = 'pretraining'
        else:
            print(model_name)
            raise NotImplementedError

        training_steps = lucie_training_step_total(training_steps_in_phase, phase_type)
        training_tokens = lucie_training_step_to_tokens(training_steps_in_phase, phase_type)
        training_tokens_in_phase = lucie_training_step_to_tokens(training_steps_in_phase, phase_type, cumulate=False)

        return {
            "training_phase": phase_str,
            "model_name": name,
            "expe_name": expe_name,
            "intermediate_checkpoint": intermediate_checkpoint,
            "training_steps": training_steps,
            "training_tokens": training_tokens,
            "training_steps_in_phase": training_steps_in_phase,
            "training_tokens_in_phase": training_tokens_in_phase,
        }
    else:
        name = model_name.split('/')[-1]
        if 'Meta-Llama-3.1-8B' in name:
            training_tokens = 15*10**12    
        elif 'Mistral-7B' in name:
            training_tokens = 7*10**12
        elif 'bloom-7b1' in name:
            training_tokens = 0.35*10**12
        elif 'CroissantLLM' in name:
            training_tokens = 3*10**12 
        elif 'falcon-7b' in name:
            training_tokens = 3*10**12 
        elif 'pythia-6.9b' in name:
            training_tokens = 299892736000
        else:
            print(model_name)
        if ('Instruct' in name) or ('Chat' in name):
            phase_str = 'instruction'
        else:
            phase_str = 'main'
        return {
            "training_phase": phase_str,
            "model_name": name,
            "training_tokens": training_tokens,
            }

In [9]:
out = []

for file_path in file_paths:
    json_data = read_json(file_path)
    match = re.search(r'/(fr_leaderboard|french_bench|french_bench_gen|leaderboard|okapi|multilingual|openllm)/', file_path)
    benchmark = match.group(1) if match else '???'
    out.append({
        'file_path': file_path,
        'benchmark': benchmark,
        **process_name(json_data),
        'chat_template': json_data['chat_template'] is not None,
        'fewshot_as_multiturn': json_data['fewshot_as_multiturn'],
        'add_bos_token': 'add_bos_token=True' in json_data['config']['model_args'],
        'results': process_results(json_data)
        })


df = pd.DataFrame(out)
df = df.explode('results')
df = pd.concat(
    [
        df.drop(columns=['results']).reset_index(drop=True),
        pd.json_normalize(df['results']).reset_index(drop=True),
    ],
    axis=1
)

In [10]:
duplicated_rows = df[df.duplicated(subset=[
    'benchmark', 'training_phase', 'model_name', "expe_name",
       'intermediate_checkpoint', 'training_steps', 'training_tokens',
       'training_steps_in_phase', 'training_tokens_in_phase', 'chat_template',
       'fewshot_as_multiturn', 'add_bos_token', 'dataset', 'score_name',
       'num_fewshot', 
    #    'doc_to_target'
       ], keep=False)]

files_to_check = list(duplicated_rows['file_path'].unique())

if len(files_to_check) > 0:
    print(f'Please clean your duplicated files...\n')
    for x in files_to_check:
        print(x)
else:
    print('No duplicate')

No duplicate


In [11]:
assert len(files_to_check) == 0

df = df[['model_name', 'expe_name', 'training_phase', 'intermediate_checkpoint',
 'add_bos_token', 'fewshot_as_multiturn', 'chat_template',
 'training_steps','training_tokens','training_steps_in_phase','training_tokens_in_phase',
 'benchmark','dataset','score','stderr','score_name',
 'num_fewshot',
#  'doc_to_target',
 "acc,none","acc_stderr,none","acc_norm,none","acc_norm_stderr,none",
 "exact_match,strict-match","exact_match_stderr,strict-match","exact_match,flexible-extract","exact_match_stderr,flexible-extract",
 "bleu_max,none","bleu_max_stderr,none","bleu_acc,none","bleu_acc_stderr,none","bleu_diff,none","bleu_diff_stderr,none",
 "rouge1_max,none","rouge1_max_stderr,none","rouge1_acc,none","rouge1_acc_stderr,none","rouge1_diff,none","rouge1_diff_stderr,none",
 "rouge2_max,none","rouge2_max_stderr,none","rouge2_acc,none","rouge2_acc_stderr,none","rouge2_diff,none","rouge2_diff_stderr,none",
 "rougeL_max,none","rougeL_max_stderr,none","rougeL_acc,none","rougeL_acc_stderr,none","rougeL_diff,none","rougeL_diff_stderr,none",
 "rouge1,none","rouge1_stderr,none","f1,none","f1_stderr,none",
 "exact,none","exact_stderr,none",
 "is_included,none","is_included_stderr,none",
 "inst_level_loose_acc,none","inst_level_loose_acc_stderr,none","inst_level_strict_acc,none","inst_level_strict_acc_stderr,none",
 "prompt_level_loose_acc,none","prompt_level_loose_acc_stderr,none",
 "exact_match,none","exact_match_stderr,none",
 "prompt_level_strict_acc,none","prompt_level_strict_acc_stderr,none"
]]

lucie_df = df[df['model_name'].apply(lambda x: 'Lucie' in x)]
baseline_df = df[df['model_name'].apply(lambda x: 'Lucie' not in x)]

pd.DataFrame(lucie_df) \
    .sort_values(by=["model_name", "training_tokens", "training_phase", "benchmark"], ascending=True) \
    .to_csv(output_file_eval_lucie, index=False)

pd.DataFrame(baseline_df) \
    .sort_values(by=["model_name", "training_tokens", "benchmark"], ascending=True) \
    .to_csv(output_file_eval_baselines, index=False)