In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import os
import glob
import re

In [2]:
def create_lucie_steps_to_tokens():
    lucie_steps_to_tokens = {
        753851: 3121737891840,
        5000: 5700059136,
        10000: 13884194816,
        15000: 26008354816,
        20000: 43747901440,
    }

    for i in range(1000):
        lucie_steps_to_tokens[20000 + i*5000] = 43747901440 + i*20971520000
    return lucie_steps_to_tokens

lucie_steps_to_tokens = create_lucie_steps_to_tokens()


In [3]:
def read_json(path):
    with open(path, 'r') as file:
        data = json.load(file)
    return data

def process_results(data):
    out = []
    for benchmark, results in data.items():
        metric = None
        if ("arc" == benchmark[:3]) or ("hellaswag" in benchmark):
            metric = "acc_norm,none"
        elif (
            ("mmlu" == benchmark)
            or ("mmlu_continuation" == benchmark)
            or ("winogrande" == benchmark)
            or ("m_mmlu" in benchmark)
            or ("french_bench_grammar" == benchmark)
            or ("french_bench_vocab" == benchmark)
            or ("truthfulqa_mc2" == benchmark)
        ):
            metric = "acc,none"
        elif "gsm8k" == benchmark:
            metric = "exact_match,strict-match"
        elif (
            ("french_bench_fquadv2_genq" == benchmark)
            or ("french_bench_fquadv2_hasAns" == benchmark)
            or ("french_bench_multifquad" == benchmark)
            or ("french_bench_orangesum_abstract" == benchmark)
            or ("french_bench_trivia" == benchmark)
        ):
            metric = "rouge1,none"
        # 
        if metric is not None:
            out.append({"benchmark": benchmark, "metric": metric, "score": results[metric]})

    return {"results": out}

def process_name(data):
    model_name = data['model_name']
    out = {}
    if 'Lucie' in model_name: # Lucie model
        out['name'] = 'lucie-7b'
        out['checkpoint'] = False
        if 'pretrained' in model_name:
            out['model_type'] = 'pretraining'
            out['global_step'] = int(re.search(r'global_step(\d+)', model_name).group(1))
            out['num_tokens'] = lucie_steps_to_tokens[out['global_step']]
            if out['global_step'] != 753851:
                out['checkpoint'] = True
        elif 'extension' in model_name:
            out['name'] += '_extension' 
            out['model_type'] = 'extension'
            out['num_tokens'] = lucie_steps_to_tokens[753851] 
        elif 'annealing' in model_name:
            out['name'] += '_annealing' 
            out['model_type'] = 'annealing'
            out['num_tokens'] = lucie_steps_to_tokens[753851] 
        elif 'instruction' in model_name:
            out['name'] += '_instruction' 
            out['model_type'] = 'instruction'
            out['num_tokens'] = lucie_steps_to_tokens[753851] 
        elif 'stage2' in model_name:
            out['name'] += '_stage2' 
            out['model_type'] = 'stage2'
            out['num_tokens'] = lucie_steps_to_tokens[753851] 
    else:
        name = model_name.split('/')[-1]
        out['name'] = name
        out['checkpoint'] = False
        if 'Meta-Llama-3.1-8B' in name:
            out['num_tokens'] = 15*10**12    
        elif name == 'Mistral-7B-v0.1':
            pass
        elif name == 'bloom-7b1':
            out['num_tokens'] = 0.35*10**12
        elif name == 'CroissantLLM':
            out['num_tokens'] = 3*10**12 
        elif name == 'falcon-7b':
            out['num_tokens'] = 3*10**12 
        elif name == 'pythia-6.9b':
            out['num_tokens'] = 299892736000
        if ('Instruct' in name) or ('Chat' in name):
            out['model_type'] = 'instruction'
        else:
            out['model_type'] = 'pretraining'
    return out

file_paths = glob.glob('out/**/*.json', recursive=True)

out = []

for file_path in file_paths:
    data = read_json(file_path)
    results = process_results(data.pop("results"))
    eval_metadata = {
        'model_name': data['model_name'],
        'chat_template': data['chat_template'] is not None,
        'fewshot_as_multiturn': data['fewshot_as_multiturn'],
        'add_bos': 'add_bos_token=True' in data['config']['model_args'],
        }
    out.append({**eval_metadata, **results})

df = pd.json_normalize(out)

df = df.groupby(['model_name', 'add_bos', 'chat_template', 'fewshot_as_multiturn'])['results'].agg(sum).reset_index()

df = pd.concat([df, df.apply(lambda x: process_name(x), axis=1, result_type = "expand")], axis=1)

  df = df.groupby(['model_name', 'add_bos', 'chat_template', 'fewshot_as_multiturn'])['results'].agg(sum).reset_index()


In [4]:
def flatten_df(df):
    # Flatten the results column
    def flatten_results(row):
        flattened = {}
        model_name = row['model_name']
        for res in row["results"]:
            benchmark = res["benchmark"]
            if benchmark in flattened:
                print(f"Model: {model_name}\n    -- Duplicate benchmark found: {benchmark}")
            flattened[benchmark] = res["score"]
        return flattened

    # Apply flatten_results to the 'results' column and create new columns for each benchmark
    results_flattened = df.apply(flatten_results, axis=1)

    # Convert the resulting series of dictionaries into a DataFrame
    results_df = pd.DataFrame(results_flattened.tolist())
    results_df = results_df.round(2)

    # Concatenate the original DataFrame with the flattened results
    final_df = pd.concat([df.drop(columns=["results"]), results_df], axis=1)
    return final_df

df = flatten_df(df).fillna(-1)

df = df.sort_values(['model_name', 'model_type', 'global_step'])

Model: /gpfsdswork/dataset/HuggingFace_Models/mistralai/Mistral-7B-Instruct-v0.2
    -- Duplicate benchmark found: french_bench_grammar
Model: /gpfsdswork/dataset/HuggingFace_Models/mistralai/Mistral-7B-Instruct-v0.2
    -- Duplicate benchmark found: french_bench_hellaswag
Model: /gpfsdswork/dataset/HuggingFace_Models/mistralai/Mistral-7B-Instruct-v0.2
    -- Duplicate benchmark found: french_bench_vocab
Model: /lustre/fsn1/projects/rech/qgz/commun/trained_models/Lucie/pretrained/transformers_checkpoints/global_step753851
    -- Duplicate benchmark found: french_bench_fquadv2_genq
Model: /lustre/fsn1/projects/rech/qgz/commun/trained_models/Lucie/pretrained/transformers_checkpoints/global_step753851
    -- Duplicate benchmark found: french_bench_fquadv2_hasAns
Model: /lustre/fsn1/projects/rech/qgz/commun/trained_models/Lucie/pretrained/transformers_checkpoints/global_step753851
    -- Duplicate benchmark found: french_bench_multifquad
Model: /lustre/fsn1/projects/rech/qgz/commun/trained

In [5]:
df.to_csv('all_results.csv')

In [6]:
df

Unnamed: 0,model_name,add_bos,chat_template,fewshot_as_multiturn,name,checkpoint,num_tokens,model_type,global_step,arc_challenge,...,french_bench_orangesum_abstract,french_bench_trivia,arc_de,arc_es,arc_fr,arc_it,m_mmlu_de,m_mmlu_es,m_mmlu_fr,m_mmlu_it
0,/gpfsdswork/dataset/HuggingFace_Models/meta-ll...,False,False,False,Meta-Llama-3.1-8B-Instruct,False,1.500000e+13,instruction,-1.0,0.59,...,0.34,0.73,0.48,0.53,0.52,0.54,0.58,0.61,0.60,0.59
1,/gpfsdswork/dataset/HuggingFace_Models/mistral...,False,False,False,Mistral-7B-Instruct-v0.2,False,-1.000000e+00,instruction,-1.0,0.64,...,0.33,0.68,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00
2,/lustre/fsn1/projects/rech/qgz/commun/instruct...,False,False,False,lucie-7b_instruction,False,3.121738e+12,instruction,-1.0,0.52,...,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00
3,/lustre/fsn1/projects/rech/qgz/commun/instruct...,False,False,False,lucie-7b_instruction,False,3.121738e+12,instruction,-1.0,0.52,...,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00
4,/lustre/fsn1/projects/rech/qgz/commun/instruct...,False,True,False,lucie-7b_instruction,False,3.121738e+12,instruction,-1.0,-1.00,...,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,/lustre/fsn1/projects/rech/qgz/uzq54wg/Hugging...,False,False,False,falcon-7b,False,3.000000e+12,pretraining,-1.0,0.47,...,-1.00,-1.00,0.31,0.38,0.41,0.39,0.25,0.25,0.25,0.25
57,EleutherAI/pythia-6.9b,False,False,False,pythia-6.9b,False,2.998927e+11,pretraining,-1.0,0.39,...,-1.00,-1.00,0.27,0.29,0.30,0.27,0.26,0.26,0.25,0.26
58,bigscience/bloom-7b1,False,False,False,bloom-7b1,False,3.500000e+11,pretraining,-1.0,-1.00,...,0.23,0.55,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00,-1.00
59,croissantllm/CroissantLLMBase,False,False,False,CroissantLLMBase,False,-1.000000e+00,pretraining,-1.0,0.32,...,-1.00,-1.00,0.26,0.26,0.31,0.26,0.25,0.24,0.26,0.24
