In [1]:
import os
import sys
import typing
import json
import pandas as pd


In [2]:
def remove_header(segment):
    if segment.startswith('model_args:'):
        return segment[len('model_args:'):]
    elif segment.startswith('task_args:'):
        return segment[len('task_args:'):]
    else:
        return segment

def parse_segment(segment) -> typing.List[str]:
    segment = remove_header(segment)
    kwargs = [kwarg for kwarg in segment.split(',') if kwarg]
    args = {}
    for kwarg_str in kwargs:
        k,v = kwarg_str.split('=')
        args[k] = v if v != 'None' else None
    return args

def parse_fname(fname: str) -> typing.Dict:    
    args = {}
    for segment in fname.split('|'):
        args.update(parse_segment(segment))
    return args

def parse_file(fpath: str) -> typing.Dict:
    mtime = os.stat(fpath).st_mtime
    with open(fpath, 'rt') as f:
        o = json.load(f)
    task_version = o['versions']
    d = {'mtime': mtime}
    for k, v in o['config'].items():
        if isinstance(v, str) and '=' in v:
            d.update(parse_segment(v))
        elif not v and k in ['model_args', 'task_args']:
            continue
        else:
            d[k] = v
    for task_name, results in o['results'].items():
        task_name_out = task_name[:-len('_d')] if task_name.endswith('_d') else task_name
        for k, v in results.items():
            d[f'{task_name_out}_v{task_version[task_name]}:{k}'] = v
    return d

def parse_dir(dirpath: str) -> pd.DataFrame:
    # pd.DataFrame([parse_fname(fname) for fname in os.listdir('lmeval_results')])
    fnames, mtimes = zip(*[(fentry.name, fentry.stat().st_mtime) for fentry in os.scandir(dirpath) if fentry.is_file() and fentry.name.endswith('.json')])
    return pd.DataFrame([parse_file(f'{dirpath}/{fname}') for fname in fnames])


In [3]:
for fname in os.listdir('../lmeval_results'):
    if not fname.endswith('.json'):
        fname_new = fname + '.json'
        os.rename(src=f'../lmeval_results/{fname}', dst=f'../lmeval_results/{fname_new}')

## Hellaswag

In [18]:
df = parse_dir('../lmeval_results')
df = df[[col for col in df.columns if col not in ['batch_size', 'device', 'no_cache', 'bootstrap_iters', 'description_dict']]]
df = df.sort_values(by=['model', 'num_fewshot', 'mtime'])[df.limit.isna()].drop(columns='limit')
# if 'NORM' in df:
#     df = df.assign(NORM=df.NORM.fillna('None'))
df

  df = df.sort_values(by=['model', 'num_fewshot', 'mtime'])[df.limit.isna()].drop(columns='limit')


Unnamed: 0,mtime,model,WORD_AGG_SCHEME,SIMILARITY_FUNC,NORM,encoding_scheme,num_fewshot,hellaswag_v0:acc,hellaswag_v0:acc_stderr,hellaswag_v0:acc_norm,hellaswag_v0:acc_norm_stderr,hellaswag_v0:rand_acc,hellaswag_v0:rand_acc_stderr,pretrained
30,1661432000.0,dist_lm,,,,cross_encoding,0,0.289185,0.004525,0.311392,0.004621,0.25,0.0,
13,1661502000.0,dist_lm,last,cosine_sim,L2,concat_all_examples,0,0.255029,0.00435,0.252738,0.004337,0.25,0.0,
6,1661503000.0,dist_lm,mean,cosine_sim,L2,concat_all_examples,0,0.249452,0.004318,0.253933,0.004344,0.25,0.0,
8,1661506000.0,dist_lm,mean,cosine_sim,layer,concat_all_examples,0,0.249552,0.004319,0.253933,0.004344,0.25,0.0,
16,1661507000.0,dist_lm,last,cosine_sim,layer,concat_all_examples,0,0.255029,0.00435,0.252738,0.004337,0.25,0.0,
4,1661508000.0,dist_lm,last,cosine_sim,,concat_all_examples,0,0.255029,0.00435,0.252738,0.004337,0.25,0.0,
17,1661513000.0,dist_lm,last,cosine_sim,layer,concat_each_example,0,0.255029,0.00435,0.252738,0.004337,0.25,0.0,
26,1661513000.0,dist_lm,last,cosine_sim,L2,concat_all_examples,0,0.280522,0.004483,0.263394,0.004396,0.25,0.0,EleutherAI/gpt-neo-1.3B
32,1661518000.0,dist_lm,last,cosine_sim,layer,concat_all_examples,0,0.280522,0.004483,0.263394,0.004396,0.25,0.0,EleutherAI/gpt-neo-1.3B
29,1661519000.0,dist_lm,last,cosine_sim,layer,cross_encoding,0,0.289185,0.004525,0.311392,0.004621,0.25,0.0,


In [19]:
df[~df.pretrained.isna()][['mtime', 'pretrained', 'model', 'WORD_AGG_SCHEME', 'SIMILARITY_FUNC', 'NORM', 'encoding_scheme', 'num_fewshot', 'hellaswag_v0:acc', 'hellaswag_v0:acc_stderr', 'hellaswag_v0:acc_norm', 'hellaswag_v0:acc_norm_stderr', 'hellaswag_v0:rand_acc']]

Unnamed: 0,mtime,pretrained,model,WORD_AGG_SCHEME,SIMILARITY_FUNC,NORM,encoding_scheme,num_fewshot,hellaswag_v0:acc,hellaswag_v0:acc_stderr,hellaswag_v0:acc_norm,hellaswag_v0:acc_norm_stderr,hellaswag_v0:rand_acc
26,1661513000.0,EleutherAI/gpt-neo-1.3B,dist_lm,last,cosine_sim,L2,concat_all_examples,0,0.280522,0.004483,0.263394,0.004396,0.25
32,1661518000.0,EleutherAI/gpt-neo-1.3B,dist_lm,last,cosine_sim,layer,concat_all_examples,0,0.280522,0.004483,0.263394,0.004396,0.25
10,1661751000.0,EleutherAI/gpt-neo-1.3B,dist_lm,last,dot_product,layer,concat_all_examples,0,0.280522,0.004483,0.263394,0.004396,0.25
27,1661752000.0,EleutherAI/gpt-neo-1.3B,dist_lm,mean,dot_product,layer,concat_all_examples,0,0.290181,0.004529,0.259311,0.004374,0.25
12,1661514000.0,EleutherAI/gpt-neo-1.3B,dist_lm,last,cosine_sim,L2,concat_all_examples,5,0.272456,0.004443,0.262398,0.00439,0.25
15,1661753000.0,EleutherAI/gpt-neo-1.3B,dist_lm,mean,dot_product,layer,concat_all_examples,5,0.236905,0.004243,0.254033,0.004344,0.25
7,1661753000.0,EleutherAI/gpt-neo-1.3B,dist_lm,last,dot_product,layer,concat_all_examples,5,0.272456,0.004443,0.262398,0.00439,0.25
19,1661520000.0,EleutherAI/gpt-neo-1.3B,gpt2,,,,,0,0.386576,0.00486,0.489345,0.004989,
11,1661527000.0,EleutherAI/gpt-neo-1.3B,gpt2,,,,,5,0.386477,0.004859,0.48576,0.004988,


In [20]:
import plotly as pt
import plotly.graph_objects as go
import plotly.express as px
