In [1]:
import os
import sys
import typing
import json
import re
import pandas as pd


In [11]:
def remove_header(segment):
    if segment.startswith('model_args:'):
        return segment[len('model_args:'):]
    elif segment.startswith('task_args:'):
        return segment[len('task_args:'):]
    else:
        return segment

def parse_segment(segment) -> typing.List[str]:
    segment = remove_header(segment)
    kwargs = [kwarg for kwarg in segment.split(',') if kwarg]
    args = {}
    for kwarg_str in kwargs:
        k,v = kwarg_str.split('=')
        args[k] = v if v != 'None' else None
    return args

def parse_fname(fname: str) -> typing.Dict:    
    args = {}
    for segment in fname.split('|'):
        args.update(parse_segment(segment))
    return args

def parse_file(fpath: str) -> typing.Dict:
    mtime = os.stat(fpath).st_mtime
    with open(fpath, 'rt') as f:
        o = json.load(f)
    task_version = o['versions']
    d = {'mtime': mtime}
    for k, v in o['config'].items():
        if isinstance(v, str) and '=' in v:
            d.update(parse_segment(v))
        elif not v and k in ['model_args', 'task_args']:
            continue
        else:
            d[k] = v
    for task_name, results in o['results'].items():
        if task_name.endswith('_d'):
            task_name_out = task_name[:-len('_d')]
        elif task_name.endswith('_dg'):
            task_name_out = task_name[:-len('_dg')]
        else:
            task_name_out = task_name
        for k, v in results.items():
            # d[f'{task_name_out}_v{task_version[task_name]}:{k}'] = v
            d[f'{task_name_out}:{k}'] = v
    return d

def parse_dir(dirpath: str) -> pd.DataFrame:
    # pd.DataFrame([parse_fname(fname) for fname in os.listdir('lmeval_results')])
    fnames, mtimes = zip(*[(fentry.name, fentry.stat().st_mtime) for fentry in os.scandir(dirpath) if fentry.is_file() and fentry.name.endswith('.json')])
    return pd.DataFrame([parse_file(f'{dirpath}/{fname}') for fname in fnames])



## Evaluation Results

In [12]:
def read_results(dir: str = '../lmeval_results') -> pd.DataFrame:
    df = parse_dir(dir)
    df = df[[col for col in df.columns if col not in ['batch_size', 'device', 'no_cache', 'bootstrap_iters', 'description_dict']]]
    df = df.drop(columns='limit').assign(pretrained=df.pretrained.fillna('GPT2'))
    df = df.assign(model_type=df.model.map(lambda model: 'autoregressive' if model == 'gpt2' else (model))).drop(columns='model')
    return df

def task_metrics(results_df: pd.DataFrame, tasks: typing.List[str]) -> pd.DataFrame:
    metrics = tasks
    metrics_re = re.compile(r'^(' + r'|'.join([f'({m})' for m in metrics]) + ').*' )
    model_cols = {'model_type', 'pretrained', 'WORD_AGG_SCHEME', 'SIMILARITY_FUNC', 'NORM'}
    task_cols = {'num_fewshot', 'encoding_scheme'}
    metric_cols = {col for col in df.columns if metrics_re.fullmatch(col) is not None}
    selected_cols = model_cols | task_cols | metric_cols | {'mtime'}
    if (selected_cols) < set(df.columns):
        print(f'Following columns will be dropped: {set(results_df.columns) - selected_cols}')
    groupby_cols = (model_cols | task_cols)
    def take_last(_df: pd.DataFrame) -> pd.DataFrame:
        _df = _df.sort_values(by='mtime', ascending=False)
        return pd.Series({col: _df[col].dropna().iloc[0] if _df[col].dropna().shape[0] >=1 else None for col in _df.columns if col in metric_cols})
    df2 = results_df[list(selected_cols)].groupby(list(groupby_cols), dropna=False).aggregate(take_last).dropna(how='all')
    return df2.reset_index().dropna(axis=1, how='all').sort_values(by=['num_fewshot', 'pretrained', 'model_type'])
df = read_results()

In [13]:
df.assign(date=pd.to_datetime(df.mtime, unit='s', origin='unix', utc=True)).sort_values(by='mtime', ascending=False).iloc[:10]

Unnamed: 0,mtime,WORD_AGG_SCHEME,SIMILARITY_FUNC,NORM,encoding_scheme,num_fewshot,hellaswag:acc,hellaswag:acc_stderr,hellaswag:acc_norm,hellaswag:acc_norm_stderr,...,hellaswag:rand_acc_stderr,webqs:acc,webqs:acc_stderr,pretrained,hellaswag:em,hellaswag:em_stderr,EXAMPLE_AGG_SCHEME,SEGMENT_AGG_SCHEME,model_type,date
34,1662539000.0,last,dot_product,L2,concat_all_examples,5,0.2,0.2,,,...,0.0,,,EleutherAI/gpt-neo-1.3B,,,mean,mean,dist_sim,2022-09-07 08:20:47.886404608+00:00
91,1662539000.0,last,dot_product,L2,concat_all_examples,0,0.0,0.0,,,...,0.0,,,EleutherAI/gpt-neo-1.3B,,,mean,mean,dist_sim,2022-09-07 08:20:41.578464768+00:00
22,1662539000.0,last,dot_product,L2,concat_all_examples,5,0.2,0.2,,,...,0.0,,,GPT2,,,mean,mean,dist_sim,2022-09-07 08:20:28.518589696+00:00
7,1662539000.0,last,dot_product,L2,concat_all_examples,0,0.0,0.0,,,...,0.0,,,GPT2,,,mean,mean,dist_sim,2022-09-07 08:20:22.822644224+00:00
97,1662525000.0,last,dot_product,L2,concat_all_examples,5,0.238697,0.004254,,,...,0.0,,,GPT2,,,mean,mean,dist_sim,2022-09-07 04:33:11.255335168+00:00
27,1662525000.0,last,dot_product,L2,concat_all_examples,0,0.236805,0.004243,,,...,0.0,,,GPT2,,,mean,mean,dist_sim,2022-09-07 04:31:14.464437248+00:00
33,1662388000.0,last,dot_product,L2,segment_each_example,5,0.232125,0.004213,,,...,0.0,,,GPT2,,,mean,mean,dist_sim,2022-09-05 14:28:11.936980992+00:00
18,1662388000.0,mean,dot_product,L2,segment_each_example,5,0.244971,0.004292,,,...,0.0,,,GPT2,,,mean,mean,dist_sim,2022-09-05 14:27:52.345164800+00:00
69,1662388000.0,mean,dot_product,layer,segment_each_example,5,0.24278,0.004279,,,...,0.0,,,GPT2,,,mean,mean,dist_sim,2022-09-05 14:27:42.605256192+00:00
11,1662388000.0,last,dot_product,layer,segment_each_example,5,0.232723,0.004217,,,...,0.0,,,GPT2,,,mean,mean,dist_sim,2022-09-05 14:27:39.265287424+00:00


## WebQs

In [14]:
df_webqs = task_metrics(df, ['webqs'])
df_webqs

Following columns will be dropped: {'hellaswag:acc_norm', 'SEGMENT_AGG_SCHEME', 'hellaswag:em_stderr', 'hellaswag:rand_acc', 'hellaswag:rand_acc_stderr', 'hellaswag:acc', 'hellaswag:acc_stderr', 'hellaswag:em', 'EXAMPLE_AGG_SCHEME', 'hellaswag:acc_norm_stderr'}


  df2 = results_df[list(selected_cols)].groupby(list(groupby_cols), dropna=False).aggregate(take_last).dropna(how='all')


Unnamed: 0,num_fewshot,pretrained,WORD_AGG_SCHEME,encoding_scheme,NORM,model_type,webqs:acc,webqs:acc_stderr
5,0,EleutherAI/gpt-neo-1.3B,,,,autoregressive,0.016732,0.002846
0,0,EleutherAI/gpt-neo-1.3B,last,merge_all_segments,layer,dist_gen,0.0,0.0
1,0,EleutherAI/gpt-neo-1.3B,last,segment_each_example,layer,dist_gen,0.0,0.0
2,0,EleutherAI/gpt-neo-1.3B,mean,merge_all_segments,layer,dist_gen,0.0,0.0
3,0,EleutherAI/gpt-neo-1.3B,mean,segment_each_example,layer,dist_gen,0.0,0.0
4,0,EleutherAI/gpt-neo-1.3B,,cross_encoding,,dist_gen,0.016732,0.002846
11,0,GPT2,,,,autoregressive,0.002953,0.001204
6,0,GPT2,last,merge_all_segments,layer,dist_gen,0.0,0.0
7,0,GPT2,last,segment_each_example,layer,dist_gen,0.0,0.0
8,0,GPT2,mean,merge_all_segments,layer,dist_gen,0.0,0.0


## Hellaswag

In [15]:
pd.set_option('display.max_rows', 100)
df_hellaswag = task_metrics(df, ['hellaswag:acc'])
df_hellaswag = df_hellaswag.sort_values(by=['num_fewshot', 'pretrained', 'model_type', 'hellaswag:acc'], ascending=[True, True, True, False])
df_hellaswag[df_hellaswag['pretrained'] == 'EleutherAI/gpt-neo-1.3B']

Following columns will be dropped: {'SEGMENT_AGG_SCHEME', 'hellaswag:rand_acc', 'hellaswag:em_stderr', 'hellaswag:rand_acc_stderr', 'EXAMPLE_AGG_SCHEME', 'hellaswag:em', 'webqs:acc', 'webqs:acc_stderr'}


  df2 = results_df[list(selected_cols)].groupby(list(groupby_cols), dropna=False).aggregate(take_last).dropna(how='all')


Unnamed: 0,num_fewshot,SIMILARITY_FUNC,pretrained,WORD_AGG_SCHEME,encoding_scheme,NORM,model_type,hellaswag:acc_norm,hellaswag:acc,hellaswag:acc_stderr,hellaswag:acc_norm_stderr
30,0,,EleutherAI/gpt-neo-1.3B,,,,autoregressive,0.489345,0.386576,0.00486,0.004989
29,0,,EleutherAI/gpt-neo-1.3B,,cross_encoding,,dist_gen,0.489345,0.386576,0.00486,0.004989
0,0,cosine_sim,EleutherAI/gpt-neo-1.3B,last,concat_all_examples,L2,dist_sim,0.1,0.3,0.152753,0.1
4,0,cosine_sim,EleutherAI/gpt-neo-1.3B,mean,concat_each_example,layer,dist_sim,0.259311,0.290181,0.004529,0.004374
5,0,cosine_sim,EleutherAI/gpt-neo-1.3B,mean,segment_each_example,layer,dist_sim,0.259311,0.290181,0.004529,0.004374
17,0,dot_product,EleutherAI/gpt-neo-1.3B,mean,concat_all_examples,layer,dist_sim,0.259311,0.290181,0.004529,0.004374
18,0,dot_product,EleutherAI/gpt-neo-1.3B,mean,concat_each_example,layer,dist_sim,0.259311,0.290181,0.004529,0.004374
19,0,dot_product,EleutherAI/gpt-neo-1.3B,mean,segment_each_example,layer,dist_sim,0.259311,0.290181,0.004529,0.004374
1,0,cosine_sim,EleutherAI/gpt-neo-1.3B,last,concat_all_examples,layer,dist_sim,0.263394,0.280522,0.004483,0.004396
2,0,cosine_sim,EleutherAI/gpt-neo-1.3B,last,concat_each_example,layer,dist_sim,0.263394,0.280522,0.004483,0.004396


In [7]:
_df = df[(df['pretrained'] == 'EleutherAI/gpt-neo-1.3B') & (df['encoding_scheme'] == 'concat_all_examples') & (df.num_fewshot == 0)]
_df = _df.assign(date=pd.to_datetime(df.mtime, unit='s', origin='unix', utc=True))
_df

Unnamed: 0,mtime,WORD_AGG_SCHEME,EXAMPLE_AGG_SCHEME,SEGMENT_AGG_SCHEME,NORM,SIMILARITY_FUNC,pretrained,encoding_scheme,num_fewshot,hellaswag:acc,hellaswag:acc_stderr,hellaswag:rand_acc,hellaswag:rand_acc_stderr,model_type,date
0,1662540000.0,last,mean,mean,L2,dot_product,EleutherAI/gpt-neo-1.3B,concat_all_examples,0,0.272954,0.004446,0.25,0.0,dist_sim,2022-09-07 08:44:24.025124096+00:00


## All

In [16]:
task_metrics(df, ['hellaswag:acc', 'webqs:acc'])

Following columns will be dropped: {'SEGMENT_AGG_SCHEME', 'hellaswag:rand_acc', 'hellaswag:em_stderr', 'hellaswag:rand_acc_stderr', 'EXAMPLE_AGG_SCHEME', 'hellaswag:em'}


  df2 = results_df[list(selected_cols)].groupby(list(groupby_cols), dropna=False).aggregate(take_last).dropna(how='all')


Unnamed: 0,num_fewshot,SIMILARITY_FUNC,pretrained,WORD_AGG_SCHEME,encoding_scheme,NORM,model_type,hellaswag:acc_norm,hellaswag:acc_norm_stderr,hellaswag:acc,hellaswag:acc_stderr,webqs:acc,webqs:acc_stderr
34,0,,EleutherAI/gpt-neo-1.3B,,,,autoregressive,0.489345,0.004989,0.386576,0.00486,0.016732,0.002846
29,0,,EleutherAI/gpt-neo-1.3B,last,merge_all_segments,layer,dist_gen,,,,,0.0,0.0
30,0,,EleutherAI/gpt-neo-1.3B,last,segment_each_example,layer,dist_gen,,,,,0.0,0.0
31,0,,EleutherAI/gpt-neo-1.3B,mean,merge_all_segments,layer,dist_gen,,,,,0.0,0.0
32,0,,EleutherAI/gpt-neo-1.3B,mean,segment_each_example,layer,dist_gen,,,,,0.0,0.0
33,0,,EleutherAI/gpt-neo-1.3B,,cross_encoding,,dist_gen,0.489345,0.004989,0.386576,0.00486,0.016732,0.002846
0,0,cosine_sim,EleutherAI/gpt-neo-1.3B,last,concat_all_examples,L2,dist_sim,0.1,0.1,0.3,0.152753,,
1,0,cosine_sim,EleutherAI/gpt-neo-1.3B,last,concat_all_examples,layer,dist_sim,0.263394,0.004396,0.280522,0.004483,,
2,0,cosine_sim,EleutherAI/gpt-neo-1.3B,last,concat_each_example,layer,dist_sim,0.263394,0.004396,0.280522,0.004483,,
3,0,cosine_sim,EleutherAI/gpt-neo-1.3B,last,segment_each_example,layer,dist_sim,0.263394,0.004396,0.280522,0.004483,,
