In [1]:
import os
import sys
import typing
import json
import re
import pandas as pd


In [2]:
def remove_header(segment):
    if segment.startswith('model_args:'):
        return segment[len('model_args:'):]
    elif segment.startswith('task_args:'):
        return segment[len('task_args:'):]
    else:
        return segment

def parse_segment(segment) -> typing.List[str]:
    segment = remove_header(segment)
    kwargs = [kwarg for kwarg in segment.split(',') if kwarg]
    args = {}
    for kwarg_str in kwargs:
        k,v = kwarg_str.split('=')
        args[k] = v if v != 'None' else None
    return args

def parse_fname(fname: str) -> typing.Dict:    
    args = {}
    for segment in fname.split('|'):
        args.update(parse_segment(segment))
    return args

def parse_file(fpath: str) -> typing.Dict:
    mtime = os.stat(fpath).st_mtime
    with open(fpath, 'rt') as f:
        o = json.load(f)
    task_version = o['versions']
    d = {'mtime': mtime}
    for k, v in o['config'].items():
        if isinstance(v, str) and '=' in v:
            d.update(parse_segment(v))
        elif not v and k in ['model_args', 'task_args']:
            continue
        else:
            d[k] = v
    for task_name, results in o['results'].items():
        if task_name.endswith('_d'):
            task_name_out = task_name[:-len('_d')]
        elif task_name.endswith('_dg'):
            task_name_out = task_name[:-len('_dg')]
        else:
            task_name_out = task_name
        for k, v in results.items():
            # d[f'{task_name_out}_v{task_version[task_name]}:{k}'] = v
            d[f'{task_name_out}:{k}'] = v
    return d

def parse_dir(dirpath: str) -> pd.DataFrame:
    # pd.DataFrame([parse_fname(fname) for fname in os.listdir('lmeval_results')])
    fnames, mtimes = zip(*[(fentry.name, fentry.stat().st_mtime) for fentry in os.scandir(dirpath) if fentry.is_file() and fentry.name.endswith('.json')])
    return pd.DataFrame([parse_file(f'{dirpath}/{fname}') for fname in fnames])



## Evaluation Results

In [9]:
def read_results(dir: str = '../lmeval_results') -> pd.DataFrame:
    df = parse_dir(dir)
    df = df[[col for col in df.columns if col not in ['batch_size', 'device', 'no_cache', 'bootstrap_iters', 'description_dict']]]
    df = df.drop(columns='limit').assign(pretrained=df.pretrained.fillna('GPT2'))
    df = df.assign(model_type=df.model.map(lambda model: 'autoregressive' if model == 'gpt2' else (model))).drop(columns='model')
    return df

def task_metrics(results_df: pd.DataFrame, tasks: typing.List[str]) -> pd.DataFrame:
    metrics = tasks
    metrics_re = re.compile(r'^(' + r'|'.join([f'({m})' for m in metrics]) + ').*' )
    model_cols = {'model_type', 'pretrained', 'WORD_AGG_SCHEME', 'SIMILARITY_FUNC', 'NORM'}
    task_cols = {'num_fewshot', 'encoding_scheme'}
    metric_cols = {col for col in df.columns if metrics_re.fullmatch(col) is not None}
    selected_cols = model_cols | task_cols | metric_cols | {'mtime'}
    if (selected_cols) < set(df.columns):
        f'Following columns were unaccounted: {set(results_df.columns) - selected_cols}'
    groupby_cols = (model_cols | task_cols)
    def take_last(_df: pd.DataFrame) -> pd.DataFrame:
        _df = _df.sort_values(by='mtime', ascending=False)
        return pd.Series({col: _df[col].dropna().iloc[0] if _df[col].dropna().shape[0] >=1 else None for col in _df.columns if col in metric_cols})
    df2 = results_df[list(selected_cols)].groupby(list(groupby_cols), dropna=False).aggregate(take_last).dropna(how='all')
    return df2.reset_index().dropna(axis=1, how='all')

In [13]:
df = read_results()
df_webqs = task_metrics(df, ['webqs'])
df_webqs

  df2 = results_df[list(selected_cols)].groupby(list(groupby_cols), dropna=False).aggregate(take_last).dropna(how='all')


Unnamed: 0,encoding_scheme,model_type,num_fewshot,pretrained,webqs:acc_stderr,webqs:acc
0,cross_encoding,dist_gen,0,EleutherAI/gpt-neo-1.3B,0.002846,0.016732
1,cross_encoding,dist_gen,0,GPT2,0.001204,0.002953
2,cross_encoding,dist_gen,5,EleutherAI/gpt-neo-1.3B,0.005601,0.068406
3,cross_encoding,dist_gen,5,GPT2,0.003301,0.022638
4,cross_encoding,dist_gen,25,GPT2,0.003537,0.026083
5,,autoregressive,0,EleutherAI/gpt-neo-1.3B,0.002846,0.016732
6,,autoregressive,0,GPT2,0.001204,0.002953
7,,autoregressive,5,EleutherAI/gpt-neo-1.3B,0.005601,0.068406
8,,autoregressive,5,GPT2,0.003301,0.022638
9,,autoregressive,25,EleutherAI/gpt-neo-1.3B,0.006521,0.095472


In [15]:
df_webqs[(df_webqs.pretrained == 'EleutherAI/gpt-neo-1.3B')]

Unnamed: 0,encoding_scheme,model_type,num_fewshot,pretrained,webqs:acc_stderr,webqs:acc
0,cross_encoding,dist_gen,0,EleutherAI/gpt-neo-1.3B,0.002846,0.016732
2,cross_encoding,dist_gen,5,EleutherAI/gpt-neo-1.3B,0.005601,0.068406
5,,autoregressive,0,EleutherAI/gpt-neo-1.3B,0.002846,0.016732
7,,autoregressive,5,EleutherAI/gpt-neo-1.3B,0.005601,0.068406
9,,autoregressive,25,EleutherAI/gpt-neo-1.3B,0.006521,0.095472
