In [15]:
import os
import sys
import typing
import json
import re
import pandas as pd


In [16]:
def remove_header(segment):
    if segment.startswith('model_args:'):
        return segment[len('model_args:'):]
    elif segment.startswith('task_args:'):
        return segment[len('task_args:'):]
    else:
        return segment

def parse_segment(segment) -> typing.List[str]:
    segment = remove_header(segment)
    kwargs = [kwarg for kwarg in segment.split(',') if kwarg]
    args = {}
    for kwarg_str in kwargs:
        k,v = kwarg_str.split('=')
        args[k] = v if v != 'None' else None
    return args

def parse_fname(fname: str) -> typing.Dict:    
    args = {}
    for segment in fname.split('|'):
        args.update(parse_segment(segment))
    return args

def parse_file(fpath: str) -> typing.Dict:
    mtime = os.stat(fpath).st_mtime
    with open(fpath, 'rt') as f:
        o = json.load(f)
    task_version = o['versions']
    d = {'mtime': mtime}
    for k, v in o['config'].items():
        if isinstance(v, str) and '=' in v:
            d.update(parse_segment(v))
        elif not v and k in ['model_args', 'task_args']:
            continue
        else:
            d[k] = v
    for task_name, results in o['results'].items():
        if task_name.endswith('_d'):
            task_name_out = task_name[:-len('_d')]
        elif task_name.endswith('_dg'):
            task_name_out = task_name[:-len('_dg')]
        else:
            task_name_out = task_name
        for k, v in results.items():
            # d[f'{task_name_out}_v{task_version[task_name]}:{k}'] = v
            d[f'{task_name_out}:{k}'] = v
    return d

def parse_dir(dirpath: str) -> pd.DataFrame:
    # pd.DataFrame([parse_fname(fname) for fname in os.listdir('lmeval_results')])
    fnames, mtimes = zip(*[(fentry.name, fentry.stat().st_mtime) for fentry in os.scandir(dirpath) if fentry.is_file() and fentry.name.endswith('.json')])
    return pd.DataFrame([parse_file(f'{dirpath}/{fname}') for fname in fnames])



## Evaluation Results

In [42]:
def read_results(dir: str = '../lmeval_results2') -> pd.DataFrame:
    df = parse_dir(dir)
    df = df[[col for col in df.columns if col not in ['batch_size', 'device', 'no_cache', 'bootstrap_iters', 'description_dict']]]
    df = df.drop(columns='limit').assign(pretrained=df.pretrained.fillna('GPT2'))
    df = df.assign(model_type=df.model.map(lambda model: 'autoregressive' if model == 'gpt2' else (model))).drop(columns='model')
    return df

def task_metrics(results_df: pd.DataFrame, tasks: typing.List[str]) -> pd.DataFrame:
    metrics = tasks
    metrics_re = re.compile(r'^(' + r'|'.join([f'({m})' for m in metrics]) + ').*' )
    print(f'metric cols regexp = {metrics_re}')
    model_cols = {'model_type', 'pretrained', 'WORD_AGG_SCHEME', 'SEGMENT_AGG_SCHEME', 'EXAMPLE_AGG_SCHEME', 'SIMILARITY_FUNC', 'NORM'}
    task_cols = {'num_fewshot', 'encoding_scheme'}
    # metric_cols = {col for col in df.columns if metrics_re.fullmatch(col) is not None}
    task_metric_cols = {task: [col for col in df.columns if re.fullmatch(f'^{task}.*', col)] for task in tasks}
    metric_cols = {col for cols in task_metric_cols.values() for col in cols}
    selected_cols = model_cols | task_cols | metric_cols | {'mtime'}
    if (selected_cols) < set(df.columns):
        print(f'Following columns will be dropped: {set(results_df.columns) - selected_cols}')
    groupby_cols = (model_cols | task_cols)
    def take_last(_df: pd.DataFrame) -> pd.DataFrame:
        _df = _df.sort_values(by='mtime', ascending=False)
        # return pd.Series({col: _df[col].dropna().iloc[0] if _df[col].dropna().shape[0] >=1 else None for col in _df.columns if col in metric_cols})
        return pd.concat([_df[task_metric_cols[task]].dropna().iloc[0] for task in tasks])
    df2 = results_df[list(selected_cols)].groupby(list(groupby_cols), dropna=False).aggregate(take_last).dropna(how='all')
    return df2.reset_index().dropna(axis=1, how='all').sort_values(by=['num_fewshot', 'pretrained', 'model_type'])
df = read_results()

## Show All Files Individually Without Merging

In [43]:
all_df = df.assign(date=pd.to_datetime(df.mtime, unit='s', origin='unix', utc=True)).sort_values(by='mtime', ascending=False)
all_df[all_df.mtime > 1.662555e+09][all_df.encoding_scheme.isin(['concat_all_examples', 'merge_all_segments'])]

  all_df[all_df.mtime > 1.662555e+09][all_df.encoding_scheme.isin(['concat_all_examples', 'merge_all_segments'])]


Unnamed: 0,mtime,WORD_AGG_SCHEME,EXAMPLE_AGG_SCHEME,SEGMENT_AGG_SCHEME,NORM,SIMILARITY_FUNC,pretrained,encoding_scheme,num_fewshot,hellaswag:acc,hellaswag:acc_stderr,hellaswag:rand_acc,hellaswag:rand_acc_stderr,model_type,date
10,1668117000.0,mean,,,layer,cosine_sim,EleutherAI/gpt-neo-1.3B,merge_all_segments,5,0.282215,0.004492,0.25,0.0,dist_sim,2022-11-10 21:52:17.123336192+00:00
23,1668117000.0,mean,,,layer,dot_product,EleutherAI/gpt-neo-1.3B,merge_all_segments,5,0.282115,0.004491,0.25,0.0,dist_sim,2022-11-10 21:51:48.251612928+00:00
3,1668116000.0,mean,,,layer,dot_product,EleutherAI/gpt-neo-1.3B,concat_all_examples,5,0.235909,0.004237,0.25,0.0,dist_sim,2022-11-10 21:39:22.438825728+00:00
0,1668116000.0,mean,,,layer,cosine_sim,EleutherAI/gpt-neo-1.3B,concat_all_examples,5,0.235909,0.004237,0.25,0.0,dist_sim,2022-11-10 21:38:50.371140864+00:00
18,1668115000.0,mean,,,layer,dot_product,EleutherAI/gpt-neo-1.3B,concat_all_examples,0,0.289882,0.004528,0.25,0.0,dist_sim,2022-11-10 21:19:48.818486784+00:00
15,1668115000.0,mean,,,layer,cosine_sim,EleutherAI/gpt-neo-1.3B,concat_all_examples,0,0.289882,0.004528,0.25,0.0,dist_sim,2022-11-10 21:19:26.346697728+00:00
7,1668115000.0,mean,,,layer,dot_product,EleutherAI/gpt-neo-1.3B,merge_all_segments,0,0.289882,0.004528,0.25,0.0,dist_sim,2022-11-10 21:19:19.782759424+00:00
8,1668115000.0,mean,,,layer,cosine_sim,EleutherAI/gpt-neo-1.3B,merge_all_segments,0,0.289882,0.004528,0.25,0.0,dist_sim,2022-11-10 21:19:15.446800128+00:00


## WebQs

In [44]:
# df_webqs = task_metrics(df, ['webqs'])
# df_webqs

## Hellaswag

In [45]:
pd.set_option('display.max_rows', 100)
df_hellaswag = task_metrics(df, ['hellaswag:acc'])
df_hellaswag = df_hellaswag.sort_values(by=['num_fewshot', 'pretrained', 'model_type', 'hellaswag:acc'], ascending=[True, True, True, False])
df_hellaswag[df_hellaswag['pretrained'] == 'EleutherAI/gpt-neo-1.3B']

metric cols regexp = re.compile('^((hellaswag:acc)).*')
Following columns will be dropped: {'hellaswag:rand_acc', 'hellaswag:rand_acc_stderr'}


  df2 = results_df[list(selected_cols)].groupby(list(groupby_cols), dropna=False).aggregate(take_last).dropna(how='all')


Unnamed: 0,SEGMENT_AGG_SCHEME,WORD_AGG_SCHEME,EXAMPLE_AGG_SCHEME,NORM,SIMILARITY_FUNC,encoding_scheme,num_fewshot,model_type,pretrained,hellaswag:acc,hellaswag:acc_stderr
4,mean,mean,mean,L2,dot_product,concat_all_examples,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
6,mean,mean,mean,layer,dot_product,concat_all_examples,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
8,,mean,,layer,cosine_sim,concat_all_examples,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
10,,mean,,layer,cosine_sim,concat_each_example,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
12,,mean,,layer,cosine_sim,merge_all_segments,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
14,,mean,,layer,cosine_sim,segment_each_example,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
16,,mean,,layer,dot_product,concat_all_examples,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
18,,mean,,layer,dot_product,concat_each_example,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
20,,mean,,layer,dot_product,merge_all_segments,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
22,,mean,,layer,dot_product,segment_each_example,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528


## All

In [46]:
task_metrics(df, ['hellaswag:acc', 'webqs:acc'])

metric cols regexp = re.compile('^((hellaswag:acc)|(webqs:acc)).*')
Following columns will be dropped: {'hellaswag:rand_acc', 'hellaswag:rand_acc_stderr'}


  df2 = results_df[list(selected_cols)].groupby(list(groupby_cols), dropna=False).aggregate(take_last).dropna(how='all')


Unnamed: 0,SEGMENT_AGG_SCHEME,WORD_AGG_SCHEME,EXAMPLE_AGG_SCHEME,NORM,SIMILARITY_FUNC,encoding_scheme,num_fewshot,model_type,pretrained,hellaswag:acc,hellaswag:acc_stderr
0,mean,last,mean,L2,cosine_sim,concat_all_examples,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.272954,0.004446
2,mean,last,mean,L2,dot_product,concat_all_examples,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.272954,0.004446
4,mean,mean,mean,L2,dot_product,concat_all_examples,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
6,mean,mean,mean,layer,dot_product,concat_all_examples,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
8,,mean,,layer,cosine_sim,concat_all_examples,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
10,,mean,,layer,cosine_sim,concat_each_example,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
12,,mean,,layer,cosine_sim,merge_all_segments,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
14,,mean,,layer,cosine_sim,segment_each_example,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
16,,mean,,layer,dot_product,concat_all_examples,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
18,,mean,,layer,dot_product,concat_each_example,0,dist_sim,EleutherAI/gpt-neo-1.3B,0.289882,0.004528
