In [1]:
# Evaluating OpenQA Evaluation
import pandas as pd

In [2]:
df = pd.read_json('data/NQ-nli-gpt35.json')


In [3]:
df[df.isna().any(axis=1)]

Unnamed: 0,qid,question,golden_answer,golden_statement,system,system_answer,golden_judge,a2astar,astar2a,pred_a2astar,pred_astar2a,system_judge,system_statement,id,asup,ainf


In [4]:
df['asup'] = df['a2astar'].apply(lambda x: 'entailment' in x.lower()).astype(int)
df['ainf'] = df['astar2a'].apply(lambda x: 'entailment' in x.lower()).astype(int)

In [5]:
# Ainf | Asup
df['system_judge'] = (df['asup'] | df['ainf']).astype(int)

# Ainf & Asup
#df['system_judge'] = (df['pred_a2astar'] & df['pred_astar2a']).astype(int)

# Asup - Ainf
#df['system_judge'] = (df['pred_a2astar'] & ~df['pred_astar2a']).astype(int)

#df = df.drop(columns=['pred_a2astar', 'pred_astar2a'])
#df.sort_values(by='qid', ascending=True, inplace=True)
#df.to_json('data/NQ-nli-gpt35.json', orient='records', indent=2)

In [6]:
def fscore(tp,fp,fn):
    prec = tp / (tp + fp)
    rec = tp / (tp + fn)
    f1 = 2 * prec * rec / (prec + rec)
    return prec, rec, f1

def eval_df(df):
    tp = df[(df['golden_judge'] == 1) & (df['system_judge'] == 1)].shape[0]
    fp = df[(df['golden_judge'] == 0) & (df['system_judge'] == 1)].shape[0]
    tn = df[(df['golden_judge'] == 0) & (df['system_judge'] == 0)].shape[0]
    fn = df[(df['golden_judge'] == 1) & (df['system_judge'] == 0)].shape[0]

    pos_prec, pos_rec, pos_f1 = fscore(tp, fp, fn)
    neg_prec, neg_rec, neg_f1 = fscore(tn, fn, fp)

    acc = (tp + tn) / (tp + tn + fp + fn)
    macro_f1 = (pos_f1 + neg_f1) / 2
    print(f'Acc: {round(acc, 3)}, Pos F1: {round(pos_f1, 3)}')
    print(f'Pos Prec: {round(pos_prec, 3)}, Pos Rec: {round(pos_rec, 3)}')

In [7]:
systems = ['fid', 'gpt35', 'chatgpt', 'gpt4', 'newbing']
for sys in systems:
    subdf = df[df['system'] == sys]
    print(sys)
    eval_df(subdf)

fid
Acc: 0.934, Pos F1: 0.953
Pos Prec: 0.937, Pos Rec: 0.97
gpt35
Acc: 0.888, Pos F1: 0.915
Pos Prec: 0.908, Pos Rec: 0.923
chatgpt
Acc: 0.876, Pos F1: 0.916
Pos Prec: 0.906, Pos Rec: 0.926
gpt4
Acc: 0.875, Pos F1: 0.921
Pos Prec: 0.922, Pos Rec: 0.919
newbing
Acc: 0.843, Pos F1: 0.9
Pos Prec: 0.919, Pos Rec: 0.882


In [8]:
# count...
for system in systems:
    print(system)
    #print(df[df['system'] == system]['golden_judge'].value_counts())

    print(df[df['system'] == system]['system_judge'].value_counts()[1] / df[df['system'] == system].shape[0])

fid
0.7278145695364239
gpt35
0.6874172185430464
chatgpt
0.7605960264900662
gpt4
0.8109271523178808
newbing
0.8049668874172186
