In [1]:
import json
from sklearn.metrics import accuracy_score, f1_score
import re


def match_choice(text):
    match = re.findall(r'.*?([A-D]+(?:[、, ]+[A-D]+)*)', text)
    if match:
        last_match = match[-1]
        return ''.join(re.split(r'[、, ]+', last_match))
    return ''


def accuracy(path):
    y_true = []
    y_pred = []
    with open(path, 'r') as file:
        for line in file:
            data = json.loads(line)
            y_true.append(match_choice(data['label']))
            y_pred.append(match_choice(data['predict']))
    return accuracy_score(y_true, y_pred)


def score(path):
    y_true = []
    y_pred = []
    with open(path, 'r') as file:
        for line in file:
            data = json.loads(line)
            y_true.append(match_choice(data['label']))
            y_pred.append(match_choice(data['predict']))
    return f1_score(y_true, y_pred, average='macro')


def out(model_name, dataset):
    path = f'{model_name}/{dataset}/generated_predictions.jsonl'
    print(f'-----{model_name}: {dataset}')
    print(f'acc: {round(accuracy(path), 2)}      f1: {round(score(path), 2)}')

### igpt_v0: insuranceQA + sentiment + travel_insurance

### igpt_v1: insuranceQA


In [2]:
data = ['travel_insurance_test', 'travel_insurance_4_label_test', 'sentiment_test']
model = ['igpt_v1_rlhf_travel_insurance', 'igpt_v1_rlhf', 'igpt_v3_correct_rlhf', 'igpt_v0', 'igpt_v1', 'igpt_v3_correct', 'igpt_v0_label', 'GPT-3.5', 'llama3']

In [3]:
for m in model:
    for d in data:
        out(m, d)
    print()

-----igpt_v1_rlhf_travel_insurance: travel_insurance_test
acc: 0.75      f1: 0.69
-----igpt_v1_rlhf_travel_insurance: travel_insurance_4_label_test
acc: 0.58      f1: 0.37
-----igpt_v1_rlhf_travel_insurance: sentiment_test
acc: 0.82      f1: 0.73

-----igpt_v1_rlhf: travel_insurance_test
acc: 0.35      f1: 0.26
-----igpt_v1_rlhf: travel_insurance_4_label_test


acc: 0.66      f1: 0.31
-----igpt_v1_rlhf: sentiment_test
acc: 0.73      f1: 0.18

-----igpt_v3_correct_rlhf: travel_insurance_test
acc: 0.72      f1: 0.7
-----igpt_v3_correct_rlhf: travel_insurance_4_label_test
acc: 0.08      f1: 0.05
-----igpt_v3_correct_rlhf: sentiment_test
acc: 0.83      f1: 0.79

-----igpt_v0: travel_insurance_test
acc: 0.38      f1: 0.31
-----igpt_v0: travel_insurance_4_label_test
acc: 0.17      f1: 0.16
-----igpt_v0: sentiment_test
acc: 0.87      f1: 0.82

-----igpt_v1: travel_insurance_test
acc: 0.4      f1: 0.37
-----igpt_v1: travel_insurance_4_label_test
acc: 0.42      f1: 0.25
-----igpt_v1: sentiment_test
acc: 0.69      f1: 0.14

-----igpt_v3_correct: travel_insurance_test
acc: 0.81      f1: 0.76
-----igpt_v3_correct: travel_insurance_4_label_test
acc: 0.05      f1: 0.02
-----igpt_v3_correct: sentiment_test
acc: 0.86      f1: 0.81

-----igpt_v0_label: travel_insurance_test
acc: 0.35      f1: 0.26
-----igpt_v0_label: travel_insurance_4_label_test
acc: 0.75   