In [4]:
import json
from sklearn.metrics import accuracy_score, f1_score
import re


def match_choice(text):
    match = re.findall(r'.*?([A-D]+(?:[、, ]+[A-D]+)*)', text)
    if match:
        last_match = match[-1]
        return ''.join(re.split(r'[、, ]+', last_match))
    return ''


def accuracy(path):
    y_true = []
    y_pred = []
    with open(path, 'r') as file:
        for line in file:
            data = json.loads(line)
            y_true.append(match_choice(data['label']))
            y_pred.append(match_choice(data['predict']))
    return accuracy_score(y_true, y_pred)


def score(path):
    y_true = []
    y_pred = []
    with open(path, 'r') as file:
        for line in file:
            data = json.loads(line)
            y_true.append(match_choice(data['label']))
            y_pred.append(match_choice(data['predict']))
    return f1_score(y_true, y_pred, average='macro')


def out(model_name, dataset):
    path = f'{model_name}/{dataset}/generated_predictions.jsonl'
    print(f'-----{model_name}: {dataset}')
    print(f'acc: {round(accuracy(path), 3)}      f1: {round(score(path), 3)}')

### igpt_v0: insuranceQA + sentiment + travel_insurance

### igpt_v1: insuranceQA


In [5]:
data = ['travel_insurance_test', 'travel_insurance_4_label_test', 'sentiment_test']
model = ['igpt_v1_rlhf_travel_insurance', 'igpt_v3_correct']
for m in model:
    for d in data:
        out(m, d)
    print()

-----igpt_v1_rlhf_travel_insurance: travel_insurance_test
acc: 0.751      f1: 0.705
-----igpt_v1_rlhf_travel_insurance: travel_insurance_4_label_test
acc: 0.63      f1: 0.367
-----igpt_v1_rlhf_travel_insurance: sentiment_test
acc: 0.806      f1: 0.726

-----igpt_v3_correct: travel_insurance_test
acc: 0.807      f1: 0.757
-----igpt_v3_correct: travel_insurance_4_label_test
acc: 0.05      f1: 0.024
-----igpt_v3_correct: sentiment_test
acc: 0.857      f1: 0.806

acc: 0.857      f1: 0.806



In [6]:
data = ['travel_insurance_test', 'travel_insurance_4_label_test', 'sentiment_test']
model = ['GPT-3.5', 'llama3']
for m in model:
    for d in data:
        out(m, d)
    print()

-----GPT-3.5: travel_insurance_test
acc: 0.58      f1: 0.231
-----GPT-3.5: travel_insurance_4_label_test


acc: 0.4      f1: 0.184
-----GPT-3.5: sentiment_test
acc: 0.73      f1: 0.424

-----llama3: travel_insurance_test
acc: 0.608      f1: 0.205
-----llama3: travel_insurance_4_label_test
acc: 0.09      f1: 0.069
-----llama3: sentiment_test
acc: 0.67      f1: 0.528

