In [13]:
import json
import pandas as pd
model_path_list = ['/root/.cache/modelscope/hub/qwen/Qwen1___5-7B-Chat', '/root/.cache/modelscope/hub/LLM-Research/Meta-Llama-3-8B-Instruct',
                   '/root/.cache/modelscope/hub/qwen/Qwen1___5-14B-Chat', '/root/.cache/modelscope/hub/ZhipuAI/chatglm3-6b', 
                   '/root/.cache/modelscope/hub/Shanghai_AI_Laboratory/internlm2-chat-7b', '/root/.cache/modelscope/hub/deepseek-ai/deepseek-llm-7b-chat',
                   '/root/.cache/modelscope/hub/lockonlvange/autoj-13b-fp16',
                   '/root/.cache/huggingface/hub/models--WeOpenML--PandaLM-7B-v1/snapshots/PandaLM',
                   '/root/finetuned/qwen1.5-7b-chat_autoj_trainset_qlora_epoch_5']
model_name_or_path = model_path_list[-1]
model_name = model_name_or_path.rsplit('/', 1)[-1]
evaluation_out_path = '../result/' + model_name + 'autoj_prompt.xlsx'
evaluation_result = {}

In [14]:
def evaluation(result, total):
    if sum(result[2]) != 0:
        accuracy = (result[0][0] + result[1][1] + result[2][2])*100/total
        p0 = result[0][0]/(result[0][0]+result[1][0]+result[2][0])
        p1 = result[1][1]/(result[0][1]+result[1][1]+result[2][1])
        p2 = result[2][2]/(result[0][2]+result[1][2]+result[2][2])
        p = (p0+p1+p2)/3
        r0 = result[0][0]/(sum(result[0]))
        r1 = result[1][1]/(sum(result[1]))
        r2 = result[2][2]/(sum(result[2]))
        r = (r0+r1+r2)/3
    else:
        accuracy = (result[0][0] + result[1][1])*100/total
        p1 = result[0][0]/(result[0][0]+result[1][0])
        p2 = result[1][1]/(result[0][1]+result[1][1])
        p = (p1+p2)/3
        r1 = result[0][0]/(sum(result[0]))
        r2 = result[1][1]/(sum(result[1]))
        r = (r1+r2)/3
    return accuracy, p, r

In [15]:
# autoj
dataset_name = 'autoj'
response_file_path = f'../datasets/AutoJ/{model_name}/testdata_pairwise_response.json'
with open(response_file_path, 'r') as f:
    allin = f.readlines()
    total = 0
    format_error = 0
    result = [[0 for _ in range(3)] for _ in range(3)]
    for line in allin:
        line_json = json.loads(line)
        label = line_json['label']
        pred = line_json['pred_label']
        if pred == -1:
            format_error += 1
            continue
        result[label][pred] += 1
        total += 1
    accuracy, p, r = evaluation(result, total)
    print(f'total: {len(allin)}, format error: {format_error}')
    print("%s accuracy: %.2f, F1 score: %.2f"%(dataset_name, accuracy, 2*p*r*100/(p+r)))
    evaluation_result[dataset_name] = {'accuracy': accuracy, 'f1 score': 2*p*r*100/(p+r), 'total': total, 'format error': format_error}

total: 1392, format error: 1
autoj accuracy: 57.30, F1 score: 55.72


In [16]:
# pandalm
dataset_name = 'pandalm'
response_file_path = f'../datasets/PandaLM/{model_name}/testset-v1_response.json'
with open(response_file_path, 'r') as f:
    allin = f.readlines()
    correct = 0
    format_error = 0
    for line in allin:
        line_json = json.loads(line)
        lable_list = [0, 0, 0]
        for i in range(3):
            lable_list[line_json[f'annotator{i+1}']] += 1
        if(max(lable_list) == 1):
            correct += 1
        else:
            label = (lable_list.index(max(lable_list)) + 2) % 3
        pred = line_json['pred_label']
        if pred == -1:
            format_error += 1
            continue
        result[label][pred] += 1
        total += 1
    accuracy, p, r = evaluation(result, total)
    print(f'total: {len(allin)}, format error: {format_error}')
    print("%s accuracy: %.2f, F1 score: %.2f"%(dataset_name, accuracy, 2*p*r*100/(p+r)))
    evaluation_result[dataset_name] = {'accuracy': accuracy, 'f1 score': 2*p*r*100/(p+r), 'total': total, 'format error': format_error}

total: 999, format error: 0
pandalm accuracy: 62.68, F1 score: 55.17


In [17]:
# llmbar_natural
dataset_name = 'llmbar_natural'
response_file_path = f'../datasets/LLMBar/Natural/{model_name}/dataset_response.json'
with open(response_file_path, 'r') as f:
    allin = f.readlines()
    total = 0
    format_error = 0
    result = [[0 for _ in range(3)] for _ in range(3)]
    for line in allin:
        line_json = json.loads(line)
        label = line_json['label'] - 1
        pred = line_json['pred_label']
        if pred == -1:
            format_error += 1
            continue
        result[label][pred] += 1
        total += 1
    accuracy, p, r = evaluation(result, total)
    print(f'total: {len(allin)}, format error: {format_error}')
    print("%s accuracy: %.2f, F1 score: %.2f"%(dataset_name, accuracy, 2*p*r*100/(p+r)))
    evaluation_result[dataset_name] = {'accuracy': accuracy, 'f1 score': 2*p*r*100/(p+r), 'total': total, 'format error': format_error}

total: 100, format error: 0
llmbar_natural accuracy: 71.00, F1 score: 47.82


In [18]:
# llmbar_neighbor
dataset_name = 'llmbar_neighbor'
response_file_path = f'../datasets/LLMBar/Adversarial/Neighbor/{model_name}/dataset_response.json'
with open(response_file_path, 'r') as f:
    allin = f.readlines()
    total = 0
    format_error = 0
    result = [[0 for _ in range(3)] for _ in range(3)]
    for line in allin:
        line_json = json.loads(line)
        label = line_json['label'] - 1
        pred = line_json['pred_label']
        if pred == -1:
            format_error += 1
            continue
        result[label][pred] += 1
        total += 1
    accuracy, p, r = evaluation(result, total)
    print(f'total: {len(allin)}, format error: {format_error}')
    print("%s accuracy: %.2f, F1 score: %.2f"%(dataset_name, accuracy, 2*p*r*100/(p+r)))
    evaluation_result[dataset_name] = {'accuracy': accuracy, 'f1 score': 2*p*r*100/(p+r), 'total': total, 'format error': format_error}

total: 134, format error: 0
llmbar_neighbor accuracy: 19.40, F1 score: 13.37


In [19]:
# llmbar_gptinst
dataset_name = 'llmbar_gptinst'
response_file_path = f'../datasets/LLMBar/Adversarial/GPTInst/{model_name}/dataset_response.json'
with open(response_file_path, 'r') as f:
    allin = f.readlines()
    total = 0
    format_error = 0
    result = [[0 for _ in range(3)] for _ in range(3)]
    for line in allin:
        line_json = json.loads(line)
        label = line_json['label'] - 1
        pred = line_json['pred_label']
        if pred == -1:
            format_error += 1
            continue
        result[label][pred] += 1
        total += 1
    accuracy, p, r = evaluation(result, total)
    print(f'total: {len(allin)}, format error: {format_error}')
    print("%s accuracy: %.2f, F1 score: %.2f"%(dataset_name, accuracy, 2*p*r*100/(p+r)))
    evaluation_result[dataset_name] = {'accuracy': accuracy, 'f1 score': 2*p*r*100/(p+r), 'total': total, 'format error': format_error}

total: 92, format error: 0
llmbar_gptinst accuracy: 27.17, F1 score: 18.12


In [20]:
# llmbar_gptout
dataset_name = 'llmbar_gptout'
response_file_path = f'../datasets/LLMBar/Adversarial/GPTOut/{model_name}/dataset_response.json'
with open(response_file_path, 'r') as f:
    allin = f.readlines()
    total = 0
    format_error = 0
    result = [[0 for _ in range(3)] for _ in range(3)]
    for line in allin:
        line_json = json.loads(line)
        label = line_json['label'] - 1
        pred = line_json['pred_label']
        if pred == -1:
            format_error += 1
            continue
        result[label][pred] += 1
        total += 1
    accuracy, p, r = evaluation(result, total)
    print(f'total: {len(allin)}, format error: {format_error}')
    print("%s accuracy: %.2f, F1 score: %.2f"%(dataset_name, accuracy, 2*p*r*100/(p+r)))
    evaluation_result[dataset_name] = {'accuracy': accuracy, 'f1 score': 2*p*r*100/(p+r), 'total': total, 'format error': format_error}

total: 47, format error: 0
llmbar_gptout accuracy: 44.68, F1 score: 31.28


In [21]:
# llmbar_manual
dataset_name = 'llmbar_manual'
response_file_path = f'../datasets/LLMBar/Adversarial/Manual/{model_name}/dataset_response.json'
with open(response_file_path, 'r') as f:
    allin = f.readlines()
    total = 0
    format_error = 0
    result = [[0 for _ in range(3)] for _ in range(3)]
    for line in allin:
        line_json = json.loads(line)
        label = line_json['label'] - 1
        pred = line_json['pred_label']
        if pred == -1:
            format_error += 1
            continue
        result[label][pred] += 1
        total += 1
    accuracy, p, r = evaluation(result, total)
    print(f'total: {len(allin)}, format error: {format_error}')
    print("%s accuracy: %.2f, F1 score: %.2f"%(dataset_name, accuracy, 2*p*r*100/(p+r)))
    evaluation_result[dataset_name] = {'accuracy': accuracy, 'f1 score': 2*p*r*100/(p+r), 'total': total, 'format error': format_error}

total: 46, format error: 0
llmbar_manual accuracy: 32.61, F1 score: 21.90


In [22]:
# mt-bench
dataset_name = 'mt-bench'
response_file_path = f'../datasets/MTBench/{model_name}/mt_bench_human_response_type1.json'
with open(response_file_path, 'r') as f:
    allin = f.readlines()
    correct = 0
    format_error = 0
    for line in allin:
        line_json = json.loads(line)
        winner = line_json['winner']
        if winner == 'model_a':
            label = 0
        elif winner == 'model_b':
            label = 1
        elif winner == 'tie':
            label = 2
        else:
            print('error')
        pred = line_json['pred_label']
        if pred == -1:
            format_error += 1
            continue
        result[label][pred] += 1
        total += 1
    accuracy, p, r = evaluation(result, total)
    print(f'total: {len(allin)}, format error: {format_error}')
    print("%s accuracy: %.2f, F1 score: %.2f"%(dataset_name, accuracy, 2*p*r*100/(p+r)))
    evaluation_result[dataset_name] = {'accuracy': accuracy, 'f1 score': 2*p*r*100/(p+r), 'total': total, 'format error': format_error}

total: 3355, format error: 0
mt-bench accuracy: 56.28, F1 score: 49.56


In [23]:
average = {'f1 score' : 0, 'accuracy': 0, 'total': '', 'format error': 0}
for key in evaluation_result:
    average['f1 score'] += evaluation_result[key]['f1 score']
    average['accuracy'] += evaluation_result[key]['accuracy']
    average['format error'] += evaluation_result[key]['format error']
average['f1 score'] /= 8
average['accuracy'] /= 8
average['format error'] /= 8
evaluation_result['average'] = average
result_df = pd.DataFrame(evaluation_result)
result_df.to_excel(evaluation_out_path)