In [1]:
import statistics

import eval


def evaluate(result_array):
    stats_dict = {}
    for metric in metrics:
        avg = statistics.mean(map(lambda line: line[metric], result_array))
        stats_dict[metric] = avg
    nr_correct = len(list(filter(lambda line: line['ed'] == 0, result_array)))
    stats_dict['accuracy'] = nr_correct / len(result_array)
    print(f'\t{nr_correct}/{len(result_array)} correct predictions')

    return stats_dict


metrics = ['ed', 'red', 'gleu', 'meteor']
metrics1 = ['accuracy'] + metrics

#Baseline

## CUP (First 1000)
print('CUP (First 1000)')
cup_result = eval.evaluate('../result/baseline/CUP_first1000.jsonl', 'CUP')[:1000]
cup_d = evaluate(cup_result)
for metric_ in metrics1:
    print(f'\t{metric_.capitalize()}: {cup_d[metric_]}')

## HebCup (First 1000)
print('HebCup (First 1000)')
hebcup_result = eval.evaluate('../result/baseline/HebCup_first1000.jsonl', 'HebCup')[:1000]
hebcup_d = evaluate(hebcup_result)
for metric_ in metrics1:
    print(f'\t{metric_.capitalize()}: {hebcup_d[metric_]}')

# LLM
result = eval.evaluate('../result/candidates/candidates-20240328_154708.jsonl')

count = len(result)

print(f'LLM (First 1000)')

simple_top1 = list(map(lambda x: x[0], result))
d1 = evaluate(simple_top1)

theoretical_best = list(map(lambda candidates: sorted(candidates, key=lambda cand: cand['gleu'], reverse=True)[0], result))
d2 = evaluate(theoretical_best)

print(f'\tAccuracy: {d1["accuracy"]:}')
print(f'\tRecall@5: {d2["accuracy"]:}')

print('Top1:')
for metric_ in metrics:
    print(f'\t{metric_.capitalize()}: {d1[metric_]:.3f}')

print('Theoretical best in top5:')
for metric_ in metrics:
    print(f'\t{metric_.capitalize()}: {d2[metric_]:.3f}')




CUP (First 1000)
	166/1000 correct predictions
	Accuracy: 0.166
	Ed: 2.592
	Red: 0.9276436507936507
	Gleu: 65.49282583663754
	Meteor: 80.01465187784005
HebCup (First 1000)
	121/1000 correct predictions
	Accuracy: 0.121
	Ed: 2.974
	Red: 1.1776805194805195
	Gleu: 61.455580947970724
	Meteor: 80.31157918256051
LLM (First 1000)
	177/1000 correct predictions
	255/1000 correct predictions
	Accuracy: 0.177
	Recall@5: 0.255
Top1:
	Ed: 8.298
	Red: 4.435
	Gleu: 50.430
	Meteor: 71.512
Theoretical best in top5:
	Ed: 4.518
	Red: 2.225
	Gleu: 62.952
	Meteor: 80.964
