In [6]:
import statistics

import eval


def evaluate(result_array):
    stats_dict = {}
    for metric in metrics:
        avg = statistics.mean(map(lambda line: line[metric], result_array))
        stats_dict[metric] = avg
    nr_correct = len(list(filter(lambda line: line['ed'] == 0, result_array)))
    stats_dict['accuracy'] = nr_correct / len(result_array)
    print(f'\t{nr_correct}/{len(result_array)} correct predictions')

    return stats_dict


metrics = ['ed', 'red', 'gleu', 'meteor']
metrics1 = ['accuracy'] + metrics

#Baseline

## CUP (First 1000)
print('CUP (First 1000)')
cup_result = eval.evaluate('../result/baseline/CUP_first1000.jsonl', 'CUP')[:1000]
cup_d = evaluate(cup_result)
for metric_ in metrics1:
    print(f'\t{metric_.capitalize()}: {cup_d[metric_]}')

## HebCup (First 1000)
print('HebCup (First 1000)')
hebcup_result = eval.evaluate('../result/baseline/HebCup_first1000.jsonl', 'HebCup')[:1000]
hebcup_d = evaluate(hebcup_result)
for metric_ in metrics1:
    print(f'\t{metric_.capitalize()}: {hebcup_d[metric_]}')

# LLM
result = eval.evaluate('../result/candidates/candidates-20240328_154708.jsonl')

count = len(result)

print(f'LLM (First 1000)')

simple_top1 = list(map(lambda x: x[0], result))
d1 = evaluate(simple_top1)

theoretical_best = list(map(lambda candidates: sorted(candidates, key=lambda cand: cand['meteor'], reverse=True)[0], result))
d2 = evaluate(theoretical_best)

print(f'\tAccuracy: {d1["accuracy"]:}')
print(f'\tRecall@5: {d2["accuracy"]:}')

print('Top1:')
for metric_ in metrics:
    print(f'\t{metric_.capitalize()}: {d1[metric_]:.3f}')

print('Theoretical best in top5:')
for metric_ in metrics:
    print(f'\t{metric_.capitalize()}: {d2[metric_]:.3f}')




CUP (First 1000)
	163/1000 correct predictions
	Accuracy: 0.163
	Ed: 2.845
	Red: 0.9499214285714286
	Gleu: 68.11386381064486
	Meteor: 83.7002437317202
HebCup (First 1000)
	155/1000 correct predictions
	Accuracy: 0.155
	Ed: 2.95
	Red: 0.9766881313131313
	Gleu: 66.7704118548645
	Meteor: 84.84403325038214
LLM (First 1000)
	158/1000 correct predictions
	226/1000 correct predictions
	Accuracy: 0.158
	Recall@5: 0.226
Top1:
	Ed: 9.432
	Red: 4.585
	Gleu: 52.644
	Meteor: 74.794
Theoretical best in top5:
	Ed: 5.497
	Red: 2.560
	Gleu: 63.788
	Meteor: 83.964
