In [1]:
from sorter import *
import statistics

import eval


def evaluate(result_array):
    stats_dict = {}
    for metric in metrics:
        avg = statistics.mean(map(lambda line: line[metric], result_array))
        stats_dict[metric] = avg
    nr_correct = len(list(filter(lambda line: line['ed'] == 0, result_array)))
    stats_dict['accuracy'] = nr_correct / len(result_array)
    print(f'\t{nr_correct}/{len(result_array)} correct predictions')

    return stats_dict

def print_all_metrics(d):
    for metric in metrics1:
        print(f'\t{metric.capitalize()}: {d[metric]:.3f}')


metrics = ['ed', 'red', 'gleu', 'meteor', 'rouge-recall', 'rouge-f1']
metrics1 = ['accuracy'] + metrics

#Baseline

## CUP (First 1000)
print('CUP (First 1000)')
cup_result = eval.evaluate('../result/baseline/CUP_first1000.jsonl', 'CUP')[:1000]
cup_d = evaluate(cup_result)
print_all_metrics(cup_d)

## HebCup (First 1000)
print('HebCup (First 1000)')
hebcup_result = eval.evaluate('../result/baseline/HebCup_first1000.jsonl', 'HebCup')[:1000]
hebcup_d = evaluate(hebcup_result)
print_all_metrics(hebcup_d)

# LLM
def get_first_candidates(l: list, sorter) -> list:
    return list(map(lambda x: sorter(x)[0], l))

result = eval.evaluate('../result/candidates/candidates-20240328_154708.jsonl')

count = len(result)

print(f'LLM (First 1000)')

print('> Top1:')
simple_top1 = get_first_candidates(result, lambda x:x)
d1 = evaluate(simple_top1)
print_all_metrics(d1)

print('> Optimal Top5:')
theoretical_best = get_first_candidates(result,  lambda x: sort_by_evaluation_metric(x, 'gleu', True))
d2 = evaluate(theoretical_best)
print_all_metrics(d2)

print('> Sort by rouge recall')
rouge = get_first_candidates(result, lambda  x: sort_by_rouge(x, 'r'))
print_all_metrics(evaluate(rouge))

print('> Sort by rouge precision')
rouge = get_first_candidates(result, lambda  x: sort_by_rouge(x, 'p'))
print_all_metrics(evaluate(rouge))

print('> Sort by rouge ƒ1')
rouge = get_first_candidates(result, lambda  x: sort_by_rouge(x, 'f'))
print_all_metrics(evaluate(rouge))

print('> Sort by levenshtein distance')
edd = get_first_candidates(result, lambda  x: sort_by_levenshtein_distance(x))
print_all_metrics(evaluate(edd))

print('> Sort by gleu')
gleu = get_first_candidates(result, lambda  x: sort_by_gleu(x))
print_all_metrics(evaluate(gleu))



CUP (First 1000)
	166/1000 correct predictions
	Accuracy: 0.166
	Ed: 2.592
	Red: 0.928
	Gleu: 65.493
	Meteor: 80.015
	Rouge-recall: 81.819
	Rouge-f1: 81.247
HebCup (First 1000)
	121/1000 correct predictions
	Accuracy: 0.121
	Ed: 2.974
	Red: 1.178
	Gleu: 61.456
	Meteor: 80.312
	Rouge-recall: 79.885
	Rouge-f1: 77.971
LLM (First 1000)
> Top1:
	177/1000 correct predictions
	Accuracy: 0.177
	Ed: 8.298
	Red: 4.435
	Gleu: 50.430
	Meteor: 71.512
	Rouge-recall: 75.822
	Rouge-f1: 66.370
> Optimal Top5:
	255/1000 correct predictions
	Accuracy: 0.255
	Ed: 4.518
	Red: 2.225
	Gleu: 62.952
	Meteor: 80.964
	Rouge-recall: 83.780
	Rouge-f1: 77.216
> Sort by rouge recall
	190/1000 correct predictions
	Accuracy: 0.190
	Ed: 7.188
	Red: 3.649
	Gleu: 55.793
	Meteor: 77.579
	Rouge-recall: 83.019
	Rouge-f1: 72.160
> Sort by rouge precision
	218/1000 correct predictions
	Accuracy: 0.218
	Ed: 4.735
	Red: 2.329
	Gleu: 60.338
	Meteor: 78.831
	Rouge-recall: 81.740
	Rouge-f1: 75.886
> Sort by rouge ƒ1
	216/1000 corr