In [1]:
from sorter import *
import statistics

import eval


def evaluate(result_array):
    stats_dict = {}
    for metric in metrics:
        avg = statistics.mean(map(lambda line: line[metric], result_array))
        stats_dict[metric] = avg
    nr_correct = len(list(filter(lambda line: line['ed'] == 0, result_array)))
    stats_dict['accuracy'] = nr_correct / len(result_array)
    print(f'\t{nr_correct}/{len(result_array)} correct predictions')

    return stats_dict

def print_all_metrics(d):
    for metric in metrics1:
        print(f'\t{metric.capitalize()}: {d[metric]}')


metrics = ['ed', 'red', 'gleu', 'meteor']
metrics1 = ['accuracy'] + metrics

#Baseline

# ## CUP (First 1000)
# print('CUP (First 1000)')
# cup_result = eval.evaluate('../result/baseline/CUP_first1000.jsonl', 'CUP')[:1000]
# cup_d = evaluate(cup_result)
# print_all_metrics(cup_d)
# 
# ## HebCup (First 1000)
# print('HebCup (First 1000)')
# hebcup_result = eval.evaluate('../result/baseline/HebCup_first1000.jsonl', 'HebCup')[:1000]
# hebcup_d = evaluate(hebcup_result)
# print_all_metrics(hebcup_d)

# LLM
def get_first_candidates(l: list, sorter) -> list:
    return list(map(lambda x: sorter(x)[0], l))

result = eval.evaluate('../result/candidates/candidates-20240328_154708.jsonl')

count = len(result)

print(f'LLM (First 1000)')

print('> Top1:')
simple_top1 = get_first_candidates(result, lambda x:x)
d1 = evaluate(simple_top1)
print_all_metrics(d1)

print('> Optimal Top5:')
theoretical_best = get_first_candidates(result,  lambda x: sort_by_evaluation_metric(x, 'gleu', True))
d2 = evaluate(theoretical_best)
print_all_metrics(d2)

print('> Sort by rouge recall')
rouge = get_first_candidates(result, lambda  x: sort_by_rouge(x, 'r'))
print_all_metrics(evaluate(rouge))

print('> Sort by rouge precision')
rouge = get_first_candidates(result, lambda  x: sort_by_rouge(x, 'p'))
print_all_metrics(evaluate(rouge))

print('> Sort by rouge ƒ1')
rouge = get_first_candidates(result, lambda  x: sort_by_rouge(x, 'f'))
print_all_metrics(evaluate(rouge))

print('> Sort by levenshtein distance')
edd = get_first_candidates(result, lambda  x: sort_by_levenshtein_distance(x))
print_all_metrics(evaluate(edd))

print('> Sort by gleu')
gleu = get_first_candidates(result, lambda  x: sort_by_gleu(x))
print_all_metrics(evaluate(gleu))



LLM (First 1000)
> Top1:
	177/1000 correct predictions
	Accuracy: 0.177
	Ed: 8.298
	Red: 4.434797763347763
	Gleu: 50.43027101817904
	Meteor: 71.51160847113778
> Optimal Top5:
	255/1000 correct predictions
	Accuracy: 0.255
	Ed: 4.518
	Red: 2.224820562770563
	Gleu: 62.95210913858119
	Meteor: 80.96390942368838
> Sort by rouge recall
	190/1000 correct predictions
	Accuracy: 0.19
	Ed: 7.188
	Red: 3.6491228354978356
	Gleu: 55.793363608427356
	Meteor: 77.5791608100043
> Sort by rouge precision
	218/1000 correct predictions
	Accuracy: 0.218
	Ed: 4.735
	Red: 2.329260461760462
	Gleu: 60.33835548338917
	Meteor: 78.83147036027287
> Sort by rouge ƒ1
	216/1000 correct predictions
	Accuracy: 0.216
	Ed: 4.878
	Red: 2.392048556998557
	Gleu: 60.31815696840139
	Meteor: 79.21078173363045
> Sort by levenshtein distance
	224/1000 correct predictions
	Accuracy: 0.224
	Ed: 4.276
	Red: 1.9742834054834055
	Gleu: 60.65087004693572
	Meteor: 78.41884282500656
> Sort by gleu
	223/1000 correct predictions
	Accuracy: