In [6]:
import statistics

import pandas

import eval
from sorter import *


def evaluate(result_array):
    stats_dict = {}
    for metric in metrics:
        ll = list(map(lambda line: line[metric], result_array))
        avg = statistics.mean(ll)
        stats_dict[metric] = f'{avg:.3f}'
    nr_correct = len(list(filter(lambda line: line['ed'] == 0, result_array)))
    stats_dict['accuracy'] = f'{nr_correct / len(result_array):.3f}'
    stats_dict['correct/total'] = f'\t{nr_correct}/{len(result_array)}'

    return stats_dict


def print_all_metrics(d):
    for metric in metrics1:
        print(f'\t{metric.capitalize()}: {d[metric]:.3f}')


class ResultTable:
    def __init__(self):
        self.rows = []

    def add(self, title: str, d: dict):
        row = {'Method': title}
        row.update({metric.capitalize(): d[metric] for metric in metrics1})
        self.rows.append(row)

    def to_markdown(self):
        df = pandas.DataFrame(self.rows)

        # Convert DataFrame to Markdown table without index
        return df.to_markdown(index=False)


metrics = ['ed', 'red', 'gleu', 'meteor', 'rouge-recall', 'rouge-f1']
metrics1 = ['correct/total', 'accuracy'] + metrics
rt = ResultTable()

#Baseline

## CUP (First 1000)
cup_result = eval.evaluate('../result/baseline/CUP_first1000.jsonl', 'CUP')
cup_d = evaluate(cup_result)
rt.add('CUP', cup_d)

## HebCup (First 1000)
hebcup_result = eval.evaluate('../result/baseline/HebCup_first1000.jsonl', 'HebCup')
hebcup_d = evaluate(hebcup_result)
rt.add('HebCup', hebcup_d)


# LLM
def get_first_candidates(l: list, sorter) -> list:
    return list(map(lambda x: sorter(x)[0], l))


# result = eval.evaluate('../result/candidates/candidates-20240407_154640.jsonl') # openorca
# result = eval.evaluate('../result/candidates/candidates-20240408_151702.jsonl') # gemma
result = eval.evaluate('../result/candidates/candidates-20240408_173014.jsonl') # llama2

count = len(result)

simple_top1 = get_first_candidates(result, lambda x: x)
d1 = evaluate(simple_top1)
rt.add('LLM Top1', d1)

theoretical_best = get_first_candidates(result, lambda x: sort_by_evaluation_metric(x, 'gleu', True))
d2 = evaluate(theoretical_best)
rt.add('LLM Optimal Top5', d2)

rouge = get_first_candidates(result, lambda x: sort_by_rouge(x, 'r'))
rt.add('LLM rouge recall', evaluate(rouge))

rouge = get_first_candidates(result, lambda x: sort_by_rouge(x, 'p'))
rt.add('LLM rouge precision', evaluate(rouge))

rouge = get_first_candidates(result, lambda x: sort_by_rouge(x, 'f'))
rt.add('LLM rouge f1', evaluate(rouge))

edd = get_first_candidates(result, lambda x: sort_by_levenshtein_distance(x))
rt.add('LLM levenshtein distance', evaluate(edd))

gleu = get_first_candidates(result, lambda x: sort_by_gleu(x))
rt.add('LLM gleu', evaluate(gleu))

print(rt.to_markdown())


| Method                   | Correct/total   |   Accuracy |    Ed |   Red |   Gleu |   Meteor |   Rouge-recall |   Rouge-f1 |
|:-------------------------|:----------------|-----------:|------:|------:|-------:|---------:|---------------:|-----------:|
| CUP                      | 1474/9204       |      0.16  | 2.509 | 0.931 | 65.209 |   79.947 |         81.584 |     81.092 |
| HebCup                   | 1129/9204       |      0.123 | 2.912 | 1.199 | 60.976 |   80.057 |         79.463 |     77.802 |
| LLM Top1                 | 85/1000         |      0.085 | 7.896 | 4.526 | 42.503 |   61.718 |         64.729 |     59.114 |
| LLM Optimal Top5         | 129/1000        |      0.129 | 5.022 | 2.666 | 55.1   |   73.581 |         76.172 |     71.845 |
| LLM rouge recall         | 104/1000        |      0.104 | 6.615 | 3.612 | 50.881 |   71.522 |         75.529 |     68.622 |
| LLM rouge precision      | 117/1000        |      0.117 | 5.118 | 2.72  | 53.424 |   72.171 |         74.792 |     7