In [1]:
import pandas
import statistics

import eval
from sorter import *
from py_markdown_table.markdown_table import markdown_table


def evaluate(result_array):
    stats_dict = {}
    for metric in metrics:
        avg = statistics.mean(map(lambda line: line[metric], result_array))
        stats_dict[metric] = f'{avg:.3f}'
    nr_correct = len(list(filter(lambda line: line['ed'] == 0, result_array)))
    stats_dict['accuracy'] = f'{nr_correct / len(result_array):.3f}'
    stats_dict['correct/total'] = f'\t{nr_correct}/{len(result_array)}'

    return stats_dict


def print_all_metrics(d):
    for metric in metrics1:
        print(f'\t{metric.capitalize()}: {d[metric]:.3f}')


class ResultTable:
    def __init__(self):
        self.rows = []

    def add(self, title: str, d: dict):
        row = {'Method': title}
        row.update({metric.capitalize(): d[metric] for metric in metrics1})
        self.rows.append(row)
        
    def to_markdown(self):
        df = pandas.DataFrame(self.rows)

        # Convert DataFrame to Markdown table without index
        return df.to_markdown(index=False)
        


metrics = ['ed', 'red', 'gleu', 'meteor', 'rouge-recall', 'rouge-f1']
metrics1 = ['correct/total', 'accuracy'] + metrics
rt = ResultTable()

#Baseline

## CUP (First 1000)
cup_result = eval.evaluate('../result/baseline/CUP_first1000.jsonl', 'CUP')[:1000]
cup_d = evaluate(cup_result)
rt.add('CUP', cup_d)

## HebCup (First 1000)
hebcup_result = eval.evaluate('../result/baseline/HebCup_first1000.jsonl', 'HebCup')[:1000]
hebcup_d = evaluate(hebcup_result)
rt.add('HebCup', hebcup_d)


# LLM
def get_first_candidates(l: list, sorter) -> list:
    return list(map(lambda x: sorter(x)[0], l))


result = eval.evaluate('../result/candidates/candidates-20240328_154708.jsonl')

count = len(result)


simple_top1 = get_first_candidates(result, lambda x: x)
d1 = evaluate(simple_top1)
rt.add('LLM Top1', d1)

theoretical_best = get_first_candidates(result, lambda x: sort_by_evaluation_metric(x, 'gleu', True))
d2 = evaluate(theoretical_best)
rt.add('LLM Optimal Top5', d2)

rouge = get_first_candidates(result, lambda x: sort_by_rouge(x, 'r'))
rt.add('LLM rouge recall', evaluate(rouge))

rouge = get_first_candidates(result, lambda x: sort_by_rouge(x, 'p'))
rt.add('LLM rouge precision', evaluate(rouge))

rouge = get_first_candidates(result, lambda x: sort_by_rouge(x, 'f'))
rt.add('LLM rouge f1', evaluate(rouge))

edd = get_first_candidates(result, lambda x: sort_by_levenshtein_distance(x))
rt.add('LLM levenshtein distance', evaluate(edd))

gleu = get_first_candidates(result, lambda x: sort_by_gleu(x))
rt.add('LLM gleu', evaluate(gleu))

print(rt.to_markdown())


| title                    | Correct/total   |   Accuracy |    Ed |   Red |   Gleu |   Meteor |   Rouge-recall |   Rouge-f1 |
|:-------------------------|:----------------|-----------:|------:|------:|-------:|---------:|---------------:|-----------:|
| CUP                      | 166/1000        |      0.166 | 2.592 | 0.928 | 65.493 |   80.015 |         81.819 |     81.247 |
| HebCup                   | 121/1000        |      0.121 | 2.974 | 1.178 | 61.456 |   80.312 |         79.885 |     77.971 |
| LLM Top1                 | 177/1000        |      0.177 | 8.298 | 4.435 | 50.43  |   71.512 |         75.822 |     66.37  |
| LLM Optimal Top5         | 255/1000        |      0.255 | 4.518 | 2.225 | 62.952 |   80.964 |         83.78  |     77.216 |
| LLM rouge recall         | 190/1000        |      0.19  | 7.188 | 3.649 | 55.793 |   77.579 |         83.019 |     72.16  |
| LLM rouge precision      | 218/1000        |      0.218 | 4.735 | 2.329 | 60.338 |   78.831 |         81.74  |     7