In [2]:
import statistics

import pandas

import eval
from sorter import *


def evaluate(result_array):
    stats_dict = {}
    for metric in metrics:
        ll = list(map(lambda line: line[metric], result_array))
        avg = statistics.mean(ll)
        stats_dict[metric] = f'{avg:.3f}'
    nr_correct = len(list(filter(lambda line: line['ed'] == 0, result_array)))
    stats_dict['accuracy'] = f'{nr_correct / len(result_array):.3f}'
    stats_dict['correct/total'] = f'\t{nr_correct}/{len(result_array)}'

    return stats_dict


def print_all_metrics(d):
    for metric in metrics1:
        print(f'\t{metric.capitalize()}: {d[metric]:.3f}')


class ResultTable:
    def __init__(self):
        self.rows = []

    def add(self, title: str, d: dict):
        row = {'Method': title}
        row.update({metric.capitalize(): d[metric] for metric in metrics1})
        self.rows.append(row)

    def to_markdown(self):
        df = pandas.DataFrame(self.rows)

        # Convert DataFrame to Markdown table without index
        return df.to_markdown(index=False)


metrics = ['ed', 'red', 'gleu', 'meteor', 'rouge-recall', 'rouge-f1']
metrics1 = ['correct/total', 'accuracy'] + metrics
rt = ResultTable()

#Baseline

# ## CUP (First 1000)
# cup_result = eval.evaluate('../result/baseline/CUP.jsonl', 'CUP')
# cup_d = evaluate(cup_result)
# rt.add('CUP', cup_d)
# 
# ## HebCup (First 1000)
# hebcup_result = eval.evaluate('../result/baseline/HebCup.jsonl', 'HebCup')
# hebcup_d = evaluate(hebcup_result)
# rt.add('HebCup', hebcup_d)


# LLM

# result = eval.evaluate('../result/candidates/candidates-mistral-openorca-latest-9204-20240407_154640.jsonl') # openorca
# result = eval.evaluate('../result/candidates/candidates-gemma-7b-1000-20240408_151702.jsonl') # gemma
# result = eval.evaluate('../result/candidates/candidates-llama2-7b-1000-20240408_173014.jsonl') # llama2
# result = eval.evaluate('../result/candidates/candidates-dolphin-mistral-latest-1000-20240408_194308.jsonl') # dolphin-mistral:latest
# result = eval.evaluate('../result/candidates/candidates-mistral-instruct-1000-20240409_002542.jsonl') # mistral:instruct
# result = eval.evaluate('../result/candidates/candidates-openhermes-7b-v2.5-1000-20240409_152922.jsonl') # openhermes:7b-v2.5
# result = eval.evaluate('../result/candidates/candidates-solar-10.7b-1000-20240409_172221.jsonl') # solar:10.7b
# result = eval.evaluate('../result/candidates/candidates-llama3-instruct-1000-20240425_161220.jsonl') # llama3
# result = eval.evaluate('../result/candidates/candidates-llama3-instruct-9204-20240425_175633.jsonl') # llama3 all
# result = eval.evaluate('../result/candidates/candidates-llama3-temp1.0-instruct-1000-20240427_105849.jsonl') # llama3 temp 1.0
result = eval.evaluate('../result/candidates/candidates-llama3-temp1.5-instruct-1000-20240427_135945.jsonl') # llama3 temp 1.5


count = len(result)

simple_top1 = get_first_candidates(result, lambda x: x)
d1 = evaluate(simple_top1)
rt.add('LLM Top1', d1)

theoretical_best = get_first_candidates(result, lambda x: sort_by_evaluation_metric(x, 'gleu', True))
d2 = evaluate(theoretical_best)
rt.add('LLM Optimal Top5', d2)

rouge = get_first_candidates(result, lambda x: sort_by_rouge(x, 'r'))
rt.add('LLM rouge recall', evaluate(rouge))

rouge = get_first_candidates(result, lambda x: sort_by_rouge(x, 'p'))
rt.add('LLM rouge precision', evaluate(rouge))

rouge = get_first_candidates(result, lambda x: sort_by_rouge(x, 'f'))
rt.add('LLM rouge f1', evaluate(rouge))

edd = get_first_candidates(result, lambda x: sort_by_levenshtein_distance(x))
rt.add('LLM levenshtein distance', evaluate(edd))

gleu = get_first_candidates(result, lambda x: sort_by_gleu(x))
rt.add('LLM gleu', evaluate(gleu))

print(rt.to_markdown())


| Method                   | Correct/total   |   Accuracy |    Ed |   Red |   Gleu |   Meteor |   Rouge-recall |   Rouge-f1 |
|:-------------------------|:----------------|-----------:|------:|------:|-------:|---------:|---------------:|-----------:|
| LLM Top1                 | 145/1000        |      0.145 | 8.252 | 4.59  | 47.547 |   71.578 |         75.874 |     64.951 |
| LLM Optimal Top5         | 252/1000        |      0.252 | 4.01  | 1.841 | 63.318 |   80.981 |         83.433 |     78.039 |
| LLM rouge recall         | 173/1000        |      0.173 | 7.685 | 4.009 | 52.375 |   76.995 |         83.148 |     70.233 |
| LLM rouge precision      | 227/1000        |      0.227 | 4.041 | 1.832 | 60.884 |   78.544 |         81.015 |     76.783 |
| LLM rouge f1             | 223/1000        |      0.223 | 4.299 | 1.969 | 60.476 |   79.304 |         82.249 |     76.646 |
| LLM levenshtein distance | 229/1000        |      0.229 | 3.904 | 1.728 | 60.844 |   78.476 |         80.535 |     7