In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import glob
import os

from jiwer import cer, wer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from pythainlp.tokenize import word_tokenize


def evaluate(reference: str, hypothesis: str) -> float:
    chencherry = SmoothingFunction()

    ref_tokens = [word_tokenize(reference, engine="newmm")] # Needs to be a list of lists
    hyp_tokens = word_tokenize(hypothesis, engine="newmm")

    bleu = sentence_bleu(ref_tokens, hyp_tokens, smoothing_function=chencherry.method1)
    meteor = meteor_score(ref_tokens, hyp_tokens)

    wer_score = wer(reference, hypothesis)
    cer_score = cer(reference, hypothesis)

    return wer_score, cer_score, bleu, meteor


def benchmark(fname, test_df):
    df = pd.read_csv(fname).fillna("")
    df['THA'] = test_df['THA'].tolist()
    
    per_file_metrics = {"wer": [], "cer": [], "bleu": [], "meteor": []}
    for _, row in df.iterrows():
        wer, cer, bleu, meteor = evaluate(row['THA'], row['PRED_cleaned'])
        
        per_file_metrics["wer"].append(wer)
        per_file_metrics["cer"].append(cer)
        per_file_metrics["bleu"].append(bleu)
        per_file_metrics["meteor"].append(meteor)
    
    # Average per file
    per_file_metrics = {k: round(np.mean(v), 4) for k, v in per_file_metrics.items()}

    return per_file_metrics
        

# Example usage:
test_df = pd.read_excel('/project/lt200304-dipmt/paweekorn/data/test_set.xlsx', index_col='ID')
files = np.sort(glob.glob('/project/lt200304-dipmt/paweekorn/results/*.csv'))

result = []
for file in tqdm(files):
    fname = file.split('/')[-1].replace('.csv', '')
    name, method = fname.split('_')
    
    metrics = benchmark(file, test_df)
    metrics['fname'] = name;  metrics['type'] = method;
    result.append(metrics)

result_df = pd.DataFrame(result)
display(result_df)

result_df.to_csv('/project/lt200304-dipmt/paweekorn/benchmark.csv', index=False)

100%|██████████| 12/12 [01:51<00:00,  9.29s/it]


Unnamed: 0,wer,cer,bleu,meteor,fname,type
0,1.1244,0.5533,0.1622,0.4227,gemma3-12b-it,base
1,1.1306,0.5082,0.1803,0.4593,gemma3-12b-pt,FT
2,2.7735,1.146,0.0113,0.0291,gemma3-12b-pt,base
3,25.555,13.0989,0.0113,0.1558,llama3.1-8b-it,base
4,2.0221,1.3691,0.0042,0.0277,llama3.1-8b,FT
5,1.2474,0.6805,0.1387,0.4135,qwen3-14b,FT
6,1.2234,0.6233,0.1466,0.4248,qwen3-14b,base
7,1.2033,0.6192,0.1465,0.4148,qwen3-8b,FT
8,47.8593,11.6084,0.0126,0.1953,qwen3-8b,base
9,1.1546,0.6187,0.1569,0.4202,seedx-7b-it,FT
