In [1]:
import jiwer
import pandas as pd
import csv

In [None]:
# read lines from a given path, strip for whitespace
# each line is treated as one hypothesis or reference sentence
def read_sentences_from_file(path):
    with open(path, encoding = "utf-8") as f:
        lines = [line.strip() for line in f if line.strip()]
    return lines

In [None]:
def weighted_average(results, transform = None):
    if transform is None:
        transform = jiwer.Compose([  # TODO: potentially change this if it is stripping AAE features from text
            jiwer.ToLowerCase(),
            jiwer.RemovePunctuation(),
            jiwer.Strip()
        ])

    total_err = 0.0
    total_ref_words = 0
    for idx, ref, hyp, wer_val in results:
        ref_words = len(transform(ref).split())
        total_err += wer_val * ref_words
        total_ref_words += ref_words
    weighted_avg = total_err / total_ref_words if total_ref_words > 0 else 0.0
    #print("Weighted average WER:", weighted_avg)
    return weighted_avg

In [None]:
# determine WER for each pair (reference, hypothesis)
# returns a list of tuples in the format (index, reference, hypothesis, WER)
def compute_wer_per_sentence(references_input, hypotheses_input, transform = None):
    results = []
    n = min(len(references_input), len(hypotheses_input))

    if transform is None:
        transform = jiwer.Compose([  # TODO: potentially change this if it is stripping AAE features from text
            jiwer.ToLowerCase(),
            jiwer.RemovePunctuation(),
            jiwer.Strip()
        ])

    # average WER across entire reference/hypothesis set passed in
    # global_ref = " ".join(references)
    # global_hyp = " ".join(hypotheses)
    # global_wer = jiwer.wer(global_ref, global_hyp, truth_transform = transform, hypothesis_transform = transform)
    # print("Global WER:", global_wer)


    for i in range(n):
        # TODO: assumes reference and hypothesis are at the same index in corresponding documents
        ref = references_input[i]
        hyp = hypotheses_input[i]
        # TODO: a github thread suggests WER can be higher than 1.0, potentially set these aside
        wer_val = jiwer.wer(ref, hyp, truth_transform = transform, hypothesis_transform = transform)
        results.append({
            "index": i,
            "reference": ref,
            "hypothesis": hyp,
            "wer": wer_val
        })
    return results

In [None]:
ref_file = r"data/reference/..."
hyp_file = r"data/hypothesis/..."

references = read_sentences_from_file(ref_file)
hypotheses = read_sentences_from_file(hyp_file)

result = compute_wer_per_sentence(references, hypotheses)
df = pd.DataFrame(result)
df_sorted = df.sort_values(by = "wer", ascending = False).reset_index(drop = True)  # sort by WER value, highest first
df_sorted.head(10)  # display the first 10
print("Weighted Average WER:", weighted_average(result))

In [None]:
with open("wer_report.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["index", "wer", "reference", "hypothesis"])
    for idx, ref, hyp, wer_val in df_sorted:
        writer.writerow([idx, f"{wer_val:.6f}", ref, hyp])