In [1]:
import jiwer
import pandas as pd
import csv
import json
import re
from pathlib import Path

In [2]:
def read_reference_from_json(json_path):
    """
    Load baseline text from JSON['text'] and return as a single-item list
    so it matches expected pipeline input.
    """
    json_path = Path(json_path)
    with json_path.open("r", encoding="utf-8", errors="ignore") as f:
        data = json.load(f)

    text = data.get("text", "")
    # collapse whitespace
    #text = re.sub(r"\s+", " ", text).strip()
    return [text]  # return as list of one string

In [3]:
def read_hypothesis_from_tsv(txt_path):
    """
    Read TSV-like transcript:
      - skip header if 'speaker\\ttext'
      - remove leading speaker tags like 'WI:' / 'WE:'
      - collapse everything into one single continuous string
    Return as single-item list
    """
    txt_path = Path(txt_path)
    lines = txt_path.read_text(encoding="utf-8", errors="ignore").splitlines()

    segments = []
    first = True
    for line in lines:
        if not line.strip():
            continue

        parts = line.split("\t")

        # Skip header row if present
        if first:
            first = False
            if len(parts) >= 2 and parts[0].strip().lower() == "speaker" and parts[1].strip().lower() == "text":
                continue

        # Use second column if exists, else full line
        text = parts[1] if len(parts) >= 2 else line

        # Remove speaker tag (1–8 uppercase letters + colon)
        text = re.sub(r"^\s*[A-Za-z]{1,8}:\s*", "", text)

        #text = text.strip()
        if text:
            segments.append(text)

    full_text = " ".join(segments)
    full_text = re.sub(r"\s+", " ", full_text).strip()
    return [full_text]  # return as list so your pipeline works

In [4]:
def weighted_average(results, transform = None):
    if transform is None:
        transform = jiwer.Compose([
            #jiwer.ToLowerCase(),
            #jiwer.RemovePunctuation(),
            #jiwer.Strip()
            jiwer.ReduceToListOfListOfWords()
        ])

    total_err = 0.0
    total_ref_words = 0
    for idx, ref, hyp, wer_val in results:
        ref_words = len(transform(ref).split())
        total_err += wer_val * ref_words
        total_ref_words += ref_words
    weighted_avg = total_err / total_ref_words if total_ref_words > 0 else 0.0
    #print("Weighted average WER:", weighted_avg)
    return weighted_avg

In [5]:
# determine WER for each pair (reference, hypothesis)
# returns a list of tuples in the format (index, reference, hypothesis, WER)
def compute_wer_per_sentence(references_input, hypotheses_input, transform = None):
    results = []
    n = min(len(references_input), len(hypotheses_input))

    if transform is None:
        transform = jiwer.Compose([
            #jiwer.ToLowerCase(),
            #jiwer.RemovePunctuation(),
            #jiwer.Strip()
            jiwer.ReduceToListOfListOfWords()
        ])

    # average WER across entire reference/hypothesis set passed in
    # global_ref = " ".join(references)
    # global_hyp = " ".join(hypotheses)
    # global_wer = jiwer.wer(global_ref, global_hyp, truth_transform = transform, hypothesis_transform = transform)
    # print("Global WER:", global_wer)

    for i in range(n):
        # TODO: assumes reference and hypothesis are at the same index in corresponding documents
        ref = references_input[i]
        hyp = hypotheses_input[i]

        # TODO: a github thread suggests WER can be higher than 1.0, potentially set these aside
        wer_val = jiwer.wer(ref, hyp, reference_transform = transform, hypothesis_transform = transform)
        results.append((i, ref, hyp, wer_val))
    return results


In [6]:
def compute_cer_per_sentence(references_input, hypotheses_input, transform = None):
    results = []
    n = min(len(references_input), len(hypotheses_input))

    if transform is None:
        transform = jiwer.Compose([
            #jiwer.ToLowerCase(),
            #jiwer.RemovePunctuation(),
            #jiwer.Strip()
            jiwer.ReduceToListOfListOfChars()
        ])

    # average WER across entire reference/hypothesis set passed in
    # global_ref = " ".join(references)
    # global_hyp = " ".join(hypotheses)
    # global_wer = jiwer.wer(global_ref, global_hyp, truth_transform = transform, hypothesis_transform = transform)
    # print("Global WER:", global_wer)

    for i in range(n):
        ref = references_input[i]
        hyp = hypotheses_input[i]

        wer_val = jiwer.cer(ref, hyp, reference_transform = transform, hypothesis_transform = transform)
        results.append((i, ref, hyp, wer_val))
    return results


In [8]:
# Update these to your real file locations
ref_json = r"C:\Users\pryce\OneDrive\Desktop\Lost in Transcription\CW Transcriptions\Checked\AAHP 259 Ella Mae Driskell 3-8-2012_transcription.json"
hyp_txt  = r"C:\Users\pryce\OneDrive\Desktop\Lost in Transcription\Text Inputs\Human Transcriptions\AAHP 259 Ella Mae Driskell 3-8-2012ufdc.txt"

# Use the new input readers
references = read_reference_from_json(ref_json)
hypotheses = read_hypothesis_from_tsv(hyp_txt)

result_w = compute_wer_per_sentence(references, hypotheses)
result_c = compute_cer_per_sentence(references, hypotheses)

# print("References:\n")
# print(references)
# print("\n\n\n")
# print("Hypotheses:\n")
# print(hypotheses)
#rint("WER:\n", result)

# Build DataFrame from the list of tuples
dfw = pd.DataFrame(result_w, columns=["index", "reference", "hypothesis", "wer"])
dfw_sorted = dfw.sort_values(by = "wer", ascending = False).reset_index(drop = True)  # sort by WER value, highest first
display(dfw_sorted)

dfc = pd.DataFrame(result_c, columns=["index", "reference", "hypothesis", "cer"])
dfc_sorted = dfc.sort_values(by = "cer", ascending = False).reset_index(drop = True)
display(dfc_sorted)
#print("Weighted Average WER:", weighted_average(result))


Unnamed: 0,index,reference,hypothesis,wer
0,0,"Today's date is March eighth, two thousand tw...","Today's date is March 8, 2012. Can you please ...",0.44829


Unnamed: 0,index,reference,hypothesis,cer
0,0,"Today's date is March eighth, two thousand tw...","Today's date is March 8, 2012. Can you please ...",0.30925


In [30]:
machine = "You think things rough now but the thing was rough in Hoover time.  That's the best I can say cause  my folk we just, you know, you could go to the store. Long in that time, you go to the store and you could buy five c cent bag of peas for 10 cent."
human = "You think things rough now but things were rough in Hoover time. That's the best I could say 'cause— you know you could go to the store and you could get you a bag of peas for ten cents."

wer = jiwer.wer(machine, human)
cer = jiwer.cer(machine, human)

print("Human:\n", human)
print()
print("Machine:\n", machine)
print("---------------------------")
print(f"WER: {wer:.5f}")
print(f"CER: {cer:.5f}")

Human:
 You think things rough now but things were rough in Hoover time. That's the best I could say 'cause— you know you could go to the store and you could get you a bag of peas for ten cents.

Machine:
 You think things rough now but the thing was rough in Hoover time.  That's the best I can say cause  my folk we just, you know, you could go to the store. Long in that time, you go to the store and you could buy five c cent bag of peas for 10 cent.
---------------------------
WER: 0.46296
CER: 0.36290


In [None]:
wer_report = r"C:\Users\pryce\PycharmProjects\LostInTranscription\data\WER_reports\AAHP_005A_Mattie_Williams_WER_report.csv"
with open("wer_report.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["index", "wer", "reference", "hypothesis"])
    for row in dfw_sorted.itertuples(index=False):
        writer.writerow([row.index, f"{row.wer:.6f}", row.reference, row.hypothesis])