In [3]:
# !pip install confidence_intervals
from confidence_intervals import evaluate_with_conf_int

# 2. Sample data loader

In [9]:
from jiwer import wer
import numpy as np

In [4]:
psst_gt_path = '/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/mllm_baselines/ConfidenceIntervals/data/psst_phoneme_baseline_gt_0.97.txt'
psst_pred_path = '/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/mllm_baselines/ConfidenceIntervals/data/psst_phoneme_baseline_pred_0.97.txt'

# load gt file
with open(psst_gt_path, "r", encoding="utf-8") as gt_file:
    ground_truth = gt_file.readlines()

# load pred file
with open(psst_pred_path, "r", encoding="utf-8") as pred_file:
    predictions = pred_file.readlines()

print("GT len:", len(ground_truth))
print("Pred len:", len(predictions))
print("Ground Truth Sample:", ground_truth[:5])
print("Predictions Sample:", predictions[:5])

GT len: 652
Pred len: 652
Ground Truth Sample: ['ACWT01a-BNT01-house\tEY HH AW S\n', 'ACWT01a-BNT02-comb\tK OW M\n', 'ACWT01a-BNT03-toothbrush\tB R AH SH\n', 'ACWT01a-BNT04-octopus\tAA G T AH P UH S\n', 'ACWT01a-BNT05-bench\tCH EY R\n']
Predictions Sample: ['ACWT01a-BNT01-house\tHH AE M AH K\n', 'ACWT01a-BNT02-comb\tK OW M\n', 'ACWT01a-BNT03-toothbrush\tG R AH SH\n', 'ACWT01a-BNT04-octopus\tAA K T AH P UH S\n', 'ACWT01a-BNT05-bench\tCH EY R\n']


# 3. WER Calculation per utterance (line?) jiwer just for testing

In [7]:
# parse ground truth and predictions
def parse_phoneme_data(lines):
    """
    Extracts utterance IDs and phoneme sequences from the input file.
    """
    phoneme_dict = {}
    for line in lines:
        parts = line.strip().split("\t")
        if len(parts) == 2:
            utterance_id, phoneme_seq = parts
            phoneme_dict[utterance_id] = phoneme_seq
    return phoneme_dict

# convert raw lines into dictionaries
ground_truth_dict = parse_phoneme_data(ground_truth)
predictions_dict = parse_phoneme_data(predictions)


print("GT len:", len(list(ground_truth_dict.items())))
print("Pred len:", len(list(predictions_dict.items())))

common_utterances = set(ground_truth_dict.keys()) & set(predictions_dict.keys())

print("Intersec len", len(common_utterances))

# compute PER per utterance
per_values = []
for utt in common_utterances:
    gt_phonemes = ground_truth_dict[utt]
    pred_phonemes = predictions_dict[utt]

    # compute Phoneme Error Rate (PER)
    per = wer(gt_phonemes, pred_phonemes)
    per_values.append(per)

print(f"First 10 PER values: {per_values[:10]}")

GT len: 652
Pred len: 652
Intersec len 652
First 10 PER values: [0.16666666666666666, 0.5, 0.25, 1.0, 0.0, 1.0, 0.4, 0.3333333333333333, 1.6666666666666667, 0.4]


# 4. Confidence Internal Calculation

In [10]:
confidence_interval = evaluate_with_conf_int(np.array(per_values), np.mean, num_bootstraps=1000, alpha=5)

print(f"Full dataset: {confidence_interval[0]:.4f}")
print(f"lower & upper bound: ({confidence_interval[1][0]:.4f}, {confidence_interval[1][1]:.4f})")

Full dataset: 0.7468
lower & upper bound: (0.6151, 0.8902)
