In [None]:
from statsmodels.stats.proportion import proportion_confint

ci_before = proportion_confint(errors_before, 3033, method='wilson')
ci_after = proportion_confint(errors_after, 3033, method='wilson')

In [1]:
import numpy as np
from jiwer import wer

def bootstrap_ci(data, num_bootstrap=1000, ci=95):
    """
    Compute the confidence interval for the mean using bootstrap resampling.
    
    Parameters:
        data (list or array): List of evaluation metric values (e.g., WER for each sample)
        num_bootstrap (int): Number of bootstrap iterations (default: 1000)
        ci (float): Confidence level percentage (default: 95 for a 95% CI)
        
    Returns:
        lower_bound, upper_bound: Lower and upper bounds of the confidence interval.
    """
    boot_means = []
    n = len(data)
    for _ in range(num_bootstrap):
        sample = np.random.choice(data, size=n, replace=True)
        boot_means.append(np.mean(sample))
    lower_bound = np.percentile(boot_means, (100 - ci) / 2)
    upper_bound = np.percentile(boot_means, 100 - (100 - ci) / 2)
    return lower_bound, upper_bound

# Example: ASR inference results (predictions) and ground truth transcripts
predictions = [
    "this is a test sentence",
    "another test sentence",
    "hello world",
    "speech recognition is challenging",
    "openai chatgpt is great"
]

ground_truths = [
    "this is test sentence",
    "another test sentence",
    "hello world",
    "speech recognition is challenging",
    "openai chatgpt is great"
]

# Compute WER for each sample using jiwer
wer_list = []
for ref, hyp in zip(ground_truths, predictions):
    error = wer(ref, hyp)
    wer_list.append(error)

mean_wer = np.mean(wer_list)
ci_lower, ci_upper = bootstrap_ci(wer_list, num_bootstrap=10000, ci=95)

print(f"Mean WER: {mean_wer:.4f}")
print(f"95% Confidence Interval: ({ci_lower:.4f}, {ci_upper:.4f})")


Mean WER: 0.0500
95% Confidence Interval: (0.0000, 0.1500)


In [4]:
import numpy as np
import scipy.stats as stats

def calculate_confidence_interval(data, confidence_level=0.95):
    # Calculate sample mean and standard deviation
    mean = np.mean(data)
    std_dev = np.std(data, ddof=1)  # Use sample standard deviation
    n = len(data)
    
    # Calculate standard error
    standard_error = std_dev / np.sqrt(n)
    
    # Calculate t-distribution critical value
    t_critical = stats.t.ppf((1 + confidence_level) / 2, df=n - 1)
    
    # Calculate margin of error
    margin_of_error = t_critical * standard_error
    
    # Return mean and margin of error
    return mean, margin_of_error

# Input two experimental results
data = [9.61, 16.06]  # Example data

# Calculate 95% confidence interval
mean, margin_of_error = calculate_confidence_interval(data)
print(f"Result: {mean:.2f} ± {margin_of_error:.2f}")

Result: 12.83 ± 40.98
