In [11]:
import os
import numpy as np
import speech_recognition as sr



In [7]:
def compute_wer(reference_text, hypothesis_text):
    """
    Computes Word Error Rate (WER) between reference and hypothesis text.

    Args:
        reference_text (str): The reference transcription.
        hypothesis_text (str): The transcription from synthesized audio.

    Returns:
        float: The Word Error Rate.
    """
    reference_words = reference_text.split()
    hypothesis_words = hypothesis_text.split()

    # Initialize the error counts
    S = D = I = 0

    # Using the Levenshtein distance algorithm
    dp = np.zeros((len(reference_words) + 1, len(hypothesis_words) + 1))

    for i in range(len(reference_words) + 1):
        dp[i][0] = i
    for j in range(len(hypothesis_words) + 1):
        dp[0][j] = j

    for i in range(1, len(reference_words) + 1):
        for j in range(1, len(hypothesis_words) + 1):
            if reference_words[i - 1] == hypothesis_words[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = min(
                    dp[i - 1][j] + 1,  # Deletion
                    dp[i][j - 1] + 1,  # Insertion
                    dp[i - 1][j - 1] + 1  # Substitution
                )

    # S, D, and I can be derived from the dp matrix
    S = dp[len(reference_words)][len(hypothesis_words)]
    D = len(reference_words) - (len(hypothesis_words) - S)
    I = len(hypothesis_words) - (len(reference_words) - S)

    N = len(reference_words)

    if N == 0:
        return float('inf')  # Avoid division by zero

    wer = (S + D + I) / N
    return wer

In [13]:
# Directory where your audio files are stored
generated_audio_dir = 'generated_audio/'  # Directory for generated audio files

# List of generated audio files (sample_1.wav to sample_19.wav)
generated_audios = [os.path.join(generated_audio_dir, f'sample_{i}.wav') for i in range(1, 20)]


# Initialize recognizer
recognizer = sr.Recognizer()

# Store transcriptions
generated_texts = []

# Transcribe each audio file
for audio_file in generated_audios:
    try:
        with sr.AudioFile(audio_file) as source:
            audio_data = recognizer.record(source)  # Read the entire audio file
            text = recognizer.recognize_google(audio_data)  # Use Google Web Speech API
            generated_texts.append(text)
            print(f"Transcription for {os.path.basename(audio_file)}: {text}")
    except sr.UnknownValueError:
        print(f"Could not understand audio in {audio_file}")
        generated_texts.append("")  # Append empty string if transcription fails
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service; {e}")
        generated_texts.append("")  # Append empty string on error

# Output the transcribed texts
print("\nGenerated Texts:")
for i, text in enumerate(generated_texts, start=1):
    print(f"sample_{i}.wav: {text}")

Transcription for sample_1.wav: but not a word did he reply
Transcription for sample_2.wav: I would rather feel your spine than your skull whoever you are
Transcription for sample_3.wav: now as the lightning rod to Aspire on Shore is intended to carry off the perilous fluid into the soil so the kinds of Rod which at Sea some ships carry to each Mast is intended to conducted into the water
Transcription for sample_4.wav: these are the times when in his whale boat the Rover selfie feels a certain feeling confident land like feeling towards the sea that she regards it as so much flowery Earth and the distance ship revealing only the tops of her Mass seems struggling forward not through High rolling waves but through the Tall Grass of rolling PR
Transcription for sample_5.wav: only cries
Transcription for sample_6.wav: was he thought of death itself there is no
Transcription for sample_7.wav: yes did I we have just signed the articles
Transcription for sample_8.wav: Turkish mutes bowstring

In [14]:

reference_texts = [    
    "But not a word did he reply.",
    
    "I would rather feel your spine than your skull, whoever you are.",
    
    "Now, as the lightning rod to a spire on shore is intended to carry off the perilous fluid into the soil; so the kindred rod which at sea some ships carry to each mast, is intended to conduct it into the water.",
    
    "These are the times, when in his whale-boat the rover softly feels a certain filial, confident, land-like feeling towards the sea; that he regards it as so much flowery earth; and the distant ship revealing only the tops of her masts, seems struggling forward, not through high rolling waves, but through the tall grass of a rolling prairie: as when the western emigrants’ horses only show their erected ears, while their hidden bodies widely wade through the amazing verdure.",
    
    "(_Right Whale_).—In.",
    
    "What he thought of death itself, there is no telling.",
    
    "Yes, said I, we have just signed the articles.",
    
    "Ahab stooped to clear it; he did clear it; but the flying turn caught him round the neck, and voicelessly as Turkish mutes bowstring their victim, he was shot out of the boat, ere the crew knew he was gone.",
    
    "D’ye feel brave men, brave? As fearless fire, cried Stubb.",
    
    "Like dislodged trucks, the heads of the harpooneers aloft shook on their bull-like necks.",
    
    "Espied by some timid man-of-war or blundering discovery-vessel from afar, when the distance obscuring the swarming fowls, nevertheless still shows the white mass floating in the sun, and the white spray heaving high against it; straightway the whale’s unharming corpse, with trembling fingers is set down in the log—_shoals, rocks, and breakers hereabouts: beware!.",
    
    "With the landless gull, that at sunset folds her wings and is rocked to sleep between billows; so at nightfall, the Nantucketer, out of sight of land, furls his sails, and lays him to his rest, while under his very pillow rush herds of walruses and whales.",
    
    "tear yourselves! TASHTEGO.",
    
    "The Hyena.",
    
    "he must have more than that. Seven hundred and seventy-seventh, again said Bildad, without lifting his eyes; and then went on mumbling—for where your treasure is, there will your heart be also.",
    
    "Small sword, or broad sword, in all its exercises boasts nothing like it.",
    
    "he cried sharply.",
    
    "We were clear from the carcase; sail had been made; the wind was freshening; the wild ocean darkness was intense.",
    
    "Ship ahoy!"
]


# Store WER scores for each generated audio
wer_scores = {}

# Calculate WER for each generated audio against its corresponding reference audio
for i, generated_text in enumerate(generated_texts):
    reference_text = reference_texts[i % len(reference_texts)]  # Cycle through reference texts if needed
    wer_score = compute_wer(reference_text, generated_text)
    wer_scores[f'sample_{i + 1}.wav'] = wer_score

# Print average WER for each generated audio
for audio, wer in wer_scores.items():
    print(f"WER for {audio}: {wer:.2f}")

# Optionally, calculate the overall average WER across all generated audios
overall_average_wer = np.mean(list(wer_scores.values()))
print(f"\nOverall Average WER for all generated audios: {overall_average_wer:.2f}")

WER for sample_1.wav: 0.86
WER for sample_2.wav: 0.50
WER for sample_3.wav: 0.93
WER for sample_4.wav: 1.63
WER for sample_5.wav: 3.00
WER for sample_6.wav: 0.90
WER for sample_7.wav: 1.33
WER for sample_8.wav: 1.88
WER for sample_9.wav: 2.10
WER for sample_10.wav: 1.93
WER for sample_11.wav: 1.72
WER for sample_12.wav: 0.77
WER for sample_13.wav: 2.00
WER for sample_14.wav: 3.00
WER for sample_15.wav: 1.88
WER for sample_16.wav: 1.38
WER for sample_17.wav: 3.00
WER for sample_18.wav: 1.35
WER for sample_19.wav: 3.00

Overall Average WER for all generated audios: 1.74
