In [None]:
import torchaudio
import Levenshtein
import itertools

from wav2vec2decoder import Wav2Vec2Decoder

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
test_samples = [
    (
        "examples/sample1.wav",
        "IF YOU ARE GENEROUS HERE IS A FITTING OPPORTUNITY FOR THE EXERCISE OF YOUR MAGNANIMITY IF YOU ARE PROUD HERE AM I YOUR RIVAL READY TO ACKNOWLEDGE MYSELF YOUR DEBTOR FOR AN ACT OF THE MOST NOBLE FORBEARANCE",
    ),
    (
        "examples/sample2.wav",
        "AND IF ANY OF THE OTHER COPS HAD PRIVATE RACKETS OF THEIR OWN IZZY WAS UNDOUBTEDLY THE MAN TO FIND IT OUT AND USE THE INFORMATION WITH A BEAT SUCH AS THAT EVEN GOING HALVES AND WITH ALL THE GRAFT TO THE UPPER BRACKETS HE'D STILL BE ABLE TO MAKE HIS PILE IN A MATTER OF MONTHS",
    ),
    (
        "examples/sample3.wav",
        "GUESS A MAN GETS USED TO ANYTHING HELL MAYBE I CAN HIRE SOME BUMS TO SIT AROUND AND WHOOP IT UP WHEN THE SHIPS COME IN AND BILL THIS AS A REAL OLD MARTIAN DEN OF SIN",
    ),
    (
        "examples/sample4.wav",
        "IT WAS A TUNE THEY HAD ALL HEARD HUNDREDS OF TIMES SO THERE WAS NO DIFFICULTY IN TURNING OUT A PASSABLE IMITATION OF IT TO THE IMPROVISED STRAINS OF I DIDN'T WANT TO DO IT THE PRISONER STRODE FORTH TO FREEDOM",
    ),
    (
        "examples/sample5.wav",
        "MARGUERITE TIRED OUT WITH THIS LONG CONFESSION THREW HERSELF BACK ON THE SOFA AND TO STIFLE A SLIGHT COUGH PUT UP HER HANDKERCHIEF TO HER LIPS AND FROM THAT TO HER EYES",
    ),
    (
        "examples/sample6.wav",
        "AT THIS TIME ALL PARTICIPANTS ARE IN A LISTEN ONLY MODE",
    ),
    (
        "examples/sample7.wav",
        "THE INCREASE WAS MAINLY ATTRIBUTABLE TO THE NET INCREASE IN THE AVERAGE SIZE OF OUR FLEETS",
    ),
    (
        "examples/sample8.wav",
        "OPERATING SURPLUS IS A NON CAP FINANCIAL MEASURE WHICH IS DEFINED AS FULLY IN OUR PRESS RELEASE",
    ),
]

In [29]:
decoder = Wav2Vec2Decoder(lm_model_path="lm/3-gram.pruned.1e-7.arpa")

alphas = [0.5, 1.0, 3]
betas = [0.5, 5]
beam_widths = [5, 10]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
for beam_width in beam_widths:
    decoder.beam_width = beam_width

    distances = []
    for audio_path, true_transcription in test_samples:
        audio_input, sr = torchaudio.load(audio_path)
        
        transcript = decoder.decode(audio_input, method="beam_lm_rescore").strip()
        
        dist = Levenshtein.distance(true_transcription, transcript)
        distances.append(dist)
    avg_dist = sum(distances) / len(distances)
    print("=" * 80)
    print(
        f"Results | beam_width={beam_width} | distances = {distances}, avg_distance = {avg_dist}"
    )

Results | beam_width=5 | distances = [10, 5, 5, 8, 2, 16, 19, 15], avg_distance = 10.0
Results | beam_width=10 | distances = [10, 5, 5, 8, 2, 17, 19, 15], avg_distance = 10.125


In [30]:
for alpha, beta, beam_width in itertools.product(alphas, betas, beam_widths):
    decoder.alpha = alpha
    decoder.beta = beta
    decoder.beam_width = beam_width

    distances = []
    for audio_path, true_transcription in test_samples:
        audio_input, sr = torchaudio.load(audio_path)
        
        transcript = decoder.decode(audio_input, method="beam_lm_rescore").strip()
        
        dist = Levenshtein.distance(true_transcription, transcript)
        distances.append(dist)
    avg_dist = sum(distances) / len(distances)
    print("=" * 80)
    print(
        f"Results | α={alpha}, β={beta}, beam_width={beam_width} | distances = {distances}, avg_distance = {avg_dist}"
    )

Results | α=0.5, β=0.5, beam_width=5 | distances = [10, 5, 5, 8, 2, 16, 19, 15], avg_distance = 10.0
Results | α=0.5, β=0.5, beam_width=10 | distances = [10, 5, 5, 8, 2, 17, 19, 15], avg_distance = 10.125
Results | α=0.5, β=5, beam_width=5 | distances = [10, 5, 5, 8, 2, 15, 19, 15], avg_distance = 9.875
Results | α=0.5, β=5, beam_width=10 | distances = [10, 5, 5, 8, 2, 15, 19, 15], avg_distance = 9.875
Results | α=1.0, β=0.5, beam_width=5 | distances = [10, 5, 5, 8, 2, 16, 19, 15], avg_distance = 10.0
Results | α=1.0, β=0.5, beam_width=10 | distances = [10, 5, 5, 8, 2, 17, 19, 15], avg_distance = 10.125
Results | α=1.0, β=5, beam_width=5 | distances = [10, 5, 5, 8, 2, 16, 19, 15], avg_distance = 10.0
Results | α=1.0, β=5, beam_width=10 | distances = [10, 5, 5, 8, 2, 15, 19, 15], avg_distance = 9.875
Results | α=3, β=0.5, beam_width=5 | distances = [10, 5, 5, 8, 2, 16, 19, 15], avg_distance = 10.0
Results | α=3, β=0.5, beam_width=10 | distances = [10, 5, 5, 8, 2, 17, 19, 15], avg_distan

In [31]:
for alpha, beta, beam_width in itertools.product(alphas, betas, beam_widths):
    decoder.alpha = alpha
    decoder.beta = beta
    decoder.beam_width = beam_width

    distances = []
    for audio_path, true_transcription in test_samples:
        audio_input, sr = torchaudio.load(audio_path)

        transcript = decoder.decode(audio_input, method="beam_lm").strip()
        
        dist = Levenshtein.distance(true_transcription, transcript)
        distances.append(dist)
    avg_dist = sum(distances) / len(distances)
    print("=" * 80)
    print(
        f"Results | α={alpha}, β={beta}, beam_width={beam_width} | distances = {distances}, avg_distance = {avg_dist}"
    )

Results | α=0.5, β=0.5, beam_width=5 | distances = [11, 6, 6, 8, 2, 20, 25, 20], avg_distance = 12.25
Results | α=0.5, β=0.5, beam_width=10 | distances = [12, 9, 6, 7, 2, 20, 25, 21], avg_distance = 12.75
Results | α=0.5, β=5, beam_width=5 | distances = [11, 11, 7, 12, 3, 20, 17, 17], avg_distance = 12.25
Results | α=0.5, β=5, beam_width=10 | distances = [15, 36, 8, 12, 3, 25, 17, 25], avg_distance = 17.625
Results | α=1.0, β=0.5, beam_width=5 | distances = [18, 20, 19, 15, 7, 23, 24, 24], avg_distance = 18.75
Results | α=1.0, β=0.5, beam_width=10 | distances = [31, 22, 28, 16, 8, 35, 33, 26], avg_distance = 24.875
Results | α=1.0, β=5, beam_width=5 | distances = [13, 12, 10, 12, 5, 17, 21, 21], avg_distance = 13.875
Results | α=1.0, β=5, beam_width=10 | distances = [13, 31, 10, 15, 5, 17, 21, 25], avg_distance = 17.125
Results | α=3, β=0.5, beam_width=5 | distances = [83, 153, 104, 92, 89, 45, 68, 75], avg_distance = 88.625
Results | α=3, β=0.5, beam_width=10 | distances = [138, 206, 

In [32]:
decoder = Wav2Vec2Decoder(lm_model_path="lm/4-gram.arpa")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
for alpha, beta, beam_width in itertools.product(alphas, betas, beam_widths):
    decoder.alpha = alpha
    decoder.beta = beta
    decoder.beam_width = beam_width

    distances = []
    for audio_path, true_transcription in test_samples:
        audio_input, sr = torchaudio.load(audio_path)

        transcript = decoder.decode(audio_input, method="beam_lm_rescore").strip()
        
        dist = Levenshtein.distance(true_transcription, transcript)
        distances.append(dist)
    avg_dist = sum(distances) / len(distances)
    print("=" * 80)
    print(
        f"Results | α={alpha}, β={beta}, beam_width={beam_width} | distances = {distances}, avg_distance = {avg_dist}"
    )

Results | α=0.5, β=0.5, beam_width=5 | distances = [10, 5, 5, 8, 2, 16, 19, 15], avg_distance = 10.0
Results | α=0.5, β=0.5, beam_width=10 | distances = [10, 5, 5, 8, 2, 17, 19, 15], avg_distance = 10.125
Results | α=0.5, β=5, beam_width=5 | distances = [10, 5, 5, 8, 2, 15, 19, 15], avg_distance = 9.875
Results | α=0.5, β=5, beam_width=10 | distances = [10, 5, 5, 8, 2, 15, 19, 15], avg_distance = 9.875
Results | α=1.0, β=0.5, beam_width=5 | distances = [10, 5, 5, 8, 2, 16, 19, 15], avg_distance = 10.0
Results | α=1.0, β=0.5, beam_width=10 | distances = [10, 5, 5, 8, 2, 17, 19, 15], avg_distance = 10.125
Results | α=1.0, β=5, beam_width=5 | distances = [10, 5, 5, 8, 2, 16, 19, 15], avg_distance = 10.0
Results | α=1.0, β=5, beam_width=10 | distances = [10, 5, 5, 8, 2, 15, 19, 15], avg_distance = 9.875
Results | α=3, β=0.5, beam_width=5 | distances = [10, 5, 5, 8, 2, 16, 19, 15], avg_distance = 10.0
Results | α=3, β=0.5, beam_width=10 | distances = [10, 5, 5, 8, 2, 17, 19, 15], avg_distan

In [34]:
for alpha, beta, beam_width in itertools.product(alphas, betas, beam_widths):
    decoder.alpha = alpha
    decoder.beta = beta
    decoder.beam_width = beam_width

    distances = []
    for audio_path, true_transcription in test_samples:
        audio_input, sr = torchaudio.load(audio_path)

        transcript = decoder.decode(audio_input, method="beam_lm").strip()
        
        dist = Levenshtein.distance(true_transcription, transcript)
        distances.append(dist)
    avg_dist = sum(distances) / len(distances)
    print("=" * 80)
    print(
        f"Results | α={alpha}, β={beta}, beam_width={beam_width} | distances = {distances}, avg_distance = {avg_dist}"
    )

Results | α=0.5, β=0.5, beam_width=5 | distances = [12, 7, 6, 8, 2, 20, 25, 21], avg_distance = 12.625
Results | α=0.5, β=0.5, beam_width=10 | distances = [12, 10, 8, 10, 2, 20, 26, 21], avg_distance = 13.625
Results | α=0.5, β=5, beam_width=5 | distances = [11, 10, 7, 12, 3, 21, 20, 18], avg_distance = 12.75
Results | α=0.5, β=5, beam_width=10 | distances = [11, 18, 15, 13, 3, 20, 23, 26], avg_distance = 16.125
Results | α=1.0, β=0.5, beam_width=5 | distances = [21, 21, 26, 16, 8, 26, 29, 22], avg_distance = 21.125
Results | α=1.0, β=0.5, beam_width=10 | distances = [21, 21, 43, 22, 9, 47, 37, 23], avg_distance = 27.875
Results | α=1.0, β=5, beam_width=5 | distances = [15, 20, 9, 15, 6, 17, 22, 21], avg_distance = 15.625
Results | α=1.0, β=5, beam_width=10 | distances = [15, 19, 11, 16, 6, 24, 24, 21], avg_distance = 17.0
Results | α=3, β=0.5, beam_width=5 | distances = [81, 148, 97, 98, 104, 43, 63, 58], avg_distance = 86.5
Results | α=3, β=0.5, beam_width=10 | distances = [132, 190,