## HTX xData Test cv-train-4 Python Notebook

This notebook contains the code for comparing the finetuned model from task 3 against the results from task 2a for the cv-valid-dev mp3 dataset.

In [1]:
import pandas as pd
import os

CV_DIRECTORY = "../asr/"
CV_DATASET = "cv-valid-dev"

cv_dev_metadata = pd.read_csv(os.path.join(CV_DIRECTORY, CV_DATASET + '.csv'))
cv_dev_metadata["generated_text"] = cv_dev_metadata["generated_text"].astype(str)

cv_dev_metadata.head(5)

Unnamed: 0.1,Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration,generated_text
0,0,cv-valid-dev/sample-000000.mp3,be careful with your prognostications said the...,1,0,,,,,BE CAREFUL WITH YOUR PROGNOSTICATIONS SAID THE...
1,1,cv-valid-dev/sample-000001.mp3,then why should they be surprised when they se...,2,0,,,,,THEN WHY SHOULD THEY BE SURPRISED WHEN THEY SE...
2,2,cv-valid-dev/sample-000002.mp3,a young arab also loaded down with baggage ent...,2,0,,,,,A YOUNG ARAB ALSO LOADED DOWN WITH BAGGAGE ENT...
3,3,cv-valid-dev/sample-000003.mp3,i thought that everything i owned would be des...,3,0,,,,,I FELT THAT EVERYTHING I OWNED WOULD BE DESTROYED
4,4,cv-valid-dev/sample-000004.mp3,he moved about invisible but everyone could he...,1,0,fourties,female,england,,HE MOVED ABOUT INVISIBLE BUT EVERY ONE COULD H...


In [2]:
from transformers import Wav2Vec2ForCTC
import torch

MODEL_FOLDER = "wav2vec2-large-960h-cv"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
finetuned_model = Wav2Vec2ForCTC.from_pretrained(MODEL_FOLDER)
finetuned_model.to(device)

  from .autonotebook import tqdm as notebook_tqdm


Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=1024, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder

In [8]:
from datasets import load_dataset

CV_DB_DIRECTORY = "/home/zchin/common_voice/"

data_files = [os.path.join(CV_DB_DIRECTORY, CV_DATASET, filename) for filename in cv_dev_metadata["filename"]]

cv_dev_dataset = load_dataset("audiofolder", data_files=data_files, drop_metadata=True)

cv_dev_dataset = cv_dev_dataset["train"]

cv_dev_dataset = cv_dev_dataset.add_column("file", cv_dev_metadata["filename"])
cv_dev_dataset = cv_dev_dataset.add_column("text", cv_dev_metadata["text"])
cv_dev_dataset = cv_dev_dataset.add_column("up_votes", cv_dev_metadata["up_votes"])
cv_dev_dataset = cv_dev_dataset.add_column("down_votes", cv_dev_metadata["down_votes"])
cv_dev_dataset = cv_dev_dataset.add_column("pretrained_text", cv_dev_metadata["generated_text"])

print(cv_dev_dataset)
print("File:            {}".format(cv_dev_dataset[0]["file"]))
print("Audio Path:      {}".format(cv_dev_dataset[0]["audio"]))
print("Text:            {}".format(cv_dev_dataset[0]["text"]))
print("Pretrained Text: {}".format(cv_dev_dataset[0]["pretrained_text"]))
print("Upvotes:         {}".format(cv_dev_dataset[0]["up_votes"]))
print("Downvotes:       {}".format(cv_dev_dataset[0]["down_votes"]))

Dataset({
    features: ['audio', 'file', 'text', 'up_votes', 'down_votes', 'pretrained_text'],
    num_rows: 4076
})
File:            cv-valid-dev/sample-000000.mp3
Audio Path:      {'path': '/home/zchin/common_voice/cv-valid-dev/cv-valid-dev/sample-000000.mp3', 'array': array([ 0.00000000e+00,  6.02674067e-17,  6.65033890e-17, ...,
       -2.24995560e-07, -1.15900491e-07,  1.59164927e-07]), 'sampling_rate': 48000}
Text:            be careful with your prognostications said the stranger
Pretrained Text: BE CAREFUL WITH YOUR PROGNOSTICATIONS SAID THE STRANGER
Upvotes:         1
Downvotes:       0


In [9]:
from datasets import Audio
import re
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")

SAMPLING_RATE = processor.feature_extractor.sampling_rate

chars_to_ignore_regex = r'[\,\?\.\!\-\;\:\"]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).upper()
    return batch

def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor(audio=audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["labels"] = processor(text=batch["text"]).input_ids

    return batch

cv_dev_ds = cv_dev_dataset.map(remove_special_characters)
cv_dev_ds = cv_dev_ds.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))

dev_prepared_ds = cv_dev_ds.map(prepare_dataset)

print(dev_prepared_ds)

Map: 100%|██████████| 4076/4076 [00:00<00:00, 34517.47 examples/s]
Map: 100%|██████████| 4076/4076 [02:05<00:00, 32.43 examples/s] 

Dataset({
    features: ['audio', 'file', 'text', 'up_votes', 'down_votes', 'pretrained_text', 'input_values', 'labels'],
    num_rows: 4076
})





In [10]:
def map_finetuned_to_result(batch):
  with torch.no_grad():
    input_values = torch.tensor(batch["input_values"], device=device).unsqueeze(0)
    ft_logits = finetuned_model(input_values).logits

  ft_pred_ids = torch.argmax(ft_logits, dim=-1)
  batch["finetuned_text"] = processor.batch_decode(ft_pred_ids)[0]
  batch["text"] = processor.decode(batch["labels"], group_tokens=False)
  
  return batch

results = dev_prepared_ds.map(map_finetuned_to_result)

Map: 100%|██████████| 4076/4076 [02:30<00:00, 27.16 examples/s]


In [11]:
from datasets import load_metric

wer_metric = load_metric("wer")

print("Pretrained WER: {:.3f}".format(wer_metric.compute(predictions=results["pretrained_text"], references=results["text"])))
print("Finetuned WER: {:.3f}".format(wer_metric.compute(predictions=results["finetuned_text"], references=results["text"])))

  wer_metric = load_metric("wer")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Pretrained WER: 0.108
Finetuned WER: 0.038


In [14]:
import random
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

show_random_elements(results.remove_columns([f for f in results.features if f not in ["finetuned_text", "pretrained_text", "text"]]))

Unnamed: 0,text,pretrained_text,finetuned_text
0,IT WAS STARLIGHT AND I EXPLAINED THE SIGNS OF THE ZODIAC TO HER,IT WAS STARLIGHT AND I EXPLAINED THE SIGNS OF THE ZODIAC TO HER,IT WAS STARLIGHT AND I EXPLAINED THE SIGNS OF THE ZODIAC TO HER
1,LOTS OF PLACES SELL TEA AROUND HERE THE MERCHANT SAID,LOTS OF PLACES SELL TEA AROUND HERE EMAGIN SAID,LOTS OF PLACES SELL TEA AROUND HERE THE MERCHANT SAID
2,MAYBE BECAUSE THAT WASN'T REALLY HIS DREAM,MAYBE BECAUSE THAT WASN'T DERILLY HIS DREAM,MAYBE BECAUSE THAT WASN'T REALLY HIS DREAM
3,THEY WERE KNOWN AS SEERS AND THEY WERE HELD IN FEAR BY WOMEN AND THE ELDERLY,DAVEA KNOWN AS SIARS AND DAVA HELD IN FEAR BY WOMEN AND THE ELDERLY,THEY WERE KNOWN AS SEERS AND THEY WERE HELD IN FEAR BY WOMEN AND THE ELDERLY
4,HE APPROACHED THE MASS AND WAS SURPRISED AT THE SIZE AND THE SHAPE,HE APPROACHED THE MASS AND WAS SURPRISED AT THE SIZE AND THE SHAPE,HE APPROACHED THE MASS AND WAS SURPRISED AT THE SIZE AND THE SHAPE
5,HE PUT A SIGN ON THE DOOR AND THEY WENT TO A SMALL CAFE NEARBY,HE PUT A SIGN OF THE DOOR AND THEY WENT TO A SMALL CAFE NEAR BY,HE PUT A SIGN ON THE DOOR AND THEY WENT TO A SMALL CAFE NEARBY
6,HE APPROACHED THE MASS AND WAS SURPRISED AT THE SIZE AND THE SHAPE,HE APPROACHED THE MASS AND WAS SURPRISED AT THE SIZE AND SHAPE,HE APPROACHED THE MASS AND WAS SURPRISED AT THE SIZE AND THE SHAPE
7,WHAT WAS WRITTEN ON THE EMERALD TABLET THE BOY WANTED TO KNOW,WHAT WAS WRITTEN ON THE EMERALD TABLET THE BOY WANTED TO KNOW,WHAT WAS WRITTEN ON THE EMERALD TABLET THE BOY WANTED TO KNOW
8,THE BOY WAS SAD AS HE LEFT HER THAT DAY,THE BOY WAS SAD AS HE LEFT HER THAT DAY,THE BOY WAS SAD AS HE LEFT HER THAT DAY
9,WHY ARE YOU CARRYING MONEY ASKED THE TRIBESMAN WHEN HE HAD SEARCHED THE BOY'S BAG,WHY ARE YOU CARRYING MONEY ASKED THE TRIBESMAN WHEN HE HAD SEARCHED A BOY'S BED,WHY ARE YOU CARRYING MONEY ASKED THE TRIBESMAN WHEN HE HAD SEARCHED THE BOY'S BED


In [16]:
cv_dev_metadata["finetuned_text"] = results["finetuned_text"]
cv_dev_metadata.to_csv("cv-valid-dev.csv", index=False)

In [19]:
cv_dev_results = pd.DataFrame({
    "text" : results["text"],
    "finetuned_text" : results["finetuned_text"],
    "pretrained_text" : results["pretrained_text"],
})

cv_dev_results.head(10)

Unnamed: 0,text,finetuned_text,pretrained_text
0,BE CAREFUL WITH YOUR PROGNOSTICATIONS SAID THE...,BE CAREFUL WITH YOUR PROGNOSTICATIONS SAID THE...,BE CAREFUL WITH YOUR PROGNOSTICATIONS SAID THE...
1,THEN WHY SHOULD THEY BE SURPRISED WHEN THEY SE...,THEN WHY SHOULD THEY BE SURPRISED WHEN THEY SE...,THEN WHY SHOULD THEY BE SURPRISED WHEN THEY SE...
2,A YOUNG ARAB ALSO LOADED DOWN WITH BAGGAGE ENT...,A YOUNG ARAB ALSO LOADED DOWN WITH BAGGAGE ENT...,A YOUNG ARAB ALSO LOADED DOWN WITH BAGGAGE ENT...
3,I THOUGHT THAT EVERYTHING I OWNED WOULD BE DES...,I THOUGHT THAT EVERYTHING I OWNED WOULD BE DES...,I FELT THAT EVERYTHING I OWNED WOULD BE DESTROYED
4,HE MOVED ABOUT INVISIBLE BUT EVERYONE COULD HE...,HE MOVED ABOUT INVISIBLE BUT EVERYONE COULD HE...,HE MOVED ABOUT INVISIBLE BUT EVERY ONE COULD H...
5,BUT EVERYTHING HAD CHANGED,BUT EVERYTHING HAD CHANGED,BUT EVERYTHING HAD CHANGED
6,ARE YOU SURE THIS IS CLAIRE,ARE YOU SURE THIS IS CLAIRE,ARE YOU SURE DIS IS CLAIRE
7,IT HAD TOLD HIM TO DIG WHERE HIS TEARS FELL,IT HAD TOLD HIM TO DIG WHERE HIS TEARS FELL,IT HAD TOLD HIM TO DIG WHERE HIS TEARS FELL
8,THE SHOP FOLKS WERE TAKING DOWN THEIR SHUTTERS...,THE SHOP FOLKS WERE TAKING DOWN THEIR SHUTTERS...,THE SHOP FOLKS WERE TAKING DOWN THEIR SHUTTERS...
9,THE TEACHER THOUGHT THAT HE'D TAUGHT HIMSELF A...,THE TEACHER TOUGHT THAT HE'D TOUGHT HIMSELF AL...,THE TEACHER THOUGHT THAT HE TAUGHT HIMSELF ALL...


In [32]:
cv_finetuned_bad_df = cv_dev_results[(cv_dev_results["text"] != cv_dev_results["finetuned_text"]) & (cv_dev_results["text"] == cv_dev_results["pretrained_text"])]

print("Number of samples where finetuned is wrong but pretrained is correct: {}".format(len(cv_finetuned_bad_df)))
display(HTML(cv_finetuned_bad_df[["finetuned_text", "text"]].sample(10).to_html()))

Number of samples where finetuned is wrong but pretrained is correct: 84


Unnamed: 0,finetuned_text,text
3600,BUT THE ENGLISHMAN APPEARED NOT TO ATTAUCH ANY IMPORTANCE TO IT,BUT THE ENGLISHMAN APPEARED NOT TO ATTACH ANY IMPORTANCE TO IT
2832,THE REFERENCES ARE GOOD IN THE VERY NARROW AREA OF HISTOGRAMH METHODS,THE REFERENCES ARE GOOD IN THE VERY NARROW AREA OF HISTOGRAM METHODS
3215,AT WASN'T MUCH ANYWAY,IT WASN'T MUCH ANYWAY
413,A LITTLE LEMONGRASS SHOULD FRESTEN IT UP,A LITTLE LEMONGRASS SHOULD FRESHEN IT UP
3517,THE THE BOY AWOKE AS THE SUN ROSE,THE BOY AWOKE AS THE SUN ROSE
2041,THE BOY WATCHED AS HIS COMPANION WENT TO HIS HORSE AND WITHDREW A CIMITAR,THE BOY WATCHED AS HIS COMPANION WENT TO HIS HORSE AND WITHDREW A SCIMITAR
427,BUT SHE'S GONE,BUT SHE IS GONE
2998,RED YOU WOULD HAVE TO HAVE BEEN BORN AN ARAB TO UNDERSTAND HE ANSWERED,YOU WOULD HAVE TO HAVE BEEN BORN AN ARAB TO UNDERSTAND HE ANSWERED
616,THE HORIZON WAS TINGED WITH RED AND SUDDENLY THE SIGN APPEARED,THE HORIZON WAS TINGED WITH RED AND SUDDENLY THE SUN APPEARED
1779,GIME ME MY ROBE,GIVE ME MY ROBE


In [35]:
cv_pretrained_bad_df = cv_dev_results[(cv_dev_results["text"] == cv_dev_results["finetuned_text"]) & (cv_dev_results["text"] != cv_dev_results["pretrained_text"])]

print("Number of samples where pretrained is wrong but finetuned is correct: {}".format(len(cv_pretrained_bad_df)))
display(HTML(cv_pretrained_bad_df[["pretrained_text", "text"]].sample(10).to_html()))

Number of samples where pretrained is wrong but finetuned is correct: 1145


Unnamed: 0,pretrained_text,text
469,ARE YOU GOING TO TALK OR WANT YOU,ARE YOU GOING TO TALK OR AREN'T YOU
2760,HE MOVED ABOUT INVISIBLE BUT EVERY ONE COULD HEAL HIM,HE MOVED ABOUT INVISIBLE BUT EVERYONE COULD HEAR HIM
2902,WE COULD GET TO THE PYRAMIDS BY TO MORROW SAID THE OTHER TAKING THE MONEY,WE COULD GET TO THE PYRAMIDS BY TOMORROW SAID THE OTHER TAKING THE MONEY
2090,I'M TRYING TO THINK OF SOMETHING BEFORE THOSE REPORTES GETBACK,I'M TRYING TO THINK OF SOMETHING BEFORE THOSE REPORTERS GET BACK
713,WE DON'T HAVE TO GIVE UP OUR CLAP,WE DON'T HAVE TO GIVE UP OUR CLUB
3059,CAN I GET A WHAT WHAT,CAN I GET A WOOT WOOT
2298,PEOPLE SAW ME COMING AND WELCOMED ME HE SOUGHED,PEOPLE SAW ME COMING AND WELCOMED ME HE THOUGHT
3159,THE SIMMON BLUE THAT DAY AS IT HAD NEVER BLOWN BEFORE,THE SIMUM BLEW THAT DAY AS IT HAD NEVER BLOWN BEFORE
3232,THE FOLLOWING NIGHT THE BOY APPEARED AT THE ALCHEMIS'S TENT WITH THE HORSE,THE FOLLOWING NIGHT THE BOY APPEARED AT THE ALCHEMIST'S TENT WITH A HORSE
1146,SAYS FERACHE TO GO THE LINET,SAYS FOR US TO GO THE LIMIT


In [36]:
cv_both_bad_df = cv_dev_results[(cv_dev_results["text"] != cv_dev_results["finetuned_text"]) & (cv_dev_results["text"] != cv_dev_results["pretrained_text"])]

print("Number of samples where both are wrong: {}".format(len(cv_both_bad_df)))
display(HTML(cv_both_bad_df.sample(10).to_html()))

Number of samples where both are wrong: 690


Unnamed: 0,text,finetuned_text,pretrained_text
122,AT OTHER TIMES AT A CRUCIAL MOMENT I MAKE IT EASIER FOR THINGS TO HAPPEN,AT OTHER TIMES AT A CRUCIAL MOMENT YOU MAKE IT EASIER FOR THINGS TO HAPPEN,AT OTHER TIMES AT A CRUISING MOMENT YOU MAKE IT EASIER FOR THINGS TO HAPPEN
3372,AND THEN THEY WANT THE PERSON TO CHANGE,AND THEN THEY WANTED THE PERSON TO CHANGE,AND THEN THEY WANTED THE PERSON TO CHANGE
564,THE BASKETBALL BOUNCED OFF HIS SHIELD OF TITANIUM,THE BASKETBALL BOUNCED ON THE SHIELS OF TITANIUM,THE BASKETBALL BOUNCED OF HIS SHIELD OF TITANIUM
2277,GOTTA BE GENTLE TO SUIT ME,THET GOTTA BE GENTLE TO SEE ME,THET GOINTO BE GENTLE TO SIDME
2412,SUPPOSE THERE WAS A SHANE YORK AND HE WALKED INTO THIS OFFICE,SUPPOSE THERE WAS A SHANEYOK AND HE WALKED INTO THE LITE OFFICE,SUPPOSE THERE WAS A SHAME YOG AND HE WALKED INTO POLISE OFFICE
1182,I NEED YOU TO BE SPONTANEOUS HE ASKED ME OUT TO DIN DIN,I NEED YOU TO BE SPONTANEOUS HE HAS KAD ME OUT YOUR DIN DIN,I NEED YOU TO PEA SPONTENEOUS HE HAS CATTED ME OUT YOUR DINGDING
3747,THE BOY LOOKED AROUND FOR THE OVENS AND OTHER APPARATUS USED IN ALCHEMY BUT SAW NONE,THE BOY LOOKED AROUND FOR THE OVENS AND OTHER APPARATUS USED IN ALCHEMY BUT SAW NOUNG,THE BOY LOOKED AROUND FOR THE OVENS AND OTHER APPARATUS USED IN ALCEMY TUT SONAN
2954,THEY'RE FORMING CLUBS,THEY 'RE FORMING CLUBS,THEY ARE FORMING CLUBS
2368,THIS MORNING I FOUND A CALCULATOR TAPED TO MY WII,THIS MORNING I FOUND A CALCULATOR TAPED TO MY WIIT,THIS MORNING I FOUND A CALCULATOR TAKE TO MY WEAT
198,ON TOP OF ALL THAT THE WEEDS KEEP GROWING AND THE GARBAGE HAS TO BE TAKEN OUT,ON TOP OF ALL THAT THE WIED'S KEEP GROWING AND THE GARBIT HAS TO BE TAKEN OUT,ON TOP OF ALL THAT THE WEEDS KEEP GROWING AND THE GABBIT HAS TO BE TAKEN OUT
