# Notebook to prototype evaluation script for different decoders

In [3]:
import librosa
import torch
import torchaudio
from datasets import load_dataset, load_metric
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import re

test_dataset = load_dataset("common_voice", "fi", split="test")
wer = load_metric("wer")

processor = Wav2Vec2Processor.from_pretrained("aapot/wav2vec2-large-xlsr-53-finnish")
model = Wav2Vec2ForCTC.from_pretrained("aapot/wav2vec2-large-xlsr-53-finnish")
model.to("cuda")

chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\...\…\–\é]'
resampler = lambda sr, y: librosa.resample(y.numpy().squeeze(), sr, 16_000)


def speech_file_to_array_fn(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    batch["speech"] = resampler(sampling_rate, speech_array).squeeze()
    return batch

def evaluate(batch):
    inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_strings"] = processor.batch_decode(pred_ids)
    return batch


test_dataset = test_dataset.map(speech_file_to_array_fn)
result = test_dataset.map(evaluate, batched=True, batch_size=8)
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))

Reusing dataset common_voice (/home/sampo/.cache/huggingface/datasets/common_voice/fi/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f)
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Loading cached processed dataset at /home/sampo/.cache/huggingface/datasets/common_voice/fi/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f/cache-01e1ce8e6f9f2ae9.arrow


HBox(children=(FloatProgress(value=0.0, max=54.0), HTML(value='')))


WER: 32.378771


In [2]:
import torch
# __init__

ic = 1
oc = 3
kernel = 5


conv = torch.nn.Conv2d(ic, oc, kernel)
print(conv.weight.grad)
# forward = 

mask = torch.ones_like(conv.weight.data)
x = torch.randn(1,1,28,28)
out = torch.nn.functional.conv2d(x, conv.weight*mask)


loss = out.mean()
loss.backward()

print(conv.weight.grad)

None
tensor([[[[-0.0062, -0.0110, -0.0154, -0.0117, -0.0147],
          [-0.0037, -0.0086, -0.0123, -0.0088, -0.0119],
          [-0.0024, -0.0066, -0.0102, -0.0053, -0.0065],
          [-0.0049, -0.0079, -0.0096, -0.0049, -0.0080],
          [-0.0067, -0.0088, -0.0089, -0.0035, -0.0075]]],


        [[[-0.0062, -0.0110, -0.0154, -0.0117, -0.0147],
          [-0.0037, -0.0086, -0.0123, -0.0088, -0.0119],
          [-0.0024, -0.0066, -0.0102, -0.0053, -0.0065],
          [-0.0049, -0.0079, -0.0096, -0.0049, -0.0080],
          [-0.0067, -0.0088, -0.0089, -0.0035, -0.0075]]],


        [[[-0.0062, -0.0110, -0.0154, -0.0117, -0.0147],
          [-0.0037, -0.0086, -0.0123, -0.0088, -0.0119],
          [-0.0024, -0.0066, -0.0102, -0.0053, -0.0065],
          [-0.0049, -0.0079, -0.0096, -0.0049, -0.0080],
          [-0.0067, -0.0088, -0.0089, -0.0035, -0.0075]]]])
