In [1]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch

In [2]:
model = Wav2Vec2ForCTC.from_pretrained("monideep2255/finetuning-xlsr-53-PSST_V7")
processor = Wav2Vec2Processor.from_pretrained("monideep2255/finetuning-xlsr-53-PSST_V7")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
processor.decode

<bound method Wav2Vec2Processor.decode of Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='monideep2255/finetuning-xlsr-53-PSST_V7', vocab_size=46, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<UNK>', 'pad_token': '<PAD>', 'additional_special_tokens': [AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True)]}, clean_up_tokenization_spaces=True)>

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elemen

In [5]:
from datasets import load_dataset, load_metric, DatasetDict, Dataset, Audio

# Load the datasets and observe the structure
dataset_dict = load_dataset('csv', data_files={
    "test": '/work/van-speech-nlp/psst-csv/test_utterances_excel.csv',
})

# review the datasets
test_inferences = dataset_dict["test"]

print(test_inferences)

Found cached dataset csv (/home/lewis.jor/.cache/huggingface/datasets/csv/default-c026370f45f2f2db/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['utterance_id', 'session', 'test', 'prompt', 'transcript', 'correctness', 'aq_index', 'duration_frames', 'filename_old', 'filename_new'],
    num_rows: 652
})


In [6]:
# remove columns that we do not need
test_inferences = test_inferences.remove_columns(["aq_index", "test", "duration_frames","filename_old"])

# print to verify
print(test_inferences)

Dataset({
    features: ['utterance_id', 'session', 'prompt', 'transcript', 'correctness', 'filename_new'],
    num_rows: 652
})


In [7]:
test_inferences = test_inferences.cast_column("filename_new", Audio(sampling_rate=16000))

In [8]:
test_inferences["filename_new"][5]

{'path': '/work/van-speech-nlp/psst-data/psst-data-2022-03-02-full/test/audio/bnt/ACWT01a/ACWT01a-BNT06-volcano.wav',
 'array': array([-0.00097656,  0.00195312,  0.01193237, ..., -0.00048828,
         0.00024414,  0.00213623]),
 'sampling_rate': 16000}

In [13]:
sample_inference_data = test_inferences[:5]
#sample_inference_data['input_values'][0]
print(len(test_inferences['input_values']))

652


In [10]:
def prepare_references_dataset(batch):
    # load the audio data into batch
    audio = batch["filename_new"]

    # extract the values from the audio files
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    # encode the transcript to the label ids
    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcript"]).input_ids
    
    # remove all columns except for 'transcript'
    batch = {key: batch[key] for key in batch.keys() if key == 'transcript'}
    
    return batch

test_inferences = test_inferences.map(prepare_references_dataset, num_proc=4)

Loading cached processed dataset at /home/lewis.jor/.cache/huggingface/datasets/csv/default-c026370f45f2f2db/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-1faface4f8c75806_*_of_00004.arrow


In [11]:
processor.tokenizer.decoder = {24: '<???>',
 3: '<PAD>',
 2: '<SIL>',
 18: '<SPN>',
 19: '<UNK>',
 1: ' AA ',
 8: '  AE',
 6: ' AH ',
 36: ' AO ',
 33: ' AW ',
 17: ' AY ',
 20: ' B ',
 43: ' CH ',
 35: ' D ',
 42: ' DH ',
 10: ' DX ',
 7: ' EH ',
 12: ' ER ',
 44: ' EY ',
 27: ' F ',
 40: ' G ',
 9: ' HH ',
 41: ' IH ',
 14: ' IY ',
 28: ' JH ',
 21: ' K ',
 22: ' L ',
 37: ' M ',
 0: ' N ',
 25: ' NG ',
 16: ' OW ',
 15: ' OY ',
 32: ' P ',
 45: ' R ',
 38: ' S ',
 29: ' SH ',
 5: ' T ',
 31: ' TH ',
 11: ' UH ',
 4: ' UW ',
 34: ' V ',
 30: ' W ',
 39: ' Y ',
 13: ' Z ',
 26: ' ZH ',
 23: '|'}

In [12]:
import librosa
import numpy as np

# Generate predictions for each sample
for i in range(len(sample_inference_data['input_values'])):
    input_values = np.array(sample_inference_data['input_values'][i])
    sampling_rate = sample_inference_data['input_length'][i]

    # Resample the input speech to match the model's sampling rate
    input_values = librosa.resample(input_values, orig_sr=sampling_rate, target_sr=16000)

    input_values = processor(input_values, sampling_rate=16000, return_tensors="pt").input_values
    input_values = input_values.to(device)  # Move input to the same device as the model
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    #print(predicted_ids)
    transcription = processor.decode(predicted_ids[0],clean_up_tokenization_spaces=False)

    reference_transcription = sample_inference_data['transcript'][i]

    print("Reference:", reference_transcription)
    print("Prediction:", transcription.lstrip().rstrip().replace('  ',' ').replace('\t',' '))
    print("---")

KeyError: 'input_values'

In [None]:
processor.tokenizer.decoder

In [None]:
processor.tokenizer.encoder['AA']

In [None]:
processor.tokenizer.decoder = {24: '<???>',
 3: '<PAD>',
 2: '<SIL>',
 18: '<SPN>',
 19: '<UNK>',
 1: ' AA ',
 8: '  AE',
 6: ' AH ',
 36: ' AO ',
 33: ' AW ',
 17: ' AY ',
 20: ' B ',
 43: ' CH ',
 35: ' D ',
 42: ' DH ',
 10: ' DX ',
 7: ' EH ',
 12: ' ER ',
 44: ' EY ',
 27: ' F ',
 40: ' G ',
 9: ' HH ',
 41: ' IH ',
 14: ' IY ',
 28: ' JH ',
 21: ' K ',
 22: ' L ',
 37: ' M ',
 0: ' N ',
 25: ' NG ',
 16: ' OW ',
 15: ' OY ',
 32: ' P ',
 45: ' R ',
 38: ' S ',
 29: ' SH ',
 5: ' T ',
 31: ' TH ',
 11: ' UH ',
 4: ' UW ',
 34: ' V ',
 30: ' W ',
 39: ' Y ',
 13: ' Z ',
 26: ' ZH ',
 23: '|'}