# **Fine-tuning Wav2Vec2 for PSST Dataset with 🤗 Transformers**

### **Ensure that GPU and RAM is set up: will be needed for training purpose**

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Jul 19 17:46:00 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K40m          Off  | 00000000:0B:00.0 Off |                    0 |
| N/A   33C    P0    65W / 235W |      0MiB / 11441MiB |     97%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# ensure enough memory present so that training does not stop
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 134.3 gigabytes of available RAM

You are using a high-RAM runtime!


### **Packages/libraries needed:** <br>
`datasets`: to transform the dataset <br>
`transformers`: upgraded version of RNN (allows to process a large quantity of text) <br>
`librosa`: needed for the audio files <br>
`jiwer`: PER and FER metric

In [None]:
# Install required libraries
!pip install datasets
!pip install transformers==4.28.0
!pip install accelerate
!pip install jiwer
!pip install huggingface_hub
!pip install librosa
!pip install torch

[0mCollecting transformers==4.28.0
  Using cached transformers-4.28.0-py3-none-any.whl (7.0 MB)
[0mInstalling collected packages: transformers
  Attempting uninstall: transformers
[0m    Found existing installation: transformers 4.23.1
    Uninstalling transformers-4.23.1:
      Successfully uninstalled transformers-4.23.1
Successfully installed transformers-4.28.0
[0m

In [None]:
#! pip install torchaudio

In [None]:
# import the libraries
import torch
#import torchaudio
import pandas as pd
from datasets import load_dataset
#from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2CTCTrainer

In [None]:
import sys
sys.exec_prefix

### **Login to hugging Face to save the model**

In [None]:
from huggingface_hub import notebook_login

notebook_login()

### **Upload/Assign the path for the PSST dataset that will be finetuned against the state of the art model**

In [8]:
# load the datasets and observe the structure
from datasets import load_dataset, load_metric, DatasetDict, Dataset, Audio

# Load the datasets and observe the structure
dataset_dict = load_dataset('csv', data_files={
    "train": '/work/van-speech-nlp/PSST-experiments/psst-csv/train_utterances_excel.csv',
    "valid": '/work/van-speech-nlp/PSST-experiments/psst-csv/valid_utterances_excel.csv',
    "test": '/work/van-speech-nlp/PSST-experiments/psst-csv/test_utterances_excel.csv',
})

# review the datasets
train_csv = dataset_dict["train"]
valid_csv = dataset_dict["valid"]
test_csv = dataset_dict["test"]

print(train_csv)
print(valid_csv)
print(test_csv)

Downloading and preparing dataset csv/default to /home/chakraborti.m/.cache/huggingface/datasets/csv/default-82722730b4e84c1a/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/chakraborti.m/.cache/huggingface/datasets/csv/default-82722730b4e84c1a/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['utterance_id', 'session', 'test', 'prompt', 'transcript', 'correctness', 'aq_index', 'duration_frames', 'filename_old', 'filename_new'],
    num_rows: 2298
})
Dataset({
    features: ['utterance_id', 'session', 'test', 'prompt', 'transcript', 'correctness', 'aq_index', 'duration_frames', 'filename_old', 'filename_new'],
    num_rows: 341
})
Dataset({
    features: ['utterance_id', 'session', 'test', 'prompt', 'transcript', 'correctness', 'aq_index', 'duration_frames', 'filename_old', 'filename_new'],
    num_rows: 652
})


### **Preprocessing the Data and prepare the Tokenizer and Feature Extractor**

ASR models transcribe speech to text which leads to the requirement of a feature extractor and tokenizer: <br>
`feature extractor`: processes speech signal to the required input format: audio processing: feature vector <br>
`tokenizer`: converts the model's output to text format <br>
`wave2vec2` has the following tokenizer: `wave2vec2CTCTokenizer` and feature extractor: `wave2vec2FeatureExtractor`

### **Preprocess the Data**

In [9]:
# remove columns that we do not need
train_dataset = train_csv.remove_columns(["aq_index", "test", "duration_frames", "filename_old"])
valid_dataset = valid_csv.remove_columns(["aq_index", "test", "duration_frames","filename_old"])
test_dataset = test_csv.remove_columns(["aq_index", "test", "duration_frames","filename_old"])

# print to verify
print(train_dataset)
print(valid_dataset)
print(test_dataset)

Dataset({
    features: ['utterance_id', 'session', 'prompt', 'transcript', 'correctness', 'filename_new'],
    num_rows: 2298
})
Dataset({
    features: ['utterance_id', 'session', 'prompt', 'transcript', 'correctness', 'filename_new'],
    num_rows: 341
})
Dataset({
    features: ['utterance_id', 'session', 'prompt', 'transcript', 'correctness', 'filename_new'],
    num_rows: 652
})


In [10]:
# Print samples from the train dataset
print("Train Dataset:")
for i in range(2):
    sample = train_csv[i]
    print(f"Sample {i+1}:")
    print("Utterance ID:", sample["utterance_id"])
    print("Session:", sample["session"])
    print("Prompt:", sample["prompt"])
    print("Transcript:", sample["transcript"])
    print("Correctness:", sample["correctness"])
    print("Filename:", sample["filename_new"])
    print()

# Print samples from the validation dataset
print("Validation Dataset:")
for i in range(2):
    sample = valid_csv[i]
    print(f"Sample {i+1}:")
    print("Utterance ID:", sample["utterance_id"])
    print("Session:", sample["session"])
    print("Prompt:", sample["prompt"])
    print("Transcript:", sample["transcript"])
    print("Correctness:", sample["correctness"])
    print("Filename:", sample["filename_new"])
    print()

# Print samples from the test dataset
print("Test Dataset:")
for i in range(2):
    sample = test_csv[i]
    print(f"Sample {i+1}:")
    print("Utterance ID:", sample["utterance_id"])
    print("Session:", sample["session"])
    print("Prompt:", sample["prompt"])
    print("Transcript:", sample["transcript"])
    print("Correctness:", sample["correctness"])
    print("Filename:", sample["filename_new"])
    print()


Train Dataset:
Sample 1:
Utterance ID: ACWT02a-BNT01-house
Session: ACWT02a
Prompt: house
Transcript: HH AW S
Correctness: True
Filename: /work/van-speech-nlp/PSST-experiments/psst-data/psst-data-2022-03-02-full/train/audio/bnt/ACWT02a/ACWT02a-BNT01-house.wav

Sample 2:
Utterance ID: ACWT02a-BNT02-comb
Session: ACWT02a
Prompt: comb
Transcript: K OW M
Correctness: True
Filename: /work/van-speech-nlp/PSST-experiments/psst-data/psst-data-2022-03-02-full/train/audio/bnt/ACWT02a/ACWT02a-BNT02-comb.wav

Validation Dataset:
Sample 1:
Utterance ID: BU01a-BNT01-house
Session: BU01a
Prompt: house
Transcript: HH AW S
Correctness: True
Filename: /work/van-speech-nlp/PSST-experiments/psst-data/psst-data-2022-03-02-full/valid/audio/bnt/BU01a/BU01a-BNT01-house.wav

Sample 2:
Utterance ID: BU01a-BNT02-comb
Session: BU01a
Prompt: comb
Transcript: K OW M
Correctness: True
Filename: /work/van-speech-nlp/PSST-experiments/psst-data/psst-data-2022-03-02-full/valid/audio/bnt/BU01a/BU01a-BNT02-comb.wav

Test 

### **Assign Huggingface repo**

In [13]:
repo_name = "finetuning-xlsr-53-PSST_V7"

In [14]:
repo_name

'finetuning-xlsr-53-PSST_V7'

In [15]:
# ignore special characters: with no language model hard to classify them
# also convert all the text into lowercase: makes life much more easier
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def remove_special_characters(batch):
    batch["transcript"] = re.sub(chars_to_ignore_regex, '', batch["transcript"]) + " "
    return batch

In [16]:
# use map function to carry out the process/transformation

train_dataset = train_dataset.map(remove_special_characters)
valid_dataset = valid_dataset.map(remove_special_characters)
test_dataset = test_dataset.map(remove_special_characters)

Loading cached processed dataset at /home/chakraborti.m/.cache/huggingface/datasets/csv/default-f3078a164edca294/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-cda3d5ce03515385.arrow
Loading cached processed dataset at /home/chakraborti.m/.cache/huggingface/datasets/csv/default-f3078a164edca294/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-0a2dc8c99a87661f.arrow
Loading cached processed dataset at /home/chakraborti.m/.cache/huggingface/datasets/csv/default-f3078a164edca294/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-d0ae3049458e5309.arrow


In [17]:
# Print some sample outputs
num_samples = 5  # Number of samples to print
sample_indices = range(num_samples)

for idx in sample_indices:
    sample = train_dataset[idx]
    print(f"Sample {idx + 1}:")
    print("Original Transcript:", train_dataset[idx]["transcript"])
    print("Processed Transcript:", sample["transcript"])
    print()

Sample 1:
Original Transcript: HH AW S 
Processed Transcript: HH AW S 

Sample 2:
Original Transcript: K OW M 
Processed Transcript: K OW M 

Sample 3:
Original Transcript: T UW TH B R AH SH 
Processed Transcript: T UW TH B R AH SH 

Sample 4:
Original Transcript: AA S AH P R OW G P UH S 
Processed Transcript: AA S AH P R OW G P UH S 

Sample 5:
Original Transcript: B EH N CH 
Processed Transcript: B EH N CH 



In [27]:
# write a function that will first concatenate all the transcriptions to one single transcription and them we map them to characters
# In short: creating tokens: determine the length of array

# Provided with the PSST dataset
def extract_all_chars(batch):
    all_transcript = " ".join(batch["transcript"])
    vocab = {
        "<pad>": 0,
        "AA": 1,
        "AE": 2,
        "AH": 3,
        "AO": 4,
        "AW": 5,
        "AY": 6,
        "B": 7,
        "CH": 8,
        "D": 9,
        "DH": 10,
        "DX": 11,
        "EH": 12,
        "ER": 13,
        "EY": 14,
        "F": 15,
        "G": 16,
        "HH": 17,
        "IH": 18,
        "IY": 19,
        "JH": 20,
        "K": 21,
        "L": 22,
        "M": 23,
        "N": 24,
        "NG": 25,
        "OW": 26,
        "OY": 27,
        "P": 28,
        "R": 29,
        "S": 30,
        "SH": 31,
        "T": 32,
        "TH": 33,
        "UH": 34,
        "UW": 35,
        "V": 36,
        "W": 37,
        "Y": 38,
        "Z": 39,
        "ZH": 40,
        "<sil>": 41,
        "<spn>": 42,
        "<unk>": 43,
        " ": 44,
        "<???>":45
    }
    return {"vocab": [vocab], "all_transcript": [all_transcript]}


# done for training and validation datasets
train_vocabs = train_dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=train_dataset.column_names)
valid_vocabs = valid_dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=valid_dataset.column_names)

Map:   0%|          | 0/2298 [00:00<?, ? examples/s]

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

In [28]:
# we create the union of all distinct letters in the training dataset and test dataset and convert the resulting list into
# an enumerated dictionary

# Create the union of all distinct letters in the training and validation datasets
vocab_list = list(set(train_vocabs["vocab"][0]) | set(valid_vocabs["vocab"][0]))

# Convert the resulting list into an enumerated dictionary
vocab_dict = {v: k for k, v in enumerate(vocab_list)}

vocab_dict

{'N': 0,
 'AA': 1,
 '<sil>': 2,
 '<pad>': 3,
 'UW': 4,
 'T': 5,
 'AH': 6,
 'EH': 7,
 'AE': 8,
 'HH': 9,
 'DX': 10,
 'UH': 11,
 'ER': 12,
 'Z': 13,
 'IY': 14,
 'OY': 15,
 'OW': 16,
 'AY': 17,
 '<spn>': 18,
 '<unk>': 19,
 'B': 20,
 'K': 21,
 'L': 22,
 ' ': 23,
 '<???>': 24,
 'NG': 25,
 'ZH': 26,
 'F': 27,
 'JH': 28,
 'SH': 29,
 'W': 30,
 'TH': 31,
 'P': 32,
 'AW': 33,
 'V': 34,
 'D': 35,
 'AO': 36,
 'M': 37,
 'S': 38,
 'Y': 39,
 'G': 40,
 'IH': 41,
 'DH': 42,
 'CH': 43,
 'EY': 44,
 'R': 45}

In [29]:
# from the above tokens: we capitalize the silence, spoken noise, padding and unknown tokens
# this maintains consistency
vocab_dict["<SIL>"] = vocab_dict["<sil>"]
vocab_dict["<PAD>"] = vocab_dict["<pad>"]
vocab_dict["<SPN>"] = vocab_dict["<spn>"]
vocab_dict["<UNK>"] = vocab_dict["<unk>"]

# from the above tokens: we given the space token visibility by using the symbol (|)
vocab_dict["|"] = vocab_dict[" "]

del vocab_dict["<sil>"]
del vocab_dict["<pad>"]
del vocab_dict["<spn>"]
del vocab_dict["<unk>"]
del vocab_dict[" "]
vocab_dict

{'N': 0,
 'AA': 1,
 'UW': 4,
 'T': 5,
 'AH': 6,
 'EH': 7,
 'AE': 8,
 'HH': 9,
 'DX': 10,
 'UH': 11,
 'ER': 12,
 'Z': 13,
 'IY': 14,
 'OY': 15,
 'OW': 16,
 'AY': 17,
 'B': 20,
 'K': 21,
 'L': 22,
 '<???>': 24,
 'NG': 25,
 'ZH': 26,
 'F': 27,
 'JH': 28,
 'SH': 29,
 'W': 30,
 'TH': 31,
 'P': 32,
 'AW': 33,
 'V': 34,
 'D': 35,
 'AO': 36,
 'M': 37,
 'S': 38,
 'Y': 39,
 'G': 40,
 'IH': 41,
 'DH': 42,
 'CH': 43,
 'EY': 44,
 'R': 45,
 '<SIL>': 2,
 '<PAD>': 3,
 '<SPN>': 18,
 '<UNK>': 19,
 '|': 23}

In [30]:
print(len(vocab_dict))

46


In [31]:
vocab_dict

{'N': 0,
 'AA': 1,
 'UW': 4,
 'T': 5,
 'AH': 6,
 'EH': 7,
 'AE': 8,
 'HH': 9,
 'DX': 10,
 'UH': 11,
 'ER': 12,
 'Z': 13,
 'IY': 14,
 'OY': 15,
 'OW': 16,
 'AY': 17,
 'B': 20,
 'K': 21,
 'L': 22,
 '<???>': 24,
 'NG': 25,
 'ZH': 26,
 'F': 27,
 'JH': 28,
 'SH': 29,
 'W': 30,
 'TH': 31,
 'P': 32,
 'AW': 33,
 'V': 34,
 'D': 35,
 'AO': 36,
 'M': 37,
 'S': 38,
 'Y': 39,
 'G': 40,
 'IH': 41,
 'DH': 42,
 'CH': 43,
 'EY': 44,
 'R': 45,
 '<SIL>': 2,
 '<PAD>': 3,
 '<SPN>': 18,
 '<UNK>': 19,
 '|': 23}

In [33]:
# Save the vocabulary dictionary as a JSON file
import json

with open('psst_vocab_V7.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

### **Tokenizer**

In [34]:
# instantiate an object of the tokenizer class
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./psst_vocab_V7.json", unk_token="<UNK>", pad_token="<PAD>", silence_token="<SIL>", spoken_noise_token="<SPN>", word_delimiter_token="|", other_special_token="<???>", vocab_size=46)

In [35]:
# push it to Hugging face to use it later
tokenizer.push_to_hub(repo_name)

CommitInfo(commit_url='https://huggingface.co/monideep2255/finetuning-xlsr-53-PSST_V7/commit/8b410c828852e28e9362c6d7db070eba47b659a9', commit_message='Upload tokenizer', commit_description='', oid='8b410c828852e28e9362c6d7db070eba47b659a9', pr_url=None, pr_revision=None, pr_num=None)

### **Feature Extractor**

To convert speech to text: it has to first discretized: create individual units: called **sampling**

A higher sampling rate leads to a better approximation of the real speech signal but also necessitates more values per second

A Wav2Vec2 feature extractor object requires the following parameters to be instantiated:

- `feature_size`: Speech models take a sequence of feature vectors as an input. While the length of this sequence obviously varies, the feature size should not. In the case of Wav2Vec2, the feature size is 1 because the model was trained on the raw speech signal ${}^2$.
- `sampling_rate`: The sampling rate at which the model is trained on.
- `padding_value`: For batched inference, shorter inputs need to be padded with a specific value
- `do_normalize`: Whether the input should be *zero-mean-unit-variance* normalized or not. Usually, speech models perform better when normalizing the input
- `return_attention_mask`: Whether the model should make use of an `attention_mask` for batched inference. In general, models should **always** make use of the `attention_mask` to mask padded tokens. However, due to a very specific design choice of `Wav2Vec2`'s "base" checkpoint, better results are achieved when using no `attention_mask`.

In [36]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [37]:
# wrap the feature extractor and tokenizer into a single processor class: when testing will only need model and processor object
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer,
)

### **Prepare the dataset as expected by the model**

1. load and resample the audio data: call batch["audio"]
2. extract values from the loaded audio file
3. encode the transcriptions to label ids

In [38]:
# Cast the "filename" column to Audio type for the train dataset
train_dataset = train_dataset.cast_column("filename_new", Audio(sampling_rate=16000))

# Cast the "filename" column to Audio type for the valid dataset
valid_dataset = valid_dataset.cast_column("filename_new", Audio(sampling_rate=16000))

In [39]:
# verification
train_dataset["filename_new"][5]

{'path': '/work/van-speech-nlp/psst-data/psst-data-2022-03-02-full/train/audio/bnt/ACWT02a/ACWT02a-BNT06-volcano.wav',
 'array': array([-0.00079346, -0.00128174,  0.00097656, ...,  0.00228882,
         0.00500488,  0.00778198]),
 'sampling_rate': 16000}

In [40]:
valid_dataset["filename_new"][2]

{'path': '/work/van-speech-nlp/psst-data/psst-data-2022-03-02-full/valid/audio/bnt/BU01a/BU01a-BNT03-toothbrush.wav',
 'array': array([-0.03439331, -0.04388428, -0.05801392, ...,  0.02001953,
         0.02304077,  0.01464844]),
 'sampling_rate': 16000}

In [41]:
# testing out sample audio files that have been loaded
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(train_dataset))

print(train_dataset[rand_int]["transcript"])
print(train_dataset[rand_int]["prompt"])
ipd.Audio(data=np.asarray(train_dataset[rand_int]["filename_new"]["array"]), autoplay=True, rate=16000)

OW L UH K AH DH AE T Y AE Y AE <sil> Y AE HH IY Z G UH D AH DH AH N G UH D AH DH AH DH AH P AE D AY G EH S AY N OW 
throw


In [42]:
# verify the column names
print(train_dataset.column_names)
print(valid_dataset.column_names)

['utterance_id', 'session', 'prompt', 'transcript', 'correctness', 'filename_new']
['utterance_id', 'session', 'prompt', 'transcript', 'correctness', 'filename_new']


In [43]:
def prepare_dataset(batch):
    # load the the audio data into batch
    audio = batch["filename_new"]

    # extract the values from the audio files
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    
    # encode it to the label ids
    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcript"]).input_ids
    return batch

In [44]:
train_dataset = train_dataset.map(prepare_dataset,remove_columns=train_dataset.column_names,num_proc=4)

Map (num_proc=4):   0%|          | 0/2298 [00:00<?, ? examples/s]



In [45]:
# verify the column names
print(train_dataset.column_names)

['input_values', 'input_length', 'labels']


In [46]:
valid_dataset = valid_dataset.map(prepare_dataset,remove_columns=valid_dataset.column_names,num_proc=4)

Map (num_proc=4):   0%|          | 0/341 [00:00<?, ? examples/s]



In [47]:
# verify the column names
print(valid_dataset.column_names)

['input_values', 'input_length', 'labels']


In [48]:
train_dataset

Dataset({
    features: ['input_values', 'input_length', 'labels'],
    num_rows: 2298
})

In [49]:
train_dataset["input_length"][5]

46128

In [50]:
valid_dataset

Dataset({
    features: ['input_values', 'input_length', 'labels'],
    num_rows: 341
})

Long input sequences require a lot of memory. Since `Wav2Vec2` is based on `self-attention` the memory requirement scales quadratically with the input length for long input sequences.

In [51]:
max_input_length_in_sec = 9.0
min_input_length_in_sec = 1.0

# Filter the training dataset
train_dataset = train_dataset.filter(
    lambda example: example["input_length"] < max_input_length_in_sec * processor.feature_extractor.sampling_rate and
                    example["input_length"] > min_input_length_in_sec * processor.feature_extractor.sampling_rate
)

# Filter the validation dataset
valid_dataset = valid_dataset.filter(
    lambda example: example["input_length"] < max_input_length_in_sec * processor.feature_extractor.sampling_rate and
                    example["input_length"] > min_input_length_in_sec * processor.feature_extractor.sampling_rate
)

Filter:   0%|          | 0/2298 [00:00<?, ? examples/s]

Filter:   0%|          | 0/341 [00:00<?, ? examples/s]

In [52]:
train_dataset

Dataset({
    features: ['input_values', 'input_length', 'labels'],
    num_rows: 1558
})

In [53]:
valid_dataset

Dataset({
    features: ['input_values', 'input_length', 'labels'],
    num_rows: 208
})

In [54]:
for i in range(2):
    sample = valid_dataset[i]
    print("Input values: ", sample["input_values"][0:1])
    print("Input lengths: ", sample["input_length"])
    print("Labels: ",sample["labels"])
    print()

Input values:  [-0.6333456635475159]
Input lengths:  20031
Labels:  [5, 4, 31, 20, 23, 45, 6, 29]

Input values:  [-0.04003889486193657]
Input lengths:  20576
Labels:  [1, 21, 23, 5, 6, 32, 11, 38, 23]



### **Training**

**Need for a  data collabtor** <br>
wave2vec2 has a much larger input length as compared to the output length. For the input size, it is efficient to pad training batches to the longest sample in the batch (not overall sample)

In [55]:
# verification of train and valid dataset lengths
train_dataset

Dataset({
    features: ['input_values', 'input_length', 'labels'],
    num_rows: 1558
})

In [56]:
valid_dataset

Dataset({
    features: ['input_values', 'input_length', 'labels'],
    num_rows: 208
})

In [57]:
'''
import torch
from typing import Any, Dict, List, Optional, Union
from transformers import Wav2Vec2Processor

class DataCollatorCTCWithPadding:
    def __init__(
        self,
        processor: Wav2Vec2Processor,
        padding: Union[bool, str] = True,
        max_length: Optional[int] = None,
        max_length_labels: Optional[int] = None,
        pad_to_multiple_of: Optional[int] = None,
    ):
        self.processor = processor
        self.padding = padding
        self.max_length = max_length
        self.max_length_labels = max_length_labels
        self.pad_to_multiple_of = pad_to_multiple_of

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": [feature["input_values"]]} for feature in features]
        label_features = [{"input_ids": [feature["labels"]]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
            truncation=True,
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of,
                return_tensors="pt",
                truncation=True,
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch
'''

'\nimport torch\nfrom typing import Any, Dict, List, Optional, Union\nfrom transformers import Wav2Vec2Processor\n\nclass DataCollatorCTCWithPadding:\n    def __init__(\n        self,\n        processor: Wav2Vec2Processor,\n        padding: Union[bool, str] = True,\n        max_length: Optional[int] = None,\n        max_length_labels: Optional[int] = None,\n        pad_to_multiple_of: Optional[int] = None,\n    ):\n        self.processor = processor\n        self.padding = padding\n        self.max_length = max_length\n        self.max_length_labels = max_length_labels\n        self.pad_to_multiple_of = pad_to_multiple_of\n\n    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n        input_features = [{"input_values": [feature["input_values"]]} for feature in features]\n        label_features = [{"input_ids": [feature["labels"]]} for feature in features]\n\n        batch = self.processor.pad(\n            input_features,\n     

In [58]:
# verifying the keys to be passed to the data collator
print(train_dataset[0].keys())

dict_keys(['input_values', 'input_length', 'labels'])


In [59]:
print(valid_dataset[0].keys())

dict_keys(['input_values', 'input_length', 'labels'])


In [60]:
# data collator

import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [61]:
'''
# Determine the maximum lengths
max_input_length = max(len(feature["input_values"]) for feature in train_dataset)
max_label_length = max(len(feature["labels"]) for feature in train_dataset)

# Create the data collator
data_collator = DataCollatorCTCWithPadding(
    processor=processor,
    padding=True,
    max_length=max_input_length,
    max_length_labels=max_label_length,
)
'''

'\n# Determine the maximum lengths\nmax_input_length = max(len(feature["input_values"]) for feature in train_dataset)\nmax_label_length = max(len(feature["labels"]) for feature in train_dataset)\n\n# Create the data collator\ndata_collator = DataCollatorCTCWithPadding(\n    processor=processor,\n    padding=True,\n    max_length=max_input_length,\n    max_length_labels=max_label_length,\n)\n'

In [62]:
#data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
#data_collator = DataCollatorCTCWithPadding(padding=True, max_length=max_length)
# Create an instance of the modified data collator
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [63]:
# verification
train_dataset

Dataset({
    features: ['input_values', 'input_length', 'labels'],
    num_rows: 1558
})

In [64]:
valid_dataset

Dataset({
    features: ['input_values', 'input_length', 'labels'],
    num_rows: 208
})

- Evaluation metric. During training, the model should be evaluated on the phoneme error rate. We should define a `compute_metrics` function accordingly

- Load a pretrained checkpoint. We need to load a pretrained checkpoint and configure it correctly for training.

- Define the training configuration.

After having fine-tuned the model, we will correctly evaluate it on the test data and verify that it has indeed learned to correctly transcribe speech.

In [65]:
'''
import numpy as np 

def compute_per(predictions, labels):
    # Compute PER (Phoneme Error Rate)
    return np.sum(predictions != labels) / labels.size

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    per = compute_per(predictions=pred_ids, labels=pred.label_ids)

    return {"per": per}
'''

'\nimport numpy as np \n\ndef compute_per(predictions, labels):\n    # Compute PER (Phoneme Error Rate)\n    return np.sum(predictions != labels) / labels.size\n\ndef compute_metrics(pred):\n    pred_logits = pred.predictions\n    pred_ids = np.argmax(pred_logits, axis=-1)\n\n    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id\n\n    per = compute_per(predictions=pred_ids, labels=pred.label_ids)\n\n    return {"per": per}\n'

### **Assign the model**

In [66]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Some weights of the model checkpoint at facebook/wav2vec2-large-xlsr-53 were not used when initializing Wav2Vec2ForCTC: ['quantizer.weight_proj.bias', 'quantizer.codevectors', 'project_hid.weight', 'project_hid.bias', 'project_q.weight', 'project_q.bias', 'quantizer.weight_proj.weight']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to u

The first component of wav2vec2 has acoustic features from the raw speech signal. This portion has been pretrained sufficiently and does not need to be pretrained anymore and hence freezed.

In [68]:
# Verification of vocabulary dictionary length

import json
from transformers import Wav2Vec2CTCTokenizer

# Verify the vocab size
with open('psst_vocab_V7.json', 'r') as vocab_file:
    vocab_dict = json.load(vocab_file)
vocab_size = len(vocab_dict)

print(vocab_size)

46


In [69]:
len(processor.tokenizer)

46

In [70]:
train_dataset

Dataset({
    features: ['input_values', 'input_length', 'labels'],
    num_rows: 1558
})

In [71]:
valid_dataset

Dataset({
    features: ['input_values', 'input_length', 'labels'],
    num_rows: 208
})

### **Define the parameters that are related to model training**


To give more explanation on some of the parameters:
- `group_by_length` makes training more efficient by grouping training samples of similar input length into one batch. This can significantly speed up training time by heavily reducing the overall number of useless padding tokens that are passed through the model
- `learning_rate` and `weight_decay` were heuristically tuned until fine-tuning has become stable. Note that those parameters strongly depend on the Timit dataset and might be suboptimal for other speech datasets.

In [72]:
model.freeze_feature_encoder()

In [73]:
# clear out cuda memory
import torch
torch.cuda.empty_cache()

In [74]:
import torch

# Initialize CUDA context
torch.cuda.init()

# Check if CUDA is available and get the current device
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using CUDA device: {torch.cuda.current_device()}")
else:
    device = torch.device("cpu")
    print("CUDA is not available, falling back to CPU.")

# Allocate some GPU memory
tensor = torch.tensor([0], device=device)

# Check the current GPU memory allocated
allocated_memory = torch.cuda.memory_allocated(device)
print(f"Current GPU memory allocated: {allocated_memory / 1024 ** 2} MB")

# Check the peak GPU memory allocated
peak_allocated_memory = torch.cuda.max_memory_allocated(device)
print(f"Peak GPU memory allocated: {peak_allocated_memory / 1024 ** 2} MB")

Using CUDA device: 0
Current GPU memory allocated: 0.00048828125 MB
Peak GPU memory allocated: 0.00048828125 MB


In [75]:
! nvidia-smi

Tue Jul 11 19:16:12 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB           Off| 00000000:C1:00.0 Off |                    0 |
| N/A   33C    P0               67W / 500W|   6033MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [76]:
torch.cuda.is_available()

True

In [77]:
torch.cuda.current_device()

0

In [78]:
! nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Nov_30_19:08:53_PST_2020
Cuda compilation tools, release 11.2, V11.2.67
Build cuda_11.2.r11.2/compiler.29373293_0


In [79]:
import torch

print(torch.__version__)

2.0.1+cu117


In [80]:
# Define the training configuration

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=repo_name,
    group_by_length=True,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=30,
    fp16=False,
    gradient_checkpointing=True,
    save_steps=400,
    eval_steps=400,
    logging_steps=400,
    learning_rate=3e-4,
    weight_decay=0.005,
    warmup_steps=500,
    save_total_limit=2,
    push_to_hub=True,
)

In [81]:
# pass all instances to the trainer as the final step before training
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=processor.feature_extractor,
)

Cloning https://huggingface.co/monideep2255/finetuning-xlsr-53-PSST_V7 into local empty directory.


In [82]:
trainer.train()



Step,Training Loss,Validation Loss
400,9.5902,3.593055
800,2.8519,1.639793
1200,1.3002,1.305981
1600,0.7496,1.279029
2000,0.5284,1.345209
2400,0.3743,1.363931
2800,0.2814,1.469079




TrainOutput(global_step=2910, training_loss=2.163539655757524, metrics={'train_runtime': 2329.8325, 'train_samples_per_second': 20.062, 'train_steps_per_second': 1.249, 'total_flos': 4.171062962014814e+18, 'train_loss': 2.163539655757524, 'epoch': 29.85})

In [83]:
# push to trained model to huggingface
trainer.push_to_hub(repo_name)

Upload file pytorch_model.bin:   0%|          | 1.00/1.18G [00:00<?, ?B/s]

To https://user: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@huggingface.co/monideep2255/finetuning-xlsr-53-PSST_V7
   39ace47..952423a  main -> main

To https://user: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@huggingface.co/monideep2255/finetuning-xlsr-53-PSST_V7
   952423a..af05244  main -> main



'https://huggingface.co/monideep2255/finetuning-xlsr-53-PSST_V7/commit/952423a4f5e8412b0764f2333aade0cefbe9de52'

### **Generating inference and adding spaces between phoneme tokens**

As a final check, let's load the model and verify that it indeed has learned to transcribe aphasic speech.

Let's first load the pretrained checkpoint.

In [37]:
# load the model and processor
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch

model = Wav2Vec2ForCTC.from_pretrained("monideep2255/finetuning-xlsr-53-PSST_V7")
processor = Wav2Vec2Processor.from_pretrained("monideep2255/finetuning-xlsr-53-PSST_V7")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [38]:
processor.decode

<bound method Wav2Vec2Processor.decode of Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='monideep2255/finetuning-xlsr-53-PSST_V7', vocab_size=46, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<UNK>', 'pad_token': '<PAD>', 'additional_special_tokens': [AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True)]}, clean_up_tokenization_spaces=True)>

In [39]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elemen

### **Method 1: Generate predictions with the references (transcript column)**

In [40]:
# load the datasets and observe the structure
from datasets import load_dataset, load_metric, DatasetDict, Dataset, Audio

# Load the datasets and observe the structure
dataset_dict = load_dataset('csv', data_files={
    "test": '/work/van-speech-nlp/psst-csv/test_utterances_excel.csv',
})

# review the datasets
test_inferences = dataset_dict["test"]

print(test_inferences)

Found cached dataset csv (/home/chakraborti.m/.cache/huggingface/datasets/csv/default-c026370f45f2f2db/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['utterance_id', 'session', 'test', 'prompt', 'transcript', 'correctness', 'aq_index', 'duration_frames', 'filename_old', 'filename_new'],
    num_rows: 652
})


In [41]:
# remove columns that we do not need
test_inferences = test_inferences.remove_columns(["aq_index", "test", "duration_frames","filename_old"])

# print to verify
print(test_inferences)

Dataset({
    features: ['utterance_id', 'session', 'prompt', 'transcript', 'correctness', 'filename_new'],
    num_rows: 652
})


In [42]:
# Cast the "filename" column to Audio type for the valid dataset
test_inferences = test_inferences.cast_column("filename_new", Audio(sampling_rate=16000))

In [43]:
test_inferences

Dataset({
    features: ['utterance_id', 'session', 'prompt', 'transcript', 'correctness', 'filename_new'],
    num_rows: 652
})

In [44]:
# verification
test_inferences["filename_new"][5]

{'path': '/work/van-speech-nlp/psst-data/psst-data-2022-03-02-full/test/audio/bnt/ACWT01a/ACWT01a-BNT06-volcano.wav',
 'array': array([-0.00097656,  0.00195312,  0.01193237, ..., -0.00048828,
         0.00024414,  0.00213623]),
 'sampling_rate': 16000}

In [45]:
# prepare the dataset
def prepare_references_dataset(batch):
    # load the audio data into batch
    audio = batch["filename_new"]

    # extract the values from the audio files
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    # encode the transcript to the label ids
    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcript"]).input_ids
    
    # remove all columns except for 'transcript'
    batch = {key: batch[key] for key in batch.keys() if key == 'transcript'}
    
    return batch

test_inferences = test_inferences.map(prepare_references_dataset, num_proc=4)

Loading cached processed dataset at /home/chakraborti.m/.cache/huggingface/datasets/csv/default-c026370f45f2f2db/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-ba09cd7b730404ac_*_of_00004.arrow


In [46]:
test_inferences

Dataset({
    features: ['utterance_id', 'session', 'prompt', 'transcript', 'correctness', 'filename_new', 'input_values', 'input_length', 'labels'],
    num_rows: 652
})

In [47]:
# Sample test data
sample_inference_data = test_inferences[:5]

In [48]:
processor.tokenizer.decoder = {24: '<???>',
 3: '<PAD>',
 2: '<SIL>',
 18: '<SPN>',
 19: '<UNK>',
 1: ' AA ',
 8: '  AE',
 6: ' AH ',
 36: ' AO ',
 33: ' AW ',
 17: ' AY ',
 20: ' B ',
 43: ' CH ',
 35: ' D ',
 42: ' DH ',
 10: ' DX ',
 7: ' EH ',
 12: ' ER ',
 44: ' EY ',
 27: ' F ',
 40: ' G ',
 9: ' HH ',
 41: ' IH ',
 14: ' IY ',
 28: ' JH ',
 21: ' K ',
 22: ' L ',
 37: ' M ',
 0: ' N ',
 25: ' NG ',
 16: ' OW ',
 15: ' OY ',
 32: ' P ',
 45: ' R ',
 38: ' S ',
 29: ' SH ',
 5: ' T ',
 31: ' TH ',
 11: ' UH ',
 4: ' UW ',
 34: ' V ',
 30: ' W ',
 39: ' Y ',
 13: ' Z ',
 26: ' ZH ',
 23: '|'}

In [49]:
import librosa
import numpy as np

# Generate predictions for each sample
for i in range(len(sample_inference_data['input_values'])):
    input_values = np.array(sample_inference_data['input_values'][i])
    sampling_rate = sample_inference_data['input_length'][i]

    # Resample the input speech to match the model's sampling rate
    input_values = librosa.resample(input_values, orig_sr=sampling_rate, target_sr=16000)

    input_values = processor(input_values, sampling_rate=16000, return_tensors="pt").input_values
    input_values = input_values.to(device)  # Move input to the same device as the model
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    #print(predicted_ids)
    transcription = processor.decode(predicted_ids[0],clean_up_tokenization_spaces=False)

    reference_transcription = sample_inference_data['transcript'][i]

    print("Reference:", reference_transcription)
    print("Prediction:", transcription.lstrip().rstrip().replace('  ',' ').replace('\t',' '))
    print("---")

Reference: EY HH AW S
Prediction: B AW SH
---
Reference: K OW M
Prediction: K OW M
---
Reference: B R AH SH
Prediction: B  R AH SH
---
Reference: AA G T AH P UH S
Prediction: AA K  T AH P UH S
---
Reference: CH EY R
Prediction: CH IY R
---


In [50]:
processor.tokenizer.decoder

{24: '<???>',
 3: '<PAD>',
 2: '<SIL>',
 18: '<SPN>',
 19: '<UNK>',
 1: ' AA ',
 8: '  AE',
 6: ' AH ',
 36: ' AO ',
 33: ' AW ',
 17: ' AY ',
 20: ' B ',
 43: ' CH ',
 35: ' D ',
 42: ' DH ',
 10: ' DX ',
 7: ' EH ',
 12: ' ER ',
 44: ' EY ',
 27: ' F ',
 40: ' G ',
 9: ' HH ',
 41: ' IH ',
 14: ' IY ',
 28: ' JH ',
 21: ' K ',
 22: ' L ',
 37: ' M ',
 0: ' N ',
 25: ' NG ',
 16: ' OW ',
 15: ' OY ',
 32: ' P ',
 45: ' R ',
 38: ' S ',
 29: ' SH ',
 5: ' T ',
 31: ' TH ',
 11: ' UH ',
 4: ' UW ',
 34: ' V ',
 30: ' W ',
 39: ' Y ',
 13: ' Z ',
 26: ' ZH ',
 23: '|'}

In [33]:
'''
import librosa
import numpy as np

# Generate predictions for each sample
for i in range(len(sample_inference_data['input_values'])):
    input_values = np.array(sample_inference_data['input_values'][i])
    sampling_rate = sample_inference_data['input_length'][i]

    # Resample the input speech to match the model's sampling rate
    input_values = librosa.resample(input_values, orig_sr=sampling_rate, target_sr=16000)

    input_values = processor(input_values, sampling_rate=16000, return_tensors="pt").input_values
    input_values = input_values.to(device)  # Move input to the same device as the model
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])

    reference_transcription = sample_inference_data['transcript'][i]

    print("Reference:", reference_transcription)
    print("Prediction:", transcription)
    print("---")
'''

'\nimport librosa\nimport numpy as np\n\n# Generate predictions for each sample\nfor i in range(len(sample_inference_data[\'input_values\'])):\n    input_values = np.array(sample_inference_data[\'input_values\'][i])\n    sampling_rate = sample_inference_data[\'input_length\'][i]\n\n    # Resample the input speech to match the model\'s sampling rate\n    input_values = librosa.resample(input_values, orig_sr=sampling_rate, target_sr=16000)\n\n    input_values = processor(input_values, sampling_rate=16000, return_tensors="pt").input_values\n    input_values = input_values.to(device)  # Move input to the same device as the model\n    with torch.no_grad():\n        logits = model(input_values).logits\n\n    predicted_ids = torch.argmax(logits, dim=-1)\n    transcription = processor.decode(predicted_ids[0])\n\n    reference_transcription = sample_inference_data[\'transcript\'][i]\n\n    print("Reference:", reference_transcription)\n    print("Prediction:", transcription)\n    print("---")\n'