In [1]:
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import torchaudio
import numpy as np
import os
import shutil

In [2]:
# Load pre-trained model and tokenizer
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base")
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base")

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForCTC: ['project_q.weight', 'project_hid.weight', 'project_q.bias', 'project_hid.bias', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'quantizer.codevectors']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [10]:
# Define the path to the Librispeech directory and embeddings directory
librispeech_dir = "/home/sksystem/Downloads/Lab Work - 22-23/all_phonemes_audio_data_Indic_TIMIT/Stress"
embeddings_dir = os.path.join(librispeech_dir, "Indic_TIMIT_Stress_embeddings")

In [11]:
# Create the embeddings directory if it doesn't already exist
if not os.path.exists(embeddings_dir):
    os.mkdir(embeddings_dir)

In [12]:
# Loop over all the subdirectories within the train-clean-360 directory
for root, _, files in os.walk(librispeech_dir):
    for file in files:
        if file.endswith(".wav"):
            # Load audio file and convert to tensor
            file_path = os.path.join(root, file)
            input_audio, _ = torchaudio.load(file_path)
            input_audio = input_audio.squeeze().numpy().tolist()

            # Tokenize and pad input audio
            input_ids = tokenizer(input_audio, return_tensors="pt").input_values

            # Generate embeddings
            outputs = model(input_ids, output_hidden_states=True)
            hidden_states = outputs.hidden_states
            embeddings = hidden_states[-1].squeeze().detach().numpy()

            # Print embeddings shape
            print(embeddings.shape)

            # Get the path of the subdirectory within embeddings directory
            embeddings_subdir = os.path.join(embeddings_dir, os.path.relpath(root, librispeech_dir))

            # Create the subdirectory if it doesn't already exist
            if not os.path.exists(embeddings_subdir):
                os.makedirs(embeddings_subdir)

            # Write embeddings to file
            embeddings_shape = '_'.join(str(x) for x in embeddings.shape)
            embeddings_file = os.path.join(embeddings_subdir, os.path.splitext(file)[0] + f"_{embeddings_shape}.txt")
            np.savetxt(embeddings_file, embeddings)

(870, 768)
(728, 768)
(387, 768)
(517, 768)
(917, 768)
(575, 768)
(987, 768)
(576, 768)
(446, 768)
(446, 768)
(317, 768)
(540, 768)
(752, 768)
(905, 768)
(576, 768)
(705, 768)
(740, 768)
(470, 768)
(623, 768)
(693, 768)
(764, 768)
(540, 768)
(834, 768)
(540, 768)
(623, 768)
(552, 768)
(446, 768)
(611, 768)
(728, 768)
(787, 768)
(528, 768)
(575, 768)
(470, 768)
(752, 768)
(693, 768)
(399, 768)
(740, 768)
(552, 768)
(634, 768)
(505, 768)
(528, 768)
(623, 768)
(540, 768)
(670, 768)
(493, 768)
(693, 768)
(552, 768)
(587, 768)
(646, 768)
(481, 768)
(446, 768)
(587, 768)
(834, 768)
(705, 768)
(870, 768)
(611, 768)
(505, 768)
(681, 768)
(458, 768)
(681, 768)
(940, 768)
(775, 768)
(858, 768)
(623, 768)
(540, 768)
(481, 768)
(646, 768)
(423, 768)
(764, 768)
(705, 768)
(823, 768)
(764, 768)
(987, 768)
(623, 768)
(693, 768)
(552, 768)
(411, 768)
(470, 768)
(1034, 768)
(446, 768)
(728, 768)
(799, 768)
(352, 768)
(552, 768)
(387, 768)
(611, 768)
(493, 768)
(717, 768)
(870, 768)
(470, 768)
(587, 768