In [7]:
import os
import librosa
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch

In [8]:
# Initialize the Wav2Vec 2.0 processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Base directory where .wav files are located
input_directory  = 'GLOBE\wavfiles'

# Output directory for embeddings
output_directory  = 'GLOBE\embeddings'
os.makedirs(output_directory , exist_ok=True)

In [4]:
# Determine the maximum length of audio sequences
max_length = 0
for accent_folder in os.listdir(input_directory):
    accent_path = os.path.join(input_directory, accent_folder)
    if os.path.isdir(accent_path):
        for audio_file in os.listdir(accent_path):
            if audio_file.endswith('.wav'):
                input_path = os.path.join(accent_path, audio_file)
                y, _ = librosa.load(input_path, sr=16000)
                max_length = max(max_length, len(y))

KeyboardInterrupt: 

In [11]:
max_length=208002

In [12]:
# Iterate over each accent folder and process the .wav files
for accent_folder in os.listdir(input_directory):
    accent_path = os.path.join(input_directory, accent_folder)
    if os.path.isdir(accent_path):
        print(f"Processing accent folder: {accent_folder}")
        
        # Sanitize the accent folder name to remove or replace illegal characters
        sanitized_accent_folder = accent_folder.replace("/", "-")
        
        # Create output directory for the accent if it doesn't exist
        accent_output_dir = os.path.join(output_directory, sanitized_accent_folder)
        os.makedirs(accent_output_dir, exist_ok=True)
        
        # Iterate over each .wav file in the accent folder
        for audio_file in os.listdir(accent_path):
            if audio_file.endswith('.wav'):
                input_path = os.path.join(accent_path, audio_file)
                output_path = os.path.join(accent_output_dir, audio_file.replace('.wav', '.npy'))
                
                try:
                    # Load and resample audio file using librosa
                    waveform, sample_rate = librosa.load(input_path, sr=16000)  # Load with resampling to 16kHz
                    
                    # Pad the waveform to the maximum length
                    if len(waveform) < max_length:
                        waveform = np.pad(waveform, (0, max_length - len(waveform)), 'constant')
                    else:
                        waveform = waveform[:max_length]  # Truncate if it's longer
                    
                    # Reshape to the correct input format for Wav2Vec2
                    waveform = torch.tensor(waveform).unsqueeze(0)  # Shape: [1, sequence_length]
                    
                    # Process with Wav2Vec 2.0
                    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
                    
                    # Remove unnecessary extra dimension
                    inputs.input_values = inputs.input_values.squeeze(1)  # Shape: [batch_size, sequence_length]
                    
                    with torch.no_grad():
                        features = model(inputs.input_values).last_hidden_state
                    
                    # Convert to numpy and save
                    features_np = features.squeeze().cpu().numpy()
                    np.save(output_path, features_np)
                    
                    print(f"Processed {audio_file} and saved features to {output_path}")
                
                except Exception as e:
                    print(f"Error processing {audio_file}: {e}")

Processing accent folder: AGerman_English,Non_native_speaker
Processed audio_train-00001-of-00108_3813.wav and saved features to GLOBE\embeddings\AGerman_English,Non_native_speaker\audio_train-00001-of-00108_3813.npy
Processed audio_train-00001-of-00108_3814.wav and saved features to GLOBE\embeddings\AGerman_English,Non_native_speaker\audio_train-00001-of-00108_3814.npy
Processed audio_train-00001-of-00108_3815.wav and saved features to GLOBE\embeddings\AGerman_English,Non_native_speaker\audio_train-00001-of-00108_3815.npy
Processed audio_train-00001-of-00108_3816.wav and saved features to GLOBE\embeddings\AGerman_English,Non_native_speaker\audio_train-00001-of-00108_3816.npy
Processed audio_train-00001-of-00108_3817.wav and saved features to GLOBE\embeddings\AGerman_English,Non_native_speaker\audio_train-00001-of-00108_3817.npy
Processed audio_train-00001-of-00108_3818.wav and saved features to GLOBE\embeddings\AGerman_English,Non_native_speaker\audio_train-00001-of-00108_3818.npy
Pro

KeyboardInterrupt: 