In [None]:
cd transformers

In [None]:
import sys
sys.prefix

In [None]:
!git checkout tags/v4.35.2

In [None]:
!pip show transformers

In [None]:
!pip install -e .

In [None]:
cd ..

In [None]:
cd peft

In [None]:
!git checkout tags/v0.6.0

In [None]:
!pip install -e .

In [None]:
cd ..

In [None]:
cd fairseq

In [None]:
!pip install --no-cache-dir --editable ./

In [None]:
!pip show fairseq

In [None]:
cd ..

In [None]:
cd slam-llm

In [None]:
!pip show slam-llm

In [None]:
!pip install  -e .

In [None]:
pip uninstall slam-llm -y

In [None]:
!echo $PATH | grep espeak

In [None]:
import torch
import torchaudio
import json
from pathlib import Path
import torch.nn as nn

class MusicFMEncoder(nn.Module):
    def __init__(self, config, model):
        super().__init__()
        self.config = config
        self.model = model

    @classmethod
    def load(cls, model_config):
        from musicfm.model.musicfm_25hz import MusicFM25Hz
        model = MusicFM25Hz(
            stat_path=model_config['encoder_stat_path'],
            model_path=model_config['encoder_path'],
            w2v2_config_path=model_config.get('encoder_config_path', "facebook/wav2vec2-conformer-rope-large-960h-ft")
        )
        return cls(model_config, model)

    def extract_features(self, source, padding_mask=None):
        _, hidden_states = self.model.get_predictions(source)
        out = hidden_states[self.config['encoder_layer_idx']]
        return out

def load_wav_file(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    return waveform, sample_rate

def main():
    # Example model configuration
    model_config = {
        'encoder_stat_path': 'path/to/encoder_stat',
        'encoder_path': 'path/to/encoder_model',
        'encoder_layer_idx': 12
    }

    # Path to your WAV file
    wav_file_path = "/work/van-speech-nlp/data/torgo/F01/Session1/wav_arrayMic/0009.wav"

    # Load the encoder
    encoder = MusicFMEncoder.load(model_config)

    # Load the WAV file
    waveform, sample_rate = load_wav_file(wav_file_path)

    # Ensure the waveform is in the correct format (batch size, num_channels, num_frames)
    if waveform.dim() == 2:
        waveform = waveform.unsqueeze(0)  # Add batch dimension

    # Extract features
    features = encoder.extract_features(waveform)

    # Print the extracted features
    print("Extracted features:", features)

if __name__ == "__main__":
    main()

In [None]:
import librosa
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

input_audio, sample_rate = librosa.load("/work/van-speech-nlp/data/torgo/F01/Session1/wav_arrayMic/0006.wav",  sr=16000)

model_name = "vitouphy/wav2vec2-xls-r-300m-timit-phoneme"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

i= feature_extractor(input_audio, return_tensors="pt", sampling_rate=sample_rate)
with torch.no_grad():
  o= model(i.input_values)
print(o.keys())
print(o.last_hidden_state.shape)
print(o.extract_features.shape)

In [17]:
import torch
import torch.nn as nn
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import librosa
import numpy as np

class Wav2PhonemeEncoder(nn.Module):
    def __init__(self, config, model, feature_extractor):
        super().__init__()
        self.config = config
        self.model = model
        self.feature_extractor = feature_extractor

    @classmethod
    def load(cls, model_config):
        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
        model = Wav2Vec2Model.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
        return cls(model_config, model, feature_extractor)

    def extract_features(self, audio_input):
        # Pass the processed inputs through the Wav2Vec2 model
        outputs = self.model(audio_input)
        # Return the last hidden state as the extracted features
        return outputs.last_hidden_state


# Configuration (update if necessary)
model_config = {}

# Load the model
encoder = Wav2PhonemeEncoder.load(model_config)

# Paths to test audio files
audio_files = [
    "/work/van-speech-nlp/data/torgo/F01/Session1/wav_arrayMic/0006.wav",
    "/work/van-speech-nlp/data/torgo/F01/Session1/wav_arrayMic/0007.wav",
    "/work/van-speech-nlp/data/torgo/F01/Session1/wav_arrayMic/0008.wav",
    "/work/van-speech-nlp/data/torgo/F01/Session1/wav_arrayMic/0009.wav"
]

# Target length for audio samples
target_length = 31200

# Function to load and preprocess audio files
def load_and_preprocess_audio(file_paths, target_length):
    audio_inputs = []
    for path in file_paths:
        audio, sr = librosa.load(path, sr=16000)
        if len(audio) < target_length:
            audio = np.pad(audio, (0, target_length - len(audio)))
        else:
            audio = audio[:target_length]
        audio_inputs.append(audio)
    return np.array(audio_inputs)

audio_inputs = load_and_preprocess_audio(audio_files, target_length)
print(f"Audio input shape: {audio_inputs_tensor.shape}")

# Convert audio inputs to PyTorch tensor
audio_inputs_tensor = torch.tensor(audio_inputs, dtype=torch.float32)

# Extract features
features = encoder.extract_features(audio_inputs_tensor)

# Print shapes
print(f"Extracted features shape: {features.shape}")

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at vitouphy/wav2vec2-xls-r-300m-timit-phoneme and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Audio input shape: torch.Size([4, 31200])
Extracted features shape: torch.Size([4, 97, 1024])


In [4]:
import torch
import torch.nn as nn
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import librosa
import numpy as np

class Wav2PhonemeEncoder(nn.Module):
    def __init__(self, config, model, feature_extractor):
        super().__init__()
        self.config = config
        self.model = model
        self.feature_extractor = feature_extractor

    @classmethod
    def load(cls, model_config):
        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
        model = Wav2Vec2Model.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
        return cls(model_config, model, feature_extractor)

    def extract_features(self, audio_input):
        # Pass the processed inputs through the Wav2Vec2 model
        outputs = self.model(audio_input)
        # Return the last hidden state as the extracted features
        return outputs.last_hidden_state


# Configuration (update if necessary)
model_config = {}

# Load the model
encoder = Wav2PhonemeEncoder.load(model_config)

# Paths to test audio files
audio_files = [
    "/work/van-speech-nlp/data/torgo/F01/Session1/wav_arrayMic/0006.wav",
    "/work/van-speech-nlp/data/torgo/F01/Session1/wav_arrayMic/0007.wav",
    "/work/van-speech-nlp/data/torgo/F01/Session1/wav_arrayMic/0008.wav",
    "/work/van-speech-nlp/data/torgo/F01/Session1/wav_arrayMic/0009.wav"
]

# Target length for audio samples
target_length = 31200

# Function to load and preprocess audio files
def load_and_preprocess_audio(file_paths, target_length):
    audio_inputs = []
    for path in file_paths:
        audio, sr = librosa.load(path, sr=16000)
        if len(audio) < target_length:
            audio = np.pad(audio, (0, target_length - len(audio)))
        else:
            audio = audio[:target_length]
        audio_inputs.append(audio)
    return np.array(audio_inputs)

audio_inputs = load_and_preprocess_audio(audio_files, target_length)


# Convert audio inputs to PyTorch tensor
audio_inputs_tensor = torch.tensor(audio_inputs, dtype=torch.float32)
print(f"Audio input shape: {audio_inputs_tensor.shape}")

# Extract features
features = encoder.extract_features(audio_inputs_tensor)

# Print shapes
print(f"Extracted features shape: {features.shape}")

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Audio input shape: torch.Size([4, 31200])
Extracted features shape: torch.Size([4, 97, 1024])


# https://huggingface.co/facebook/wav2vec2-base-960h

In [9]:
import torch
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Define the model and processor
MODEL_ID = "facebook/wav2vec2-large-960h-lv60-self"
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

# Path to the single audio file you want to transcribe
audio_file_path = "/work/van-speech-nlp/data/torgo/F01/Session1/wav_arrayMic/0006.wav"

# Load and preprocess the audio file
def speech_file_to_array_fn(audio_file_path):
    speech_array, sampling_rate = librosa.load(audio_file_path, sr=16_000)
    return speech_array

speech_array = speech_file_to_array_fn(audio_file_path)
inputs = processor(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)

# Perform inference
with torch.no_grad():
    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

# Decode the predicted ids to text
predicted_ids = torch.argmax(logits, dim=-1)
predicted_sentence = processor.batch_decode(predicted_ids)[0]

# Print the prediction
print("-" * 100)
print("Prediction:", predicted_sentence)

preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/162 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-large-960h-lv60-self were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.maske

----------------------------------------------------------------------------------------------------
Prediction: AN THEATR
