In [6]:
import numpy as np
import torch
import librosa
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import tensorflow as tf
import soundfile as sf
import resampy  # Ensure this is imported

import params as yamnet_params
import yamnet as yamnet_model
import features  # Import features.py

# Initialize the Params class to access model parameters
params = yamnet_params.Params()

# Load YAMNet model and processor
yamnet = yamnet_model.yamnet_frames_model(params)
yamnet.load_weights('yamnet.h5')
yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

# Load Wav2Vec2 model and processor
wav2vec2_model = Wav2Vec2ForSequenceClassification.from_pretrained('facebook/wav2vec2-base-960h', num_labels=521)
processor_wav2vec2 = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wav2vec2_model.to(device)

# Ensemble method: Averaging Probabilities from both models
def ensemble_average(yamnet_scores, wav2vec2_scores):
    """Combine predictions from YAMNet and Wav2Vec2 by averaging probabilities."""
    avg_scores = (yamnet_scores + wav2vec2_scores) / 2
    return np.argmax(avg_scores, axis=1)

def process_audio(file_name, yamnet, wav2vec2_model, processor_wav2vec2, params):
    """Load, preprocess the audio and make ensemble predictions."""
    # Load and preprocess the audio for YAMNet
    wav_data, sr = sf.read(file_name, dtype=np.int16)
    waveform = wav_data / 32768.0  # Normalize to [-1.0, +1.0]
    waveform = waveform.astype('float32')

    # Convert to mono and resample to the required sample rate
    if len(waveform.shape) > 1:
        waveform = np.mean(waveform, axis=1)
    if sr != params.sample_rate:
        waveform = resampy.resample(waveform, sr, params.sample_rate)  # Resample to 16kHz

    # Preprocess audio for Wav2Vec2 (use Wav2Vec2Processor)
    inputs_wav2vec2 = processor_wav2vec2(waveform, return_tensors="pt", sampling_rate=params.sample_rate)

    # Ensure the input tensor has a batch dimension of 1 (for a batch of size 1)
    inputs_wav2vec2 = {key: val.unsqueeze(0).to(device) for key, val in inputs_wav2vec2.items()}  # Add batch dimension

    # Get YAMNet predictions (scores)
    yamnet_scores, _, _ = yamnet(waveform)
    yamnet_scores = np.mean(yamnet_scores, axis=0)  # Average the scores over time

    # Get Wav2Vec2 predictions (logits)
    wav2vec2_outputs = wav2vec2_model(**inputs_wav2vec2)
    wav2vec2_scores = torch.nn.functional.softmax(wav2vec2_outputs.logits, dim=-1).cpu().detach().numpy()

    # Check the shape of the wav2vec2_scores
    print(f"Shape of Wav2Vec2 scores: {wav2vec2_scores.shape}")  # Should be [1, 521] for batch size 1

    # Combine the predictions using ensemble averaging
    final_predictions = ensemble_average(yamnet_scores, wav2vec2_scores)

    # Get the top 5 predictions
    top5_i = np.argsort(final_predictions)[::-1][:5]
    print(f"{file_name} predictions:\n" + 
          '\n'.join(f'  {yamnet_classes[i]:12s}: {final_predictions[i]:.3f}' for i in top5_i))

# Example usage
file_name = 'test.wav'  # Replace with your audio file
process_audio(file_name, yamnet, wav2vec2_model, processor_wav2vec2, params)


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [1, 1, 1, 106560]

In [3]:
import torch
print(torch.cuda.is_available())  # Should print True if CUDA is available


True


In [1]:
import glob
import os
import json
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import torch
import librosa
from collections import namedtuple

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Set to block CUDA errors

# Load the class map CSV (mapping from mid to index)
class_map_df = pd.read_csv('yamnet_class_map.csv')
class_map = pd.read_csv('yamnet_class_map.csv').set_index('display_name').to_dict()['mid']

# Create a mapping from mid (string) to index (integer)
mid_to_index = {mid: idx for idx, mid in enumerate(set(class_map.values()))}

# Initialize the model and processor
model = Wav2Vec2ForSequenceClassification.from_pretrained('facebook/wav2vec2-base-960h', num_labels=521)
processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h')

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Moved model to GPU if available")

# Define a namedtuple for dataset items
AudioSample = namedtuple("AudioSample", ["input_values", "labels"])

class AudioDataset(Dataset):
    def __init__(self, audio_directory, ontology_file, mid_to_index):
        with open(ontology_file, 'r') as f:
            self.ontology_data = json.load(f)

        self.mid_to_index = mid_to_index
        self.audio_directory = audio_directory
        self.audio_files = glob.glob(os.path.join(self.audio_directory, '**', '*.wav'), recursive=True)
        
        # Populate the dataset by calling prepare_data
        self.data = self.prepare_data()

    def prepare_data(self):
        data = []
        for category in self.ontology_data:
            if "positive_examples" in category:
                category_name = category["name"]
                mid = category["id"]  # Get the mid for the current category

                # Use the mid to get the index from the mid_to_index
                if mid in self.mid_to_index:
                    label = self.mid_to_index[mid]  # Get the integer index as the label
                else:
                    label = -1  # Default to -1 if not found

                for audio_file in self.audio_files:
                    if category_name.lower() in audio_file.lower():
                        audio_file = audio_file.replace("\\", "/")
                        data.append({"audio": audio_file, "label": label})
        return data
    
    def load_audio(self, file_path):
        """Load and preprocess audio using Wav2Vec2Processor."""
        try:
            if not os.path.isfile(file_path):
                raise FileNotFoundError(f"WAV file not found: {file_path}")

            # Load audio using librosa and resample to 16kHz
            audio_data, sr = librosa.load(file_path, sr=16000)
            print(f"Successfully loaded audio: {file_path}, shape: {audio_data.shape}, dtype: {audio_data.dtype}")

            # Preprocess using Wav2Vec2Processor
            inputs = processor(audio_data, sampling_rate=sr, return_tensors="pt", padding=True)
            processed_audio = inputs.input_values.squeeze(0)  # Remove batch dimension

            print(f"Processed audio shape: {processed_audio.shape}")

            # Return processed audio
            return processed_audio

        except Exception as e:
            print(f"Error loading audio file {file_path}: {e}")
            return None

    def __getitem__(self, idx):
        """Get one item (audio, label) for the dataset."""
        sample = self.data[idx]
        audio_data = self.load_audio(sample["audio"])
        label = sample["label"]

        # Ensure audio_data is valid
        if audio_data is None:
            print(f"Error loading audio at index {idx}, returning dummy data.")
            return {"input_values": torch.zeros(1), "labels": torch.tensor(label, dtype=torch.long)}

        # Trim or pad audio to max_length
        max_length = 160000  # Set a max_length for padding/truncating
        if audio_data.shape[0] < max_length:
            padding = torch.zeros(max_length - audio_data.shape[0])
            audio_data = torch.cat([audio_data, padding])
        else:
            audio_data = audio_data[:max_length]

        print(f"Successfully loaded audio at index {idx}, shape: {audio_data.shape}, label: {label}")
        
        return {"input_values": audio_data.clone().detach(), "labels": torch.tensor(label, dtype=torch.long)}

    def __len__(self):
        return len(self.data)

# Initialize the dataset and dataloaders
audio_directory = r"audiosets/ontology"
ontology_file = 'ontology.json'

# Initialize dataset and prepare data
dataset = AudioDataset(audio_directory, ontology_file, mid_to_index)

# Now split the dataset into train and test sets (80% train, 20% test)
train_data, test_data = train_test_split(dataset.data, test_size=0.2, random_state=42)

# Initialize train and test datasets using the split data
train_dataset = AudioDataset(audio_directory, ontology_file, mid_to_index)
test_dataset = AudioDataset(audio_directory, ontology_file, mid_to_index)

# Assign the split data to the datasets
train_dataset.data = train_data
test_dataset.data = test_data

# Define the compute_metrics function
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {'accuracy': accuracy}

# Training setup
training_args = TrainingArguments(
    output_dir="./results",  # Directory where the model will be saved
    evaluation_strategy="steps",  # Save after each epoch
    save_strategy="steps",  # Save after each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=3,
    save_steps=500,
    disable_tqdm=False,
    report_to="tensorboard",
    load_best_model_at_end=True,
)

# Initialize train and test dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

# Pass the datasets without specifying the dataloaders in Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Manually save the model after each epoch
for epoch in range(int(training_args.num_train_epochs)):
    print(f"Training epoch {epoch + 1}")
    model.train()  # Set model to training mode
    for batch in train_dataloader:
        input_values = batch['input_values'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward passl
        outputs = model(input_values, labels=labels)
        loss = outputs.loss
        loss.backward()  # Backward pass to compute gradients
        
        # Clear GPU cache to prevent memory overflow
        torch.cuda.empty_cache()

    # Manually save the model after each epoch
    model.save_pretrained(f"./results/checkpoint-{epoch+1}")
    print(f"Model saved at checkpoint-{epoch+1}")
    
    # Run evaluation after each epoch (optional)
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        for batch in test_dataloader:
            input_values = batch['input_values'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_values, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            print(f"Evaluation loss: {loss.item()}")

    torch.cuda.empty_cache()  # Clear GPU cache after each epoch





Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Moved model to GPU if available




Training epoch 1
Successfully loaded audio: audiosets/ontology/Electric guitar_4.wav, shape: (2106143,), dtype: float32
Processed audio shape: torch.Size([2106143])
Successfully loaded audio at index 1324, shape: torch.Size([160000]), label: 464
Successfully loaded audio: audiosets/ontology/Howl (wind)_2.wav, shape: (160125,), dtype: float32
Processed audio shape: torch.Size([160125])
Successfully loaded audio at index 530, shape: torch.Size([160000]), label: 224
Successfully loaded audio: audiosets/ontology/Waves, surf_4.wav, shape: (159754,), dtype: float32
Processed audio shape: torch.Size([159754])
Successfully loaded audio at index 2218, shape: torch.Size([160000]), label: 75
Successfully loaded audio: audiosets/ontology/Dog_5.wav, shape: (1473074,), dtype: float32
Processed audio shape: torch.Size([1473074])
Successfully loaded audio at index 1756, shape: torch.Size([160000]), label: 332
Successfully loaded audio: audiosets/ontology/Howl_3.wav, shape: (160125,), dtype: float32
Pr

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
