In [14]:
import sys
import numpy as np
import resampy
import soundfile as sf
import tensorflow as tf

import params as yamnet_params
import yamnet as yamnet_model
import features  # Import features.py

# Load YAMNet model and weights
params = yamnet_params.Params()
yamnet = yamnet_model.yamnet_frames_model(params)
yamnet.load_weights('yamnet.h5')
yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

def process_audio(file_name, params, yamnet):
    # Load and process the audio file
    wav_data, sr = sf.read(file_name, dtype=np.int16)
    waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
    waveform = waveform.astype('float32')

    # Convert to mono and resample to the required sample rate
    if len(waveform.shape) > 1:
        waveform = np.mean(waveform, axis=1)
    if sr != params.sample_rate:
        waveform = resampy.resample(waveform, sr, params.sample_rate)

    # Pad waveform to get an integral number of patches
    padded_waveform = features.pad_waveform(waveform, params)

    # Convert the waveform to log mel spectrogram patches
    log_mel_spectrogram, feature_patches = features.waveform_to_log_mel_spectrogram_patches(padded_waveform, params)

    # Make prediction using YAMNet
    scores, embeddings, spectrogram = yamnet(waveform)
    prediction = np.mean(scores, axis=0)  # Average scores over time

    return prediction

# Example usage
file_name = 'test.wav'  # Replace with your audio file
prediction = process_audio(file_name, params, yamnet)

# Get top 5 predictions
top5_i = np.argsort(prediction)[::-1][:5]
print(f"{file_name}:\n" + 
      '\n'.join(f'  {yamnet_classes[i]:12s}: {prediction[i]:.3f}' for i in top5_i))


test.wav:
  Crying, sobbing: 0.150
  Speech      : 0.135
  Whimper     : 0.113
  Baby cry, infant cry: 0.091
  Animal      : 0.074


In [2]:
import tensorflow as tf
import resampy
import numpy as np

def preprocess_audio(audio_path, params):
    # Load audio file and resample to 16kHz
    waveform, sample_rate = tf.audio.decode_wav(tf.io.read_file(audio_path))
    waveform = tf.squeeze(waveform, axis=-1)
    waveform = resampy.resample(waveform.numpy(), sample_rate, params.sample_rate)

    # Convert the waveform to log mel spectrogram
    spectrogram, features = waveform_to_log_mel_spectrogram_patches(waveform, params)
    return features, spectrogram

def waveform_to_log_mel_spectrogram_patches(waveform, params):
  """Compute log mel spectrogram patches of a 1-D waveform."""
  with tf.name_scope('log_mel_features'):
    # waveform has shape [<# samples>]

    # Convert waveform into spectrogram using a Short-Time Fourier Transform.
    # Note that tf.signal.stft() uses a periodic Hann window by default.
    window_length_samples = int(
      round(params.sample_rate * params.stft_window_seconds))
    hop_length_samples = int(
      round(params.sample_rate * params.stft_hop_seconds))
    fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
    num_spectrogram_bins = fft_length // 2 + 1
    if params.tflite_compatible:
      magnitude_spectrogram = _tflite_stft_magnitude(
          signal=waveform,
          frame_length=window_length_samples,
          frame_step=hop_length_samples,
          fft_length=fft_length)
    else:
      magnitude_spectrogram = tf.abs(tf.signal.stft(
          signals=waveform,
          frame_length=window_length_samples,
          frame_step=hop_length_samples,
          fft_length=fft_length))
    # magnitude_spectrogram has shape [<# STFT frames>, num_spectrogram_bins]

    # Convert spectrogram into log mel spectrogram.
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins=params.mel_bands,
        num_spectrogram_bins=num_spectrogram_bins,
        sample_rate=params.sample_rate,
        lower_edge_hertz=params.mel_min_hz,
        upper_edge_hertz=params.mel_max_hz)
    mel_spectrogram = tf.matmul(
      magnitude_spectrogram, linear_to_mel_weight_matrix)
    log_mel_spectrogram = tf.math.log(mel_spectrogram + params.log_offset)
    # log_mel_spectrogram has shape [<# STFT frames>, params.mel_bands]

    # Frame spectrogram (shape [<# STFT frames>, params.mel_bands]) into patches
    # (the input examples). Only complete frames are emitted, so if there is
    # less than params.patch_window_seconds of waveform then nothing is emitted
    # (to avoid this, zero-pad before processing).
    spectrogram_hop_length_samples = int(
      round(params.sample_rate * params.stft_hop_seconds))
    spectrogram_sample_rate = params.sample_rate / spectrogram_hop_length_samples
    patch_window_length_samples = int(
      round(spectrogram_sample_rate * params.patch_window_seconds))
    patch_hop_length_samples = int(
      round(spectrogram_sample_rate * params.patch_hop_seconds))
    features = tf.signal.frame(
        signal=log_mel_spectrogram,
        frame_length=patch_window_length_samples,
        frame_step=patch_hop_length_samples,
        axis=0)
    # features has shape [<# patches>, <# STFT frames in an patch>, params.mel_bands]

    return log_mel_spectrogram, features


def pad_waveform(waveform, params):
  """Pads waveform with silence if needed to get an integral number of patches."""
  # In order to produce one patch of log mel spectrogram input to YAMNet, we
  # need at least one patch window length of waveform plus enough extra samples
  # to complete the final STFT analysis window.
  min_waveform_seconds = (
      params.patch_window_seconds +
      params.stft_window_seconds - params.stft_hop_seconds)
  min_num_samples = tf.cast(min_waveform_seconds * params.sample_rate, tf.int32)
  num_samples = tf.shape(waveform)[0]
  num_padding_samples = tf.maximum(0, min_num_samples - num_samples)

  # In addition, there might be enough waveform for one or more additional
  # patches formed by hopping forward. If there are more samples than one patch,
  # round up to an integral number of hops.
  num_samples = tf.maximum(num_samples, min_num_samples)
  num_samples_after_first_patch = num_samples - min_num_samples
  hop_samples = tf.cast(params.patch_hop_seconds * params.sample_rate, tf.int32)
  num_hops_after_first_patch = tf.cast(tf.math.ceil(
          tf.cast(num_samples_after_first_patch, tf.float32) /
          tf.cast(hop_samples, tf.float32)), tf.int32)
  num_padding_samples += (
      hop_samples * num_hops_after_first_patch - num_samples_after_first_patch)

  padded_waveform = tf.pad(waveform, [[0, num_padding_samples]],
                           mode='CONSTANT', constant_values=0.0)
  return padded_waveform


def _tflite_stft_magnitude(signal, frame_length, frame_step, fft_length):
  """TF-Lite-compatible version of tf.abs(tf.signal.stft())."""
  def _hann_window():
    return tf.reshape(
      tf.constant(
          (0.5 - 0.5 * np.cos(2 * np.pi * np.arange(0, 1.0, 1.0 / frame_length))
          ).astype(np.float32),
          name='hann_window'), [1, frame_length])

  def _dft_matrix(dft_length):
    """Calculate the full DFT matrix in NumPy."""
    # See https://en.wikipedia.org/wiki/DFT_matrix
    omega = (0 + 1j) * 2.0 * np.pi / float(dft_length)
    # Don't include 1/sqrt(N) scaling, tf.signal.rfft doesn't apply it.
    return np.exp(omega * np.outer(np.arange(dft_length), np.arange(dft_length)))

  def _rdft(framed_signal, fft_length):
    """Implement real-input Discrete Fourier Transform by matmul."""
    # We are right-multiplying by the DFT matrix, and we are keeping only the
    # first half ("positive frequencies").  So discard the second half of rows,
    # but transpose the array for right-multiplication.  The DFT matrix is
    # symmetric, so we could have done it more directly, but this reflects our
    # intention better.
    complex_dft_matrix_kept_values = _dft_matrix(fft_length)[:(
        fft_length // 2 + 1), :].transpose()
    real_dft_matrix = tf.constant(
        np.real(complex_dft_matrix_kept_values).astype(np.float32),
        name='real_dft_matrix')
    imag_dft_matrix = tf.constant(
        np.imag(complex_dft_matrix_kept_values).astype(np.float32),
        name='imaginary_dft_matrix')
    signal_frame_length = tf.shape(framed_signal)[-1]
    half_pad = (fft_length - signal_frame_length) // 2
    padded_frames = tf.pad(
        framed_signal,
        [
            # Don't add any padding in the frame dimension.
            [0, 0],
            # Pad before and after the signal within each frame.
            [half_pad, fft_length - signal_frame_length - half_pad]
        ],
        mode='CONSTANT',
        constant_values=0.0)
    real_stft = tf.matmul(padded_frames, real_dft_matrix)
    imag_stft = tf.matmul(padded_frames, imag_dft_matrix)
    return real_stft, imag_stft

  def _complex_abs(real, imag):
    return tf.sqrt(tf.add(real * real, imag * imag))

  framed_signal = tf.signal.frame(signal, frame_length, frame_step)
  windowed_signal = framed_signal * _hann_window()
  real_stft, imag_stft = _rdft(windowed_signal, fft_length)
  stft_magnitude = _complex_abs(real_stft, imag_stft)
  return stft_magnitude



In [5]:
import tensorflow as tf

# Use MobileNetV2 as the base model (similar to MobileNetV1)
base_model = tf.keras.applications.MobileNetV2(
    include_top=False, weights='imagenet', input_shape=(None, None, 3))

# Freeze layers if you're fine-tuning
base_model.trainable = False

num_classes = 521  # Adjust this to match the number of classes in your dataset

# Add a new classification head (modify the number of output classes)
model = tf.keras.Sequential([
    base_model,
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])




In [6]:
# Add a new classification head (modify the number of output classes)
model = tf.keras.Sequential([
    base_model,
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Now you can compile and train the model
model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

# Example: model summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 mobilenetv2_1.00_224 (Func  (None, None, None, 1280   2257984   
 tional)                     )                                   
                                                                 
 global_average_pooling2d_2  (None, 1280)              0         
  (GlobalAveragePooling2D)                                       
                                                                 
 dense_1 (Dense)             (None, 521)               667401    
                                                                 
Total params: 2925385 (11.16 MB)
Trainable params: 667401 (2.55 MB)
Non-trainable params: 2257984 (8.61 MB)
_________________________________________________________________


In [7]:
model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model.fit(train_dataset, epochs=3, batch_size=16)


ValueError: Failed to find data adapter that can handle input: <class '__main__.AudioDataset'>, <class 'NoneType'>

In [1]:
import glob
import os
import json
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import torch
import librosa
from collections import namedtuple
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Set to block CUDA errors

# Load the class map CSV (mapping from mid to index)
class_map_df = pd.read_csv('yamnet_class_map.csv')
class_map = pd.read_csv('yamnet_class_map.csv').set_index('display_name').to_dict()['mid']

# Create a mapping from mid (string) to index (integer)
mid_to_index = {mid: idx for idx, mid in enumerate(set(class_map.values()))}

# Initialize the model and processor
model = Wav2Vec2ForSequenceClassification.from_pretrained('facebook/wav2vec2-base-960h', num_labels=521)

processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h')

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Moved model to GPU if available")

# Define a namedtuple for dataset items
AudioSample = namedtuple("AudioSample", ["input_values", "labels"])

class AudioDataset(Dataset):
    def __init__(self, audio_directory, ontology_file, mid_to_index):
        with open(ontology_file, 'r') as f:
            self.ontology_data = json.load(f)

        self.mid_to_index = mid_to_index
        self.audio_directory = audio_directory
        self.audio_files = glob.glob(os.path.join(self.audio_directory, '**', '*.wav'), recursive=True)
        
        # Populate the dataset by calling prepare_data
        self.data = self.prepare_data()

    def prepare_data(self):
        data = []
        for category in self.ontology_data:
            if "positive_examples" in category:
                category_name = category["name"]
                mid = category["id"]  # Get the mid for the current category

                # Use the mid to get the index from the mid_to_index
                if mid in self.mid_to_index:
                    label = self.mid_to_index[mid]  # Get the integer index as the label
                else:
                    label = -1  # Default to -1 if not found

                for audio_file in self.audio_files:
                    if category_name.lower() in audio_file.lower():
                        audio_file = audio_file.replace("\\", "/")
                        data.append({"audio": audio_file, "label": label})
        return data
    
    def load_audio(self, file_path):
        """Load and preprocess audio using Wav2Vec2Processor."""
        try:
            if not os.path.isfile(file_path):
                raise FileNotFoundError(f"WAV file not found: {file_path}")

            # Load audio using librosa and resample to 16kHz
            audio_data, sr = librosa.load(file_path, sr=16000)
            print(f"Successfully loaded audio: {file_path}, shape: {audio_data.shape}, dtype: {audio_data.dtype}")

            # Preprocess using Wav2Vec2Processor
            inputs = processor(audio_data, sampling_rate=sr, return_tensors="pt", padding=True)
            processed_audio = inputs.input_values.squeeze(0)  # Remove batch dimension

            print(f"Processed audio shape: {processed_audio.shape}")

            # Return processed audio
            return processed_audio

        except Exception as e:
            print(f"Error loading audio file {file_path}: {e}")
            return None

    def __getitem__(self, idx):
        """Get one item (audio, label) for the dataset."""
        sample = self.data[idx]
        audio_data = self.load_audio(sample["audio"])
        label = sample["label"]

        # Ensure audio_data is valid
        if audio_data is None:
            print(f"Error loading audio at index {idx}, returning dummy data.")
            return {"input_values": torch.zeros(1), "labels": torch.tensor(label, dtype=torch.long)}

        # Trim or pad audio to max_length
        max_length = 160000  # Set a max_length for padding/truncating
        if audio_data.shape[0] < max_length:
            padding = torch.zeros(max_length - audio_data.shape[0])
            audio_data = torch.cat([audio_data, padding])
        else:
            audio_data = audio_data[:max_length]

        print(f"Successfully loaded audio at index {idx}, shape: {audio_data.shape}, label: {label}")
        
        return {"input_values": audio_data.clone().detach(), "labels": torch.tensor(label, dtype=torch.long)}

    def __len__(self):
        return len(self.data)

# Initialize the dataset and dataloaders
audio_directory = r"audiosets/ontology"
ontology_file = 'ontology.json'

# Initialize dataset and prepare data
dataset = AudioDataset(audio_directory, ontology_file, mid_to_index)

# Now split the dataset into train and test sets (80% train, 20% test)
train_data, test_data = train_test_split(dataset.data, test_size=0.2, random_state=42)

# Initialize train and test datasets using the split data
train_dataset = AudioDataset(audio_directory, ontology_file, mid_to_index)
test_dataset = AudioDataset(audio_directory, ontology_file, mid_to_index)

# Assign the split data to the datasets
train_dataset.data = train_data
test_dataset.data = test_data

# Define the compute_metrics function
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {'accuracy': accuracy}

# Training setup
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=3,
    save_steps=10,
    disable_tqdm=False,
    report_to="tensorboard",
    load_best_model_at_end=True,
)

# Initialize train and test dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

# Pass the datasets without specifying the dataloaders in Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Training loop with memory clearing after each batch
for epoch in range(int(training_args.num_train_epochs)):
    print(f"Training epoch {epoch + 1}")
    model.train()  # Set model to training mode
    for batch in train_dataloader:
        input_values = batch['input_values'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(input_values, labels=labels)
        loss = outputs.loss
        loss.backward()  # Backward pass to compute gradients
        
        # Clear GPU cache to prevent memory overflow
        torch.cuda.empty_cache()

    # Run evaluation after each epoch (optional)
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        for batch in test_dataloader:
            input_values = batch['input_values'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_values, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            print(f"Evaluation loss: {loss.item()}")

    torch.cuda.empty_cache()  # Clear GPU cache after each epoch





Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Moved model to GPU if available




Training epoch 1
Error loading audio file audiosets/ontology/Electric guitar_4.wav: cannot access local variable 'parent' where it is not associated with a value
Error loading audio at index 1324, returning dummy data.
Successfully loaded audio: audiosets/ontology/Howl (wind)_2.wav, shape: (160125,), dtype: float32
Processed audio shape: torch.Size([160125])
Successfully loaded audio at index 530, shape: torch.Size([160000]), label: 115
Successfully loaded audio: audiosets/ontology/Waves, surf_4.wav, shape: (159754,), dtype: float32
Processed audio shape: torch.Size([159754])
Successfully loaded audio at index 2218, shape: torch.Size([160000]), label: 413
Successfully loaded audio: audiosets/ontology/Dog_5.wav, shape: (1473074,), dtype: float32
Processed audio shape: torch.Size([1473074])
Successfully loaded audio at index 1756, shape: torch.Size([160000]), label: 85
Successfully loaded audio: audiosets/ontology/Howl_3.wav, shape: (160125,), dtype: float32
Processed audio shape: torch.

RuntimeError: stack expects each tensor to be equal size, but got [1] at entry 0 and [160000] at entry 1