<a href="https://colab.research.google.com/github/Tyler-Schwenk/Rana_Draytonii/blob/main/Data_Manager.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

First prepare data into proper format and prepare with labels

In [4]:
import os
import torch
import torchaudio


# Changes sampling frequency of audio file to 16kHz required by the AST model
def resampler(audio_path):
  # load your audio file
  waveform, sample_rate = torchaudio.load(audio_path)

  # define resampler
  resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)

  # resample the waveform and save
  waveform_resampled = resampler(waveform)
  torchaudio.save(audio_path, waveform_resampled, sample_rate=16000)


# Define the path where your positive and negative .wav files are stored
positive_audio_path = '/Pseudacris and Rana 1.wav'
negative_audio_path = 'path_to_your_negative_audio_files'

# Resample to 16kHz
resampler(positive_audio_path)
resampler(negative_audio_path)

# Define the target length for your spectrograms
target_length = 1024
mel_bins = 128  # Number of bins in Mel spectrogram

# Function to convert audio to Mel spectrogram
def audio_to_mel_spectrogram(wav_name, mel_bins, target_length=1024):
    waveform, sr = torchaudio.load(wav_name)
    assert sr == 16000, 'input audio sampling rate must be 16kHz'

    fbank = torchaudio.compliance.kaldi.fbank(
        waveform, htk_compat=True, sample_frequency=sr, use_energy=False,
        window_type='hanning', num_mel_bins=mel_bins, dither=0.0, frame_shift=10)

    n_frames = fbank.shape[0]

    p = target_length - n_frames
    if p > 0:
        m = torch.nn.ZeroPad2d((0, 0, 0, p))
        fbank = m(fbank)
    elif p < 0:
        fbank = fbank[0:target_length, :]

    fbank = (fbank - (-4.2677393)) / (4.5689974 * 2)
    return fbank

# Prepare dataset
dataset = []
labels = []

# Process positive samples
for filename in os.listdir(positive_audio_path):
    if filename.endswith('.wav'):
        filepath = os.path.join(positive_audio_path, filename)
        spectrogram = audio_to_mel_spectrogram(filepath, mel_bins, target_length)
        dataset.append(spectrogram)
        labels.append(1)  # Positive samples are labeled as 1

# Process negative samples
for filename in os.listdir(negative_audio_path):
    if filename.endswith('.wav'):
        filepath = os.path.join(negative_audio_path, filename)
        spectrogram = audio_to_mel_spectrogram(filepath, mel_bins, target_length)
        dataset.append(spectrogram)
        labels.append(0)  # Negative samples are labeled as 0

# Convert to PyTorch tensors
dataset = torch.stack(dataset)
labels = torch.Tensor(labels)

# Now `dataset` is your dataset of spectrograms and `labels` are the corresponding labels

# Note: This script assumes your audio files have a sampling rate of 16kHz. If they do not, you may need to resample them to
# this rate before generating the spectrograms. -- DONE

# Remember to split your data into training, validation, and test sets, so you can properly train and evaluate your model.

# Keep in mind that you might need to adjust this script according to your specific needs, for example, to manage memory 
# if you have a large number of audio files. In such cases, you might want to save the spectrograms to disk and use a 
# PyTorch DataLoader to load them in batches during training.

Sampling rate of /Pseudacris and Rana 1.wav is 44100Hz
Sampling rate of /Pseudacris and Rana 1.wav is 16000Hz
