In [1]:
from datasets import load_dataset, Dataset, Audio, ClassLabel, Features
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
esc50 = load_dataset('ashraq/esc50', split='train')
class_labels = ClassLabel(names=["bang", "dog_bark"])

Repo card metadata block was not found. Setting CardData to empty.


In [3]:
# Define features with audio and label columns
features = Features({
    "audio": Audio(),  # Define the audio feature
    "labels": class_labels  # Assign the class labels
})


In [4]:
# Construct the dataset from a dictionary
dataset = Dataset.from_dict({
    "audio": ["audio/fold1/bang-100662.wav", "audio/fold1/dog-barking-406629.wav"],
    "labels": [0, 1],  # Corresponding labels for the audio files
}, features=features)

In [5]:
# Step 2: Preprocess the Audio Data
from transformers import ASTFeatureExtractor
import torch

# Define the pretrained model and instantiate a feature extractor
pretrained_model = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(pretrained_model)

# Save model input name and sampling rate
model_input_name = feature_extractor.model_input_names[0]
SAMPLING_RATE = feature_extractor.sampling_rate

# Preprocess audio function
def preprocess_audio(batch):
    wavs = [audio["array"] for audio in batch["input_values"]]
    inputs = feature_extractor(wavs, sampling_rate=SAMPLING_RATE, return_tensors="pt")
    output_batch = {model_input_name: inputs.get(model_input_name), "labels": list(batch["labels"])}
    return output_batch

# Apply the transformation to the dataset
dataset = dataset.rename_column("audio", "input_values")
dataset.set_transform(preprocess_audio, output_all_columns=False)
#
# # Inspect a transformed sample
print(dataset[0])

{'input_values': [<datasets.features._torchcodec.AudioDecoder object at 0x7fb0c0b71fd0>], 'labels': [0]}
{'input_values': tensor([[-1.2776, -1.2776, -1.2776,  ..., -1.2776, -1.2776, -1.2776],
        [-1.2776, -1.2776, -1.2776,  ..., -1.2776, -1.2776, -1.2776],
        [-1.2776, -1.2776, -1.2776,  ..., -1.2776, -1.2776, -1.2776],
        ...,
        [-0.8126, -1.1532, -0.7764,  ..., -1.2378, -1.2776, -1.2216],
        [-0.8920, -1.2540, -0.8771,  ..., -1.1316, -1.2758, -1.2431],
        [-1.2748, -1.2776, -0.9781,  ..., -1.2000, -1.2636, -1.1547]]), 'labels': 0}


In [7]:
dataset = dataset.train_test_split(test_size=0.2, seed=42)

In [8]:
from audiomentations import Compose, AddGaussianSNR, Gain, GainTransition, ClippingDistortion, TimeStretch, PitchShift

# Define audio augmentations
audio_augmentations = Compose([
    AddGaussianSNR(min_snr_db=10, max_snr_db=20),
    Gain(min_gain_db=-6, max_gain_db=6),
    GainTransition(min_gain_db=-6, max_gain_db=6, min_duration=0.01, max_duration=0.3, duration_unit="fraction"),
    ClippingDistortion(min_percentile_threshold=0, max_percentile_threshold=30, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.2),
    PitchShift(min_semitones=-4, max_semitones=4),
], p=0.8, shuffle=True)

# Preprocess audio with augmentations
def preprocess_audio_with_transforms(batch):
    wavs = [audio_augmentations(audio["array"], sample_rate=SAMPLING_RATE) for audio in batch["input_values"]]
    inputs = feature_extractor(wavs, sampling_rate=SAMPLING_RATE, return_tensors="pt")
    output_batch = {model_input_name: inputs.get(model_input_name), "labels": list(batch["labels"])}
    return output_batch

# Set transforms for training and validation splits
dataset["train"].set_transform(preprocess_audio_with_transforms, output_all_columns=False)
dataset["test"].set_transform(preprocess_audio, output_all_columns=False)

In [12]:
from transformers import ASTConfig, ASTForAudioClassification

# # Load configuration from the pretrained model
config = ASTConfig.from_pretrained(pretrained_model)

print(esc50)
# # Update configuration with the number of labels in our dataset
# num_labels = len(np.unique(esc50["labels"]))
# label2id = {label: i for i, label in enumerate(class_labels.names)}
# config.num_labels = num_labels
# config.label2id = label2id
# config.id2label = {v: k for k, v in label2id.items()}
#
# # Initialize the model with the updated configuration
# model = ASTForAudioClassification.from_pretrained(pretrained_model, config=config, ignore_mismatched_sizes=True)
# model.init_weights()

Dataset({
    features: ['filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take', 'audio'],
    num_rows: 2000
})
