In [1]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import IPython.display as ipd
import numpy as np
import pandas as pd
import tensorflow as tf
import requests, tarfile

For Installing Dataset and extracting in Work directory

In [2]:
url = "https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz"
filename = "UrbanSound8K.tar.gz"

with requests.get(url, stream=True) as r:
    r.raise_for_status()
    with open(filename, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)

with tarfile.open(filename, "r:gz") as tar:
    tar.extractall()

In [3]:
base_dir = "/content/UrbanSound8K"
metadata_path = os.path.join(base_dir, "metadata", "UrbanSound8K.csv")
audio_dir = os.path.join(base_dir, "audio")

In [4]:
metadata = pd.read_csv(metadata_path)
print(metadata.head())

      slice_file_name    fsID  start        end  salience  fold  classID  \
0    100032-3-0-0.wav  100032    0.0   0.317551         1     5        3   
1  100263-2-0-117.wav  100263   58.5  62.500000         1     5        2   
2  100263-2-0-121.wav  100263   60.5  64.500000         1     5        2   
3  100263-2-0-126.wav  100263   63.0  67.000000         1     5        2   
4  100263-2-0-137.wav  100263   68.5  72.500000         1     5        2   

              class  
0          dog_bark  
1  children_playing  
2  children_playing  
3  children_playing  
4  children_playing  


Function for converting Audio to Mel Spectrogram than normalize it and convert to image array.

In [5]:
from PIL import Image

def preprocess_audio(file_path, sr=22050, n_mels=128, fmax=8000, duration=4):

        y, sr = librosa.load(file_path, sr=sr, duration=duration)
        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, fmax=fmax)
        mel_db = librosa.power_to_db(mel, ref=np.max)

        mel_norm = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min())
        mel_img = (mel_norm * 255).astype(np.uint8)
        img = Image.fromarray(mel_img)
        img = img.resize((128, 128))
        img_array = np.array(img) / 255.0

        img_array = np.expand_dims(img_array, axis=-1)

        return img_array

Function that takes data and returns arrays of image array and labels

In [6]:
from tqdm import tqdm

def load_data(metadata, audio_dir, max_files=None):
    X = []
    y = []

    for i, row in tqdm(metadata.iterrows(), total=len(metadata)):
        if max_files and i >= max_files:
            break

        fold = f"fold{row['fold']}"
        file_path = os.path.join(audio_dir, fold, row['slice_file_name'])
        label = row['classID']

        tensor = preprocess_audio(file_path)
        if tensor is not None:
            X.append(tensor)
            y.append(label)

    return np.array(X), np.array(y)

In [13]:
X, y = load_data(metadata, audio_dir)

X = np.array(X)
y = np.array(y)

100%|██████████| 8732/8732 [03:27<00:00, 42.04it/s]


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")


Train: (6112, 128, 128, 1), Validation: (1310, 128, 128, 1), Test: (1310, 128, 128, 1)


In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 1)),
    BatchNormalization(),
    MaxPooling2D((2, 2)),

    Conv2D(64, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.3),

    Conv2D(128, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.3),

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(10, activation='softmax')
])

model.compile(optimizer= 'adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [22]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=32,
    callbacks=[early_stop]
)

Epoch 1/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.7923 - loss: 0.5480 - val_accuracy: 0.6282 - val_loss: 2.2720
Epoch 2/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.8097 - loss: 0.5160 - val_accuracy: 0.7878 - val_loss: 0.8239
Epoch 3/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.8274 - loss: 0.4537 - val_accuracy: 0.8649 - val_loss: 0.4907
Epoch 4/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.8299 - loss: 0.4544 - val_accuracy: 0.7031 - val_loss: 1.6695
Epoch 5/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.8359 - loss: 0.4904 - val_accuracy: 0.6397 - val_loss: 1.9110
Epoch 6/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.8410 - loss: 0.4231 - val_accuracy: 0.8542 - val_loss: 0.5573
Epoch 7/30
[1m191/191

In [24]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"🧪 Final Test Accuracy: {test_acc:.4f}")

[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8511 - loss: 0.4977
🧪 Final Test Accuracy: 0.8588


Randomly select audio file and predict its class

In [32]:
import random

random_row = metadata.sample(1).iloc[0]
random_path = os.path.join(audio_dir, f"fold{random_row['fold']}", random_row['slice_file_name'])
print(f"Selected file: {random_path} (Actual class: {random_row['class']})")

input_tensor = preprocess_audio(random_path)
if input_tensor is not None:
    input_tensor = np.expand_dims(input_tensor, axis=0)
    pred = model.predict(input_tensor)
    pred_class = np.argmax(pred)
    print(f"Predicted class: {pred_class} ({metadata[metadata.classID == pred_class]['class'].iloc[0]})")

Selected file: /content/UrbanSound8K/audio/fold10/196084-2-0-2.wav (Actual class: children_playing)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Predicted class: 2 (children_playing)
