# Bird Sound Classification with CNN
This notebook implements a CNN-based classifier using Mel spectrograms extracted from bird sound recordings.

In [1]:

import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from pathlib import Path


In [2]:

SAMPLE_RATE = 22050
DURATION = 5  # seconds
N_MELS = 128
N_FFT = 2048
HOP_LENGTH = 512
MAX_PAD_LEN = int(np.ceil((SAMPLE_RATE * DURATION) / HOP_LENGTH))
AUDIO_DIR = r"C:\UPF\Taller de Tecnologia Musical\Birdify\data\processed\birdcall_segments_5s\birdcall_segments_5s"


In [3]:

def load_and_preprocess_audio(file_path):
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)
    
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH)
    S_dB = librosa.power_to_db(S, ref=np.max)
    S_dB = (S_dB - np.min(S_dB)) / (np.max(S_dB) - np.min(S_dB))
    S_dB = np.pad(S_dB, ((0, 0), (0, max(0, MAX_PAD_LEN - S_dB.shape[1]))), mode='constant')
    S_dB = S_dB[:, :MAX_PAD_LEN]
    return S_dB[..., np.newaxis]


In [4]:

def create_dataset():
    wav_files = list(Path(AUDIO_DIR).glob("*.wav"))
    labels = [f.stem.split('_')[0] for f in wav_files]

    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(labels)

    X = []
    y_clean = []

    for f, label in zip(wav_files, y):
        spec = load_and_preprocess_audio(f)
        if spec is not None:
            X.append(spec)
            y_clean.append(label)

    X = np.array(X)
    y_clean = np.array(y_clean)

    return X, y_clean, label_encoder


In [5]:

def create_improved_cnn(input_shape, num_classes):
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.GlobalAveragePooling2D(),
        
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])
    return model


In [6]:

def train_cnn_model():
    X, y, label_encoder = create_dataset()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    
    input_shape = X_train[0].shape
    num_classes = len(np.unique(y))

    model = create_improved_cnn(input_shape, num_classes)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    history = model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=16, callbacks=[early_stopping])

    return model, history, label_encoder, X_test, y_test


In [7]:

def plot_confusion_matrix(y_true, y_pred, classes):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=classes, yticklabels=classes, cmap='Blues')
    plt.ylabel('True')
    plt.xlabel('Predicted')
    plt.title('Confusion Matrix')
    plt.show()


In [None]:

model, history, label_encoder, X_test, y_test = train_cnn_model()
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")

y_pred = np.argmax(model.predict(X_test), axis=1)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
plot_confusion_matrix(y_test, y_pred, label_encoder.classes_)


In [None]:
print("X shape:", X.shape)
print("y shape:", y.shape)

NameError: name 'X' is not defined