In [3]:
# Install kaggle API
!pip install kaggle api

# Upload your kaggle.json before running this cell (in Colab: "Files" → upload)
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download and unzip dataset
!kaggle datasets download -d eliasmarcon/environmental-sound-classification-50
!unzip -q environmental-sound-classification-50.zip -d esc50

Dataset URL: https://www.kaggle.com/datasets/eliasmarcon/environmental-sound-classification-50
License(s): MIT
Downloading environmental-sound-classification-50.zip to /content
 95% 583M/615M [00:06<00:00, 145MB/s]
100% 615M/615M [00:06<00:00, 98.1MB/s]


In [10]:
pip install librosa numpy pandas scikit-learn tensorflow tqdm




In [21]:
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# Paths
audio_dir = "esc50/ESC-50-master/wav_files"
label_file = "esc50/ESC-50-master/esc50_labels.csv"

# Load metadata
labels_df = pd.read_csv(label_file)

# Merge metadata with audio file info
audio_info = []
for file in os.listdir(audio_dir):
    if file.endswith(".wav"):
        try:
            y, sr = librosa.load(os.path.join(audio_dir, file), sr=None)
            duration = librosa.get_duration(y=y, sr=sr)
            audio_info.append({
                "filename": file,
                "duration_sec": duration,
                "sample_rate": sr,
                "samples": len(y)
            })
        except:
            continue

df_audio = pd.DataFrame(audio_info)
df_full = pd.merge(df_audio, labels_df, on="filename")

# Encode labels
label_encoder = LabelEncoder()
df_full['label_idx'] = label_encoder.fit_transform(df_full['category'])
num_classes = len(label_encoder.classes_)

# Feature extraction: MFCC + delta + delta-delta
X = []
y = []

print("Extracting MFCC + delta features...")
for i, row in tqdm(df_full.iterrows(), total=len(df_full)):
    try:
        y_audio, sr = librosa.load(os.path.join(audio_dir, row["filename"]), sr=22050)
        mfcc = librosa.feature.mfcc(y=y_audio, sr=sr, n_mfcc=40)
        delta = librosa.feature.delta(mfcc)
        delta2 = librosa.feature.delta(mfcc, order=2)
        features = np.vstack([mfcc, delta, delta2])  # [120, time]
        features = librosa.util.fix_length(features, size=216, axis=1)
        features = librosa.util.normalize(features)
        X.append(features.T)  # Shape: [216, 120]
        y.append(row["label_idx"])
    except:
        continue

X = np.array(X)
y = to_categorical(np.array(y), num_classes=num_classes)

print("Feature shape:", X.shape)
print("Label shape:", y.shape)

# Split by stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Attention layer
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
                                 initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1),
                                 initializer="zeros")
        super().build(input_shape)

    def call(self, x):
        e = tf.keras.backend.tanh(tf.keras.backend.dot(x, self.W) + self.b)
        a = tf.keras.backend.softmax(e, axis=1)
        output = x * a
        return tf.keras.backend.sum(output, axis=1)

# Model
input_layer = layers.Input(shape=(216, 120))
x = layers.Bidirectional(layers.GRU(128, return_sequences=True))(input_layer)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.4)(x)
x = layers.Bidirectional(layers.GRU(64, return_sequences=True))(x)
x = Attention()(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(0.3)(x)
output_layer = layers.Dense(num_classes, activation='softmax')(x)

model = models.Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Callbacks
callbacks = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, verbose=1),
    EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)
]

# Train
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

# Evaluate
loss, acc = model.evaluate(X_test, y_test)
print(f"\n✅ Test Accuracy: {acc:.2%}")


Extracting MFCC + delta features...


100%|██████████| 2000/2000 [00:44<00:00, 44.58it/s]


Feature shape: (2000, 216, 120)
Label shape: (2000, 50)


Epoch 1/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 844ms/step - accuracy: 0.0380 - loss: 3.7883 - val_accuracy: 0.0781 - val_loss: 3.8252 - learning_rate: 0.0010
Epoch 2/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 834ms/step - accuracy: 0.1259 - loss: 3.3218 - val_accuracy: 0.1094 - val_loss: 3.7096 - learning_rate: 0.0010
Epoch 3/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 819ms/step - accuracy: 0.1813 - loss: 3.0279 - val_accuracy: 0.1187 - val_loss: 3.5735 - learning_rate: 0.0010
Epoch 4/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 864ms/step - accuracy: 0.2443 - loss: 2.7886 - val_accuracy: 0.1937 - val_loss: 3.4169 - learning_rate: 0.0010
Epoch 5/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 818ms/step - accuracy: 0.2901 - loss: 2.5771 - val_accuracy: 0.2250 - val_loss: 3.2352 - learning_rate: 0.0010
Epoch 6/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m