In [1]:

import pandas as pd
from pathlib import Path

DATA_DIR = Path("Data")
CSV_PATH = Path("UrbanSound8K.csv")

df = pd.read_csv(CSV_PATH)

# Create full path to each wav file
df["file_path"] = df.apply(
    lambda row: DATA_DIR / f"fold{row['fold']}" / row["slice_file_name"],
    axis=1
)

print(df[["file_path", "class"]].head())

import librosa
import numpy as np

def extract_mfcc(file_path, n_mfcc=40):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfcc_mean = np.mean(mfcc, axis=1)  # fixed-size vector
    return mfcc_mean

def extract_mel(file_path, n_mels=64):
    y, sr = librosa.load(file_path, sr=None)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels =n_mels , n_fft=1024 , hop_length=512)
    mel_db = librosa.power_to_db(mel , ref=np.max)
    mel_mean = np.mean(mel_db, axis=1)  # fixed-size vector
    mel_std = np.std(mel_db, axis=1)
    features = np.concatenate([mel_mean, mel_std])
    features = (features - np.mean(features)) / np.std(features)
    # mel_db = (mel_db - mel_mean) / mel_std

    
    return features


X = []
y = []

for _, row in df.iterrows():
    try:
        features = extract_mel(row["file_path"])
        X.append(features)
        y.append(row["classID"])  # numeric label
    except Exception as e:
        print(f"Error loading {row['file_path']}: {e}")

X = np.array(X)
y = np.array(y)

print(X.shape, y.shape)
print(X[0])
print(y[0])




                       file_path             class
0    Data\fold5\100032-3-0-0.wav          dog_bark
1  Data\fold5\100263-2-0-117.wav  children_playing
2  Data\fold5\100263-2-0-121.wav  children_playing
3  Data\fold5\100263-2-0-126.wav  children_playing
4  Data\fold5\100263-2-0-137.wav  children_playing


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


(8732, 128) (8732,)
[-1.3482255  -1.2236692  -1.1316347  -1.1461596  -1.178558   -1.1745125
 -1.0340447  -0.93703234 -0.7997523  -0.59453374 -0.47595215 -0.4441112
 -0.38117942 -0.2386911  -0.29449016 -0.20423783 -0.18983157 -0.24527268
 -0.4072699  -0.4512159  -0.43714538 -0.45558703 -0.30265516 -0.13684435
 -0.22802036 -0.4483584  -0.45787716 -0.4088175  -0.37859172 -0.44224262
 -0.53664947 -0.6146636  -0.67115396 -0.7385174  -0.814543   -0.9101011
 -1.0466732  -1.0670856  -0.92826647 -1.0721225  -1.2127682  -1.2910459
 -1.2607192  -1.2695026  -1.2778567  -1.3437096  -1.3886501  -1.2127593
 -1.2135909  -1.1892092  -1.2079809  -1.2362734  -1.2802215  -1.390651
 -1.4704983  -1.5087192  -1.5625389  -1.6282246  -1.6469471  -1.6556369
 -1.6926061  -1.694156   -1.6973214  -1.74441     0.6683605   0.73290193
  0.7833929   0.7505707   0.745637    0.8098202   0.81915736  0.8794411
  0.9675095   0.99065393  1.00603     1.0596918   1.1045706   1.0740318
  0.9988428   0.96384865  0.9906596   0.9

In [2]:
df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class,file_path
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark,Data\fold5\100032-3-0-0.wav
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing,Data\fold5\100263-2-0-117.wav
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing,Data\fold5\100263-2-0-121.wav
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing,Data\fold5\100263-2-0-126.wav
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing,Data\fold5\100263-2-0-137.wav


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y   # IMPORTANT for class balance
)


from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import Adam

model = Sequential([

    # Input layer
    Dense(256, activation='relu', input_shape=(128,)),
    BatchNormalization(),
    Dropout(0.3),

    # Hidden layer 1
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    # Hidden layer 2
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    # Output layer
    Dense(10, activation='softmax')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

from keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.1,
    verbose=1,
    # callbacks=[early_stop]
)

test_loss, test_acc = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_acc )
print("Test Loss:", test_loss )

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3681 - loss: 1.9300 - val_accuracy: 0.3405 - val_loss: 1.8397
Epoch 2/50
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4981 - loss: 1.4706 - val_accuracy: 0.5665 - val_loss: 1.3069
Epoch 3/50
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5528 - loss: 1.3132 - val_accuracy: 0.6080 - val_loss: 1.1468
Epoch 4/50
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5764 - loss: 1.2422 - val_accuracy: 0.4979 - val_loss: 1.5014
Epoch 5/50
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6102 - loss: 1.1439 - val_accuracy: 0.6638 - val_loss: 1.0163
Epoch 6/50
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6316 - loss: 1.0846 - val_accuracy: 0.5408 - val_loss: 1.3053
Epoch 7/50
[1m197/197[0m [32m━━━━━━━

In [6]:
from xgboost import XGBClassifier

model = XGBClassifier(
    objective="multi:softprob",
    num_class=10,
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="mlogloss",
    random_state=42
)

model.fit(X_train, y_train)
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.94      0.95       200
           1       0.96      0.80      0.87        86
           2       0.76      0.84      0.80       200
           3       0.95      0.82      0.88       200
           4       0.90      0.90      0.90       200
           5       0.95      0.94      0.95       200
           6       0.91      0.92      0.91        75
           7       0.94      0.94      0.94       200
           8       0.94      0.94      0.94       186
           9       0.78      0.86      0.82       200

    accuracy                           0.89      1747
   macro avg       0.90      0.89      0.90      1747
weighted avg       0.90      0.89      0.90      1747



In [7]:
import joblib

joblib.dump(model, "audio_classifier_xgb.pkl")


['audio_classifier_xgb.pkl']

In [8]:
import joblib

model = joblib.load("audio_classifier_xgb.pkl")


In [15]:
class_map = {
    0: "air_conditioner",
    1: "car_horn",
    2: "children_playing",
    3: "dog_bark",
    4: "drilling",
    5: "engine_idling",
    6: "gun_shot",
    7: "jackhammer",
    8: "siren",
    9: "street_music"
}


In [17]:
def predict_audio(file_path, model):
    features = extract_mel(file_path)   # SAME function as training

    features = features.reshape(1, -1)  # (1, 128)

    probs = model.predict_proba(features)
    pred_id = probs.argmax(axis=1)[0]
    pred_class = class_map[pred_id]
    return pred_class

    


In [21]:
file_path = "74726-8-0-3.wav"
predict_audio(file_path=file_path , model=model)

'siren'