In [1]:
import pandas as pd
import os
import librosa

audio_dataset_path='UrbanSound8K/'
metadata=pd.read_csv('UrbanSound8K/UrbanSound8K.csv')
metadata.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
1,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
2,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
3,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing
4,100263-2-0-143.wav,100263,71.5,75.5,1,5,2,children_playing


In [2]:
def features_extractor(file):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    
    return mfccs_scaled_features
    

In [3]:
import numpy as np
from tqdm import tqdm

extracted_features=[]
c=1
for index_num,row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"])).replace("\\","/")
    final_class_labels=row["class"]
    data=features_extractor(file_name)
    extracted_features.append([data,final_class_labels])
    c+=1

8731it [15:14,  9.55it/s]


In [4]:
### converting extracted_features to Pandas dataframe
extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','class'])
extracted_features_df.head()

Unnamed: 0,feature,class
0,"[-424.09818, 109.34077, -52.919525, 60.86475, ...",children_playing
1,"[-458.79114, 121.38419, -46.520657, 52.00812, ...",children_playing
2,"[-413.89984, 101.66371, -35.42945, 53.036354, ...",children_playing
3,"[-446.60352, 113.68541, -52.402218, 60.302044,...",children_playing
4,"[-446.8255, 117.011925, -33.7923, 55.406204, 2...",children_playing


In [5]:
### Split the dataset into independent and dependent dataset
X=np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())

In [6]:
X.shape

(8731, 40)

In [7]:
y

array(['children_playing', 'children_playing', 'children_playing', ...,
       'car_horn', 'car_horn', 'car_horn'], dtype='<U16')

In [8]:
### Label Encoding
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))

In [9]:
y

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [10]:
### Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [11]:
X_train

array([[-1.12337402e+02,  6.10992813e+01, -3.18407097e+01, ...,
        -1.65826523e+00, -1.99171209e+00,  2.66584349e+00],
       [-2.51385864e+02,  1.33342346e+02, -1.06559610e+01, ...,
        -2.38794994e+00, -4.75240517e+00, -5.88469839e+00],
       [-2.51105194e+02,  1.24307274e+02,  7.52227736e+00, ...,
        -4.09622416e-02,  9.93638337e-02,  5.03649950e-01],
       ...,
       [-5.10315063e+02,  8.80034409e+01, -5.02600241e+00, ...,
         2.61844218e-01, -3.40468377e-01, -1.64568985e+00],
       [-1.49709091e+02,  1.38775314e+02, -3.20417595e+01, ...,
         4.44790363e-01, -1.55924821e+00, -1.42068398e+00],
       [-4.25987091e+02,  2.08752579e+02,  1.58930099e+00, ...,
        -3.81701732e+00, -1.41003668e+00, -3.94950747e-01]], dtype=float32)

In [12]:
y

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [13]:
X_train.shape

(6984, 40)

In [14]:
X_test.shape

(1747, 40)

In [15]:
y_train.shape

(6984, 10)

In [16]:
y_test.shape

(1747, 10)

### Model Creation

In [17]:
import tensorflow as tf

2.10.0


In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

In [19]:
### No of classes
num_labels=y.shape[1]

In [20]:
model=Sequential()
###first layer
model.add(Dense(100,input_shape=(40,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###second layer
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###third layer
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

###final layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               4100      
                                                                 
 activation (Activation)     (None, 100)               0         
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 200)               20200     
                                                                 
 activation_1 (Activation)   (None, 200)               0         
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 dense_2 (Dense)             (None, 100)               2

In [22]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [25]:
## Trianing my model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

num_epochs = 100
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/100
Epoch 1: val_loss improved from inf to 0.72797, saving model to saved_models\audio_classification.hdf5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.72797
Epoch 3/100
Epoch 3: val_loss did not improve from 0.72797
Epoch 4/100
Epoch 4: val_loss did not improve from 0.72797
Epoch 5/100
Epoch 5: val_loss did not improve from 0.72797
Epoch 6/100
Epoch 6: val_loss did not improve from 0.72797
Epoch 7/100
Epoch 7: val_loss did not improve from 0.72797
Epoch 8/100
Epoch 8: val_loss did not improve from 0.72797
Epoch 9/100
Epoch 9: val_loss did not improve from 0.72797
Epoch 10/100
Epoch 10: val_loss did not improve from 0.72797
Epoch 11/100
Epoch 11: val_loss did not improve from 0.72797
Epoch 12/100
Epoch 12: val_loss did not improve from 0.72797
Epoch 13/100
Epoch 13: val_loss did not improve from 0.72797
Epoch 14/100
Epoch 14: val_loss did not improve from 0.72797
Epoch 15/100
Epoch 15: val_loss improved from 0.72797 to 0.71714, saving model to saved_models\audio_classif

Epoch 30: val_loss did not improve from 0.71714
Epoch 31/100
Epoch 31: val_loss improved from 0.71714 to 0.70590, saving model to saved_models\audio_classification.hdf5
Epoch 32/100
Epoch 32: val_loss did not improve from 0.70590
Epoch 33/100
Epoch 33: val_loss did not improve from 0.70590
Epoch 34/100
Epoch 34: val_loss did not improve from 0.70590
Epoch 35/100
Epoch 35: val_loss did not improve from 0.70590
Epoch 36/100
Epoch 36: val_loss did not improve from 0.70590
Epoch 37/100
Epoch 37: val_loss did not improve from 0.70590
Epoch 38/100
Epoch 38: val_loss improved from 0.70590 to 0.68991, saving model to saved_models\audio_classification.hdf5
Epoch 39/100
Epoch 39: val_loss did not improve from 0.68991
Epoch 40/100
Epoch 40: val_loss did not improve from 0.68991
Epoch 41/100
Epoch 41: val_loss did not improve from 0.68991
Epoch 42/100
Epoch 42: val_loss did not improve from 0.68991
Epoch 43/100
Epoch 43: val_loss did not improve from 0.68991
Epoch 44/100
Epoch 44: val_loss did not

Epoch 59: val_loss did not improve from 0.68991
Epoch 60/100
Epoch 60: val_loss did not improve from 0.68991
Epoch 61/100
Epoch 61: val_loss did not improve from 0.68991
Epoch 62/100
Epoch 62: val_loss did not improve from 0.68991
Epoch 63/100
Epoch 63: val_loss did not improve from 0.68991
Epoch 64/100
Epoch 64: val_loss did not improve from 0.68991
Epoch 65/100
Epoch 65: val_loss did not improve from 0.68991
Epoch 66/100
Epoch 66: val_loss did not improve from 0.68991
Epoch 67/100
Epoch 67: val_loss did not improve from 0.68991
Epoch 68/100
Epoch 68: val_loss did not improve from 0.68991
Epoch 69/100
Epoch 69: val_loss did not improve from 0.68991
Epoch 70/100
Epoch 70: val_loss did not improve from 0.68991
Epoch 71/100
Epoch 71: val_loss did not improve from 0.68991
Epoch 72/100
Epoch 72: val_loss did not improve from 0.68991
Epoch 73/100
Epoch 73: val_loss did not improve from 0.68991
Epoch 74/100
Epoch 74: val_loss did not improve from 0.68991
Epoch 75/100
Epoch 75: val_loss did n

Epoch 89/100
Epoch 89: val_loss did not improve from 0.68621
Epoch 90/100
Epoch 90: val_loss did not improve from 0.68621
Epoch 91/100
Epoch 91: val_loss did not improve from 0.68621
Epoch 92/100
Epoch 92: val_loss did not improve from 0.68621
Epoch 93/100
Epoch 93: val_loss did not improve from 0.68621
Epoch 94/100
Epoch 94: val_loss did not improve from 0.68621
Epoch 95/100
Epoch 95: val_loss did not improve from 0.68621
Epoch 96/100
Epoch 96: val_loss did not improve from 0.68621
Epoch 97/100
Epoch 97: val_loss did not improve from 0.68621
Epoch 98/100
Epoch 98: val_loss did not improve from 0.68621
Epoch 99/100
Epoch 99: val_loss did not improve from 0.68621
Epoch 100/100
Epoch 100: val_loss improved from 0.68621 to 0.68066, saving model to saved_models\audio_classification.hdf5
Training completed in time:  0:01:28.128403


In [26]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

0.7956497073173523


In [28]:
X_test[1]

array([-102.88505   ,   87.88047   ,  -28.99793   ,   16.018013  ,
          1.2033683 ,   16.61361   ,  -13.97893   ,   17.469175  ,
         -5.9888306 ,   14.429839  ,  -17.82136   ,    3.3612053 ,
        -11.023228  ,   14.444955  ,    5.394709  ,   30.56515   ,
          5.5551596 ,   13.035866  ,   -6.269046  ,    7.8246922 ,
         -9.91783   ,   11.382298  ,  -12.103567  ,    2.8155384 ,
         -2.8936207 ,    6.367891  ,  -11.633807  ,    7.238143  ,
         12.592639  ,    6.793665  ,  -14.943717  ,    0.10874656,
         13.804669  ,   17.925476  ,   -6.5646725 ,    3.7349968 ,
          4.7114425 ,  -18.016933  ,  -19.927252  ,   13.628061  ],
      dtype=float32)

In [81]:
pred = model.predict(X_test)
pred = np.argmax(pred,axis=1)
pred



array([4, 1, 7, ..., 9, 3, 3], dtype=int64)

### Testing Some Test Audio Data

Steps
- Preprocess the new audio data
- predict the classes
- Invere transform your Predicted Label

In [86]:
filename="UrbanSound8K/dog_bark2.wav"
audio, sample_rate = librosa.load(filename, res_type='kaiser_fast') 
mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

print(mfccs_scaled_features)
mfccs_scaled_features=mfccs_scaled_features.reshape(1,-1)
print(mfccs_scaled_features)
print(mfccs_scaled_features.shape)
predicted_label=model.predict(mfccs_scaled_features)
predicted_label=np.argmax(predicted_label,axis=1)
predicted_label = np.array(predicted_label)
predicted_label = predicted_label.flatten()
print(predicted_label)
prediction_class = labelencoder.inverse_transform(predicted_label) 
prediction_class

[-3.5683524e+02  8.4114578e+01 -2.2832775e+01 -1.0309464e+01
  2.7360733e+00  1.0459924e+00 -6.2581086e+00  1.2818811e+01
 -3.5566330e+00  8.3815515e-01  3.2011795e+00  9.9730711e+00
  2.1779177e+00 -4.0221424e+00 -4.9099975e+00  5.7891160e-01
  1.1456020e-01  7.6958501e-01 -4.9822502e+00 -1.6147598e+00
 -1.9361399e+00 -1.4523536e+00  1.7747289e+00  4.2438450e+00
 -3.8731721e-01 -1.8724498e+00 -3.8537102e+00 -2.2411184e+00
 -2.0041037e+00 -1.3309578e+00 -4.9806366e+00 -3.5181143e+00
 -2.3907588e+00 -1.8754460e+00 -3.2984786e+00 -1.0587918e+00
 -1.5697086e+00 -2.7484596e+00 -2.2563803e+00 -2.8791909e+00]
[[-3.5683524e+02  8.4114578e+01 -2.2832775e+01 -1.0309464e+01
   2.7360733e+00  1.0459924e+00 -6.2581086e+00  1.2818811e+01
  -3.5566330e+00  8.3815515e-01  3.2011795e+00  9.9730711e+00
   2.1779177e+00 -4.0221424e+00 -4.9099975e+00  5.7891160e-01
   1.1456020e-01  7.6958501e-01 -4.9822502e+00 -1.6147598e+00
  -1.9361399e+00 -1.4523536e+00  1.7747289e+00  4.2438450e+00
  -3.8731721e-01 

array(['dog_bark'], dtype='<U16')