In [None]:
import pandas as pd
import os
import librosa

audio_dataset_path='/content/drive/MyDrive/UrbanSound8K/audio'
metadata=pd.read_csv('/content/drive/MyDrive/UrbanSound8K/metadata/UrbanSound8K.csv')

Here, necessary libraries are imported. pandas is used for handling data frames, os for operating system-related functions, and librosa for audio signal processing.

Storing the path to the directory containing the audio files.

The metadata for the UrbanSound8K dataset is read from a CSV file. This file likely contains information about the audio files, such as file names, locations, labels, and other relevant details.

In [None]:
def features_extractor(file):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

    return mfccs_scaled_features

//Defining mfcc feature extractor function
The features_extractor function is designed to extract Mel-Frequency Cepstral Coefficients (MFCCs) from an audio file using the librosa library.

Number of MFCCs to be extracted, set to 40 in this case.

 Takes the mean across the columns after transposing the MFCC matrix. This results in a one-dimensional array representing the scaled MFCCs.

In [None]:
!pip install resampy

Collecting resampy
  Downloading resampy-0.4.2-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: resampy
Successfully installed resampy-0.4.2


In [None]:
import numpy as np
from tqdm import tqdm
extracted_features=[]
for index_num,row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    final_class_labels=row["class"]
    data=features_extractor(file_name)
    extracted_features.append([data,final_class_labels])

8732it [17:09,  8.48it/s]


Created a extracted_features list,
iterated over metadata file,accesed each audio file by linking locations,obtain mfccs data throug mfcc extractor funtion, then appending as pair of data and class labels in list.
This code snippet is a part of the process where features, specifically Mel-Frequency Cepstral Coefficients (MFCCs), are extracted from each audio file in the dataset using the previously defined features_extractor function and Appending the extracted information into a list to later convert into a data frame

In [None]:
### converting extracted_features into a Pandas dataframe
extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','class'])

making a dataframe

In [None]:
### Split the dataset into independent and dependent dataset
X=np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())

converting feature and class columns into arrays

In [None]:
### Label Encoding
### Label Encoder
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))

label encoder encodes class labels as like 0,1,2..,
 to_categorical converts theses these numbers into vetors as 0010000000,0100000000,....

In [None]:
### Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

Proportion of the dataset that will be allocated to the test set is 20 and for train is 80.

By splitting the dataset into training and testing sets, you can train the machine learning model on the training set and evaluate its performance on the unseen testing set. This helps assess how well the model generalizes to new data and avoids overfitting.

MODEL CREATION

In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

In [None]:
### No of classes
num_labels=y.shape[1]

In [None]:
model=Sequential()
###first layer
model.add(Dense(100,input_shape=(40,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###second layer
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###third layer
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

###final layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

The code defines a neural network using Keras with a Sequential model. It comprises three dense layers with 100, 200, and 100 units, respectively, using ReLU activation. Each dense layer includes dropout for regularization with a dropout rate of 0.5. The final layer has a number of units corresponding to the output labels, utilizing softmax activation for multi-class classification. The input shape is set to (40,) for 40 features.

In [None]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [None]:
## Trianing my model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime

num_epochs = 100
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.hdf5',
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/100
Epoch 1: val_loss improved from inf to 0.67892, saving model to saved_models/audio_classification.hdf5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.67892
Epoch 3/100
Epoch 3: val_loss did not improve from 0.67892
Epoch 4/100
Epoch 4: val_loss did not improve from 0.67892
Epoch 5/100
Epoch 5: val_loss did not improve from 0.67892
Epoch 6/100
Epoch 6: val_loss did not improve from 0.67892
Epoch 7/100
Epoch 7: val_loss did not improve from 0.67892
Epoch 8/100
Epoch 8: val_loss did not improve from 0.67892
Epoch 9/100
Epoch 9: val_loss did not improve from 0.67892
Epoch 10/100
Epoch 10: val_loss did not improve from 0.67892
Epoch 11/100
Epoch 11: val_loss did not improve from 0.67892
Epoch 12/100
Epoch 12: val_loss improved from 0.67892 to 0.67337, saving model to saved_models/audio_classification.hdf5
Epoch 13/100
Epoch 13: val_loss did not improve from 0.67337
Epoch 14/100
Epoch 14: val_loss did not improve from 0.67337
Epoch 15/100
Epoch 15: val_loss did not improve 

It takes more time use high speed internet.
We can increase accuracy by running the training model again and again.

In [None]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

0.8070979118347168


80 percent accuracy is obtained on the test set.

Testing Some Test Audio Data

Demonstrating how to take an audio file, extract MFCC features, and use a trained neural network model to predict the class label for that particular audio sample. The result is the predicted class label for the provided audio file.

If instrested we also see the mfccs_scaled_features array, its reshaped form, and its shape,the predicted class index and the corresponding class label after the prediction step.

In [None]:
filename = "/content/drive/MyDrive/UrbanSound8K/audio/fold2/76086-4-0-58.wav"
audio, sample_rate = librosa.load(filename, res_type='kaiser_fast')
mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
mfccs_scaled_features = np.mean(mfccs_features.T, axis=0)

# print(mfccs_scaled_features)
mfccs_scaled_features = mfccs_scaled_features.reshape(1, -1)
# print(mfccs_scaled_features)
# print(mfccs_scaled_features.shape)

predicted_probabilities = model.predict(mfccs_scaled_features)
predicted_class = np.argmax(predicted_probabilities, axis=-1)

# print("Predicted class:", predicted_class)
prediction_class = labelencoder.inverse_transform(predicted_class)
print("Prediction class label:", prediction_class)

Prediction class label: ['drilling']
