# Audio Classification with CNN model 
- CNN train with real audio file (1 second time duration). 
- Using normalize sound between -1 to 1 , noise reduce and calculate spectrogram to train CNN

In [None]:
import os
import librosa # type: ignore
import librosa.display # type: ignore
import numpy as np # type: ignore

# Load example a drone audio file
audio_file = 'D:\\dataset_drone\\Drone\\Drone_0.wav'  # Replace with your audio file path
y, fs = librosa.load(audio_file)  # , duration=20)
timesDuration = librosa.get_duration(y=y, sr=fs)

# normalize audio  
max_value = np.max(np.abs(y))       # Determine the maximum values
audio_Drone = y/max_value           # Use max_value and normalize sound data to get values between -1 & +1

print(f'Sampling Rate: {fs} Hz')
print(f'Audio Duration: {timesDuration:.0f} seconds')

In [None]:
from IPython.display import Audio  # type: ignore

Audio(data=audio_Drone, rate=fs)

# Spectrogram Calculate 
Calculate Spectrogram by using SFTF method

In [None]:
# Calculate Spectrogram by using SFTF method
def spectrogram_cal(data,fs):
    ms = librosa.feature.melspectrogram(y=data, sr=fs, n_fft=2048, hop_length=128, n_mels=256)
    spectrogram_db = librosa.power_to_db(ms, ref=np.max)
    
    return spectrogram_db

# FFT Calculate
1-D discrete Fourier transforms 

In [None]:
from scipy.fft import fft, fftfreq # type: ignore

# ----- 1-D discrete Fourier transforms ------
def audioFFT_cal (data,fs):
    N = int(fs * timesDuration)         #   Number of sample points

    T = 1.0 / (fs)   # sample spacing
    x = np.linspace(0.0, N*T, N, endpoint=False)
    yf = fft(data)
    Xf = fftfreq(N, T)[:N//2]
    FFT_Amplitude = 10*np.log(np.abs(yf[0:N//2]))
    
    return Xf,FFT_Amplitude

In [None]:
# Plot demo of audio graph 
import matplotlib.pyplot as plt # type: ignore

y_signal = audio_Drone

plt.figure(figsize=(10, 8))
# ----- Plot Audio Waveform  -----
plt.subplot(2, 2, 1)
plt.title(f'Audio Waveform')
plt.plot(np.linspace(0, len(y_signal) / fs, len(y_signal)), y_signal)
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.grid()
# ----- Plot FFT  -----
xf,yf = audioFFT_cal(y_signal,fs)    
plt.subplot(2, 2, 2)
plt.title(f'FFT waveform')
plt.plot(xf, yf)
plt.grid()
plt.xlabel('Freq (Hz)')
plt.ylabel('Normalize Amplitude (dB)')
plt.ylim(-50,80)

# ------- Plot Spectrogram ---------
spectrogram_db = spectrogram_cal(y_signal,fs)
plt.subplot(2, 1, 2)
plt.title(f'Spectrogram')
librosa.display.specshow(spectrogram_db, sr=fs, x_axis='time', y_axis='linear', cmap='viridis')
#cmap = 'viridis', 'plasma', 'inferno', 'magma', 'cividis'
plt.colorbar(format='%+2.0f dB')
plt.title(f'Spectrogram shape {spectrogram_db.shape}')
plt.xlabel('Time (s)')
plt.ylabel('Frequency (Hz)')
plt.grid()

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# Data Pre-Processing
- load all sound in directory 
- normalize sound between -1 to 1 , noise reduce and calculate spectrogram to train CNN

In [None]:
import noisereduce as nr        # type: ignore
from skimage import exposure         # type: ignore

# Set the path to dataset folder
print("#: Set the path to dataset folder")
data_dir = "D:\\dataset_drone"

# parameter config
labels = ['Drone','No_Drone']
spectrogram_input = []
target_labels = []

stationary=True
prop_decrease=1
n_std_thresh_stationary = 1

for label in labels:
    label_dir = os.path.join(data_dir, label)
    for audio_file in os.listdir(label_dir):
        audio_path = os.path.join(label_dir, audio_file)
        print(audio_path)
        
        try:
            # Import Audio File
            audio_original, sr = librosa.load(audio_path)  # Load audio and limit to 3 seconds

                    # normalize audio  
            max_value = np.max(np.abs(audio_original))       # Determine the maximum values
            audio_normalize = audio_original/max_value        # Use max_value and normalize sound data to get values between -1 & +1

                    # perform noise reduction
            audio_reduced_noise = nr.reduce_noise(y=audio_normalize, 
                                                sr=fs, 
                                                stationary=stationary, 
                                                prop_decrease=prop_decrease,
                                                n_std_thresh_stationary=n_std_thresh_stationary)    # ,use_torch=True )

            spectrogram = spectrogram_cal(audio_reduced_noise,fs)
            #image_adapteq = exposure.equalize_hist((spectrogram))
            
            # Transpose the spectrogram to have the shape (timesteps, n_mels)
            spectrogram_input.append(spectrogram)
            target_labels.append(label)
            
        except:
            print(f'Error audio File: {audio_path}')

D:\dataset_drone\Drone\Drone_1280.wav
D:\dataset_drone\Drone\Drone_1281.wav
D:\dataset_drone\Drone\Drone_1282.wav
D:\dataset_drone\Drone\Drone_1283.wav
D:\dataset_drone\Drone\Drone_1284.wav
D:\dataset_drone\Drone\Drone_1285.wav
D:\dataset_drone\Drone\Drone_1286.wav
D:\dataset_drone\Drone\Drone_1287.wav
D:\dataset_drone\Drone\Drone_1288.wav
D:\dataset_drone\Drone\Drone_1289.wav
D:\dataset_drone\Drone\Drone_129.wav
D:\dataset_drone\Drone\Drone_1290.wav
D:\dataset_drone\Drone\Drone_1291.wav
D:\dataset_drone\Drone\Drone_1292.wav
D:\dataset_drone\Drone\Drone_1293.wav
D:\dataset_drone\Drone\Drone_1294.wav
D:\dataset_drone\Drone\Drone_1295.wav
D:\dataset_drone\Drone\Drone_1296.wav
D:\dataset_drone\Drone\Drone_1297.wav
D:\dataset_drone\Drone\Drone_1298.wav
D:\dataset_drone\Drone\Drone_1299.wav
D:\dataset_drone\Drone\Drone_13.wav
D:\dataset_drone\Drone\Drone_130.wav
D:\dataset_drone\Drone\Drone_1300.wav
D:\dataset_drone\Drone\Drone_1301.wav
D:\dataset_drone\Drone\Drone_1302.wav
D:\dataset_drone

# Save and load training data 
- directory : DatasetForTrain
- spectrogram_input
- target_labels

In [None]:
import pickle

# save dataset
with open('./DatasetForTrain/spectrogram_input', 'wb') as fp:
    pickle.dump(spectrogram_input, fp)

with open('./DatasetForTrain/target_labels', 'wb') as fp:
    pickle.dump(target_labels, fp)

with open('./DatasetForTrain/labels', 'wb') as fp:
    pickle.dump(labels, fp)

In [None]:
# import pickle

# # load dataset
# with open ('./DataSetForTrain/spectrogram_input', 'rb') as fp:
#     spectrogram_input = pickle.load(fp)

# with open ('./DataSetForTrain/target_labels', 'rb') as fp:
#     target_labels = pickle.load(fp)

# with open ('./DataSetForTrain/labels', 'rb') as fp:
#     labels = pickle.load(fp)

# Encoding targets 

In [None]:
# Support Python 3.9 only
from tensorflow.keras.utils import to_categorical # type: ignore
from sklearn.preprocessing import LabelEncoder # type: ignore

print("#: Encoding targets and data-splitting")
print("labels : " + str(labels))

# Encode target labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(target_labels)
Y_label = to_categorical(encoded_labels)
print(f'Encode target labels : {Y_label[0]}')

# Split data into train and test sets
Convert input data to ndarray and split data to train and test Model

In [None]:
from sklearn.model_selection import train_test_split # type: ignore

X = np.array(spectrogram_input)
y = np.array(Y_label)
X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)

xtrain, xtest, ytrain, ytest = train_test_split(X,y, train_size =0.9, random_state=42 )

# Normalize the data
xtrain = xtrain / 255.0
xtest = xtest / 255.0

print("xtrain shape : " + str(xtrain.shape))
print("xtest shape : " + str(xtest.shape))
print("ytrain shape : " + str(ytrain.shape))
print("ytest shape : " + str(ytest.shape))

######## Exploratory data analysis #######
# Count the number of samples in each class
print()
print("#: Count the number of samples in each class")
class_counts = [len(os.listdir(os.path.join(data_dir, label))) for label in labels]

print("Total Data set: " + str(int(class_counts[0]) + int(class_counts[1])))
print(labels[0] + ": " + str(class_counts[0]))
print(labels[1] + ": " + str(class_counts[1]))
print()

print("Data set for Train: " + str(xtrain.shape[0]))
print("Data set for Test: " + str(xtest.shape[0]))

# Model training

In [None]:
import keras # type: ignore
import tensorflow as tf

# Create the convolutional base
model = keras.Sequential([
        keras.layers.Conv2D(128, (3,3), activation='relu', input_shape=xtrain.shape[1:]),
        keras.layers.MaxPooling2D(pool_size=(2, 2)),
        keras.layers.Conv2D(128,(3,3) , activation='relu'),
        keras.layers.MaxPooling2D(pool_size=(2,2)),
        keras.layers.Dense(64),
        keras.layers.Dense(32),
        keras.layers.Dense(16),
        keras.layers.Flatten(),
        keras.layers.Dense(1024, activation='relu'),
        keras.layers.Dense(2, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# from tensorflow.keras.callbacks import EarlyStopping
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# hist = model.fit(xtrain, 
#                  ytrain, 
#                  validation_data=(xtest, ytest), 
#                  batch_size=32, 
#                  epochs=50,  
#                  callbacks=[early_stop])

In [None]:
hist = model.fit(xtrain, 
                 ytrain, 
                 validation_data=(xtest, ytest), 
                 batch_size=32, 
                 epochs=50)  

In [None]:
acc = hist.history['accuracy']
val_acc = hist.history['val_accuracy']
epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, '-', label='Training Accuracy')
plt.plot(epochs, val_acc, ':', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

In [None]:
loss = hist.history['loss']
val_loss = hist.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.plot(epochs, loss, '-', label='Training loss')
plt.plot(epochs, val_loss, ':', label='Validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epoch')
plt.ylabel('loss')
plt.legend(loc='lower right')

In [None]:
import pandas as pd

directory_name = 'model_01'
os.mkdir('./model/' + directory_name)

metrics = pd.DataFrame(model.history.history)
metrics.to_csv('./model/' + directory_name + '/hist.csv', index=False)

# save model
model.save('./model/' + directory_name + '/myModel.h5')

In [None]:
# load model
from tensorflow.keras.models import load_model # type: ignore

myModel = load_model('./model/' + directory_name + '/myModel.h5') #same file path
myModel.summary()

loss, acc = myModel.evaluate(xtest, ytest, verbose=0)
print(f"test accuracy {acc*100}")
print(f"test loss {loss*100}")


In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report , confusion_matrix # type: ignore
import seaborn as sns # type: ignore

y_predicted = myModel.predict(xtest)
mat = confusion_matrix(ytest.argmax(axis=1), y_predicted.argmax(axis=1))
class_labels = ['Drone', 'NoDrone']

# Calculate accuracy and F1 score
print("#: Calculate accuracy and F1 score")
accuracy = accuracy_score(ytest.argmax(axis=1), y_predicted.argmax(axis=1))

f1 = f1_score(ytest.argmax(axis=1), y_predicted.argmax(axis=1))

print(f"Accuracy: {accuracy * 100:.2f}%")
#print('Accuracy: {:.2f}'.format(accuracy))
print('F1 score: {:.2f}'.format(f1))

# Print classification report
print(classification_report(ytest.argmax(axis=1), y_predicted.argmax(axis=1), target_names=class_labels))

sns.heatmap(mat, square=True, annot=True, fmt='d', cbar=False, cmap='Blues',
            xticklabels=class_labels,
            yticklabels=class_labels)

plt.title('Confusion Matrix', fontsize=15, pad=20)
plt.xlabel('Prediction', fontsize=11)
plt.ylabel('Actual', fontsize=11)


In [None]:
# Support Python 3.9 only
from tensorflow.keras.utils import to_categorical # type: ignore
from sklearn.preprocessing import LabelEncoder # type: ignore

print("#: Encoding targets and data-splitting")
print("labels : " + str(labels))

# Encode target labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(target_labels)
Y_label = to_categorical(encoded_labels)
print(f'Encode target labels : {Y_label[0]}')

In [None]:
# Apply Model 
Ytest = xtest[1]

Ytest = Ytest.reshape(1, Ytest.shape[0], Ytest.shape[1], 1)
y_predicted = myModel.predict(Ytest)
output =  y_predicted.argmax(axis=1)

lable_Output = label_encoder.inverse_transform(output)

print(f'Predicted Output : {y_predicted}')
print(f'Output Valve : {output}')
print(f'Output Lable : {lable_Output}')