## Shooting sound Recognition

In [1]:
# Import Libraries 
import os
import glob
import sys
import librosa
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.utils import shuffle
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

## Audio Feature Extraction Function

The following Python function `extract_features` is designed to load audio files and extract Mel-frequency cepstral coefficients (MFCCs) from them. The MFCCs are commonly used features in audio processing and are particularly effective for identifying characteristics in sound, such as those required for the classification of gunshot sounds. The function takes the file path of an audio file as input, loads the audio using `librosa`, and then calculates the MFCCs with a default setting of 40 coefficients per frame and a hop length of 512 samples. The resulting MFCCs are returned for further analysis or model training.


In [2]:
# Function to load and extract features from audio files
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40, hop_length=512)
    return mfccs

## Loading and Preparing Audio Data

Loads and extracts MFCC features from audio files in specific directories for gunshot and ambient sounds, labeling them appropriately. It then combines the features and labels, shuffles the data, and prepares it for use in a machine learning model. The final data shapes are printed to verify the successful processing of the audio files.


In [3]:
# Function to load audio files from a specific directory
def load_audio_files_from_dir(directory, label):
    features = []
    labels = []
    
    for file in os.listdir(directory):
        if file.endswith('.wav'):
            file_path = os.path.join(directory, file)
            mfccs = extract_features(file_path)
            for mfcc in mfccs.T:  # Each column is a feature vector for one frame
                features.append(mfcc)
                labels.append(label)
    
    return features, labels

# Define the paths to your data directories using raw strings for Windows paths
automatic_path = 'Gunshots (1)/Gunshots/Automatic'
single_shot_path = 'Gunshots (1)/Gunshots/Single Shot'
ambience_path = '1_Ambience Sound/Ambience Sound'

# Load gunshots data from both "Automatic" and "Single shot" folders
automatic_features, automatic_labels = load_audio_files_from_dir(automatic_path, label=1)
single_shot_features, single_shot_labels = load_audio_files_from_dir(single_shot_path, label=1)

# Combine the gunshots data from both folders
gunshots_features = automatic_features + single_shot_features
gunshots_labels = automatic_labels + single_shot_labels

# Load ambience sound data
ambience_features, ambience_labels = load_audio_files_from_dir(ambience_path, label=0)

# Combine all features and labels
features = gunshots_features + ambience_features
labels = gunshots_labels + ambience_labels

In [4]:
# Convert to numpy arrays
X = np.array(features)
y = np.array(labels)

# Shuffle the data
from sklearn.utils import shuffle
X, y = shuffle(X, y, random_state=42)

In [5]:
# Print the shape of the data
print(f'Shape of features (X): {X.shape}')
print(f'Shape of labels (y): {y.shape}')

Shape of features (X): (202713, 40)
Shape of labels (y): (202713,)


### Model Training with CNN

In [6]:
# Perform stratified splitting to ensure class balance in both train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print the class distribution in the training and test sets
print(f'Number of gunshots in the training set: {np.sum(y_train == 1)}')
print(f'Number of ambient sounds in the training set: {np.sum(y_train == 0)}')
print(f'Number of gunshots in the test set: {np.sum(y_test == 1)}')
print(f'Number of ambient sounds in the test set: {np.sum(y_test == 0)}')

Number of gunshots in the training set: 3441
Number of ambient sounds in the training set: 158729
Number of gunshots in the test set: 860
Number of ambient sounds in the test set: 39683


In [7]:
# Reshape data for CNN input: (number of samples, number of MFCCs, 1)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Convert labels to categorical (if needed, though binary here)
lb = LabelBinarizer()
y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)

In [8]:
# Build the CNN model
model = models.Sequential([
    layers.Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    layers.MaxPooling1D(pool_size=2),
    layers.Conv1D(64, kernel_size=3, activation='relu'),
    layers.MaxPooling1D(pool_size=2),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [9]:
# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/20
[1m5068/5068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 5ms/step - accuracy: 0.9962 - loss: 0.0225 - val_accuracy: 0.9997 - val_loss: 0.0016
Epoch 2/20
[1m5068/5068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 6ms/step - accuracy: 0.9993 - loss: 0.0025 - val_accuracy: 0.9996 - val_loss: 0.0018
Epoch 3/20
[1m5068/5068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 6ms/step - accuracy: 0.9995 - loss: 0.0020 - val_accuracy: 0.9993 - val_loss: 0.0018
Epoch 4/20
[1m5068/5068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 6ms/step - accuracy: 0.9995 - loss: 0.0016 - val_accuracy: 0.9997 - val_loss: 0.0011
Epoch 5/20
[1m5068/5068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 5ms/step - accuracy: 0.9995 - loss: 0.0023 - val_accuracy: 0.9997 - val_loss: 0.0013
Epoch 6/20
[1m5068/5068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 5ms/step - accuracy: 0.9995 - loss: 0.0017 - val_accuracy: 0.9997 - val_loss: 0.0013
Epoch 7/20

In [10]:
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc:.4f}')

[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0017
Test accuracy: 0.9995


In [11]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# Confusion matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
[[39672    11]
 [    8   852]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39683
           1       0.99      0.99      0.99       860

    accuracy                           1.00     40543
   macro avg       0.99      1.00      0.99     40543
weighted avg       1.00      1.00      1.00     40543



In [12]:
print(f'Number of gunshots in the test set: {np.sum(y_test == 1)}')
print(f'Number of ambient sounds in the test set: {np.sum(y_test == 0)}')

Number of gunshots in the test set: 860
Number of ambient sounds in the test set: 39683
