In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install audiomentations



In [None]:
DATASET_PATH = r"d:\DL\Gunshot_Detection_Project\emergency sound detection"


In [None]:
import os
print(os.listdir(DATASET_PATH))


['Explosion dataset', 'Fire', 'GunShots', 'scream', 'non_scream']


In [None]:
for label in os.listdir(DATASET_PATH):
    folder_path = os.path.join(DATASET_PATH, label)
    if os.path.isdir(folder_path):
        num_files = len([f for f in os.listdir(folder_path) if f.endswith(('.wav', '.mp3', '.flac', '.ogg'))])
        print(f"{label}: {num_files} files")


Explosion dataset: 1294 files
Fire: 289 files
GunShots: 901 files
scream: 2445 files
non_scream: 596 files


In [None]:
# Verify folder structure and count files per class
import os
valid_ext = ('.wav', '.mp3', '.flac', '.ogg', '.m4a')

if not os.path.exists(DATASET_PATH):
    raise FileNotFoundError(f"Dataset path not found: {DATASET_PATH}")

classes = [d for d in sorted(os.listdir(DATASET_PATH)) if os.path.isdir(os.path.join(DATASET_PATH, d))]
print("Detected classes:", classes)

counts = {}
for c in classes:
    p = os.path.join(DATASET_PATH, c)
    files = [f for f in os.listdir(p) if f.lower().endswith(valid_ext)]
    counts[c] = len(files)

print("\nSample counts per class:")
for k,v in counts.items():
    print(f" - {k}: {v}")


Detected classes: ['Explosion dataset', 'Fire', 'GunShots', 'non_scream', 'scream']

Sample counts per class:
 - Explosion dataset: 1294
 - Fire: 289
 - GunShots: 901
 - non_scream: 596
 - scream: 2445


In [None]:
# Imports and parameters
import os, random
import numpy as np
import librosa
import librosa.display
from audiomentations import Compose, AddGaussianNoise, PitchShift, TimeStretch, Shift
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical

# Parameters
SAMPLE_RATE = 22050
DURATION = 3
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION
N_MELS = 128
BATCH_SIZE = 32
EPOCHS = 30
RANDOM_STATE = 42

# Use mixed precision if GPU supports it (speeds up training)
try:
    from tensorflow.keras import mixed_precision
    policy = mixed_precision.Policy('mixed_float16')
    mixed_precision.set_global_policy(policy)
    print('Mixed precision enabled:', mixed_precision.global_policy())
except Exception as e:
    print('Mixed precision not enabled or not available:', e)


Mixed precision enabled: <DTypePolicy "mixed_float16">


In [None]:
# Augmentation pipeline (used only to augment smaller classes)
from audiomentations import Compose, AddGaussianNoise, PitchShift, TimeStretch, Shift

augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.4),
    PitchShift(min_semitones=-3, max_semitones=3, p=0.5),
    TimeStretch(min_rate=0.9, max_rate=1.1, p=0.4),
    Shift(p=0.5)  # ✅ updated syntax (no min/max_fraction)
])


def load_audio(file_path, sr=SAMPLE_RATE, duration=DURATION):
    y, _sr = librosa.load(file_path, sr=sr, duration=duration)
    if len(y) < sr*duration:
        y = np.pad(y, (0, sr*duration - len(y)))
    else:
        y = y[:sr*duration]
    return y

def extract_mel(y, sr=SAMPLE_RATE, n_mels=N_MELS):
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, fmax=sr//2)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    # Normalize to zero mean & unit variance per-sample
    mel_db = (mel_db - np.mean(mel_db)) / (np.std(mel_db) + 1e-9)
    return mel_db


In [None]:
# Build balanced dataset by augmenting only smaller classes
data = []
labels = []

# current file lists and counts
class_files = {}
for idx, c in enumerate(classes):
    p = os.path.join(DATASET_PATH, c)
    files = [os.path.join(p, f) for f in os.listdir(p) if f.lower().endswith(valid_ext)]
    class_files[c] = files

max_count = max(len(v) for v in class_files.values())
print('Target samples per class:', max_count)

for idx, c in enumerate(classes):
    files = class_files[c]
    print(f"Processing class '{c}' ({len(files)} original samples)...")
    # add original samples
    for fpath in tqdm(files):
        y = load_audio(fpath)
        mel = extract_mel(y)
        data.append(mel)
        labels.append(idx)
    # augment until we reach max_count
    while sum(1 for lab in labels if lab==idx) < max_count:
        src = random.choice(files)
        y = load_audio(src)
        y_aug = augment(samples=y, sample_rate=SAMPLE_RATE)
        mel = extract_mel(y_aug)
        data.append(mel)
        labels.append(idx)

# Convert to arrays
X = np.array(data)[..., np.newaxis].astype('float32')  # shape: (N, n_mels, frames, 1)
y = to_categorical(np.array(labels), num_classes=len(classes))
print('\nFinal dataset shape:', X.shape, y.shape)


Target samples per class: 2445
Processing class 'Explosion dataset' (1294 original samples)...


  4%|▍         | 49/1294 [01:00<03:48,  5.44it/s]

In [None]:
# Train-test split (stratified) and class weights
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)
print('Train:', X_train.shape, 'Test:', X_test.shape)

# compute class weights for optional use
from sklearn.utils.class_weight import compute_class_weight
y_integers = np.argmax(y_train, axis=1)
class_weights_values = compute_class_weight('balanced', classes=np.unique(y_integers), y=y_integers)
class_weights = dict(enumerate(class_weights_values))
print('Class weights:', class_weights)


In [None]:
# Model: CNN + Bidirectional GRU
import tensorflow as tf
from tensorflow.keras import layers, models

def build_model(input_shape, num_classes):
    inp = layers.Input(shape=input_shape)
    x = layers.Conv2D(32, (3,3), activation='relu', padding='same')(inp)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2,2))(x)

    x = layers.Conv2D(64, (3,3), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2,2))(x)

    x = layers.Conv2D(128, (3,3), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2,2))(x)

    # prepare for RNN: collapse freq axis, keep time frames
    # input shape: (n_mels, frames, 1) -> we permute to (frames, features)
    shape = tf.shape(x)
    x = layers.Permute((2,1,3))(x)  # now (batch, frames, n_mels/?, channels)
    b, t, f, ch = x.shape.as_list()
    x = layers.Reshape((tf.shape(x)[1], -1))(x)  # (batch, frames, features)

    x = layers.Bidirectional(layers.GRU(128, return_sequences=False))(x)
    x = layers.Dropout(0.4)(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    out = layers.Dense(num_classes, activation='softmax', dtype='float32')(x)  # dtype float32 for stable output

    model = models.Model(inputs=inp, outputs=out)
    return model

input_shape = X_train.shape[1:]  # (n_mels, frames, 1)
model = build_model(input_shape, num_classes=len(classes))
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [None]:
# Callbacks and training
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

checkpoint_path = "/content/drive/MyDrive/emergency_sound_detection/best_model.h5"
callbacks = [
    ModelCheckpoint(checkpoint_path, monitor='val_accuracy', save_best_only=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1),
    EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True, verbose=1),
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    class_weight=class_weights,
    callbacks=callbacks
)


In [None]:
# Plot training curves
import matplotlib.pyplot as plt
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.title('Accuracy'); plt.legend(); plt.grid(True)

plt.subplot(1,2,2)
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title('Loss'); plt.legend(); plt.grid(True)
plt.show()

# Load best model (if saved) and evaluate
from tensorflow.keras.models import load_model
best = checkpoint_path
if os.path.exists(best):
    model = load_model(best)
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test accuracy: {acc*100:.2f}%")


In [None]:
# Predictions, classification report and confusion matrix
import numpy as np
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

print(classification_report(y_true, y_pred_classes, target_names=classes))

cm = confusion_matrix(y_true, y_pred_classes)
disp = ConfusionMatrixDisplay(cm, display_labels=classes)
fig, ax = plt.subplots(figsize=(8,6))
disp.plot(ax=ax, cmap='Blues', xticks_rotation=45)
plt.show()


In [None]:
# Save final model to Drive
final_path = "/content/drive/MyDrive/emergency_sound_detection/final_model.h5"
model.save(final_path)
print("Saved final model to:", final_path)


In [None]:
# Inference on an uploaded file
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
    path = fn
    print("Uploaded:", path)
    y = load_audio(path)
    mel = extract_mel(y)
    mel_input = mel[np.newaxis,...,np.newaxis].astype('float32')
    preds = model.predict(mel_input)
    idx = preds.argmax()
    print(f"Predicted: {classes[idx]} (confidence {preds[0,idx]*100:.2f}%)")


In [None]:
# =====================================
# 🎤 REAL-TIME AUDIO PREDICTION WITH MEL SPECTROGRAM
# =====================================
import io
import numpy as np
import sounddevice as sd
import scipy.io.wavfile as wav
import librosa
import librosa.display
import matplotlib.pyplot as plt
import IPython.display as ipd
from tensorflow.keras.models import load_model

# If model not already in memory:
# model = load_model("/content/drive/MyDrive/emergency_sound_detection/audio_multiclass_cnn_gru_model.h5")

DURATION = 3   # seconds
SR = 22050     # sample rate

print("🎙️ Recording... Speak or make a sound now!")
recording = sd.rec(int(DURATION * SR), samplerate=SR, channels=1, dtype='float32')
sd.wait()
print("✅ Recording complete!")

# Save and playback
wav.write("realtime_input.wav", SR, recording)
ipd.display(ipd.Audio("realtime_input.wav"))

# --- Feature Extraction Function ---
def extract_mel(y, sr=SR, n_mels=128):
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    return mel_db

# Load & preprocess audio
y, sr = librosa.load("realtime_input.wav", sr=SR)
mel = extract_mel(y)

# --- Display Mel Spectrogram ---
plt.figure(figsize=(10, 4))
librosa.display.specshow(mel, sr=sr, x_axis='time', y_axis='mel', cmap='magma')
plt.colorbar(format='%+2.0f dB')
plt.title("🎛️ Mel Spectrogram (Real-Time Input)")
plt.tight_layout()
plt.show()

# --- Model Input Prep ---
mel = mel / 80.0 + 1.0  # normalize
mel_input = mel[np.newaxis, ..., np.newaxis].astype('float32')

# --- Prediction ---
preds = model.predict(mel_input)
idx = preds.argmax()
confidence = preds[0, idx] * 100

print(f"\n🎯 Predicted Class: **{classes[idx]}**")
print(f"🤖 Confidence: {confidence:.2f}%")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import random
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np

# ✅ Correct dataset path
DATASET_PATH = "/content/drive/MyDrive/emergency sound detection"
SAMPLE_RATE = 22050

# Class labels (same as your folder names)
LABELS = ["scream", "non_scream", "GunShots", "Explosion dataset", "Fire"]

# Function to extract mel spectrogram
def extract_mel(y, sr=22050, n_mels=128):
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    return mel_db

# Loop through each class and display one random Mel spectrogram
for label in LABELS:
    class_path = os.path.join(DATASET_PATH, label)
    files = [f for f in os.listdir(class_path) if f.lower().endswith(('.wav', '.mp3'))]
    if not files:
        print(f"⚠️ No audio files found in {label}")
        continue

    # Pick a random file
    file_path = os.path.join(class_path, random.choice(files))
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)

    # Extract mel spectrogram
    mel_spec = extract_mel(y, sr)

    # Plot
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mel_spec, sr=sr, x_axis='time', y_axis='mel', cmap='magma')
    plt.colorbar(format='%+2.0f dB')
    plt.title(f"🎧 Mel Spectrogram - {label}")
    plt.tight_layout()
    plt.show()


In [None]:
import librosa
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

# === Load model ===
from tensorflow.keras.models import load_model
model = load_model("audio_multiclass_cnn_gru_model.h5")

# === Feature extraction ===
def extract_mel_spectrogram(file_path, sr=22050, n_mels=130, max_len=128):
    y, sr = librosa.load(file_path, sr=sr)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_db = mel_db.T  # shape: (time, freq)
    
    # Pad or trim to match model input (128, 130)
    if mel_db.shape[0] < max_len:
        pad_width = max_len - mel_db.shape[0]
        mel_db = np.pad(mel_db, ((0, pad_width), (0, 0)), mode='constant')
    else:
        mel_db = mel_db[:max_len, :]
    
    return mel_db

# === Interpretability functions ===
def compute_saliency(model, mel_features):
    input_tensor = tf.convert_to_tensor(mel_features[np.newaxis, ..., np.newaxis], dtype=tf.float32)
    with tf.GradientTape() as tape:
        tape.watch(input_tensor)
        preds = model(input_tensor)
        top_class = tf.argmax(preds[0])
        loss = preds[0, top_class]
    grads = tape.gradient(loss, input_tensor)
    saliency = np.abs(grads[0, ..., 0].numpy())
    return saliency

def smoothgrad(model, mel_features, n_samples=30, noise_level=0.1):
    grads_sum = np.zeros_like(mel_features)
    for _ in range(n_samples):
        noise = np.random.normal(0, noise_level, mel_features.shape)
        noisy_input = mel_features + noise
        grads_sum += compute_saliency(model, noisy_input)
    return grads_sum / n_samples

def occlusion_sensitivity(model, mel_features, patch_size=(8, 8)):
    base_pred = model.predict(mel_features[np.newaxis, ..., np.newaxis])[0]
    sensitivity = np.zeros_like(mel_features)
    for i in range(0, mel_features.shape[0], patch_size[0]):
        for j in range(0, mel_features.shape[1], patch_size[1]):
            occluded = mel_features.copy()
            occluded[i:i+patch_size[0], j:j+patch_size[1]] = 0
            pred = model.predict(occluded[np.newaxis, ..., np.newaxis])[0]
            diff = np.abs(base_pred - pred).sum()
            sensitivity[i:i+patch_size[0], j:j+patch_size[1]] = diff
    return sensitivity

# === Example for each class ===
classes = ["Explosion", "Fire", "Gunshots", "Non-Scream", "Scream"]
test_files = {
    "Explosion": "C:/Users/Prathima B A/Downloads/Scream_detection/Explosion dataset/timebomb-74798.mp3",
    "Fire": "C:/Users/Prathima B A/Downloads/Scream_detection/Fire/videoplayback (1)_62.wav",
    "Gunshots":"C:/Users/Prathima B A/Downloads/Scream_detection/GunShots/8 (5).wav",
    "Non-Scream": "C:/Users/Prathima B A/Downloads/Scream_detection/non_scream/z_mf1ceM8jc_out.wav",
    "Scream": "C:/Users/Prathima B A/Downloads/Scream_detection/scream/zkWoni28n64_out.wav"
}

for label, file_path in test_files.items():
    print(f"\n🎧 {label}: {file_path}")
    mel_features = extract_mel_spectrogram(file_path)
    
    sal_map = compute_saliency(model, mel_features)
    sg_map = smoothgrad(model, mel_features)
    occ_map = occlusion_sensitivity(model, mel_features)

    plt.figure(figsize=(12, 4))
    plt.subplot(1, 3, 1)
    plt.imshow(sal_map.T, aspect='auto', origin='lower', cmap='viridis')
    plt.title('Saliency')
    plt.subplot(1, 3, 2)
    plt.imshow(sg_map.T, aspect='auto', origin='lower', cmap='inferno')
    plt.title('SmoothGrad')
    plt.subplot(1, 3, 3)
    plt.imshow(occ_map.T, aspect='auto', origin='lower', cmap='magma')
    plt.title('Occlusion')
    plt.suptitle(f"Class: {label}")
    plt.tight_layout()
    plt.show()
