# üéß Audio Classification Module

This notebook demonstrates preprocessing, training, prediction, and audio quality assessment for audio-based car crash detection using YAMNet.

## üîß Install Dependencies

In [7]:
!pip install librosa soundfile tensorflow tensorflow_hub numpy



## üìÇ Mount Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## üéöÔ∏è Preprocess Audio Clips (500 crash, 100 each other class)

In [3]:

import os
import librosa
import soundfile as sf
from collections import defaultdict

input_base = '/content/drive/MyDrive/FYP_Dataset_2'
output_base = '/content/drive/MyDrive/FYP_Processed_Dataset'
os.makedirs(output_base, exist_ok=True)

limits = defaultdict(lambda: 100)
limits['car_crash'] = 500

for class_name in os.listdir(input_base):
    in_class_path = os.path.join(input_base, class_name)
    out_class_path = os.path.join(output_base, class_name)
    if not os.path.isdir(in_class_path): continue
    os.makedirs(out_class_path, exist_ok=True)

    processed = 0
    for fname in os.listdir(in_class_path):
        if fname.endswith('.wav') and processed < limits[class_name]:
            in_file = os.path.join(in_class_path, fname)
            out_file = os.path.join(out_class_path, fname)
            y, sr = librosa.load(in_file, sr=16000, mono=True)
            sf.write(out_file, y, 16000)
            processed += 1


## üè∑Ô∏è Create CSV Labels

In [4]:

import pandas as pd

filepaths, labels = [], []
for class_name in os.listdir(output_base):
    class_dir = os.path.join(output_base, class_name)
    if not os.path.isdir(class_dir): continue
    for fname in os.listdir(class_dir):
        if fname.endswith('.wav'):
            filepaths.append(os.path.join(class_dir, fname))
            labels.append(1 if class_name == 'car_crash' else 0)

df = pd.DataFrame({'filepath': filepaths, 'label': labels})
df.to_csv('/content/drive/MyDrive/audio_labels.csv', index=False)
df.head()


Unnamed: 0,filepath,label
0,/content/drive/MyDrive/FYP_Processed_Dataset/c...,1
1,/content/drive/MyDrive/FYP_Processed_Dataset/c...,1
2,/content/drive/MyDrive/FYP_Processed_Dataset/c...,1
3,/content/drive/MyDrive/FYP_Processed_Dataset/c...,1
4,/content/drive/MyDrive/FYP_Processed_Dataset/c...,1


## üì• Load and Prepare Dataset

In [9]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/drive/MyDrive/audio_labels.csv')
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

train_X, train_y = [], []
val_X, val_y = [], []

import tensorflow_hub as hub
yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')

def extract_features(filepaths):
    feats = []
    for path in filepaths:
        waveform, sr = librosa.load(path, sr=16000)
        scores, embeddings, spectrogram = yamnet_model(waveform)
        mean_feat = tf.reduce_mean(embeddings, axis=0).numpy()
        feats.append(mean_feat)
    return np.array(feats)

train_X = extract_features(train_df['filepath'])
train_y = train_df['label'].values
val_X = extract_features(val_df['filepath'])
val_y = val_df['label'].values


## üß† Train Classifier

In [11]:

from tensorflow import keras

model = keras.Sequential([
    keras.layers.Input(shape=(1024,)),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(train_X, train_y, validation_data=(val_X, val_y),
                    epochs=5, batch_size=32,
                    callbacks=[keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)])

model.save('/content/drive/MyDrive/crash_classifier_audio_final_v3.keras')


Epoch 1/5
[1m30/30[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m4s[0m 37ms/step - accuracy: 0.8579 - loss: 0.4012 - val_accuracy: 0.9625 - val_loss: 0.1147
Epoch 2/5
[1m30/30[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9735 - loss: 0.0747 - val_accuracy: 0.9875 - val_loss: 0.0781
Epoch 3/5
[1m30/30[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9819 - loss: 0.0606 - val_accuracy: 0.9875 - val_loss: 0.0652
Epoch 4/5
[1m30/30[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9937 - loss: 0.0326 - val_accuracy: 0.9875 - val_loss: 0.0619
Epoch 5/5
[1m30/30[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9921 - loss: 0.0259 - val_accuracy: 0.9792 - val_loss: 0.0641


## üéß Audio Quality Assessment

In [12]:

def compute_audio_quality(path):
    y, sr = librosa.load(path, sr=16000)
    rms = librosa.feature.rms(y=y).mean()
    zero_crossings = sum(librosa.zero_crossings(y))
    silence = 1.0 - (np.count_nonzero(y) / len(y))
    return rms, zero_crossings, silence

for i in range(3):
    path = df['filepath'].iloc[i]
    rms, zc, silence = compute_audio_quality(path)
    print(f"üîπ File: {os.path.basename(path)}")
    print(f"RMS Energy: {rms:.4f}, Zero Crossings: {zc}, Silence Ratio: {silence:.2f}\n")


üîπ File: car_crash_001.wav
RMS Energy: 0.2046, Zero Crossings: 6883, Silence Ratio: 0.00

üîπ File: car_crash_002.wav
RMS Energy: 0.1935, Zero Crossings: 1265, Silence Ratio: 0.00

üîπ File: car_crash_007.wav
RMS Energy: 0.0760, Zero Crossings: 2560, Silence Ratio: 0.00



## üîç Run Prediction on New Clip

In [24]:

from tensorflow.keras.models import load_model
model = load_model('/content/drive/MyDrive/crash_classifier_audio_final_v3.keras')

test_file = '/content/drive/MyDrive/FYP/Test_Dataset/test_crash_aud_03.mp3'




import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model


# Parameters
SAMPLE_RATE = 16000
CHUNK_DURATION = 10  # seconds
CHUNK_SIZE = SAMPLE_RATE * CHUNK_DURATION

def predict_audio_chunks(file_path):
    waveform, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    total_samples = len(waveform)

    predictions = []

    print(f"Total duration: {total_samples / sr:.2f}s | Chunks: {int(np.ceil(total_samples / CHUNK_SIZE))}")

    for start in range(0, total_samples, CHUNK_SIZE):
        end = start + CHUNK_SIZE
        chunk = waveform[start:end]

        # Pad if last chunk is shorter
        if len(chunk) < CHUNK_SIZE:
            chunk = np.pad(chunk, (0, CHUNK_SIZE - len(chunk)))

        # Get embeddings from YAMNet
        _, embeddings, _ = yamnet_model(chunk)
        features = tf.reduce_mean(embeddings, axis=0).numpy().reshape(1, -1)

        # Predict with crash classifier
        prob = model.predict(features)[0][0]
        predictions.append(prob)

        print(f"Chunk [{start/SAMPLE_RATE:.1f}-{min(end/SAMPLE_RATE, total_samples/SAMPLE_RATE):.1f}s] ‚Üí Prob: {prob:.4f}")

    # If any chunk crosses 0.5 threshold, classify whole audio as crash
    crash_detected = any(p > 0.5 for p in predictions)
    return predictions, crash_detected

# Run on a test audio
preds, is_crash = predict_audio_chunks(test_file)

print("\nFinal Decision:", "üö® Crash Detected" if is_crash else "‚úÖ No Crash Detected")





Total duration: 3.12s | Chunks: 1
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 63ms/step
Chunk [0.0-3.1s] ‚Üí Prob: 0.0000

Final Decision: ‚úÖ No Crash Detected
