In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)


DATA_DIR = '/content/drive/MyDrive/IBM_audio_dataset'
SAMPLE_RATE = 22050
DURATION = 5  # seconds
N_MFCC = 13
N_FFT = 2048
HOP_LENGTH = 512
MAX_FRAMES = 52  # Based on the error message

# Function to extract MFCC features from audio file
def extract_features(file_path):
    audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)
    mfccs = librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH)

    # Pad or truncate MFCC features to MAX_FRAMES
    if mfccs.shape[1] < MAX_FRAMES:
        pad_width = MAX_FRAMES - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfccs = mfccs[:, :MAX_FRAMES]

    return mfccs

# Load and preprocess the dataset
def load_data(data_dir):
    features = []
    labels = []
    for label in os.listdir(data_dir):
        label_dir = os.path.join(data_dir, label)
        if os.path.isdir(label_dir):
            for audio_file in os.listdir(label_dir):
                file_path = os.path.join(label_dir, audio_file)
                mfccs = extract_features(file_path)
                features.append(mfccs)
                labels.append(label)
    return np.array(features), np.array(labels)

# Load and preprocess the data
X, y = load_data(DATA_DIR)

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Normalize the features
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)
X_train = (X_train - mean) / (std + 1e-8)
X_test = (X_test - mean) / (std + 1e-8)

# Reshape for CNN input (if necessary)
if len(X_train.shape) == 3:
    X_train = X_train[..., np.newaxis]
    X_test = X_test[..., np.newaxis]

# Print the shape of the input data
print(f"Input shape: {X_train.shape[1:]}")

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=X_train.shape[1:]),
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(len(le.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Print model summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print(f'\nTest accuracy: {test_acc}')

# Save the model
model.save('audio_classifier_model.h5')

# Save the label encoder
import joblib
joblib.dump(le, 'label_encoder.joblib')

print("Model and label encoder saved successfully.")

Input shape: (13, 52, 1)


Epoch 1/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 43ms/step - accuracy: 0.7257 - loss: 0.7036 - val_accuracy: 0.8972 - val_loss: 0.2985
Epoch 2/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 58ms/step - accuracy: 0.9172 - loss: 0.2541 - val_accuracy: 0.9315 - val_loss: 0.1924
Epoch 3/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 48ms/step - accuracy: 0.9509 - loss: 0.1541 - val_accuracy: 0.9540 - val_loss: 0.1435
Epoch 4/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 43ms/step - accuracy: 0.9642 - loss: 0.1061 - val_accuracy: 0.9455 - val_loss: 0.1478
Epoch 5/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 59ms/step - accuracy: 0.9685 - loss: 0.0843 - val_accuracy: 0.9548 - val_loss: 0.1497
Epoch 6/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 41ms/step - accuracy: 0.9775 - loss: 0.0693 - val_accuracy: 0.9579 - val_loss: 0.1335
Epoch 7/50
[1m161/1




Test accuracy: 0.9856697916984558
Model and label encoder saved successfully.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
odel = tf.keras.models.load_model('audio_classifier_model.h5')

# Load the label encoder
le = joblib.load('label_encoder.joblib')


def load_test_data(test_data_path):
    pass

# Load your test data
X_test, y_test

# Get predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Calculate and print class-wise metrics
print("\nClass-wise Metrics:")
class_report = classification_report(y_test, y_pred_classes, target_names=le.classes_, output_dict=True)
print(classification_report(y_test, y_pred_classes, target_names=le.classes_))

# Create a DataFrame for easier manipulation
df_class_report = pd.DataFrame(class_report).transpose()

# Plot class-wise metrics
plt.figure(figsize=(12, 8))
sns.heatmap(df_class_report.iloc[:-3, :3], annot=True, cmap="YlGnBu", fmt='.2f')
plt.title("Class-wise Metrics Heatmap")
plt.tight_layout()
plt.savefig('class_wise_metrics_heatmap.png')
plt.close()

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.close()

print("Evaluation complete. Visualizations saved as 'class_wise_metrics_heatmap.png' and 'confusion_matrix.png'.")

# Function to predict on new data
def predict_audio(file_path, model, label_encoder):

    prediction = model.predict(features)
    predicted_class = label_encoder.inverse_transform([np.argmax(prediction)])[0]
    confidence = np.max(prediction)

    return predicted_class, confidence




[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step

Class-wise Metrics:
                 precision    recall  f1-score   support

      car_crash       0.99      0.99      0.99       332
gunshot_dataset       0.98      0.97      0.97       295
          other       0.99      0.99      0.99       885
   road_traffic       0.99      0.95      0.97        93

       accuracy                           0.99      1605
      macro avg       0.99      0.98      0.98      1605
   weighted avg       0.99      0.99      0.99      1605

Evaluation complete. Visualizations saved as 'class_wise_metrics_heatmap.png' and 'confusion_matrix.png'.
