In [5]:
import os
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.image import resize
from tensorflow.keras.models import load_model

In [6]:
# Load and preprocess audio data
def load_and_preprocess_data(data_dir, classes, target_shape=(128, 128)):
    data = []
    labels = []

    for i, class_name in enumerate(classes):
        class_dir = os.path.join(data_dir, class_name)
        for filename in os.listdir(class_dir):
            if filename.endswith(".wav"):
                file_path = os.path.join(class_dir, filename)
                audio_data, sample_rate = librosa.load(file_path, sr=None)
                # Perform preprocessing (e.g., convert to Mel spectrogram and resize)
                mel_spectrogram = librosa.feature.melspectrogram(
                    y=audio_data, sr=sample_rate
                )
                mel_spectrogram = resize(
                    np.expand_dims(mel_spectrogram, axis=-1), target_shape
                )
                data.append(mel_spectrogram)
                labels.append(i)

    return np.array(data), np.array(labels)

In [7]:
data_dir = "D:/Programming/VoiceRecog/for-norm/training"
classes = ["fake", "real"]
# Split data into training and testing sets
data, labels = load_and_preprocess_data(data_dir, classes)
labels = to_categorical(
    labels, num_classes=len(classes)
)  # Convert labels to one-hot encoding
X_train, y_train = data, labels


data_dir2 = "D:/Programming/VoiceRecog/for-norm/testing"
classes2 = ["fake", "real"]
data2, labels2 = load_and_preprocess_data(data_dir2, classes2)
labels2 = to_categorical(labels2, num_classes=len(classes2))
X_test, y_test = data2, labels2



In [8]:
# # Split data into training and testing sets
# data, labels = load_and_preprocess_data(data_dir, classes)
# labels = to_categorical(labels, num_classes=len(classes))  # Convert labels to one-hot encoding
# X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)


In [9]:
input_shape = X_train[0].shape
input_layer = Input(shape=input_shape)
x = Conv2D(32, (3, 3), activation="relu")(input_layer)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation="relu")(x)
x = MaxPooling2D((2, 2))(x)
x = Flatten()(x)
x = Dense(64, activation="relu")(x)
output_layer = Dense(len(classes), activation="softmax")(x)
model = Model(input_layer, output_layer)

In [10]:
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)

In [11]:
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1e8bd4b21c0>

In [12]:
test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(test_accuracy[1])

0.931592583656311


In [13]:
model.save("audio_classification_model.h5")

In [14]:
# Load the saved model
model = load_model("audio_classification_model.h5")

# Define the target shape for input spectrograms
target_shape = (128, 128)

# Define your class labels
classes = ["fake", "real"]


# Function to preprocess and classify an audio file
def test_audio(file_path, model):
    # Load and preprocess the audio file
    audio_data, sample_rate = librosa.load(file_path, sr=None)
    mel_spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate)
    mel_spectrogram = resize(np.expand_dims(mel_spectrogram, axis=-1), target_shape)
    mel_spectrogram = tf.reshape(mel_spectrogram, (1,) + target_shape + (1,))

    # Make predictions
    predictions = model.predict(mel_spectrogram)

    # Get the class probabilities
    class_probabilities = predictions[0]

    # Get the predicted class index
    predicted_class_index = np.argmax(class_probabilities)

    return class_probabilities, predicted_class_index
# Test an audio file
test_audio_file = "D:/Programming/VoiceRecog/for-norm/training/real/file6.wav_16k.wav_norm.wav_mono.wav_silence.wav"
class_probabilities, predicted_class_index = test_audio(test_audio_file, model)

# Display results for all classes
for i, class_label in enumerate(classes):
    probability = class_probabilities[i]
    print(f"Class: {class_label}, Probability: {probability:.4f}")

# Calculate and display the predicted class and accuracy
predicted_class = classes[predicted_class_index]
accuracy = class_probabilities[predicted_class_index]
print(f"The audio is classified as: {predicted_class}")
print(f"Accuracy: {accuracy:.4f}")

Class: fake, Probability: 0.0000
Class: real, Probability: 1.0000
The audio is classified as: real
Accuracy: 1.0000
