In [1]:
import os
import numpy as np
import librosa
from tensorflow.keras.models import load_model
from sklearn.metrics import accuracy_score

import timeit
start_time = timeit.default_timer()
# code you want to evaluate



# Load both Model 1 and Model 2
model1 = load_model('models/agamjeet-model.h5') # Replace with the path to Model 1's saved file
model2 = load_model('models/speaker_verification_model.h5') 
model3 = load_model('models/gautham-model-new.h5')


2023-10-15 23:50:42.488779: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-15 23:50:43.291927: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22843 MB memory:  -> device: 0, name: NVIDIA TITAN RTX, pci bus id: 0000:1a:00.0, compute capability: 7.5
2023-10-15 23:50:43.292573: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 22646 MB memory:  -> device: 1, name: NVIDIA TITAN RTX, pci bus id: 0000:68:00.0, compute capability: 7.5


In [2]:
#CONSTANT VARIABLES
MAX_PAD_LENGTH = 86
NUM_MFCC_COEFFS = 40
NUM_CLASSES = 2  # Number of classes (bonafide and spoof)
SAMPLE_RATE = 16000  # Sample rate of your audio files
DURATION = 5  # Duration of audio clips in seconds
MAX_TIME_STEPS = 109
N_MFCC = 13  # Number of MFCC coefficients
HOP_LENGTH = 512  # Hop length for MFCC extraction
WIN_LENGTH = 1024  # Window length for MFCC extraction

In [3]:
# Define functions for preprocessing the data for each model
def preprocess_data_model1(file_path):
    try:
        audio, sample_rate = librosa.load(file_path)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=NUM_MFCC_COEFFS)
        pad_width = MAX_PAD_LENGTH - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, max(0, pad_width))), mode='constant')
        return mfccs
    except Exception as e:
        print(f"Error encountered while processing file: {file_path}")
        print(e)
        return None

In [4]:
# Define a function to preprocess a single audio file
def preprocess_data_model2(file_path, SAMPLE_RATE, DURATION, N_MFCC, max_time_steps,HOP_LENGTH,WIN_LENGTH):
    try:
        # Load audio file using librosa
        audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)

        # Extract MFCC features using librosa
        mfcc = librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=N_MFCC, hop_length=HOP_LENGTH, n_fft=WIN_LENGTH)

        # Ensure all MFCC features have the same width (time steps)
        if mfcc.shape[1] < max_time_steps:
            mfcc = np.pad(mfcc, ((0, 0), (0, max_time_steps - mfcc.shape[1])), mode='constant')
        else:
            mfcc = mfcc[:, :max_time_steps]
            
        return mfcc
    except Exception as e:
        print(f"Error encountered while processing file: {file_path}")
        return None

In [5]:
def preprocess_data_model3(file_path):
    n_mfcc = 13
    max_length = 100
# Load the audio file
    audio, sr = librosa.load(file_path, sr=None)

    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)

    # Ensure that the MFCCs have the same length as during training
    if mfccs.shape[1] < max_length:
        mfccs = np.pad(mfccs, ((0, 0), (0, max_length - mfccs.shape[1])), mode='constant')
    else:
        mfccs = mfccs[:, :max_length]

    # Reshape the MFCCs to match the input shape expected by the model
    input_shape = (1, n_mfcc, max_length, 1)
    mfccs = mfccs.reshape(input_shape)

    return mfccs

In [6]:
def majority_vote(predictions_model1, predictions_model2,predictions_model3):
    # Perform majority voting
    combined_predictions = []
    for pred1, pred2, pred3 in zip(predictions_model1, predictions_model2, predictions_model3):
        # Choose the class with the majority of votes
        combined_prediction = np.argmax(np.bincount([np.argmax(pred1), np.argmax(pred2), np.argmax(pred3)]))
        combined_predictions.append(combined_prediction)
    
    return np.array(combined_predictions)

In [17]:
audio_sample_path = '/home/jonat/MAIN PROJECT/test-audios/5sec-audio.wav'
# Preprocess the audio sample for Model 1 (MFCC-based)
# Ensure that the preprocessing matches what was done for Model 1
preprocessed_audio_model1 = preprocess_data_model1(audio_sample_path)  # Implement this function

# Preprocess the audio sample for Model 2 (Mel spectrogram-based)
# Ensure that the preprocessing matches what was done for Model 2
SAMPLE_RATE = 16000  # Sample rate of your audio files
DURATION = 5  # Duration of audio clips in seconds
MAX_TIME_STEPS = 109
N_MFCC = 13  # Number of MFCC coefficients
HOP_LENGTH = 512  # Hop length for MFCC extraction
WIN_LENGTH = 1024  # Window length for MFCC extraction

preprocessed_audio_model2 = preprocess_data_model2(audio_sample_path, SAMPLE_RATE, DURATION, N_MFCC, MAX_TIME_STEPS,HOP_LENGTH,WIN_LENGTH)

preprocessed_audio_model3 = preprocess_data_model3(audio_sample_path)

# Make predictions for Model 1
prediction_model1 = model1.predict(np.expand_dims(preprocessed_audio_model2, axis=0))

# Make predictions for Model 2
prediction_model2 = model2.predict(np.expand_dims(preprocessed_audio_model2, axis=0))

prediction_model3 = model3.predict(np.expand_dims(preprocessed_audio_model2, axis=0))

# # Interpret the predictions
# if prediction_model3[0][0] > 0.5:
#     result = "fake"
# else:
#     result = "real"

# Combine predictions using ensemble strategy (e.g., majority voting)
ensemble_prediction = majority_vote(prediction_model1, prediction_model2,prediction_model3)  # Implement this function

# Define a mapping from binary values to class labels
class_labels = {0: "fake", 1: "real"}

# Convert the ensemble_prediction array to a single integer
ensemble_prediction_int = int(ensemble_prediction[0])
print(ensemble_prediction_int)

# Convert the predictions to class labels
prediction_label_model1 = class_labels.get(int(prediction_model1[0][0]), "Unknown")
prediction_label_model2 = class_labels.get(int(prediction_model2[0][0]), "Unknown")
prediction_label_model3 = class_labels.get(int(prediction_model3[0][0]), "Unknown")

# Print the predictions from both models
print(f"Prediction from Model 1: {prediction_label_model1}")
print(f"Prediction from Model 2: {prediction_label_model2}")
print(f"Prediction from Model 3: {prediction_label_model3}")

fakeCount = 0
realCount = 0

if(prediction_label_model1 == "fake"):
    fakeCount+=1
if(prediction_label_model2 == "fake"):
    fakeCount+=1
if(prediction_label_model3 == "fake"):
    fakeCount+=1

if(prediction_label_model1 == "real"):
    realCount+=1
if(prediction_label_model2 == "real"):
    realCount+=1
if(prediction_label_model3 == "real"):
    realCount+=1

probability_index = {}

probability_index["Real"] = realCount
probability_index["Fake"] = fakeCount

final_prediction = max(probability_index, key= probability_index.get)

print("This audio sample is : ", final_prediction)

elapsed = timeit.default_timer() - start_time

print(elapsed)

0
Prediction from Model 1: fake
Prediction from Model 2: fake
Prediction from Model 3: fake
This audio sample is :  Fake
921.0761893499875
