In [2]:
import os
import numpy as np
import librosa
from tensorflow.keras.models import load_model
from sklearn.metrics import accuracy_score

# Load both Model 1 and Model 2
model1 = load_model('models/agamjeet-model.h5')  # Replace with the path to Model 1's saved file
model2 = load_model('models/jonat-model.h5') 
model3 = load_model('models/gautham-model.h5')


2023-10-09 10:53:21.649682: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-09 10:53:22.482396: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22843 MB memory:  -> device: 0, name: NVIDIA TITAN RTX, pci bus id: 0000:1a:00.0, compute capability: 7.5
2023-10-09 10:53:22.483095: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 22773 MB memory:  -> device: 1, name: NVIDIA TITAN RTX, pci bus id: 0000:68:00.0, compute capability: 7.5


In [3]:
# Define the paths to testing data for each model
testing_dir_model1 = '/home/jonat/datasets/for-2seconds/testing'  # Adjust the path
testing_dir_model2 = '/home/jonat/datasets/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_eval/flac'
testing_dir_model3 = '/home/jonat/datasets/release_in_the_wild'
LABEL_FILE_PATH_MODEL2 = '/home/jonat/datasets/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.eval.trl.txt'
LABEL_FILE_PATH_MODEL3 = '/home/jonat/datasets/release_in_the_wild/meta.csv'

#CONSTANT VARIABLES
MAX_PAD_LENGTH = 86
NUM_MFCC_COEFFS = 40
NUM_CLASSES = 2  # Number of classes (bonafide and spoof)
SAMPLE_RATE = 16000  # Sample rate of your audio files
DURATION = 5  # Duration of audio clips in seconds
N_MELS = 128  # Number of Mel frequency bins
MAX_TIME_STEPS = 109

In [4]:
# Define functions for preprocessing the data for each model
def preprocess_data_model1(file_path):
    try:
        audio, sample_rate = librosa.load(file_path)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=NUM_MFCC_COEFFS)
        pad_width = MAX_PAD_LENGTH - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, max(0, pad_width))), mode='constant')
        return mfccs
    except Exception as e:
        print(f"Error encountered while processing file: {file_path}")
        print(e)
        return None

# Process testing data for Model 1
X_test_model1 = []
y_test_model1 = []

data = []
for label in os.listdir(testing_dir_model1):
    label_dir = os.path.join(testing_dir_model1, label)
    if not os.path.isdir(label_dir):
        continue
    for file_name in os.listdir(label_dir):
        if file_name.endswith('.wav'):
            file_path = os.path.join(label_dir, file_name)
            features = preprocess_data_model1(file_path)
            if features is not None:
                data.append([features, label])
# Shuffle the data
np.random.shuffle(data)

X_test_model1 = np.array([x[0] for x in data])
y_test_model1 = np.array([x[1] for x in data])  

X_test_model1,y_test_model1

(array([[[-2.76277771e+02, -2.49373566e+02, -1.99227371e+02, ...,
          -2.27046265e+02, -1.34770493e+02, -1.03455429e+02],
         [ 1.74620529e+02,  1.85464966e+02,  1.96909729e+02, ...,
           7.07589569e+01,  1.54023010e+02,  1.65170990e+02],
         [ 5.69737358e+01,  4.54773674e+01,  1.74491978e+01, ...,
          -2.83980751e+01, -1.50940132e+01, -5.55561447e+00],
         ...,
         [ 2.46777916e+00,  7.45345497e+00,  5.94833279e+00, ...,
           8.74664211e+00,  1.27606812e+01,  1.21703796e+01],
         [ 4.85905141e-01, -2.36257339e+00,  1.46288171e-01, ...,
           7.01033306e+00,  3.16514158e+00,  2.43539977e+00],
         [-9.34749171e-02, -3.58693719e+00, -2.31290364e+00, ...,
          -1.22441196e+00, -6.32507420e+00, -4.38854647e+00]],
 
        [[-3.08881439e+02, -3.03552643e+02, -2.12761063e+02, ...,
          -1.82072174e+02, -1.09421761e+02, -9.30665588e+01],
         [ 1.54973816e+02,  1.56298920e+02,  1.57697021e+02, ...,
           1.20053299

In [5]:
X_test_model2 = []
y_test_model2 = []

labels = {}

# Define a function to preprocess a single audio file
def preprocess_data_model2(file_path, sample_rate, duration, n_mels, max_time_steps):
    try:
        audio, _ = librosa.load(file_path, sr=sample_rate, duration=duration)
        mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=n_mels)
        mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
        
        # Ensure all spectrograms have the same width (time steps)
        if mel_spectrogram.shape[1] < max_time_steps:
            mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, max_time_steps - mel_spectrogram.shape[1])), mode='constant')
        else:
            mel_spectrogram = mel_spectrogram[:, :max_time_steps]
        
        return mel_spectrogram
    except Exception as e:
        print(f"Error encountered while processing file: {file_path}")
        return None

with open(LABEL_FILE_PATH_MODEL2, 'r') as label_file:
    lines = label_file.readlines()

for line in lines:
    parts = line.strip().split()
    file_name = parts[1]
    label = 1 if parts[-1] == "bonafide" else 0
    labels[file_name] = label



for file_name in os.listdir(testing_dir_model2):
    if file_name.endswith('.flac'):
        file_path = os.path.join(testing_dir_model2, file_name)
        features = preprocess_data_model2(file_path, SAMPLE_RATE, DURATION, N_MELS, MAX_TIME_STEPS)
        if features is not None:
            X_test_model2.append(features)    
        y_test_model2.append(label)


X_test_model2 = np.array(X_test_model2)
y_test_model2 = np.array(y_test_model2)

In [6]:
def preprocess_data_model3(file_path):
    n_mfcc = 13
    max_length = 100
# Load the audio file
    audio, sr = librosa.load(file_path, sr=None)

    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)

    # Ensure that the MFCCs have the same length as during training
    if mfccs.shape[1] < max_length:
        mfccs = np.pad(mfccs, ((0, 0), (0, max_length - mfccs.shape[1])), mode='constant')
    else:
        mfccs = mfccs[:, :max_length]

    # Reshape the MFCCs to match the input shape expected by the model
    input_shape = (1, n_mfcc, max_length, 1)
    mfccs = mfccs.reshape(input_shape)

    return mfccs

In [7]:
def majority_vote(predictions_model1, predictions_model2,predictions_model3):
    # Perform majority voting
    combined_predictions = []
    for pred1, pred2, pred3 in zip(predictions_model1, predictions_model2, predictions_model3):
        # Choose the class with the majority of votes
        combined_prediction = np.argmax(np.bincount([np.argmax(pred1), np.argmax(pred2), np.argmax(pred3)]))
        combined_predictions.append(combined_prediction)
    
    return np.array(combined_predictions)

In [None]:
# Combine predictions from both models (e.g., using majority vote)
ensemble_predictions = majority_vote(predictions_model1, predictions_model2)

In [9]:
audio_sample_path = '/home/jonat/datasets/for-2seconds/testing/fake/file1040.wav_16k.wav_norm.wav_mono.wav_silence.wav_2sec.wav'
# Preprocess the audio sample for Model 1 (MFCC-based)
# Ensure that the preprocessing matches what was done for Model 1
preprocessed_audio_model1 = preprocess_data_model1(audio_sample_path)  # Implement this function

# Preprocess the audio sample for Model 2 (Mel spectrogram-based)
# Ensure that the preprocessing matches what was done for Model 2
sample_rate = 16000  # Adjust to match the sample rate used during training
duration = 5  # Adjust to match the duration used during training
n_mels = 128  # Adjust to match the number of Mel frequency bins used during training
max_time_steps = 109  # Adjust to match the maximum time steps used during training

preprocessed_audio_model2 = preprocess_data_model2(audio_sample_path, sample_rate, duration, n_mels, max_time_steps)

preprocessed_audio_model3 = preprocess_data_model3(audio_sample_path)

# Make predictions for Model 1
prediction_model1 = model1.predict(np.expand_dims(preprocessed_audio_model1, axis=0))

# Make predictions for Model 2
prediction_model2 = model2.predict(np.expand_dims(preprocessed_audio_model2, axis=0))

prediction_model3 = model3.predict(preprocessed_audio_model3)

# Interpret the predictions
if prediction_model3[0][0] > 0.5:
    result = "fake"
else:
    result = "real"

# Combine predictions using ensemble strategy (e.g., majority voting)
ensemble_prediction = majority_vote(prediction_model1, prediction_model2,prediction_model3)  # Implement this function

# Define a mapping from binary values to class labels
class_labels = {0: "fake", 1: "real"}

# Convert the ensemble_prediction array to a single integer
ensemble_prediction_int = int(ensemble_prediction[0])

# Convert the predictions to class labels
prediction_label_model1 = class_labels.get(int(prediction_model1[0][0]), "Unknown")
prediction_label_model2 = class_labels.get(int(prediction_model2[0][0]), "Unknown")
prediction_label_model3 = class_labels.get(int(prediction_model3[0][0]), "Unknown")

# Print the predictions from both models
print(f"Prediction from Model 1: {prediction_label_model1}")
print(f"Prediction from Model 2: {prediction_label_model2}")
print(f"Prediction from Model 3: {result}")

fakeCount = 0
realCount = 0

if(prediction_label_model1 == "fake"):
    fakeCount+=1
if(prediction_label_model2 == "fake"):
    fakeCount+=1
if(result == "fake"):
    fakeCount+=1

if(prediction_label_model1 == "real"):
    realCount+=1
if(prediction_label_model2 == "real"):
    realCount+=1
if(result == "real"):
    realCount+=1

probability_index = {}

probability_index["Real"] = realCount
probability_index["Fake"] = fakeCount

final_prediction = max(probability_index, key= probability_index.get)

print("This audio sample is : ", final_prediction)  

Prediction from Model 1: fake
Prediction from Model 2: real
Prediction from Model 3: fake
This audio sample is :  Fake
