In [1]:
import os
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras import layers, models

In [2]:
def preprocess_data(file_path, max_time_steps=300, sample_rate=16000, duration=3, n_mels=128):
    audio, _ = librosa.load(file_path, sr=sample_rate, duration=duration)

    # Extract Mel spectrogram using librosa
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=n_mels,hop_length=160)
    mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # Ensure all spectrograms have the same width (time steps)
    if mel_spectrogram.shape[1] < max_time_steps:
        mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, max_time_steps - mel_spectrogram.shape[1])), mode='constant')
    else:
        mel_spectrogram = mel_spectrogram[:, :max_time_steps]

    return mel_spectrogram

In [3]:
# Set your file paths and constants
TRAINING_LABEL = '/data/common_source/datasets/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt'
TRAINING_DATA = '/data/common_source/datasets/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_train/flac'
VALIDATION_DATA = '/data/common_source/datasets/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_dev/flac'
VALIDATION_LABEL = '/data/common_source/datasets/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.dev.trl.txt'
SAMPLE_RATE = 16000  # Adjust if your sample rate is different
DURATION = 3  # Adjust the duration of your audio samples
N_MELS = 128  # Adjust the number of mel filters

In [4]:
# Load labels for training Data
train_labels = {}

with open(TRAINING_LABEL, 'r') as label_file:
    lines = label_file.readlines()

for line in lines:
    parts = line.strip().split()
    file_name = parts[1]
    label = 1 if parts[-1] == "bonafide" else 0
    train_labels[file_name] = label

X = []
y = []

max_time_steps = 300  # Define the maximum time steps for your model

for file_name, label in train_labels.items():
    file_path = os.path.join(TRAINING_DATA, file_name + ".flac")

    # Use the preprocess_data function
    mel_spectrogram = preprocess_data(file_path, max_time_steps=max_time_steps)

    X.append(mel_spectrogram)
    y.append(label)

X = np.array(X)
y = np.array(y)



In [5]:
# Reshape input data to match the required input shape for ResNet
X_reshaped = X.reshape((X.shape[0], X.shape[1], X.shape[2], 1))

print(X_reshaped.shape)

(25380, 128, 300, 1)


In [6]:
# Define input shape and number of classes
input_shape = (X_reshaped.shape[1], X_reshaped.shape[2], 1)
num_classes = 2  # Assuming you have two classes (0 and 1)
print(input_shape)

(128, 300, 1)


In [7]:
# Load and preprocess evaluation data
eval_X = []
eval_y = []

with open(VALIDATION_LABEL, 'r') as eval_label_file:
    eval_lines = eval_label_file.readlines()

eval_labels = {}

for line in eval_lines:
    parts = line.strip().split()
    file_name = parts[1]
    label = 1 if parts[-1] == "bonafide" else 0
    eval_labels[file_name] = label

max_time_steps = 300  # Define the maximum time steps for your model

for file_name, label in eval_labels.items():
    file_path = os.path.join(VALIDATION_DATA, file_name + ".flac")

    # Use the preprocess_data function
    mel_spectrogram = preprocess_data(file_path, max_time_steps=max_time_steps)

    eval_X.append(mel_spectrogram)
    eval_y.append(label)

eval_X = np.array(eval_X)
eval_y = np.array(eval_y)




In [8]:
# Reshape input data to match the required input shape for ResNet
eval_X_reshaped = eval_X.reshape((eval_X.shape[0], eval_X.shape[1], eval_X.shape[2], 1))

print(eval_X_reshaped.shape)

(24844, 128, 300, 1)


In [9]:
# Define the ResNet block
def resnet_block(x, filters, kernel_size=3, stride=1, conv_shortcut=False):
    shortcut = x
    if conv_shortcut:
        shortcut = layers.Conv2D(filters, 1, strides=stride)(shortcut)
        shortcut = layers.BatchNormalization()(shortcut)

    x = layers.Conv2D(filters, kernel_size, strides=stride, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)

    x = layers.Conv2D(filters, kernel_size, padding='same')(x)
    x = layers.BatchNormalization()(x)

    x = layers.add([x, shortcut])
    x = layers.Activation('relu')(x)
    return x

# Build the ResNet model
def build_resnet(input_shape, num_classes):
    input_tensor = layers.Input(shape=input_shape)

    x = layers.Conv2D(64, 7, strides=2, padding='same')(input_tensor)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.MaxPooling2D(3, strides=2, padding='same')(x)

    # ResNet blocks
    for size in [64, 128, 256, 512]:
        x = resnet_block(x, size, conv_shortcut=True)
        x = resnet_block(x, size)

    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(num_classes, activation='softmax')(x)

    model = models.Model(inputs=input_tensor, outputs=x, name='resnet_model')
    return model

In [10]:
# Build ResNet model
resnet_model = build_resnet(input_shape, num_classes)

2023-11-29 09:10:14.538577: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-29 09:10:15.334506: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22834 MB memory:  -> device: 0, name: NVIDIA TITAN RTX, pci bus id: 0000:1a:00.0, compute capability: 7.5
2023-11-29 09:10:15.335167: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 22758 MB memory:  -> device: 1, name: NVIDIA TITAN RTX, pci bus id: 0000:68:00.0, compute capability: 7.5


In [11]:
# Compile the model
resnet_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [12]:
# Display the model summary
resnet_model.summary()

Model: "resnet_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 128, 300, 1  0           []                               
                                )]                                                                
                                                                                                  
 conv2d (Conv2D)                (None, 64, 150, 64)  3200        ['input_1[0][0]']                
                                                                                                  
 batch_normalization (BatchNorm  (None, 64, 150, 64)  256        ['conv2d[0][0]']                 
 alization)                                                                                       
                                                                                       

In [13]:

from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

# Define a model checkpoint to save the best weights during training
checkpoint = ModelCheckpoint('baseline_LA.h5', monitor='val_loss', save_best_only=True)

In [14]:
# Train the model

epochs = 50  # Adjust the number of epochs as needed

history = resnet_model.fit(X_train, y_train, epochs=epochs, validation_data=(eval_X, eval_y))

Epoch 1/50


2023-11-29 09:10:47.789030: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8600
2023-11-29 09:10:48.314485: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Epoch 2/50

KeyboardInterrupt: 

: 

In [26]:
# Evaluate the model on the test set
loss, accuracy = history.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy * 100:.2f}%')

# Evaluate the model on the separate evaluation dataset
eval_loss, eval_accuracy = history.evaluate(eval_X_reshaped, eval_y)
print(f'Evaluation Loss: {eval_loss:.4f}, Evaluation Accuracy: {eval_accuracy * 100:.2f}%')

NameError: name 'history' is not defined

In [27]:
import matplotlib.pyplot as plt

# Plot training & validation loss values
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

# Plot training & validation accuracy values
plt.figure(figsize=(10, 5))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='lower right')
plt.show()

NameError: name 'history' is not defined

<Figure size 1000x500 with 0 Axes>

In [28]:
# Save the trained model
resnet_model.save("models/basic_resnet_test.h5")

In [30]:
model_new = tf.keras.models.load_model("models/basic_resnet_test.h5")

# Evaluate the model on the test set
loss, accuracy = model_new.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy * 100:.2f}%')

# Evaluate the model on the separate evaluation dataset
eval_loss, eval_accuracy = model_new.evaluate(eval_X, eval_y)
print(f'Evaluation Loss: {eval_loss:.4f}, Evaluation Accuracy: {eval_accuracy * 100:.2f}%')


Test Loss: 0.0043, Test Accuracy: 99.82%
Evaluation Loss: 0.0058, Evaluation Accuracy: 99.86%


: 

# PREDICTION

In [3]:
# Load Model and Make Predictions
def makePrediction(audio_file_path, model_file, max_time_steps):
    mel_spectrogram = preprocess_data(audio_file_path, max_time_steps=max_time_steps)

    #reshaping the spectrogram 
    input_data = np.expand_dims(mel_spectrogram, axis=0)

    print(input_data.shape)

    result = ''
    threshold = 0.377089

    # Predict using the loaded classifier
    prediction = model_file.predict(input_data)

    # Convert the prediction to a human-readable label
    if (prediction[0][1] > threshold):
        result = "BONAFIDE"
    else:
        result = "SPOOF"    

    return result

In [5]:
import tensorflow as tf

max_time_steps = 109

audio_file1 = "/home/jonat/test2.wav"
audio_file2 = "/home/jonat/MAIN-PROJECT/test-audios/5sec-audio.wav"
audio_file3 = "/home/jonat/MAIN-PROJECT/test-audios/tts-robot.wav"


# Load the pre-trained model
model = tf.keras.models.load_model('/home/jonat/AudioDeepFake_Baseline_Test/models/basic_resnet.h5')

prediction_result = makePrediction(audio_file3,model_file=model,max_time_steps=109)

print(prediction_result)



(1, 128, 109)
SPOOF
