# Constructing The Model

## Imports

In [15]:
import cv2
import IPython.display as ipd
import keras
import librosa # Compatible with python 3.10
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import seaborn as sns
import tensorflow as tf

from keras.models import Sequential
from keras.layers import BatchNormalization, Conv2D, Dense, Dropout, Flatten, MaxPool2D, Reshape
from PIL import Image
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

## Constants

#### Directories

In [16]:
# Assign paths
base_dir = "dataset"

# Use for-norm dataset
train_dir = os.path.join(base_dir, "for-norm", "for-norm", "training")
test_dir = os.path.join(base_dir, "for-norm", "for-norm", "testing")
val_dir = os.path.join(base_dir, "for-norm", "for-norm", "validation")

# Classified directories
train_dir_fake = os.path.join(train_dir, "fake")
train_dir_real = os.path.join(train_dir, "real")
test_dir_fake = os.path.join(test_dir, "fake")
test_dir_real = os.path.join(test_dir, "real")
val_dir_fake = os.path.join(val_dir, "fake")
val_dir_real = os.path.join(val_dir, "real")

In [17]:
# Check for mistakes
print("Base directory:", base_dir)
print()
print("Train directory:", train_dir)
print("Test directory:", test_dir)
print("Val directory:", val_dir)
print()
print("Train (fake) directory:", train_dir_fake)
print("Train (real) directory:", train_dir_real)
print("Test (fake) directory:", test_dir_fake)
print("Test (real) directory:", test_dir_real)
print("Val (fake) directory:", val_dir_fake)
print("Val (real) directory:", val_dir_real)

Base directory: dataset

Train directory: dataset/for-norm/for-norm/training
Test directory: dataset/for-norm/for-norm/testing
Val directory: dataset/for-norm/for-norm/validation

Train (fake) directory: dataset/for-norm/for-norm/training/fake
Train (real) directory: dataset/for-norm/for-norm/training/real
Test (fake) directory: dataset/for-norm/for-norm/testing/fake
Test (real) directory: dataset/for-norm/for-norm/testing/real
Val (fake) directory: dataset/for-norm/for-norm/validation/fake
Val (real) directory: dataset/for-norm/for-norm/validation/real


#### Other Constants

In [18]:
BATCH_SIZE = 16
NUM_WORKERS = 12
EPOCHS = 20
STEPS_PER_EPOCH = 300

## Obtain Data

In [19]:
# Obtain waveform (.wav) audio files
train_fake_audio_path = [os.path.join(train_dir_fake, file) for file in os.listdir(train_dir_fake) if file.endswith(".wav")]
train_real_audio_path = [os.path.join(train_dir_real, file) for file in os.listdir(train_dir_real) if file.endswith(".wav")]

test_fake_audio_path = [os.path.join(test_dir_fake, file) for file in os.listdir(test_dir_fake) if file.endswith(".wav")]
test_real_audio_path = [os.path.join(test_dir_real, file) for file in os.listdir(test_dir_real) if file.endswith(".wav")]

validation_fake_audio_path = [os.path.join(val_dir_fake, file) for file in os.listdir(val_dir_fake) if file.endswith(".wav")]
validation_real_audio_path = [os.path.join(val_dir_real, file) for file in os.listdir(val_dir_real) if file.endswith(".wav")]

In [20]:
# Convert audio file to spectrogram
# def create_spectrogram(file_path):
#     audio_data, sample_rate = librosa.load(file_path)
#     spectrogram = librosa.stft(audio_data)
#     decibel_spectrogram = librosa.amplitude_to_db(abs(spectrogram))
#     return decibel_spectrogram

# Convert audio file to mel-scale spectrogram
# See https://medium.com/analytics-vidhya/understanding-the-mel-spectrogram-fca2afa2ce53
def create_mel_spectrogram(file_path):
    audio_data, sample_rate = librosa.load(file_path)  
    # Convert audio to mel-based spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate)
    # Convert from amplitude squared to decibel units
    mel_decibel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)  
    return mel_decibel_spectrogram

In [None]:
# Use spectrograms as features to train the model
def get_features_and_labels(real_audio_files, fake_audio_files):
    spec_arr = []
    labels = []

    target_shape = (128, 87) # Target shape for resizing

    for file in real_audio_files:
        spectrogram = create_mel_spectrogram(file)
        resized_spectrogram = cv2.resize(spectrogram, target_shape[::-1], interpolation=cv2.INTER_LINEAR) # Resize, swap width and height
        spec_arr.append(resized_spectrogram)
        labels.append(0)
    for file in fake_audio_files:
        spectrogram = create_mel_spectrogram(file)
        resized_spectrogram = cv2.resize(spectrogram, target_shape[::-1], interpolation=cv2.INTER_LINEAR) # Resize, swap width and height
        spec_arr.append(resized_spectrogram)
        labels.append(1)


    return np.array(spec_arr), np.array(labels)

In [None]:
train_features, train_labels = get_features_and_labels(train_real_audio_path, train_fake_audio_path)
validation_features, validation_labels = get_features_and_labels(validation_real_audio_path, validation_fake_audio_path)
test_features, test_labels = get_features_and_labels(test_real_audio_path, test_fake_audio_path)



In [None]:
# Verity the shapes of the features
print("train features shape: {}".format(train_features.shape))
print("test features shape: {}".format(test_features.shape))

In [None]:
# Significantly trimmed VGG model to optimize results

trimmed_vgg = Sequential()
trimmed_vgg.add(Reshape((128, 87, 1),input_shape=train_features.shape[1:]))

trimmed_vgg.add(Conv2D(filters=64, kernel_size=(3,3), padding="same", activation='relu'))
trimmed_vgg.add(Conv2D(filters=64, kernel_size=(3,3), padding="same", activation='relu'))
trimmed_vgg.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))

trimmed_vgg.add(Conv2D(filters=128, kernel_size=(3,3), padding="same", activation='relu'))
trimmed_vgg.add(Conv2D(filters=128, kernel_size=(3,3), padding="same", activation='relu'))
trimmed_vgg.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))

trimmed_vgg.add(Flatten())
trimmed_vgg.add(Dense(units=256,activation="relu"))
trimmed_vgg.add(Dense(units=256,activation="relu"))
trimmed_vgg.add(Dense(1, activation="sigmoid"))

trimmed_vgg.summary()

In [None]:
# Compile model
# Adam optimiser helps model get unstuck when stuck at local minima
trimmed_vgg.compile(optimizer=keras.optimizers.Adam(),
                    loss=keras.losses.binary_crossentropy, 
                    metrics=["accuracy"])

In [None]:
trimmed_vgg_history = trimmed_vgg.fit(train_features,
                                      train_labels,
                                      validation_data = [validation_features, validation_labels],
                                      batch_size = BATCH_SIZE,
                                      epochs = EPOCHS,
                                      steps_per_epoch = STEPS_PER_EPOCH)

In [None]:
# Note that extremely high accuracy is not due to model overfitting,
# it's because the dataset is very "easy" relative to actual modern state of deepfaked audio
# i.e. the fakes are mostly easily detectable, even to the human ear

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7))

ax1.plot(trimmed_vgg_history.history["accuracy"])
ax1.plot(trimmed_vgg_history.history['val_accuracy'])
ax1.set_title("Accuracy")
ax1.set_xlabel("Epoch")
ax1.legend(["Accuracy","Validation Accuracy"])

ax2.plot(trimmed_vgg_history.history["loss"])
ax2.plot(trimmed_vgg_history.history["val_loss"])
ax2.set_title("Loss")
ax2.set_xlabel("Epoch")
ax2.legend(["Loss","Validation Loss"])

plt.show()

In [None]:
trimmed_vgg_loss, trimmed_vgg_accuracy = trimmed_vgg.evaluate(test_features, test_labels)

In [None]:

# Saving the final version
trimmed_vgg.save('deepfake_audio_detector.h5')