In [None]:
# %pip install resampy
# %pip install tqdm
# %pip install imblearn

In [None]:
import numpy as np
import pandas as pd
import os
import librosa
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.utils import to_categorical


In [None]:
audio_file_path = "./Dataset/"

In [None]:
def visualize(audio_file_path, Tag):
    ad, sr = librosa.load(audio_file_path)

    spec = np.abs(librosa.stft(ad))
    spec = librosa.amplitude_to_db(spec, ref=np.max)

    mel_spect = librosa.feature.melspectrogram(y=ad, sr=sr)
    mel_spect = librosa.power_to_db(mel_spect, ref=np.max)

    chroma = librosa.feature.chroma_cqt(y=ad, sr=sr, bins_per_octave=36)

    mfcc = librosa.feature.mfcc(y=ad, sr=sr)

    fig, ax = plt.subplots(5, 1, figsize=(10, 15), constrained_layout=True)
    fig.suptitle(f'Visualization of {Tag} audio', fontsize=16)

    ax[0].plot(ad)
    ax[0].set_title(f'{Tag} audio Waveform')

    img = librosa.display.specshow(spec, sr=sr, x_axis='time', y_axis='log', ax=ax[1])
    fig.colorbar(img, ax=ax[1], format="%+2.0f dB")
    ax[1].set_title(f'{Tag} audio spectrogram')

    img = librosa.display.specshow(mel_spect, sr=sr, x_axis='time', y_axis='mel', ax=ax[2])
    fig.colorbar(img, ax=ax[2], format="%+2.0f dB")
    ax[2].set_title(f'{Tag} audio Mel Spectrogram')

    img = librosa.display.specshow(chroma, sr=sr, x_axis='time', y_axis='chroma', ax=ax[3])
    fig.colorbar(img, ax=ax[3], format="%+2.0f dB")
    ax[3].set_title(f'{Tag} audio Chroma')

    img = librosa.display.specshow(mfcc, sr=sr, x_axis='time', ax=ax[4])
    fig.colorbar(img, ax=ax[4], format="%+2.0f dB")
    ax[4].set_title(f'{Tag} audio MFCC')

    plt.show()

In [None]:
demo_audio_paths = os.path.join(audio_file_path, 'DEMO')

for item in os.listdir(demo_audio_paths):
    if item.find("original") != -1:
        real_audio_path = os.path.join(demo_audio_paths, item)
    else:
        fake_audio_path = os.path.join(demo_audio_paths, item)

In [None]:
visualize(real_audio_path, "Real")

In [None]:
visualize(fake_audio_path, "Fake")

In [None]:
data, labels = [], []

folders = ['FAKE', 'REAL']

for folder in folders:
    files = os.listdir(os.path.join(audio_file_path, folder))

    for file in tqdm(files):
        file_path = os.path.join(audio_file_path, folder, file)
        audio, sr = librosa.load(file_path, res_type='kaiser_fast')
        mfcc_features = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
        mfcc_features_scaled = np.mean(mfcc_features.T, axis=0)

        data.append(mfcc_features_scaled)
        labels.append(folder)

In [None]:
feature_df = pd.DataFrame({'features': data, 'labels': labels})
print(feature_df.head())
print(feature_df['labels'].value_counts())

In [None]:
def label_encoder(labels):
    le = LabelEncoder().fit(labels)
    print(labels.name, le.classes_)
    return le.transform(labels)

feature_df['labels'] = label_encoder(feature_df['labels'])

In [None]:
X = np.array(feature_df.features.tolist())
y = np.array(feature_df.labels.tolist())

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)
y_resampled = to_categorical(y_resampled)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
num_labels = len(feature_df['labels'].unique())
input_shape = feature_df['features'][0].shape

print(f"Input shape : {input_shape}")
print(f"Number of labels : {num_labels}")

In [None]:
model = Sequential([
    Dense(128, input_shape=input_shape),
    Activation("relu"),
    Dropout(0.5),
    Dense(256),
    Activation("relu"),
    Dropout(0.5),
    Dense(128),
    Activation("relu"),
    Dropout(0.5),
    Dense(num_labels),
    Activation("softmax")
])

model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
model.summary()

In [None]:
history = model.fit(X_train, y_train, batch_size=2, epochs=200, validation_data=(X_test, y_test))

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test loss : {test_loss}")
print(f"Test accuracy : {test_acc}")

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Model Accuracy')
plt.legend(loc='lower right')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Model Loss')
plt.legend(loc='upper right')
plt.show()

In [None]:
def predict(audio_file_path, model):
    audio, sr = librosa.load(audio_file_path, res_type='kaiser_fast')
    mfcc_features = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
    mfcc_features_scaled = np.mean(mfcc_features.T, axis=0)
    mfcc_features_scaled = np.expand_dims(mfcc_features_scaled, axis=0)
    prediction = model.predict(mfcc_features_scaled)

    print(f"Prediction : {folders[np.argmax(prediction[0])]}")

In [None]:
real_path = '../Speaker_Identification/combined_files/Benjamin_Netanyau_combined.wav'
predict(real_path, model)

In [None]:
currTime = pd.Timestamp.now().strftime("%Y%m%d%H%M")
model.save(f'weights/anti-spoof-{currTime}.keras')