In [2]:
#import libraries
import tensorflow.compat.v2 as tf
tf.enable_v2_behavior()
assert tf.executing_eagerly()

import tensorflow_hub as hub
import numpy as np

import librosa

In [26]:
# function to convert audio signal to numpy array
def convert_audio_to_nd(audio_file):

    y, sr = librosa.load(audio_file,sr = 16000)
    print("Audio data type:", type(y))
    print("Audio data shape:", y.shape)

    
    # audio_data = np.array(y)
    audio_data = y / np.max(np.abs(y))

    print("New audio data type:", type(audio_data))
    print("New audio data shape:", audio_data.shape)


    return audio_data,sr

In [27]:
# function to convert audio signal numpy array to embeddings
def get_audio_embeddings(audio_signal,sr):
    # Load the module and run inference.
    module = hub.load('https://tfhub.dev/google/nonsemantic-speech-benchmark/trill/2')
    
    # Reshape the input ndarray to have shape (num_samples,).
    audio_signal = np.squeeze(audio_signal)
    
    # Resample the audio signal to 16kHz, if necessary.
    # resampler = tf.signal.resample
    # audio_signal = resampler(audio_signal, tf.constant([len(audio_signal) * 16000 // len(audio_signal)], dtype=tf.int32))
    
    #audio_signal = librosa.resample(audio_signal, sr, 16000)

    # Normalize the audio signal to have values between -1 and 1.
    audio_signal = np.asarray(audio_signal, dtype=np.float32)
    assert audio_signal.ndim == 1
    assert np.abs(audio_signal).max() <= 1.
    audio_signal = np.clip(audio_signal, -1., 1.)
    
    # Generate the embeddings using the loaded module.
    emb_dict = module(samples=audio_signal, sample_rate=16000)
    emb = emb_dict['embedding']
    emb_layer19 = emb_dict['layer19']
    
    # Return the embeddings as numpy ndarrays.
    return emb.numpy(), emb_layer19.numpy()



In [28]:
# function to calculate FAD from test & reference embeddings
def calc_fad(ref_emb,test_emb):

    euclidean_sq = np.sum((ref_emb.mean(axis=0) - test_emb.mean(axis=0))**2)

  
    ref_var = np.var(ref_emb, axis=0)
    test_var = np.var(test_emb, axis=0)
    euclidean_var = np.sum(ref_var + test_var - 2*np.sqrt(ref_var*test_var))
    euclidean_var /= ref_emb.shape[1]

    # Compute the Fréchet Audio Distance using the Euclidean distance.
    fad = euclidean_sq + euclidean_var
    fad = np.sqrt(fad)
    return fad


In [6]:
fad_list = []

In [29]:
for i in range(1,5):
  
    test_audio,sr1 = convert_audio_to_nd("/content/generated/"+str(i)+".wav") # modify this to take generated audio
    ref_audio,sr2 = convert_audio_to_nd("/content/actual/"+str(i)+".flac") 
    test_emb,test_emb_19 = get_audio_embeddings(test_audio,sr1)
    ref_emb,ref_emb_19 = get_audio_embeddings(ref_audio,sr2)

    fad = calc_fad(test_emb,ref_emb)
    fad_list.append(fad)



Audio data type: <class 'numpy.ndarray'>
Audio data shape: (109227,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (109227,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (480000,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (480000,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (59579,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (59579,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (480000,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (480000,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (163840,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (163840,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (480000,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (480000,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (109227,)
New audio data type: <class 'numpy.ndarray'>
New audio da

In [30]:
fad_list

[0.9711992994431712,
 1.0611124096838642,
 1.115067483416699,
 1.0156717368465618,
 0.9711992994431712,
 1.0611124096838642,
 1.115067483416699,
 1.0156717368465618,
 0.9711992994431712,
 1.0611124096838642,
 1.115067483416699,
 1.0156717368465618,
 0.9720172968293402,
 1.0454869415705403,
 1.1142205101030827,
 1.0110736718570463]

In [19]:
import glob

# Get a list of all the audio files in the directories
test_files = glob.glob("/content/generated/*.wav")
ref_files = glob.glob("/content/actual/*.flac")

# Loop over each pair of reference and test files and calculate FAD
for i, (test_file, ref_file) in enumerate(zip(test_files, ref_files)):
    test_audio, sr1 = convert_audio_to_nd(test_file)
    ref_audio, sr2 = convert_audio_to_nd(ref_file) 
    test_emb, test_emb_19 = get_audio_embeddings(test_audio, sr1)
    ref_emb, ref_emb_19 = get_audio_embeddings(ref_audio, sr2)
    fad = calc_fad(test_emb, ref_emb)
    print("FAD for pair {}: {}".format(i, fad))


Audio data type: <class 'numpy.ndarray'>
Audio data shape: (163840,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (163840,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (480000,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (480000,)
FAD for pair 0: 1.110425024550683
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (109227,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (109227,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (480000,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (480000,)
FAD for pair 1: 1.0533965633823341
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (109227,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (109227,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (480000,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (480000,)
FAD for pair 2: 1.0088698437544767
Audio data type: <cl

AssertionError: ignored

In [31]:
# Get a list of all the audio files in the directories
test_files = glob.glob("/content/generated/*.wav")
ref_files = glob.glob("/content/actual/*.flac")

# Get embeddings for all reference files
ref_emb_list = []
for ref_file in ref_files:
    ref_audio, sr2 = convert_audio_to_nd(ref_file) 
    ref_emb, ref_emb_19 = get_audio_embeddings(ref_audio, sr2)
    ref_emb_list.append(ref_emb)

# Concatenate all reference embeddings into a single array
ref_emb_all = np.concatenate(ref_emb_list, axis=0)

# Get embeddings for all test files
test_emb_list = []
for test_file in test_files:
    test_audio, sr1 = convert_audio_to_nd(test_file)
    test_emb, test_emb_19 = get_audio_embeddings(test_audio, sr1)
    test_emb_list.append(test_emb)

# Concatenate all test embeddings into a single array
test_emb_all = np.concatenate(test_emb_list, axis=0)

# Calculate FAD between the two sets of embeddings
fad = calc_fad(test_emb_all, ref_emb_all)

print("FAD between generated and actual audio: {}".format(fad))


Audio data type: <class 'numpy.ndarray'>
Audio data shape: (480000,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (480000,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (480000,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (480000,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (480000,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (480000,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (480000,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (480000,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (480000,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (480000,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (163840,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (163840,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (109227,)
New audio data type: <class 'numpy.ndarray'>
New audio 

In [12]:
fad_list

[0.9711992994431712,
 1.0611124096838642,
 1.115067483416699,
 1.0156717368465618,
 0.9711992994431712,
 1.0611124096838642,
 1.115067483416699,
 1.0156717368465618,
 0.9711992994431712,
 1.0611124096838642,
 1.115067483416699,
 1.0156717368465618]

In [None]:
# get reference & test audio, convert to embeddings and calculate FAD
test_audio,sr1 = convert_audio_to_nd("/content/sample-0.mp3") # modify this to take generated audio
ref_audio,sr2 = convert_audio_to_nd("/content/real.mp3") # modify this to take corrosponsing music caps audio

test_emb,test_emb_19 = get_audio_embeddings(test_audio,sr1)
ref_emb,ref_emb_19 = get_audio_embeddings(ref_audio,sr2)

fad = calc_fad(test_emb,ref_emb)
print(fad)

Audio data type: <class 'numpy.ndarray'>
Audio data shape: (160683,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (160683,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (95109,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (95109,)
0.5838273918180972


In [18]:
test_audio,sr1 = convert_audio_to_nd("/content/generated/1.wav") # modify this to take generated audio
ref_audio,sr2 = convert_audio_to_nd("/content/generated/1.wav") 
test_emb,test_emb_19 = get_audio_embeddings(test_audio,sr1)
ref_emb,ref_emb_19 = get_audio_embeddings(ref_audio,sr2)

fad = calc_fad(test_emb,ref_emb)
print('fad',fad)
# fad_list.append(fad)

Audio data type: <class 'numpy.ndarray'>
Audio data shape: (109227,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (109227,)
Audio data type: <class 'numpy.ndarray'>
Audio data shape: (109227,)
New audio data type: <class 'numpy.ndarray'>
New audio data shape: (109227,)
fad 0.0
