In [1]:
# Imports
from sine_examples import x,x_quant,y,freqs,x_fft_mag,x_quant_fft_mag,y_fft_mag,fs
import IPython.display as ipd
import torch
import numpy as np
import matplotlib.pyplot as plt
from psychoacousticLoss import PsychoAcousticLoss, LSDLoss
from asteroid.losses import SingleSrcMultiScaleSpectral

In [2]:
# Playback
print('Example Signal X')
display(ipd.Audio(x.detach(),rate=fs))
print('Example Signal Y')
display(ipd.Audio(y.detach(),rate=fs))
print('Example Signal X_Quant')
display(ipd.Audio(x_quant.detach(),rate=fs))

Example Signal X


Example Signal Y


Example Signal X_Quant


In [3]:
# Compute MSE Loss
loss_mse = torch.nn.MSELoss()
mse_original = loss_mse(y,x)
print('MSE Loss (y,x):', mse_original)
mse_quant = loss_mse(x_quant,x)
print('MSE Loss (x_quant,x):', mse_quant)


MSE Loss (y,x): tensor(0.0556)
MSE Loss (x_quant,x): tensor(0.0537)


In [None]:
# Plot Signals in Time and Frequency
plt.figure(figsize=(16,12))
plt.subplot(2, 3, 1)
plt.plot(x.detach()[:64])
plt.ylim([-1.5,1.5])
plt.title("Signal X (Time Domain)")
plt.xlabel("Sample [n]")
plt.ylabel("Amplitude")
plt.grid()
plt.subplot(2, 3, 2)
plt.plot(y.detach()[:64])
plt.ylim([-1.5,1.5])
plt.title("Signal Y (Time Domain)")
plt.xlabel("Sample [n]")
plt.ylabel("Amplitude")
plt.grid()
plt.subplot(2, 3, 3)
plt.plot(x_quant.detach()[:64])
plt.ylim([-1.5,1.5])
plt.title("Signal X_Quant (Time Domain)")
plt.xlabel("Sample [n]")
plt.ylabel("Amplitude")
plt.grid()
plt.subplot(2, 3, 4)
plt.plot(freqs, x_fft_mag)
plt.title('Signal X (Frequency Domain)')
plt.xlabel('Frequencies [Hz]')
plt.ylabel('Magnitude [dB]')
plt.grid()
plt.subplot(2, 3, 5)
plt.plot(freqs, y_fft_mag)
plt.title('Signal Y (Frequency Domain)')
plt.xlabel('Frequencies [Hz]')
plt.ylabel('Magnitude [dB]')
plt.grid()
plt.subplot(2, 3, 6)
plt.plot(freqs, x_quant_fft_mag)
plt.title('Signal Y (Frequency Domain)')
plt.xlabel('Frequencies [Hz]')
plt.ylabel('Magnitude [dB]')
plt.grid()

In [None]:
# Compute Psychoacoustic MSE Loss
mse_psychoLoss = PsychoAcousticLoss(mode="mse_time")
mse_psycho = mse_psychoLoss(x,y)

# Compare MSE with Psychoacoustic MSE
print('MSE Loss', mse)
print('MSE Psychoascoutic Loss:', mse_psycho)
print('Losses Rastio:', mse/mse_psycho)

The Psychoacoustic MSE Loss is now much closer to zero than the MSE loss we calculated before. In fact it is more than 200 times smaller than the 'traditional' MSE loss in this situation.

We can say that now some perceptual characteristis of our hearing system are being taken into consideration by this loss function to calculate a 'perceived hearing difference' between the signals.

So far we analysed some very simple audio signals, and just used the MSE loss function. 

Let's now move on to more sophisticated signals, such as music excerpts, and apply the same principles to different loss functions such as the Log Spectral Distance and the Multi Scale Spectral Loss.

For our next experiment, we will use 3 versions of the same music audio signal: a 16-bit PCM .wav file sampled at 44.1kHz, then we will encode this signal into an .mp3 128kbps file and finally we will encoded it also as an .aac file.

We can load these files and listen to them.

In [None]:
print('Example Signal .wav')
display(ipd.Audio(audio_wav.detach(),rate=44100))
print('Example Signal .mp3')
display(ipd.Audio(audio_mp3.detach(),rate=44100))
print('Example Signal .aac')
display(ipd.Audio(audio_aac.detach(),rate=44100))

In [None]:
from scipy.io import wavfile
import soundfile as sf
sf.write('audio_wav_mono.wav', audio_wav.detach().numpy(), 44100)
samplerate, audio_wav_mono = wavfile.read("audio_wav_mono.wav")



In [None]:
factor=17000
wav_quant = np.round(audio_wav_mono/factor)*factor
sf.write('audio_wav_mono_quant.wav', wav_quant, 44100)



In [None]:
import librosa
audio_wav_quant, sr_wav = librosa.load(
    'audio_wav_mono_quant.wav', sr=None)
audio_wav_quant = torch.from_numpy(audio_wav_quant)



In [None]:
display(ipd.Audio(audio_wav_quant.detach(),rate=44100))

In [None]:
loss_mse = torch.nn.MSELoss()
mse_psychoLoss = PsychoAcousticLoss(mode="mse_time")
mse_wav_aac = loss_mse(audio_aac,audio_wav)
print("MSE Loss (wav_aac):",mse_wav_aac)
print(loss_mse(audio_wav_quant,audio_wav))

All the used codecs are quite popular and it can be very tricky for a non-trained ear to distinguish between all the audio files.

Like we did before, let's calculate both the MSE value and the Psychoacoustic MSE value between the original .wav file and the encoded versions.

In [None]:
# Loss functions
loss_mse = torch.nn.MSELoss()
mse_psychoLoss = PsychoAcousticLoss(mode="mse_time")
mse_wav_aac = loss_mse(audio_aac,audio_wav)
print("MSE Loss (wav_aac):",mse_wav_aac)
mse_psycho_wav_aac = mse_psychoLoss(audio_aac,audio_wav)
print("MSE Psychoacoustic Loss (wav_aac):",mse_psycho_wav_aac)
print("Losses Ratio (wav_aac):",mse_wav_aac/mse_psycho_wav_aac)
mse_wav_mp3 = loss_mse(audio_mp3,audio_wav)
print("MSE Loss (wav_mp3):",mse_wav_mp3)
mse_psycho_wav_mp3 = mse_psychoLoss(audio_mp3,audio_wav)
print("MSE Psychoacoustic Loss (wav_mp3):",mse_psycho_wav_mp3)
print("Losses Ratio (wav_mp3):",mse_wav_mp3/mse_psycho_wav_mp3)

In [None]:
loss_mse = torch.nn.MSELoss()


Again we notice that the Psychoacoustic Loss function is much smaller than its non-psychoacoustic version, somewhere near the order of magnitude of 100 times smaller.

Let's repeat the same procedure but now we will use the Log Spectral Distance (LSD) loss.

In [None]:
loss_lsd = LSDLoss()
lsd_psychoLoss = PsychoAcousticLoss(mode="lsd")
lsd_wav_aac = loss_lsd(audio_aac,audio_wav)
print("LSD Loss (wav_aac):",lsd_wav_aac)
lsd_psycho_wav_aac = lsd_psychoLoss(audio_aac,audio_wav)
print("LSD Psychoacoustic Loss (wav_aac):",lsd_psycho_wav_aac)
print("Losses Ratio (wav_aac):",lsd_wav_aac/lsd_psycho_wav_aac)
lsd_wav_mp3 = loss_lsd(audio_mp3,audio_wav)
print("LSD Loss (wav_mp3):",lsd_wav_mp3)
lsd_psycho_wav_mp3 = lsd_psychoLoss(audio_mp3,audio_wav)
print("LSD Psychoacoustic Loss (wav_mp3):",lsd_psycho_wav_mp3)
print("Losses Ratio (wav_mp3):",lsd_wav_mp3/lsd_psycho_wav_mp3)


This time we notice something different. The Psychoacoustic Loss is still smaller than the pure LSD but the difference is much smaller, just around 1.2 times smaller.

Finally we will inspect one more loss function, the Multi Scale Spectral Loss.

In [None]:
loss_multiScaleSpectral = SingleSrcMultiScaleSpectral()
multiscalespectral_psychoLoss = PsychoAcousticLoss(mode="multiscale")
multispectral_wav_aac = loss_multiScaleSpectral(audio_aac.unsqueeze(dim=0),audio_wav.unsqueeze(dim=0))
print("MultiScale Spectral Loss (wav_aac):",multispectral_wav_aac)
multispectral_psycho_wav_aac = multiscalespectral_psychoLoss(audio_aac,audio_wav)
print("Psychoacoustic MultiScale Spectral Loss (wav_aac):",multispectral_psycho_wav_aac)
print("Losses Ratio (wav_aac):",multispectral_wav_aac/multispectral_psycho_wav_aac)
multispectral_wav_mp3 = loss_multiScaleSpectral(audio_mp3.unsqueeze(dim=0),audio_wav.unsqueeze(dim=0))
print("MultiScale Spectral Loss (wav_mp3):",multispectral_wav_mp3)
multispectral_psycho_wav_mp3 = multiscalespectral_psychoLoss(audio_mp3,audio_wav)
print("Psychoacoustic MultiScale Spectral Loss (wav_mp3):",multispectral_psycho_wav_mp3)
print("Losses Ratio (wav_mp3):",multispectral_wav_mp3/multispectral_psycho_wav_mp3)



Again we notice something similar to what we've seen with the previous loss. The psychoacoustic version of the loss is still smaller, but only around 1.8 times smaller.

Once again we can plot the signals in time and in time-frequency domain for a visual inspection.

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(2, 3, 1)
plt.plot(audio_wav.detach()[200000:200256])
plt.ylim([-0.5,0.5])
plt.title("Signal .wav (Time Domain)")
plt.xlabel("Sample [n]")
plt.ylabel("Amplitude")
plt.grid()
plt.subplot(2, 3, 2)
plt.plot(audio_aac.detach()[200000:200256])
plt.ylim([-0.5,0.5])
plt.title("Signal .aac (Time Domain)")
plt.xlabel("Sample [n]")
plt.ylabel("Amplitude")
plt.grid()
plt.subplot(2, 3, 3)
plt.plot(audio_mp3.detach()[200000:200256])
plt.ylim([-0.5,0.5])
plt.title("Signal .mp3 (Time Domain)")
plt.xlabel("Sample [n]")
plt.ylabel("Amplitude")
plt.grid()
plt.subplot(2, 3, 4)
plt.specgram(audio_wav.detach().numpy(), Fs=44100, cmap='magma')
plt.title("Signal .wav (Time-Frequency Domain)")
plt.xlabel("Time [s]")
plt.ylabel("Magnitude [dB]")
plt.grid()
plt.subplot(2, 3, 5)
plt.specgram(audio_aac.detach().numpy(), Fs=44100, cmap='magma')
plt.title("Signal .aac (Frequency Domain)")
plt.xlabel("Time [s]")
plt.ylabel("Magnitude [dB]")
plt.grid()
plt.subplot(2, 3, 6)
plt.specgram(audio_mp3.detach().numpy(), Fs=44100, cmap='magma')
plt.title("Signal .mp3 (Time-Frequency Domain)")
plt.xlabel("Time [s]")
plt.ylabel("Magnitude [dB]")
plt.grid()
plt.tight_layout()


The visual difference is again very noticeable both in time and in time-frequency domain.

In [None]:
fs