In [14]:
import torchaudio
import torch
import pandas as pd
from pesq import pesq
from torchaudio.transforms import Resample

In [15]:
voice, voice_rate = torchaudio.load('gt.wav')
noise, noise_rate = torchaudio.load('noise.wav')
assert voice_rate == noise_rate
voice = voice[0]
noise = noise[0,:len(voice)]

In [16]:
def n2(tens):
    return (tens*tens).sum()

def norm(tens):
    return n2(tens)**.5

def sdr(ref, deg):
    return 10*torch.log10(n2(deg) / n2(deg-ref))

def si_sdr(ref, deg):
    alpha = norm(deg*ref)/n2(deg)
    return sdr(ref, deg*alpha)

def mix(a, b, ratio_db):
    ratio = 10 ** (ratio_db/20) / norm(a) * norm(b) # calculating amplitude ratio
    m = (a*ratio + b) / (ratio + 1) # computing the noisy signal
    return m * norm(a) / norm(m) # making it same volume as original

In [17]:
r = Resample(48000, 16000)
fnames = []
snr_vals = [-5, 0, 5, 10]
pesq_vals = []
sdr_vals = []
sisdr_vals = []
for snr in snr_vals:
    m = mix(voice, noise, snr)
    fnames.append(f'mix_{snr}.wav')
    torchaudio.save(fnames[-1], m.unsqueeze(0), noise_rate)
    pesq_vals.append(pesq(16000, r(voice).numpy(), r(m).numpy()))
    sdr_vals.append(float(sdr(voice, m)))
    sisdr_vals.append(float(si_sdr(voice, m)))

In [18]:
df = pd.DataFrame({
    'Filename' : fnames,
    'SNR' : snr_vals,
    'SDR' : sdr_vals,
    'SI-SDR' : sisdr_vals,
    'PESQ' : pesq_vals,
    'NISQA' : [' ']*4,
    'DNSMOS' : [' ']*4,
    'MOS' : [' ']*4
})
df

Unnamed: 0,Filename,SNR,SDR,SI-SDR,PESQ,NISQA,DNSMOS,MOS
0,mix_-5.wav,-5,-0.100994,-50.384743,1.026105,,,
1,mix_0.wav,0,2.307439,-47.770924,1.033454,,,
2,mix_5.wav,5,5.893706,-46.156532,1.069925,,,
3,mix_10.wav,10,10.304302,-45.44952,1.181226,,,


In [19]:
df.to_markdown()

'|    | Filename   |   SNR |       SDR |   SI-SDR |    PESQ | NISQA   | DNSMOS   | MOS   |\n|---:|:-----------|------:|----------:|---------:|--------:|:--------|:---------|:------|\n|  0 | mix_-5.wav |    -5 | -0.100994 | -50.3847 | 1.02611 |         |          |       |\n|  1 | mix_0.wav  |     0 |  2.30744  | -47.7709 | 1.03345 |         |          |       |\n|  2 | mix_5.wav  |     5 |  5.89371  | -46.1565 | 1.06993 |         |          |       |\n|  3 | mix_10.wav |    10 | 10.3043   | -45.4495 | 1.18123 |         |          |       |'

In [20]:

torchaudio.save('res.wav', r(voice).unsqueeze(0), 16000)