# Comparing Pre-Trained Speech Separation models for de-noising

#Load Imports

In [86]:
!pip3 install speechbrain torchmetrics asteroid



In [1]:
from speechbrain.inference.separation import SepformerSeparation as separator
import torchaudio
import IPython
import librosa
import torchaudio
from torchmetrics.audio import SignalDistortionRatio
import torch
import os
import soundfile as sf
from asteroid.models import BaseModel

# Sepformer
Using [this](https://huggingface.co/speechbrain/sepformer-wham-enhancement) version of the model, pretrained on WHAM!

## Demo on Sample

In [2]:
model = separator.from_hparams(source="speechbrain/sepformer-wham-enhancement", savedir='pretrained_models/sepformer-wham-enhancement')

In [10]:
audio, sr = librosa.load('/content/sample/p226_126.wav', sr=16000)

In [11]:
IPython.display.Audio(data=audio, rate=sr)

In [12]:
est_sources = model.separate_file(path='/content/sample/p226_126.wav')
torchaudio.save(f'/content/preds/sepformer/p226_126.wav', est_sources[:, :, 0].detach().cpu(), 8000)

Resampling the audio from 48000 Hz to 8000 Hz


In [13]:
audio_enhanced, sr_enhanced = librosa.load('/content/preds/sepformer/p226_126.wav', sr=16000)


In [14]:
IPython.display.Audio(data=audio_enhanced, rate=sr_enhanced)

In [16]:
audio_target, sr_target = librosa.load('/content/targets/p226_126.wav', sr=16000)

In [17]:
target = torch.from_numpy(audio_enhanced)
pred = torch.from_numpy(audio_target)

In [18]:
sdr = SignalDistortionRatio()
sdr(pred, target)

tensor(13.6360)

## Calculate average SDR for mini-validation batch

In [7]:


# Assign directory
directory = r"/content/sample"
sdr_vals = []
preds = []
targets= []
sdr = SignalDistortionRatio()
# Iterate over files in directory
for name in os.listdir(directory):
    # Open file
    if os.path.isfile(os.path.join(directory, name)):
      with open(os.path.join(directory, name)) as f:
          est_sources = model.separate_file(path=f'/content/sample/{name}')
          torchaudio.save(f'/content/preds/sepformer/{name}', est_sources[:, :, 0].detach().cpu(), 8000)
          audio_enhanced, sr_enhanced = librosa.load(f'/content/preds/sepformer/{name}', sr=16000)
          pred = torch.from_numpy(audio_enhanced)
          preds.append(pred)

          #calculate target
          audio_target, sr = librosa.load(f'/content/targets/{name}', sr=16000)
          target_tensor = torch.from_numpy(audio_target)
          targets.append(target_tensor)
        # sdr_vals.append(sdr(pred, target_tensor))


Resampling the audio from 48000 Hz to 8000 Hz
Resampling the audio from 48000 Hz to 8000 Hz
Resampling the audio from 48000 Hz to 8000 Hz
Resampling the audio from 48000 Hz to 8000 Hz
Resampling the audio from 48000 Hz to 8000 Hz
Resampling the audio from 48000 Hz to 8000 Hz
Resampling the audio from 48000 Hz to 8000 Hz
Resampling the audio from 48000 Hz to 8000 Hz
Resampling the audio from 48000 Hz to 8000 Hz


In [8]:

sdr_val = []
for  prediction, target in zip(preds,targets):
  if prediction.size() == target.size():
    sdr_val.append(sdr(prediction,target))


In [9]:
avg_sdr = sum(sdr_val)/len(sdr_val)
print(f"Average sdr value for mini validation batch: {avg_sdr}")

Average sdr value for mini validation batch: 10.38723087310791


# ConvTasNet
Using [this](https://huggingface.co/mpariente/ConvTasNet_WHAM_sepclean) version of the model, also pretrained on WHAM!

In [19]:
model = BaseModel.from_pretrained("mpariente/ConvTasNet_WHAM_sepclean")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [57]:
audio, sr = librosa.load('/content/sample/p226_128.wav', sr=16000)

In [59]:
IPython.display.Audio(data=audio, rate=sr)

In [60]:
import soundfile as sf
model.separate("/content/sample/p226_128.wav", resample=True)



In [40]:
audio_conv_1, sr_conv1 = librosa.load('/content/sample/p226_128_est1.wav', sr=16000)

In [41]:
IPython.display.Audio(data=audio_conv_1, rate=sr_conv1)

In [42]:
audio_target, sr_target = librosa.load('/content/targets/p226_128.wav', sr=16000)

In [43]:
target = torch.from_numpy(audio_target)
pred = torch.from_numpy(audio_conv_1)

In [44]:
sdr = SignalDistortionRatio()
sdr(pred, target)

tensor(6.4754)

## Calculate average SDR for mini validation batch

In [53]:
# Assign directory
from pathlib import Path
directory = r"/content/sample"
preds_conv = []
targets_conv= []
sdr = SignalDistortionRatio()
# Iterate over files in directory
for name in os.listdir(directory):
    # Open file
    if os.path.isfile(os.path.join(directory, name)) and not os.path.join(directory, name).endswith('est1.wav') and not os.path.join(directory, name).endswith('est2.wav') :
      with open(os.path.join(directory, name)) as f:
          model.separate(f"/content/sample/{name}", resample=True)
          name_stem=Path(f'/content/sample/{name}').stem
          audio_enhanced, sr_enhanced = librosa.load(f'/content/sample/{name_stem}_est1.wav', sr=16000)
          pred = torch.from_numpy(audio_enhanced)
          preds.append(pred)

          #calculate target
          audio_target, sr = librosa.load(f'/content/targets/{name}', sr=16000)
          target_tensor = torch.from_numpy(audio_target)
          targets.append(target_tensor)



In [54]:
sdr_val = []
for  prediction, target in zip(preds,targets):
  if prediction.size() == target.size():
    sdr_val.append(sdr(prediction,target))

In [55]:
avg_sdr = sum(sdr_val)/len(sdr_val)
print(f"Average sdr value for mini validation batch: {avg_sdr}")

Average sdr value for mini validation batch: 8.5319185256958
