In [1]:
from parallel_wavegan.utils import download_pretrained_model

In [2]:
from parallel_wavegan.datasets import MelDataset
from parallel_wavegan.datasets import MelSCPDataset
from parallel_wavegan.utils import load_model
from parallel_wavegan.utils import read_hdf5

In [1]:
import argparse
import logging
import os

import librosa
import numpy as np
import soundfile as sf
import yaml

from tqdm import tqdm

from parallel_wavegan.datasets import AudioDataset
from parallel_wavegan.datasets import AudioSCPDataset
from parallel_wavegan.utils import write_hdf5

In [2]:
import torch

In [3]:
import librosa

In [6]:
from torchaudio.transforms import MelSpectrogram

In [24]:
import torch, yaml
ckpt_file = "/home/rni/ParallelWaveGAN/pretrained_model/libritts_parallel_wavegan.v1/checkpoint-400000steps.pkl"
config_file = "/home/rni/ParallelWaveGAN/pretrained_model/libritts_parallel_wavegan.v1/config.yml"
with open(config_file) as f:
    config = yaml.load(f, Loader=yaml.Loader)
device = torch.device("cpu")
model = load_model(ckpt_file, config)
model.remove_weight_norm()
model = model.eval().to(device)

In [149]:
def logmelfilterbank(
    audio,
    sampling_rate,
    fft_size=1024,
    hop_size=256,
    win_length=None,
    window="hann",
    num_mels=80,
    fmin=None,
    fmax=None,
    eps=1e-10,
    log_base=10.0,
):
    """Compute log-Mel filterbank feature.
    Args:
        audio (ndarray): Audio signal (T,).
        sampling_rate (int): Sampling rate.
        fft_size (int): FFT size.
        hop_size (int): Hop size.
        win_length (int): Window length. If set to None, it will be the same as fft_size.
        window (str): Window function type.
        num_mels (int): Number of mel basis.
        fmin (int): Minimum frequency in mel basis calculation.
        fmax (int): Maximum frequency in mel basis calculation.
        eps (float): Epsilon value to avoid inf in log calculation.
        log_base (float): Log base. If set to None, use np.log.
    Returns:
        ndarray: Log Mel filterbank feature (#frames, num_mels).
    """
    # get amplitude spectrogram
    x_stft = librosa.stft(
        audio,
        n_fft=fft_size,
        hop_length=hop_size,
        win_length=win_length,
        window=window,
        pad_mode="reflect",
    )
    spc = np.abs(x_stft).T  # (#frames, #bins)
    
    # get mel basis
    fmin = 0 if fmin is None else fmin
    fmax = sampling_rate / 2 if fmax is None else fmax
    mel_basis = librosa.filters.mel(
        sr=sampling_rate,
        n_fft=fft_size,
        n_mels=num_mels,
        fmin=fmin,
        fmax=fmax,
    )
    # return spc, mel_basis.T
    # return mel_basis
    mel = np.maximum(eps, np.dot(spc, mel_basis.T))
    # return mel

    if log_base is None:
        return np.log(mel)
    elif log_base == 10.0:
        return np.log10(mel)
    elif log_base == 2.0:
        return np.log2(mel)
    else:
        raise ValueError(f"{log_base} is not supported.")

In [157]:
def logmelfilterbank_torch(
    audio,
    sampling_rate,
    fft_size=1024,
    hop_size=256,
    win_length=None,
    window="hann",
    num_mels=80,
    fmin=None,
    fmax=None,
    eps=1e-10,
    log_base=10.0,
):
    """Compute log-Mel filterbank feature.
    Args:
        audio (ndarray): Audio signal (T,).
        sampling_rate (int): Sampling rate.
        fft_size (int): FFT size.
        hop_size (int): Hop size.
        win_length (int): Window length. If set to None, it will be the same as fft_size.
        window (str): Window function type.
        num_mels (int): Number of mel basis.
        fmin (int): Minimum frequency in mel basis calculation.
        fmax (int): Maximum frequency in mel basis calculation.
        eps (float): Epsilon value to avoid inf in log calculation.
        log_base (float): Log base. If set to None, use np.log.
    Returns:
        ndarray: Log Mel filterbank feature (#frames, num_mels).
    """
    from torchaudio.functional import melscale_fbanks
    # get amplitude spectrogram
    x_stft = torch.stft(
        audio,
        n_fft=fft_size,
        hop_length=hop_size,
        win_length=win_length,
        window=torch.hann_window(win_length),
        pad_mode="reflect",
        return_complex=True
    )
    spc = torch.abs(x_stft).T  # (#frames, #bins)

    # get mel basis
    fmin = 0 if fmin is None else fmin
    fmax = sampling_rate / 2 if fmax is None else fmax
    mel_basis = melscale_fbanks(
        sample_rate=sampling_rate,
        n_freqs=fft_size//2 + 1,
        n_mels=num_mels,
        f_min=fmin,
        f_max=fmax,
        norm='slaney',
        mel_scale='slaney'
    )
    
    # return spc, mel_basis
    # return mel_basis
    mel = torch.maximum(torch.FloatTensor([eps]), torch.mm(spc, mel_basis))
    # return mel

    if log_base is None:
        return torch.log(mel)
    elif log_base == 10.0:
        return torch.log10(mel)
    elif log_base == 2.0:
        return torch.log2(mel)
    else:
        raise ValueError(f"{log_base} is not supported.")

In [9]:
import soundfile as sf
audio = sf.read("/home/rni/.armory/outputs/2022-11-08T052340.454091/saved_samples/0_benign.wav")

In [10]:
audio

(array([ 0.00000000e+00, -3.05175781e-05,  3.05175781e-05, ...,
        -1.22070312e-04, -1.22070312e-04, -9.15527344e-05]),
 16000)

In [11]:
import IPython.display as ipd

In [12]:
new_sr = 24000
resamp_audio = librosa.resample(audio[0], orig_sr=audio[1], target_sr=new_sr)
ipd.Audio(resamp_audio, rate=new_sr)

In [154]:
config_file = "/home/rni/ParallelWaveGAN/pretrained_model/libritts_parallel_wavegan.v1/config.yml"
with open(config_file) as f:
    config = yaml.load(f, Loader=yaml.Loader)
lmf = logmelfilterbank(            
            resamp_audio,
            sampling_rate=config["sampling_rate"],
            hop_size=config["hop_size"],
            fft_size=config["fft_size"],
            win_length=config["win_length"],
            window=config["window"],
            num_mels=config["num_mels"],
            fmin=config["fmin"],
            fmax=config["fmax"],
            # keep compatibility
            log_base=config.get("log_base", 10.0),)
lmf

array([[-2.73149134, -2.81738282, -3.12153097, ..., -4.27301669,
        -4.53236082, -4.60934559],
       [-2.48232727, -2.95189755, -3.34601933, ..., -4.22800722,
        -4.45674499, -4.58589894],
       [-2.39135516, -2.81979582, -3.26478438, ..., -4.26005279,
        -4.3563763 , -4.60626312],
       ...,
       [-2.37460551, -2.83933532, -3.43327012, ..., -4.25140663,
        -4.18628759, -4.4555066 ],
       [-2.39330747, -2.89031327, -3.29838024, ..., -4.13329922,
        -4.05428251, -4.55463721],
       [-2.51403397, -2.9964339 , -3.30979346, ..., -4.16680565,
        -4.15292685, -4.58716904]])

In [158]:
lmf_torch_self_impl = logmelfilterbank_torch(            
            torch.tensor(resamp_audio, dtype=torch.float),
            sampling_rate=config["sampling_rate"],
            hop_size=config["hop_size"],
            fft_size=config["fft_size"],
            win_length=config["win_length"],
            window=config["window"],
            num_mels=config["num_mels"],
            fmin=config["fmin"],
            fmax=config["fmax"],
            # keep compatibility
            log_base=config.get("log_base", 10.0),)
lmf_torch_self_impl

tensor([[-2.7315, -2.8174, -3.1215,  ..., -4.2730, -4.5324, -4.6093],
        [-2.4823, -2.9519, -3.3460,  ..., -4.2280, -4.4567, -4.5859],
        [-2.3914, -2.8198, -3.2648,  ..., -4.2601, -4.3564, -4.6063],
        ...,
        [-2.3746, -2.8393, -3.4333,  ..., -4.2514, -4.1863, -4.4555],
        [-2.3933, -2.8903, -3.2984,  ..., -4.1333, -4.0543, -4.5546],
        [-2.5140, -2.9964, -3.3098,  ..., -4.1668, -4.1529, -4.5872]])

In [160]:
np.allclose(lmf, lmf_torch_self_impl, atol=1e-2)

True

In [63]:
print(lmf_torch_self_impl.numpy().transpose(1,0)-lmf)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [19]:
trans = MelSpectrogram(
    sample_rate=config["sampling_rate"],
    hop_length=config["hop_size"],
    n_fft=config["fft_size"],
    win_length=config["win_length"],
    n_mels=config["num_mels"],
    f_min=config["fmin"],
    f_max=config["fmax"],
    mel_scale='slaney',
    norm='slaney'
    )

In [20]:
lmf_torch = trans(torch.tensor(resamp_audio, dtype=torch.float))
lmf_torch

tensor([[5.3879e-05, 1.3442e-04, 2.0615e-04,  ..., 2.2653e-04, 2.0602e-04,
         1.3215e-04],
        [3.0263e-05, 2.5293e-05, 4.3288e-05,  ..., 4.8685e-05, 3.7437e-05,
         1.8879e-05],
        [6.9207e-06, 2.5714e-06, 3.5606e-06,  ..., 1.6411e-06, 3.2443e-06,
         3.8392e-06],
        ...,
        [5.9611e-08, 6.2648e-08, 4.8260e-08,  ..., 5.0844e-08, 8.0969e-08,
         8.4309e-08],
        [2.5317e-08, 2.7627e-08, 3.0116e-08,  ..., 5.9407e-08, 1.1952e-07,
         1.0101e-07],
        [1.0619e-08, 1.0355e-08, 8.7997e-09,  ..., 2.1267e-08, 1.6373e-08,
         1.4542e-08]])

In [21]:
x_torch = torch.log10(lmf_torch.transpose(0,1))
x_torch

tensor([[-4.2686, -4.5191, -5.1598,  ..., -7.2247, -7.5966, -7.9739],
        [-3.8715, -4.5970, -5.5898,  ..., -7.2031, -7.5587, -7.9848],
        [-3.6858, -4.3636, -5.4485,  ..., -7.3164, -7.5212, -8.0555],
        ...,
        [-3.6449, -4.3126, -5.7849,  ..., -7.2938, -7.2262, -7.6723],
        [-3.6861, -4.4267, -5.4889,  ..., -7.0917, -6.9226, -7.7859],
        [-3.8789, -4.7240, -5.4158,  ..., -7.0741, -6.9956, -7.8374]])

In [15]:
c = torch.tensor(lmf, dtype=torch.float).to(device)
y = model.inference(c, normalize_before=True).view(-1)

In [25]:
y_torch = model.inference(x_torch.to(device), normalize_before=True).view(-1)

In [26]:
ipd.Audio(y_torch.cpu().detach().numpy(), rate=config["sampling_rate"])

In [4]:
model

ParallelWaveGANGenerator(
  (first_conv): Conv1d1x1(1, 64, kernel_size=(1,), stride=(1,))
  (upsample_net): ConvInUpsampleNetwork(
    (conv_in): Conv1d(80, 80, kernel_size=(5,), stride=(1,), bias=False)
    (upsample): UpsampleNetwork(
      (up_layers): ModuleList(
        (0): Stretch2d()
        (1): Conv2d(1, 1, kernel_size=(1, 9), stride=(1, 1), padding=(0, 4), bias=False)
        (2): Stretch2d()
        (3): Conv2d(1, 1, kernel_size=(1, 11), stride=(1, 1), padding=(0, 5), bias=False)
        (4): Stretch2d()
        (5): Conv2d(1, 1, kernel_size=(1, 7), stride=(1, 1), padding=(0, 3), bias=False)
        (6): Stretch2d()
        (7): Conv2d(1, 1, kernel_size=(1, 11), stride=(1, 1), padding=(0, 5), bias=False)
      )
    )
  )
  (conv_layers): ModuleList(
    (0): WaveNetResidualBlock(
      (conv): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv1x1_aux): Conv1d1x1(80, 128, kernel_size=(1,), stride=(1,), bias=False)
      (conv1x1_out): Conv1d1x1(64, 64,

In [17]:
from art.defences.preprocessor import Preprocessor

In [162]:
from torchaudio.transforms import Resample
from functools import partial

In [None]:
class PWGAN_Defense(torch.nn.Module):
    def __init__(self, config, ckpt_path):
        super().__init__()
        self.pre_resample = Resample(16000, 24000)
        self.post_resample = Resample(24000, 16000)
        self.log_mel_feature = partial(
            logmelfilterbank_torch,             
            sampling_rate=config["sampling_rate"],
            hop_size=config["hop_size"],
            fft_size=config["fft_size"],
            win_length=config["win_length"],
            window=config["window"],
            num_mels=config["num_mels"],
            fmin=config["fmin"],
            fmax=config["fmax"],
            log_base=config.get("log_base", 10.0)
        )
        model = load_model(ckpt_path, config)
        model.remove_weight_norm()
        model = model.eval().to(device)
        self.pwgan = model
        
    def forward(self, x):
        x = self.pre_resample(x)
        x = self.log_mel_feature(x)
        x = self.model.inference(x, normalize_before=True).view(-1)
        return self.post_resample(x)
        

In [1]:
!pip install tensorflow

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [1]:
import tensorflow

2022-12-04 01:58:28.503557: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-04 01:58:28.633244: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-04 01:58:29.495351: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib/python3.8/site-packages

In [2]:
import sys, torch, librosa
sys.path.append("..")

In [3]:
from PWGAN_defense import PWGAN_Defense_auto_ckpt

In [4]:
pwgan = PWGAN_Defense_auto_ckpt()

In [5]:
pwgan = pwgan.to("cuda")

In [6]:
import soundfile as sf
audio = sf.read("/jet/home/rni/ECE_STORAGE/785-proj/PWGAN-ASR-defense/datasets/0_benign.wav")

In [7]:
import IPython.display as ipd
new_sr = 24000
resamp_audio = librosa.resample(audio[0], orig_sr=audio[1], target_sr=new_sr)
ipd.Audio(resamp_audio, rate=new_sr)

In [8]:
y = pwgan(torch.tensor(audio[0], dtype=torch.float).to("cuda"))

In [9]:
ipd.Audio(y.cpu().detach().numpy(), rate=16000)

In [10]:
from art.defences.preprocessor.preprocessor import PreprocessorPyTorch

In [11]:
from typing import Optional, Tuple

In [12]:
class PWGANDefensePyTorch(PreprocessorPyTorch):
    def __init__(self, device):
        super().__init__(
            device_type=device,
            is_fitted=True,
            apply_fit=False,
            apply_predict=True,
        )
        self.pwgan = PWGAN_Defense_auto_ckpt().to(device)
        
    def forward(
        self, x: "torch.Tensor", y: Optional["torch.Tensor"] = None
    ) -> Tuple["torch.Tensor", Optional["torch.Tensor"]]:
        return pwgan(x), y

In [13]:
pwgan_defense = PWGANDefensePyTorch("cuda")

In [14]:
from art.estimators.speech_recognition import PyTorchDeepSpeech
# ds2 = PyTorchDeepSpeech(pretrained_model="librispeech", clip_values=[-1, 1])
ds2 = PyTorchDeepSpeech(pretrained_model="librispeech", clip_values=[-1, 1], preprocessing_defences=pwgan_defense)

In [15]:
from art.attacks.evasion import ProjectedGradientDescentPyTorch, ProjectedGradientDescent, ImperceptibleASRPyTorch
eps = 0.01
pgd = ProjectedGradientDescent(ds2, norm='inf', eps=eps, eps_step=eps/5, max_iter=7, batch_size=1)

In [16]:
from deepspeech_pytorch.model import DeepSpeech
str(DeepSpeech.__base__)

"<class 'pytorch_lightning.core.module.LightningModule'>"

In [17]:
audio[0].dtype

dtype('float64')

In [23]:
import numpy as np
# pgd.generate(np.array([audio[0]]))
adv = pgd.generate(np.array([audio[0]]))

PGD - Iterations:   0%|          | 0/7 [00:00<?, ?it/s]

In [24]:
%load_ext autoreload
%autoreload 2

In [21]:
torch.cuda.empty_cache()

In [25]:
ipd.Audio(adv, rate=16000)

In [26]:
from art.estimators.speech_recognition import PyTorchDeepSpeech
ds2 = PyTorchDeepSpeech(pretrained_model="librispeech", clip_values=[-1, 1])
# ds2 = PyTorchDeepSpeech(pretrained_model="librispeech", clip_values=[-1, 1], preprocessing_defences=pwgan_defense)

In [27]:
from art.attacks.evasion import ProjectedGradientDescentPyTorch, ProjectedGradientDescent, ImperceptibleASRPyTorch
eps = 0.01
pgd = ProjectedGradientDescent(ds2, norm='inf', eps=eps, eps_step=eps/5, max_iter=7, batch_size=1)

In [28]:
import numpy as np
# pgd.generate(np.array([audio[0]]))
adv = pgd.generate(np.array([audio[0]]))

PGD - Iterations:   0%|          | 0/7 [00:00<?, ?it/s]

In [29]:
ipd.Audio(adv, rate=16000)