In [19]:
import matplotlib.pyplot as plt

from src.utils import *
import os
import IPython.display as ipd
import logging
#from src.ResNet1D.resnet1d_model import SpectrogramModel1D
#from src.SENet.senet1d_model import se_resnet341d_custom
from src.SENet.SENet_model import se_resnet34_custom
import torch.nn as nn
import librosa


logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('matplotlib.font_manager').disabled = True
logging.getLogger('matplotlib.colorbar').disabled = True
logging.getLogger('matplotlib.pyplot').disabled = True

In [20]:
config_path = '../config/residualnet_train_config.yaml'
config_res = read_yaml(config_path)
seed_everything(1234)
set_gpu(-1)
plt.style.use('dark_background')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
script_dir = os.getcwd()

GPU selected: 0 - NVIDIA GeForce RTX 3060


## Load clean audio and compute power spec

In [21]:
# load one file
file_number = 1001227
label = np.array([1])
clean_audio_path = os.path.join(script_dir, '..',f'/nas/public/dataset/asvspoof2019/LA/ASVspoof2019_LA_eval/flac/LA_E_{file_number}.flac')
clean, _ = librosa.load(clean_audio_path, sr=16000)
clean = clean[:47104]

In [22]:
win_length = 2048
n_fft = 2048
hop_length = 512
window = 'hann'

s = librosa.stft(clean, n_fft=n_fft, win_length=win_length, hop_length=hop_length, window=window, center=True)
phase = np.angle(s)
mag = np.abs(s)

a = np.abs(s) ** 2
pow_spec = librosa.power_to_db(a, ref=np.max)

In [23]:
# load SENet model
parent_dir = os.path.dirname(script_dir)
config_path = os.path.join(parent_dir, 'config', 'SENet.yaml')
config = read_yaml(config_path)
check_dir = os.path.join(parent_dir, config['model_path_spec_pow_v0'])
sen_model = se_resnet34_custom(num_classes=2).to(device)
sen_model.load_state_dict(torch.load(check_dir, map_location=device), strict=False)

<All keys matched successfully>

In [24]:
#res_model.eval()
sen_model.eval()

CustomResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): SEBasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (se): SELayer(
        (avg_pool): AdaptiveAvgPool2d(output_size=1)
        (fc): Sequential(
          (0): Linear(in_features=64, out_features=4, bias=False)
          (1): ReLU(inplace=True)
          (2): Linear(in_features=4, out

In [25]:
clean_batch = torch.from_numpy(pow_spec).unsqueeze(dim=0).to(device)
clean_batch.shape

torch.Size([1, 1025, 93])

In [26]:
# FGSM attack on SENet
epsilon = 3.0
clean_batch.requires_grad = True
label = np.array([1])
batch_y = torch.from_numpy(label).to(device)
batch_y.shape

torch.Size([1])

## FGSM

In [27]:
L = nn.NLLLoss()
out = sen_model(clean_batch.unsqueeze(dim=1))
print(f'Clean audio prediction is {torch.argmax(out)} with {out}')
loss = L(out, batch_y)
sen_model.zero_grad()
loss.backward()
grad = clean_batch.grad.data
pert_batch = clean_batch + epsilon * grad.sign()

out_pert = sen_model(pert_batch.unsqueeze(dim=1))
print(f'Perturbed audio prediction is {torch.argmax(out_pert)} with {out_pert}')

Clean audio prediction is 1 with tensor([[-1.0387, -0.4369]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
Perturbed audio prediction is 0 with tensor([[  0.0000, -41.8471]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)


In [28]:
# reconvert back to audio
pow_spec = pert_batch.detach().squeeze(0).cpu().numpy()
lin_spec = librosa.db_to_power(pow_spec)
mag_a = np.sqrt(lin_spec)

recon_a = librosa.istft(mag_a * np.exp(1j * phase), n_fft=n_fft, win_length=win_length, hop_length=hop_length, window=window, center=True)  

In [29]:
ipd.Audio(recon_a, rate=16000)

## Audio not yes saved as FLAC, go back to spectrogram

In [30]:
recon_a.shape

(47104,)

In [31]:
# BEFORE SAVING THE AUDIO, get the model prediction 
s1 = librosa.stft(recon_a, n_fft=n_fft, win_length=win_length, hop_length=hop_length, window=window, center=True)
phase1 = np.angle(s1)
mag1 = np.abs(s1)

a1 = np.abs(s1) ** 2
pow_spec1 = librosa.power_to_db(a1, ref=np.max)

In [32]:
batch1 = torch.from_numpy(pow_spec1).unsqueeze(dim=0).to(device)
out1 = sen_model(batch1.unsqueeze(dim=1))
print(f'Not converted to FLAC audio prediction is {torch.argmax(out1)} with {out1}')

Not converted to FLAC audio prediction is 0 with tensor([[-3.5763e-07, -1.4964e+01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)


## Save audio as npy and then reload it

In [33]:
np.save('audio.npy', recon_a)

audio3 = np.load('audio.npy')
s3 = librosa.stft(audio3, n_fft=n_fft, win_length=win_length, hop_length=hop_length, window=window, center=True)
phase3 = np.angle(s3)
mag3 = np.abs(s3)

a3 = np.abs(s3) ** 2
pow_spec3 = librosa.power_to_db(a3, ref=np.max)
batch3 = torch.from_numpy(pow_spec3).unsqueeze(dim=0).to(device)
out3 = sen_model(batch3.unsqueeze(dim=1))
print(f'Saved as .npy audio prediction is {torch.argmax(out3)} with {out3}')

Saved as .npy audio prediction is 0 with tensor([[-3.5763e-07, -1.4964e+01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)


## Save audio as FLAC file and then reload it

In [34]:
# save the audio
import soundfile as sf
sf.write('audio.flac', recon_a, samplerate=16000)

In [35]:
# Load the file
audio2,_ = librosa.load('audio.flac', sr=16000)
audio2.shape

(47104,)

In [36]:
audio2 = audio2[:47104]
s2 = librosa.stft(audio2, n_fft=n_fft, win_length=win_length, hop_length=hop_length, window=window, center=True)
phase2 = np.angle(s2)
mag2 = np.abs(s2)

a2 = np.abs(s2) ** 2
pow_spec2 = librosa.power_to_db(a2, ref=np.max)
batch2 = torch.from_numpy(pow_spec2).unsqueeze(dim=0).to(device)
out2 = sen_model(batch2.unsqueeze(dim=1))
print(f'Converted to FLAC audio prediction is {torch.argmax(out2)} with {out2}')

Converted to FLAC audio prediction is 0 with tensor([[-7.3909e-06, -1.1809e+01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
