# FGSM attack on a single spectrogram

In [1]:
from src.utils import *
import librosa
import IPython.display as ipd
import torch.nn as nn
from src.resnet_model import SpectrogramModel
from src.resnet_utils import get_features

## Preliminaries

In [2]:
seed_everything(1234)
set_gpu(-1)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

config_path = '../config/residualnet_train_config.yaml'
config = read_yaml(config_path)

GPU selected: 0 - NVIDIA GeForce RTX 3060


In [3]:
df_eval = pd.read_csv(os.path.join('..', config["df_eval_path"]))

# get the list of evaluation files and labels
file_eval = list(df_eval['path'])
label_eval = list(df_eval['label'])

# get one single file (given the index)
index = 0
file = file_eval[index]
label = label_eval[index]
print(f'Evaluating file {file} with label {label}')

Evaluating file /nas/public/dataset/asvspoof2021/ASVspoof2021_DF_eval/flac/DF_E_2000011.flac with label 1


## Load the model

In [4]:
model = SpectrogramModel().to(device)
model.load_state_dict(torch.load(os.path.join('..', config["model_path_spec"]), map_location=device))
model.eval()
print('Model loaded')
print(f'Is the model on GPU? {next(model.parameters()).is_cuda}')

Model loaded
Is the model on GPU? True


## Load the data (cached spec)

In [5]:
spec= get_features(wav_path=file,
                   features=config['features'],
                   args=config,
                   X=None,
                   cached=True,
                   force=False)
print(f'Spectrogram shape is {spec.shape}')

Spectrogram shape is (1025, 41)


In [6]:
# plot the spectrogram

# librosa.display.specshow(spec, x_axis='time', y_axis='linear', hop_length=512, sr=16000, cmap='magma')
# plt.title('Power spectrogram')
# plt.colorbar(format='%+2.0f dB')
# plt.show()

In [7]:
def find_range(matrix):
    range = np.empty(2)
    range[0] = np.min(matrix)
    range[1] = np.max(matrix)
    return range

range_clean_spec = find_range(spec)
range_clean_spec

array([-38.86148834,  41.13851166])

## Run the attack

In [8]:
# transform into a mini batch and to a tensor
X_batch = np.expand_dims(spec, axis=0)  # ndarray 
X_batch_tensor = torch.from_numpy(X_batch).to(device) # tensor 

X_batch_tensor.size()

torch.Size([1, 1025, 41])

In [9]:
# FGSM attack requires grad wrt the data
X = X_batch_tensor
X.requires_grad = True

In [10]:
out = model(X)
out

tensor([[-8.0388e+00, -3.2277e-04]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)

In [11]:
score = out[0,0]-out[0,1]
if score >0:
    pred = 0
else:
    pred = 1
    
print(f'The predicted class is {pred} and the GT label is {label}')

The predicted class is 1 and the GT label is 1


In [12]:
epsilon = 0.4

# initialize loss object
L = nn.NLLLoss()

In [13]:
loss = None
label_tensor = torch.tensor([1]).to(device)
loss = L(out, label_tensor)

In [14]:
# zero out all existing gradients
model.zero_grad()

# compute gradients
loss.backward()
X_grad = X.grad

In [15]:
def perturb(X, epsilon, grad):
    X_prime = X + epsilon * grad.sign()
    return X_prime

In [16]:
perturbed_spec = perturb(X, epsilon, X_grad)

In [17]:
p_spec = perturbed_spec.squeeze(0).detach()
p_spec = p_spec.cpu().numpy()

In [18]:
p_spec.shape

(1025, 41)

In [19]:
range_p_spec = find_range(p_spec)
range_p_spec

array([-39.26148987,  41.53851318])

## Evaluate the attack

In [20]:
X_p_batch = np.expand_dims(p_spec, axis=0)  # ndarray 
X_p_batch_tensor = torch.from_numpy(X_p_batch).to(device) # tensor 
X_p = X_p_batch_tensor
X_p.requires_grad = True

In [21]:
out_p = model(X_p)
out_p

tensor([[-0.4350, -1.0421]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)

In [22]:
score_p = out_p[0,0]-out_p[0,1]
if score_p >0:
    pred_p = 0
else:
    pred_p = 1
    
print(f'The predicted class is {pred_p} and the GT label is {label}')

The predicted class is 0 and the GT label is 1


## Reconstruct the audio

In [23]:
from tests import recover_mag_spec, spsi, griffin_lim

In [24]:
mag_spec_p = recover_mag_spec(p_spec)

DEBUG:numba.core.byteflow:bytecode dump:
>          0	NOP(arg=None, lineno=1135)
           2	RESUME(arg=0, lineno=1135)
           4	LOAD_FAST(arg=0, lineno=1138)
           6	LOAD_CONST(arg=1, lineno=1138)
           8	BINARY_SUBSCR(arg=None, lineno=1138)
          18	STORE_FAST(arg=3, lineno=1138)
          20	LOAD_FAST(arg=1, lineno=1139)
          22	UNARY_NEGATIVE(arg=None, lineno=1139)
          24	LOAD_FAST(arg=3, lineno=1139)
          26	SWAP(arg=2, lineno=1139)
          28	COPY(arg=2, lineno=1139)
          30	COMPARE_OP(arg=1, lineno=1139)
          36	POP_JUMP_FORWARD_IF_FALSE(arg=6, lineno=1139)
          38	LOAD_FAST(arg=1, lineno=1139)
          40	COMPARE_OP(arg=1, lineno=1139)
          46	POP_JUMP_FORWARD_IF_FALSE(arg=5, lineno=1139)
          48	JUMP_FORWARD(arg=2, lineno=1139)
>         50	POP_TOP(arg=None, lineno=1139)
          52	JUMP_FORWARD(arg=2, lineno=1139)
>         54	LOAD_CONST(arg=1, lineno=1140)
          56	STORE_FAST(arg=3, lineno=1140)
>         58

In [25]:
SPSI_audio_p = spsi(msgram=mag_spec_p, n_fft=2048, hop_length=512)

In [26]:
p = np.angle(librosa.stft(y=SPSI_audio_p, n_fft=2048, hop_length=512, center=False))

DEBUG:numba.core.byteflow:bytecode dump:
>          0	NOP(arg=None, lineno=1039)
           2	RESUME(arg=0, lineno=1039)
           4	LOAD_FAST(arg=0, lineno=1042)
           6	LOAD_CONST(arg=1, lineno=1042)
           8	BINARY_SUBSCR(arg=None, lineno=1042)
          18	LOAD_FAST(arg=0, lineno=1042)
          20	LOAD_CONST(arg=2, lineno=1042)
          22	BINARY_SUBSCR(arg=None, lineno=1042)
          32	COMPARE_OP(arg=4, lineno=1042)
          38	LOAD_FAST(arg=0, lineno=1042)
          40	LOAD_CONST(arg=1, lineno=1042)
          42	BINARY_SUBSCR(arg=None, lineno=1042)
          52	LOAD_FAST(arg=0, lineno=1042)
          54	LOAD_CONST(arg=3, lineno=1042)
          56	BINARY_SUBSCR(arg=None, lineno=1042)
          66	COMPARE_OP(arg=5, lineno=1042)
          72	BINARY_OP(arg=1, lineno=1042)
          76	RETURN_VALUE(arg=None, lineno=1042)
DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=0 nstack_initial=0)])
DEBUG:numba.core.byteflow:stack: []
DEBUG:numba.core.byteflow:state.pc

In [27]:
SPSI_GL_audio_p = griffin_lim(magnitude_spectrogram=mag_spec_p,
                            n_fft=2048,
                            hop_length=512,
                            num_iterations=100,
                            init_phase=p)

In [28]:
ipd.Audio(SPSI_GL_audio_p, rate=16000)