In [1]:
from matplotlib import pyplot as plt
import numpy as np
from pathlib import Path
import soundfile as sf
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torch.nn.functional as F

In [2]:
audio_file = Path('40-222-0000.flac')
print(audio_file.exists())

True


In [3]:
# Load Audio File
audio, sr = librosa.load(audio_file, sr=None)
print(f"Audio loaded at {sr} Hz, Duration: {len(audio)/sr:.2f}s")
audio = audio.astype(np.float32)

Audio loaded at 16000 Hz, Duration: 13.14s


In [26]:
# Add noise to Audio File
rng = np.random.default_rng(seed=42)
noise = rng.normal(0, 0.001, len(audio))
audio_noise = audio + noise

In [6]:
#######################################
### FGSM (Fast Gradient Sign Method)###
#######################################

In [57]:
device = torch.device("cpu")
# load wav2vec2 surrogate, using pretrained
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device).eval()

# helper to transcribe with surrogate
def transcribe_surrogate(arr, sr):
    inputs = processor(arr, sampling_rate=sr, return_tensors="pt", padding=True)
    input_values = inputs.input_values.to(device)
    with torch.no_grad():
        logits = model(input_values).logits
    pred_ids = torch.argmax(logits, dim=-1)
    return processor.batch_decode(pred_ids)[0]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
# Encoding transcription into tensor (for targeted attack)

dictionary = {'-': 0, '|': 1, 'E': 2, 'T': 3, 'A': 4, 
              'O': 5, 'N': 6, 'I': 7, 'H': 8, 'S': 9, 
              'R': 10, 'D': 11, 'L': 12, 'U': 13, 'M': 14, 
              'W': 15, 'C': 16, 'F': 17, 'G': 18, 'Y': 19, 
              'P': 20, 'B': 21, 'V': 22, 'K': 23, "'": 24, 
              'X': 25, 'J': 26, 'Q': 27, 'Z': 28} #wav2vec uses this dictionary

chars = list(transcribe_surrogate(audio,sr))
encoded_chars = [dictionary[char] for char in chars if char in dictionary]
encoded_transcription = torch.tensor(encoded_chars)

In [71]:
# epsilon - finetune
EPS = 0.002

# Untargeted FGSM attack
def FGSM_step(audio):
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
    input_values = inputs.input_values.to(device)   # (1, T)
    input_values = input_values.clone().detach().requires_grad_(True)

    logits = model(input_values).logits                 # (1, seq_len, vocab)
    max_logits = torch.max(logits, dim=-1).values       # (1, seq_len)
    loss = -torch.mean(max_logits)                      
    model.zero_grad() # clears any gradients stored on models params
    loss.backward()

    grad = input_values.grad.detach()
    sign = torch.sign(grad)
    adv_tensor = input_values + EPS * sign
    adv_tensor = torch.clamp(adv_tensor, -1, 1).detach().cpu().numpy()[0]
    return adv_tensor

# Targeted FGSM attack
def FGSM_target(audio):
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
    input_values = inputs.input_values.to(device)   # (1, T)
    input_values.requires_grad = True

    logits = model(input_values).logits
    # more like "untarget"
    target = torch.from_numpy(np.array(encoded_transcription)).to(device).long()

    # Computing CTC Loss
    logits_length = torch.tensor([logits.shape[1]], dtype=torch.long).to(device)
    logits = logits.transpose(0, 1)
    target_length = torch.tensor([len(encoded_transcription)], dtype=torch.long).to(device)
    loss = F.ctc_loss(logits, target, logits_length, target_length, blank=0, reduction='mean')

    loss.backward()

    # maximize loss
    sign = input_values.grad.sign() 
    adv_tensor = input_values + EPS * sign
    adv_tensor = torch.clamp(adv_tensor, -1, 1).detach().cpu().numpy()[0]
    return adv_tensor
    

In [62]:
#FGSM_step(audio)
adv_tensor = FGSM_target(audio)

  target = torch.from_numpy(np.array(encoded_transcription)).to(device).long()


In [72]:
# Iterations seem to work with degrading model performance - finetune
adv_tensor = audio
for i in range(10):
    #FGSM_step(adv_tensor)
    adv_tensor = FGSM_target(adv_tensor)

  target = torch.from_numpy(np.array(encoded_transcription)).to(device).long()


In [73]:
# Save watermarked audio and compare (transcripts + audio)

import IPython.display as ipd

out_file = "40-222-0000_noise.flac"
sf.write(out_file, audio_noise, sr)

out_file = "40-222-0000_fgsm.flac"
sf.write(out_file, adv_tensor, sr)


print("Original Audio:")
ipd.display(ipd.Audio(audio, rate=sr))
orig_transcript = transcribe_surrogate(audio, sr)
print(orig_transcript)

print("\nNoisy Audio:")
ipd.display(ipd.Audio(audio_noise, rate=sr))
noisy_transcript = transcribe_surrogate(audio_noise, sr)
print(noisy_transcript)

print("\nFGSM Audio:")
ipd.display(ipd.Audio(adv_tensor, rate=sr))
adv_transcript = transcribe_surrogate(adv_tensor, sr)
print(adv_transcript)

Original Audio:


CATHERINE WAS COMPLETELY AWAKENED HENRY'S ADDRESS SHORT AS IT HAD BEEN HAD MORE THOROUGHLY OPENED HER EYES TO THE EXTRAVAGANCE OF HER LATE FANCIES THAN ALL THEIR SEVERAL DISAPPOINTMENTS HAD DONE

Noisy Audio:


CATHERINE WAS COMPLETELY AWAKENED HENRY'S ADDRESS SHORT AS IT HAD BEEN HAD MORE THOROUGHLY OPENED HER EYES TO THE EXTRAVAGANCE OF HER LATE FANCIES THAN ALL THEIR SEVERAL DISAPPOINTMENTS HAD DONE

FGSM Audio:


A DEE OAA  A DON
