In [2]:
pip install numpy scipy torch torchaudio librosa pydub matplotlib

Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
import torchaudio
import numpy as np
import scipy.io.wavfile as wav
import os
from pydub import AudioSegment
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

def convert_audio(input_path):
    base, ext = os.path.splitext(input_path)
    output_path = f"{base}_converted.wav"
    audio = AudioSegment.from_file(input_path)
    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
    audio.export(output_path, format="wav")
    return output_path

MODEL_NAME = "facebook/wav2vec2-large-960h"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME, ignore_mismatched_sizes=True)

AUDIO_PATH = "Tsg_long_version.wav"
AUDIO_PATH = convert_audio(AUDIO_PATH)

waveform, sample_rate = torchaudio.load(AUDIO_PATH)

if sample_rate != 16000:
    raise ValueError(f"Incorrect sample rate: {sample_rate} Hz")

waveform = waveform.mean(dim=0)
waveform = waveform / waveform.abs().max()

inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
inputs.input_values = inputs.input_values.squeeze(1)
with torch.no_grad():
    logits = model(inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text_original = processor.batch_decode(predicted_ids)[0]

print(f"✅ Original Transcription: {text_original}")

audio_tensor = waveform.clone().detach().requires_grad_(True)

epsilon = 0.005
alpha = 0.001
num_iter = 6

adv_audio = audio_tensor.clone().detach().requires_grad_(True)

target = processor.tokenizer(text_original, return_tensors="pt").input_ids.squeeze(0)

input_lengths = torch.tensor([logits.shape[1]], dtype=torch.long)
target_lengths = torch.tensor([target.shape[0]], dtype=torch.long)
ctc_loss = torch.nn.CTCLoss(blank=processor.tokenizer.pad_token_id)

for i in range(num_iter):
    print(f"🔄 Iteration {i+1}")
    adv_audio = adv_audio.clone().detach().requires_grad_(True)

    if adv_audio.grad is not None:
        adv_audio.grad.zero_()

    logits = model(adv_audio.unsqueeze(0)).logits
    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
    loss = ctc_loss(log_probs.transpose(0, 1), target, input_lengths, target_lengths)
    
    loss.backward()
    print(f"🔍 Loss: {loss.item()}")
    
    with torch.no_grad():
        perturbation = alpha * torch.sign(adv_audio.grad)
        adv_audio = adv_audio + perturbation
        perturbation = torch.clamp(adv_audio - audio_tensor, min=-epsilon, max=epsilon)
        adv_audio = audio_tensor + perturbation
        adv_audio = torch.clamp(adv_audio, min=-1, max=1)
    
    adv_audio.requires_grad_()

adversarial_audio = (adv_audio.detach().numpy() * 32768).astype(np.int16)
wav.write("TpFasi_adversarial_PGD.wav", 16000, adversarial_audio)

inputs = processor(adv_audio.unsqueeze(0), sampling_rate=16000, return_tensors="pt", padding=True)
inputs.input_values = inputs.input_values.squeeze(0)  # Suppression correcte de la première dimension
inputs.input_values = inputs.input_values.squeeze().unsqueeze(0)


with torch.no_grad():
    logits = model(inputs.input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)
text_adversarial = processor.batch_decode(predicted_ids)[0]

print(f"🚨 Transcription after attack: {text_adversarial}")
print("✅ Adversarial audio saved as 'TpFasi_adversarial_PGD.wav'")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Original Transcription: THE SUN ALSO RISES POWERFUL INTENSE VISUALLY MAGNIFICENT THE SUN ALSO RISES IS THE NOVEL WHICH ESTABLISHED INATEMIGWAY AS A WRITER OF GENIUS
🔄 Iteration 1
🔍 Loss: 0.45314472913742065
🔄 Iteration 2
🔍 Loss: 0.9177964329719543
🔄 Iteration 3
🔍 Loss: 1.4054068326950073
🔄 Iteration 4
🔍 Loss: 1.3769713640213013
🔄 Iteration 5
🔍 Loss: 1.9479230642318726
🔄 Iteration 6
🔍 Loss: 1.5851980447769165
🚨 Transcription after attack: THE SUN ALSO ARISES POWERFUL AND TEMTH VISUALLY MAGNIFICENT THE SUN ALSO ARISES AS THE MOLVELE WHICH ESTABLISHS ERNEST HENLINK WITH OF THE RISER OFF GENIUS
✅ Adversarial audio saved as 'TpFasi_adversarial_PGD.wav'


In [6]:
import librosa
import librosa.display
import torch
import numpy as np
import matplotlib.pyplot as plt
from scipy.io.wavfile import write

def bark_scale(freq):
    """Convert frequency to the Bark scale."""
    return 13 * np.arctan(0.00076 * freq) + 3.5 * np.arctan((freq / 7500.0) ** 2)

def apply_psychoacoustic_masking(waveform, sr=16000):
    """Apply a psychoacoustic masking technique to hide perturbations."""
    waveform_np = waveform.detach().cpu().numpy()  # Détacher le tenseur avant conversion
    stft = librosa.stft(waveform_np, n_fft=512, hop_length=128)
    magnitude, phase = np.abs(stft), np.angle(stft)

    # Get frequency bins
    freqs = librosa.fft_frequencies(sr=sr, n_fft=512)
    bark_bins = bark_scale(freqs)

    # Apply masking: reduce noise on perceptually sensitive frequencies
    mask = np.exp(-bark_bins / np.max(bark_bins))  # Higher Bark → lower modification
    masked_magnitude = magnitude * mask[:, np.newaxis]

    # Reconstruct signal
    modified_stft = masked_magnitude * np.exp(1j * phase)
    modified_waveform = librosa.istft(modified_stft, hop_length=128)

    return torch.tensor(modified_waveform, dtype=torch.float32)

# Appliquer la correction et tester
adv_audio = apply_psychoacoustic_masking(adv_audio)


# Apply psychoacoustic filtering to perturbation
adv_audio = apply_psychoacoustic_masking(adv_audio)


In [20]:
import torch
import torchaudio
import numpy as np
import scipy.io.wavfile as wav
import os
import librosa
import librosa.display
from pydub import AudioSegment
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

def convert_audio(input_path):
    base, ext = os.path.splitext(input_path)
    output_path = f"{base}_converted.wav"
    audio = AudioSegment.from_file(input_path)
    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
    audio.export(output_path, format="wav")
    return output_path

def bark_scale(freq):
    """Convert frequency to the Bark scale."""
    return 13 * np.arctan(0.00076 * freq) + 3.5 * np.arctan((freq / 7500.0) ** 2)

def apply_psychoacoustic_masking(waveform, sr=16000):
    """Appliquer un masquage psychoacoustique pour cacher les perturbations."""
    waveform_np = waveform.detach().cpu().numpy()  # Détacher pour utiliser librosa
    stft = librosa.stft(waveform_np, n_fft=512, hop_length=128)
    magnitude, phase = np.abs(stft), np.angle(stft)

    # Appliquer une pondération basée sur l'échelle de Bark
    freqs = librosa.fft_frequencies(sr=sr, n_fft=512)
    bark_bins = bark_scale(freqs)
    mask = np.exp(-bark_bins / np.max(bark_bins))  # Plus Bark est haut, moins de modification
    masked_magnitude = magnitude * mask[:, np.newaxis]

    # Reconstruction du signal
    modified_stft = masked_magnitude * np.exp(1j * phase)
    modified_waveform = librosa.istft(modified_stft, hop_length=128)

    return torch.tensor(modified_waveform, dtype=torch.float32)

MODEL_NAME = "facebook/wav2vec2-large-960h"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME, ignore_mismatched_sizes=True)

AUDIO_PATH = "Tsg_long_version.wav"
AUDIO_PATH = convert_audio(AUDIO_PATH)

waveform, sample_rate = torchaudio.load(AUDIO_PATH)

if sample_rate != 16000:
    raise ValueError(f"Incorrect sample rate: {sample_rate} Hz")

waveform = waveform.mean(dim=0)
waveform = waveform / waveform.abs().max()

inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
inputs.input_values = inputs.input_values.squeeze(1)
with torch.no_grad():
    logits = model(inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text_original = processor.batch_decode(predicted_ids)[0]

print(f"✅ Original Transcription: {text_original}")

audio_tensor = waveform.clone().detach().requires_grad_(True)

epsilon = 0.015
alpha = epsilon / 3
num_iter = min(10,int(epsilon/alpha)+2)

adv_audio = audio_tensor.clone().detach().requires_grad_(True)

target = processor.tokenizer(text_original, return_tensors="pt").input_ids.squeeze(0)

input_lengths = torch.tensor([logits.shape[1]], dtype=torch.long)
target_lengths = torch.tensor([target.shape[0]], dtype=torch.long)
ctc_loss = torch.nn.CTCLoss(blank=processor.tokenizer.pad_token_id)

for i in range(num_iter):
    print(f"🔄 Iteration {i+1}")
    adv_audio = adv_audio.clone().detach().requires_grad_(True)

    if adv_audio.grad is not None:
        adv_audio.grad.zero_()

    logits = model(adv_audio.unsqueeze(0)).logits
    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
    loss = ctc_loss(log_probs.transpose(0, 1), target, input_lengths, target_lengths)
    
    loss.backward()
    print(f"🔍 Loss: {loss.item()}")
    
    with torch.no_grad():
        perturbation = alpha * torch.sign(adv_audio.grad)
        adv_audio = adv_audio + perturbation
        perturbation = torch.clamp(adv_audio - audio_tensor, min=-epsilon, max=epsilon)
        adv_audio = audio_tensor + perturbation
        adv_audio = torch.clamp(adv_audio, min=-1, max=1)
    
    # ✅ Appliquer le masquage psychoacoustique après chaque mise à jour
    adv_audio = apply_psychoacoustic_masking(adv_audio)
    if adv_audio.shape[0] < audio_tensor.shape[0]:
    # Zero-padding si la sortie de istft est trop courte
        adv_audio = torch.nn.functional.pad(adv_audio, (0, audio_tensor.shape[0] - adv_audio.shape[0]))
    elif adv_audio.shape[0] > audio_tensor.shape[0]:
    # Tronquer si la sortie de istft est trop longue
        adv_audio = adv_audio[:audio_tensor.shape[0]]
    adv_audio.requires_grad_()

# Sauvegarde de l'audio après application du masquage psychoacoustique
adversarial_audio = (adv_audio.detach().numpy() * 32768).astype(np.int16)
wav.write("TpFasi_adversarial_PGD_psychoacoustic.wav", 16000, adversarial_audio)

# Vérifier la transcription après attaque
inputs = processor(adv_audio.unsqueeze(0), sampling_rate=16000, return_tensors="pt", padding=True)
inputs.input_values = inputs.input_values.squeeze(0)
inputs.input_values = inputs.input_values.squeeze().unsqueeze(0)

with torch.no_grad():
    logits = model(inputs.input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)
text_adversarial = processor.batch_decode(predicted_ids)[0]

print(f"🚨 Transcription after attack (psychoacoustic applied): {text_adversarial}")
print("✅ Adversarial audio saved as 'TpFasi_adversarial_PGD_psychoacoustic.wav'")


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Original Transcription: THE SUN ALSO RISES POWERFUL INTENSE VISUALLY MAGNIFICENT THE SUN ALSO RISES IS THE NOVEL WHICH ESTABLISHED INATEMIGWAY AS A WRITER OF GENIUS
🔄 Iteration 1
🔍 Loss: 0.45314472913742065
🔄 Iteration 2
🔍 Loss: 1.0092693567276
🔄 Iteration 3
🔍 Loss: 1.4539203643798828
🔄 Iteration 4
🔍 Loss: 1.3525958061218262
🔄 Iteration 5
🔍 Loss: 1.8819913864135742
🚨 Transcription after attack (psychoacoustic applied): THE SUN ALSO ARISES POWERFUL AND PREVISUALLY MAGNIFICENT THE SUN ALSO ARIVES IS A NOVEL WHICH ESTABLISHED EARNEST HIMIG WITH OTHER WRITER OF GENIUS
✅ Adversarial audio saved as 'TpFasi_adversarial_PGD_psychoacoustic.wav'


In [32]:
import torch
import torchaudio
import numpy as np
import scipy.io.wavfile as wav
import os
import librosa
import librosa.display
from pydub import AudioSegment
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

def convert_audio(input_path):
    base, ext = os.path.splitext(input_path)
    output_path = f"{base}_converted.wav"
    audio = AudioSegment.from_file(input_path)
    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
    audio.export(output_path, format="wav")
    return output_path

def bark_scale(freq):
    """Convert frequency to the Bark scale."""
    return 13 * np.arctan(0.00076 * freq) + 3.5 * np.arctan((freq / 7500.0) ** 2)

def apply_psychoacoustic_masking(waveform, sr=16000):
    """Appliquer un masquage psychoacoustique pour cacher les perturbations."""
    waveform_np = waveform.detach().cpu().numpy()  # Détacher pour utiliser librosa
    stft = librosa.stft(waveform_np, n_fft=512, hop_length=128)
    magnitude, phase = np.abs(stft), np.angle(stft)

    # Appliquer une pondération basée sur l'échelle de Bark
    freqs = librosa.fft_frequencies(sr=sr, n_fft=512)
    bark_bins = bark_scale(freqs)
    mask = np.exp(-bark_bins / np.max(bark_bins))  # Plus Bark est haut, moins de modification
    masked_magnitude = magnitude * mask[:, np.newaxis]

    # Reconstruction du signal
    modified_stft = masked_magnitude * np.exp(1j * phase)
    modified_waveform = librosa.istft(modified_stft, hop_length=128)

    return torch.tensor(modified_waveform, dtype=torch.float32)

MODEL_NAME = "facebook/wav2vec2-large-960h"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME, ignore_mismatched_sizes=True)

AUDIO_PATH = "Tsg_long_version.wav"
AUDIO_PATH = convert_audio(AUDIO_PATH)

waveform, sample_rate = torchaudio.load(AUDIO_PATH)

if sample_rate != 16000:
    raise ValueError(f"Incorrect sample rate: {sample_rate} Hz")

waveform = waveform.mean(dim=0)
waveform = waveform / waveform.abs().max()

inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
inputs.input_values = inputs.input_values.squeeze(1)
with torch.no_grad():
    logits = model(inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text_original = processor.batch_decode(predicted_ids)[0]

print(f"✅ Original Transcription: {text_original}")

audio_tensor = waveform.clone().detach().requires_grad_(True)

epsilon = 0.03
alpha = epsilon / 3.5
num_iter = 10
perturbation_norm = torch.norm(adv_audio - audio_tensor).item()
print(f"🔍 Perturbation Norm: {perturbation_norm:.6f}")

adv_audio = audio_tensor.clone().detach().requires_grad_(True)

target = processor.tokenizer(text_original, return_tensors="pt").input_ids.squeeze(0)

input_lengths = torch.tensor([logits.shape[1]], dtype=torch.long)
target_lengths = torch.tensor([target.shape[0]], dtype=torch.long)
ctc_loss = torch.nn.CTCLoss(blank=processor.tokenizer.pad_token_id)

for i in range(num_iter):
    print(f"🔄 Iteration {i+1}")
    adv_audio = adv_audio.clone().detach().requires_grad_(True)

    if adv_audio.grad is not None:
        adv_audio.grad.zero_()

    logits = model(adv_audio.unsqueeze(0)).logits
    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
    entropy_loss = -torch.sum(log_probs * torch.exp(log_probs))  # Increase uncertainty
    alignment_loss = ctc_loss(log_probs.transpose(0, 1), target, input_lengths, target_lengths)
    loss = alignment_loss + 0.25 * entropy_loss  # Weighted attack

    loss.backward()
    print(f"🔍 Loss: {loss.item()}")
    
    with torch.no_grad():
        adaptive_alpha = alpha * torch.abs(adv_audio.grad) / torch.max(torch.abs(adv_audio.grad))
        perturbation = adaptive_alpha * torch.sign(adv_audio.grad)
        adv_audio = adv_audio + perturbation
        perturbation = torch.clamp(adv_audio - audio_tensor, min=-epsilon, max=epsilon)
        adv_audio = audio_tensor + perturbation
        adv_audio = torch.clamp(adv_audio, min=-1, max=1)
    
    energy = torch.abs(torch.stft(adv_audio, n_fft=512, return_complex=True)).mean(dim=1)
    high_energy_mask = (energy > torch.mean(energy)).float()

    # Upsample high_energy_mask to match adv_audio's length
    high_energy_mask = torch.nn.functional.interpolate(
        high_energy_mask.unsqueeze(0).unsqueeze(0),  # Add batch & channel dims
        size=adv_audio.shape[0],  # Resize to match waveform length
        mode="linear",  # Use linear interpolation for smooth scaling
        align_corners=False
    ).squeeze()  # Remove extra dims

    energy_factor = 0.85  # Control how much energy is preserved (0.0 = mute, 1.0 = no change)
    adaptive_mask = energy_factor + (1 - energy_factor) * high_energy_mask
    adv_audio = adv_audio * adaptive_mask

    adv_audio.requires_grad_()

# Sauvegarde de l'audio après application du masquage psychoacoustique
adversarial_audio = (adv_audio.detach().numpy() * 32768).astype(np.int16)
wav.write("TpFasi_adversarial_PGD_psychoacoustic.wav", 16000, adversarial_audio)

# Vérifier la transcription après attaque
inputs = processor(adv_audio.unsqueeze(0), sampling_rate=16000, return_tensors="pt", padding=True)
inputs.input_values = inputs.input_values.squeeze(0)
inputs.input_values = inputs.input_values.squeeze().unsqueeze(0)

with torch.no_grad():
    logits = model(inputs.input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)
text_adversarial = processor.batch_decode(predicted_ids)[0]

print(f"🚨 Transcription after attack (psychoacoustic applied): {text_adversarial}")
print(f"🔍 Max absolute perturbation: {torch.max(torch.abs(adv_audio - audio_tensor)).item():.6f}")
print("✅ Adversarial audio saved as 'TpFasi_adversarial_PGD_psychoacoustic.wav'")


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Original Transcription: THE SUN ALSO RISES POWERFUL INTENSE VISUALLY MAGNIFICENT THE SUN ALSO RISES IS THE NOVEL WHICH ESTABLISHED INATEMIGWAY AS A WRITER OF GENIUS
🔍 Perturbation Norm: 18.617125
🔄 Iteration 1
🔍 Loss: 13.497586250305176
🔄 Iteration 2
🔍 Loss: 12.681798934936523
🔄 Iteration 3
🔍 Loss: 12.561518669128418
🔄 Iteration 4
🔍 Loss: 12.978765487670898
🔄 Iteration 5
🔍 Loss: 16.039358139038086
🔄 Iteration 6
🔍 Loss: 15.965095520019531
🔄 Iteration 7
🔍 Loss: 19.217084884643555
🔄 Iteration 8
🔍 Loss: 13.925923347473145
🔄 Iteration 9
🔍 Loss: 17.148847579956055
🔄 Iteration 10
🔍 Loss: 14.865665435791016
🚨 Transcription after attack (psychoacoustic applied): THE SUN ALSO ARISES POWERFUL AND TENSS VISUALLY MAGNIFICENT THE SUN ALSO ARISES IS THE NOVEL WHICH ESTABLISHED AN NERTAINMENT WITH THE WRITER OF GENIUS
🔍 Max absolute perturbation: 0.175500
✅ Adversarial audio saved as 'TpFasi_adversarial_PGD_psychoacoustic.wav'


In [4]:
import torch
import torchaudio
import numpy as np
import scipy.io.wavfile as wav
import os
import librosa
import librosa.display
from pydub import AudioSegment
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

def convert_audio(input_path):
    base, ext = os.path.splitext(input_path)
    output_path = f"{base}_converted.wav"
    audio = AudioSegment.from_file(input_path)
    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
    audio.export(output_path, format="wav")
    return output_path

def bark_scale(freq):
    """Convert frequency to the Bark scale."""
    return 13 * np.arctan(0.00076 * freq) + 3.5 * np.arctan((freq / 7500.0) ** 2)

def apply_psychoacoustic_masking(waveform, sr=16000):
    """Appliquer un masquage psychoacoustique pour cacher les perturbations."""
    waveform_np = waveform.detach().cpu().numpy()  # Détacher pour utiliser librosa
    stft = librosa.stft(waveform_np, n_fft=512, hop_length=128)
    magnitude, phase = np.abs(stft), np.angle(stft)

    # Appliquer une pondération basée sur l'échelle de Bark
    freqs = librosa.fft_frequencies(sr=sr, n_fft=512)
    bark_bins = bark_scale(freqs)
    mask = np.exp(-bark_bins / np.max(bark_bins))  # Plus Bark est haut, moins de modification
    masked_magnitude = magnitude * mask[:, np.newaxis]

    # Reconstruction du signal
    modified_stft = masked_magnitude * np.exp(1j * phase)
    modified_waveform = librosa.istft(modified_stft, hop_length=128)

    return torch.tensor(modified_waveform, dtype=torch.float32)

MODEL_NAME = "facebook/wav2vec2-large-960h"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME, ignore_mismatched_sizes=True)

AUDIO_PATH = "Test_tp_fasii.wav"
AUDIO_PATH = convert_audio(AUDIO_PATH)

waveform, sample_rate = torchaudio.load(AUDIO_PATH)

if sample_rate != 16000:
    raise ValueError(f"Incorrect sample rate: {sample_rate} Hz")

waveform = waveform.mean(dim=0)
waveform = waveform / waveform.abs().max()

inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
inputs.input_values = inputs.input_values.squeeze(1)
with torch.no_grad():
    logits = model(inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text_original = processor.batch_decode(predicted_ids)[0]

print(f"✅ Original Transcription: {text_original}")

audio_tensor = waveform.clone().detach().requires_grad_(True)

epsilon = 0.03
alpha = epsilon / 4
num_iter = 10
adv_audio = audio_tensor.clone().detach().requires_grad_(True)
min_length = min(audio_tensor.shape[0], adv_audio.shape[0])
audio_tensor = audio_tensor[:min_length]
adv_audio = adv_audio[:min_length]

perturbation_norm = torch.norm(adv_audio - audio_tensor).item()

print(f"🔍 Perturbation Norm: {perturbation_norm:.6f}")



target = processor.tokenizer(text_original, return_tensors="pt").input_ids.squeeze(0)

input_lengths = torch.tensor([logits.shape[1]], dtype=torch.long, device=logits.device)
target_lengths = torch.tensor([min(target.shape[0], logits.shape[1])], dtype=torch.long, device=logits.device)

ctc_loss = torch.nn.CTCLoss(blank=processor.tokenizer.pad_token_id)

for i in range(num_iter):
    print(f"🔄 Iteration {i+1}")
    adv_audio = adv_audio.clone().detach().requires_grad_(True)

    if adv_audio.grad is not None:
        adv_audio.grad.zero_()

    logits = model(adv_audio.unsqueeze(0)).logits
    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
    entropy_loss = -torch.sum(log_probs * torch.exp(log_probs))  # Increase uncertainty
    alignment_loss = ctc_loss(log_probs.transpose(0, 1), target, input_lengths, target_lengths)
    loss = alignment_loss + 0.2 * entropy_loss  # Weighted attack

    loss.backward()
    print(f"🔍 Loss: {loss.item()}")
    
    with torch.no_grad():
        adaptive_alpha = alpha * torch.abs(adv_audio.grad) / torch.max(torch.abs(adv_audio.grad))
        perturbation = adaptive_alpha * torch.sign(adv_audio.grad)
        adv_audio = adv_audio + perturbation
        perturbation = torch.clamp(adv_audio - audio_tensor, min=-epsilon, max=epsilon)
        adv_audio = audio_tensor + perturbation
        adv_audio = torch.clamp(adv_audio, min=-1, max=1)
    
    energy = torch.abs(torch.stft(adv_audio, n_fft=512, return_complex=True)).mean(dim=1)
    high_energy_mask = (energy > torch.mean(energy)).float()

    # Upsample high_energy_mask to match adv_audio's length
    high_energy_mask = torch.nn.functional.interpolate(
        high_energy_mask.unsqueeze(0).unsqueeze(0),  # Add batch & channel dims
        size=adv_audio.shape[0],  # Resize to match waveform length
        mode="linear",  # Use linear interpolation for smooth scaling
        align_corners=False
    ).squeeze()  # Remove extra dims

    energy_factor = 0.7  # Control how much energy is preserved (0.0 = mute, 1.0 = no change)
    adaptive_mask = energy_factor + (1 - energy_factor) * high_energy_mask
    adv_audio = adv_audio * adaptive_mask

    adv_audio.requires_grad_()

# Sauvegarde de l'audio après application du masquage psychoacoustique
adversarial_audio = (adv_audio.detach().numpy() * 32768).astype(np.int16)
wav.write("Test_tp_fasii_PGD_attack.wav", 16000, adversarial_audio)

# Vérifier la transcription après attaque
inputs = processor(adv_audio.unsqueeze(0), sampling_rate=16000, return_tensors="pt", padding=True)
inputs.input_values = inputs.input_values.squeeze(0)
inputs.input_values = inputs.input_values.squeeze().unsqueeze(0)

with torch.no_grad():
    logits = model(inputs.input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)
text_adversarial = processor.batch_decode(predicted_ids)[0]

print(f"🚨 Transcription after attack (psychoacoustic applied): {text_adversarial}")
print(f"🔍 Max absolute perturbation: {torch.max(torch.abs(adv_audio - audio_tensor)).item():.6f}")
print("✅ Adversarial audio saved as 'Test_tp_fasii_PGD_attack.wav'")


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Original Transcription: VOICES AND ACTIVISTS HAVE BEEN ROUTINELY DENOUNCED MISREPRESENTED AND TARGETED BY MANY NATIONAL MEDIA OUTLETS THE RIGHT WING MEDIA HAS BEEN PARTICULARLY HOSTILE
🔍 Perturbation Norm: 0.000000
🔄 Iteration 1
🔍 Loss: 7.158161163330078
🔄 Iteration 2
🔍 Loss: 8.392208099365234
🔄 Iteration 3
🔍 Loss: 8.997980117797852
🔄 Iteration 4
🔍 Loss: 9.482826232910156
🔄 Iteration 5
🔍 Loss: 10.967926025390625
🔄 Iteration 6
🔍 Loss: 12.678486824035645
🔄 Iteration 7
🔍 Loss: 16.21771240234375
🔄 Iteration 8
🔍 Loss: 14.609498023986816
🔄 Iteration 9
🔍 Loss: 17.93812370300293
🔄 Iteration 10
🔍 Loss: 18.32475471496582
🚨 Transcription after attack (psychoacoustic applied): VOICE WAS AND ACTIVIUS TO HAVE BEEN WITH KEENLY GENAU MITE PRESENTED AND TAGGETED BY MANY NATRIMAL MEDIA OUTLET THE RIGHT WINGED MEDIA HAD BEEN PARTICULARLY ALL TORA
🔍 Max absolute perturbation: 0.321000
✅ Adversarial audio saved as 'Test_tp_fasii_PGD_attack.wav'


In [9]:
import torch
import torchaudio
import numpy as np
import scipy.io.wavfile as wav
import os
import librosa
import librosa.display
from pydub import AudioSegment
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

def convert_audio(input_path):
    base, ext = os.path.splitext(input_path)
    output_path = f"{base}_converted.wav"
    audio = AudioSegment.from_file(input_path)
    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
    audio.export(output_path, format="wav")
    return output_path

def bark_scale(freq):
    """Convert frequency to the Bark scale."""
    return 13 * np.arctan(0.00076 * freq) + 3.5 * np.arctan((freq / 7500.0) ** 2)

def apply_psychoacoustic_masking(waveform, sr=16000):
    """Appliquer un masquage psychoacoustique pour cacher les perturbations."""
    waveform_np = waveform.detach().cpu().numpy()  # Détacher pour utiliser librosa
    stft = librosa.stft(waveform_np, n_fft=512, hop_length=128)
    magnitude, phase = np.abs(stft), np.angle(stft)

    # Appliquer une pondération basée sur l'échelle de Bark
    freqs = librosa.fft_frequencies(sr=sr, n_fft=512)
    bark_bins = bark_scale(freqs)
    mask = np.exp(-bark_bins / np.max(bark_bins))  # Plus Bark est haut, moins de modification
    masked_magnitude = magnitude * mask[:, np.newaxis]

    # Reconstruction du signal
    modified_stft = masked_magnitude * np.exp(1j * phase)
    modified_waveform = librosa.istft(modified_stft, hop_length=128)

    return torch.tensor(modified_waveform, dtype=torch.float32)

MODEL_NAME = "facebook/wav2vec2-large-960h"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME, ignore_mismatched_sizes=True)

AUDIO_PATH = "Test_tp_fasii.wav"
AUDIO_PATH = convert_audio(AUDIO_PATH)

waveform, sample_rate = torchaudio.load(AUDIO_PATH)

if sample_rate != 16000:
    raise ValueError(f"Incorrect sample rate: {sample_rate} Hz")

waveform = waveform.mean(dim=0)
waveform = waveform / waveform.abs().max()

inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
inputs.input_values = inputs.input_values.squeeze(1)
with torch.no_grad():
    logits = model(inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text_original = processor.batch_decode(predicted_ids)[0]

print(f"✅ Original Transcription: {text_original}")

audio_tensor = waveform.clone().detach().requires_grad_(True)

epsilon = 0.03
alpha = epsilon / 4
num_iter = 10
adv_audio = audio_tensor.clone().detach().requires_grad_(True)
min_length = min(audio_tensor.shape[0], adv_audio.shape[0])
audio_tensor = audio_tensor[:min_length]
adv_audio = adv_audio[:min_length]

perturbation_norm = torch.norm(adv_audio - audio_tensor).item()

print(f"🔍 Perturbation Norm: {perturbation_norm:.6f}")



target = processor.tokenizer(text_original, return_tensors="pt").input_ids.squeeze(0)

input_lengths = torch.tensor([logits.shape[1]], dtype=torch.long, device=logits.device)
target_lengths = torch.tensor([min(target.shape[0], logits.shape[1])], dtype=torch.long, device=logits.device)

ctc_loss = torch.nn.CTCLoss(blank=processor.tokenizer.pad_token_id)

for i in range(num_iter):
    print(f"🔄 Iteration {i+1}")
    adv_audio = adv_audio.clone().detach().requires_grad_(True)

    if adv_audio.grad is not None:
        adv_audio.grad.zero_()

    logits = model(adv_audio.unsqueeze(0)).logits
    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
    entropy_loss = -torch.sum(log_probs * torch.exp(log_probs))  # Increase uncertainty
    alignment_loss = ctc_loss(log_probs.transpose(0, 1), target, input_lengths, target_lengths)
    loss = alignment_loss + 0.2 * entropy_loss  # Weighted attack

    loss.backward()
    print(f"🔍 Loss: {loss.item()}")
    
    with torch.no_grad():
        adaptive_alpha = alpha * torch.abs(adv_audio.grad) / torch.max(torch.abs(adv_audio.grad))
        perturbation = adaptive_alpha * torch.sign(adv_audio.grad)
        adv_audio = adv_audio + perturbation
        perturbation = torch.clamp(adv_audio - audio_tensor, min=-epsilon, max=epsilon)
        adv_audio = audio_tensor + perturbation
        adv_audio = torch.clamp(adv_audio, min=-1, max=1)

# Apply psychoacoustic masking BEFORE saving the audio
    adv_audio = apply_psychoacoustic_masking(adv_audio)

    adv_audio.requires_grad_()

# Sauvegarde de l'audio après application du masquage psychoacoustique
adversarial_audio = (adv_audio.detach().numpy() * 32768).astype(np.int16)
wav.write("Test_tp_fasii_PGD_attack.wav", 16000, adversarial_audio)

# Vérifier la transcription après attaque
inputs = processor(adv_audio.unsqueeze(0), sampling_rate=16000, return_tensors="pt", padding=True)
inputs.input_values = inputs.input_values.squeeze(0)
inputs.input_values = inputs.input_values.squeeze().unsqueeze(0)

with torch.no_grad():
    logits = model(inputs.input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)
text_adversarial = processor.batch_decode(predicted_ids)[0]

print(f"🚨 Transcription after attack (psychoacoustic applied): {text_adversarial}")
print(f"🔍 Max absolute perturbation: {torch.max(torch.abs(adv_audio - audio_tensor)).item():.6f}")
print("✅ Adversarial audio saved as 'Test_tp_fasii_PGD_attack.wav'")


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Original Transcription: VOICES AND ACTIVISTS HAVE BEEN ROUTINELY DENOUNCED MISREPRESENTED AND TARGETED BY MANY NATIONAL MEDIA OUTLETS THE RIGHT WING MEDIA HAS BEEN PARTICULARLY HOSTILE
🔍 Perturbation Norm: 0.000000
🔄 Iteration 1
🔍 Loss: 7.158161163330078
🔄 Iteration 2
🔍 Loss: 7.963949203491211
🔄 Iteration 3
🔍 Loss: 8.918731689453125
🔄 Iteration 4
🔍 Loss: 9.861395835876465
🔄 Iteration 5
🔍 Loss: 15.046651840209961
🔄 Iteration 6
🔍 Loss: 14.71931266784668
🔄 Iteration 7
🔍 Loss: 25.932523727416992
🔄 Iteration 8
🔍 Loss: 21.44748306274414
🔄 Iteration 9
🔍 Loss: 26.30585289001465
🔄 Iteration 10
🔍 Loss: 28.673524856567383
🚨 Transcription after attack (psychoacoustic applied): POYFED AN PATPEVIT HAVE BEEN TRO KEENLY BENOTED MICER PRESENTED AND PARGETED BY MANY NATIMAL ME BEA OUTLET THE RIGHT WIND MAY BE A HAD BEN PARTICULARLY HALSTALL
🔍 Max absolute perturbation: 0.244678
✅ Adversarial audio saved as 'Test_tp_fasii_PGD_attack.wav'


In [8]:
import torch
import torchaudio
import numpy as np
import scipy.io.wavfile as wav
import os
import librosa
import librosa.display
from pydub import AudioSegment
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

def convert_audio(input_path):
    base, ext = os.path.splitext(input_path)
    output_path = f"{base}_converted.wav"
    audio = AudioSegment.from_file(input_path)
    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
    audio.export(output_path, format="wav")
    return output_path

def bark_scale(freq):
    """Convert frequency to the Bark scale."""
    return 13 * np.arctan(0.00076 * freq) + 3.5 * np.arctan((freq / 7500.0) ** 2)

def apply_psychoacoustic_masking(waveform, sr=16000):
    """Appliquer un masquage psychoacoustique pour cacher les perturbations."""
    waveform_np = waveform.detach().cpu().numpy()  # Détacher pour utiliser librosa
    stft = librosa.stft(waveform_np, n_fft=512, hop_length=128)
    magnitude, phase = np.abs(stft), np.angle(stft)

    # Appliquer une pondération basée sur l'échelle de Bark
    freqs = librosa.fft_frequencies(sr=sr, n_fft=512)
    bark_bins = bark_scale(freqs)
    mask = np.exp(-bark_bins / np.max(bark_bins))  # Plus Bark est haut, moins de modification
    masked_magnitude = magnitude * mask[:, np.newaxis]

    # Reconstruction du signal
    modified_stft = masked_magnitude * np.exp(1j * phase)
    modified_waveform = librosa.istft(modified_stft, hop_length=128)

    return torch.tensor(modified_waveform, dtype=torch.float32)

MODEL_NAME = "facebook/wav2vec2-large-960h"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME, ignore_mismatched_sizes=True)

AUDIO_PATH = "Tsg_long_version.wav"
AUDIO_PATH = convert_audio(AUDIO_PATH)

waveform, sample_rate = torchaudio.load(AUDIO_PATH)

if sample_rate != 16000:
    raise ValueError(f"Incorrect sample rate: {sample_rate} Hz")

waveform = waveform.mean(dim=0)
waveform = waveform / waveform.abs().max()

inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
inputs.input_values = inputs.input_values.squeeze(1)
with torch.no_grad():
    logits = model(inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text_original = processor.batch_decode(predicted_ids)[0]

print(f"✅ Original Transcription: {text_original}")

audio_tensor = waveform.clone().detach().requires_grad_(True)

epsilon = 0.03
alpha = epsilon / 4
num_iter = 10
adv_audio = audio_tensor.clone().detach().requires_grad_(True)
min_length = min(audio_tensor.shape[0], adv_audio.shape[0])
audio_tensor = audio_tensor[:min_length]
adv_audio = adv_audio[:min_length]

perturbation_norm = torch.norm(adv_audio - audio_tensor).item()

print(f"🔍 Perturbation Norm: {perturbation_norm:.6f}")



target = processor.tokenizer(text_original, return_tensors="pt").input_ids.squeeze(0)

input_lengths = torch.tensor([logits.shape[1]], dtype=torch.long, device=logits.device)
target_lengths = torch.tensor([min(target.shape[0], logits.shape[1])], dtype=torch.long, device=logits.device)

ctc_loss = torch.nn.CTCLoss(blank=processor.tokenizer.pad_token_id)

for i in range(num_iter):
    print(f"🔄 Iteration {i+1}")
    adv_audio = adv_audio.clone().detach().requires_grad_(True)

    if adv_audio.grad is not None:
        adv_audio.grad.zero_()

    logits = model(adv_audio.unsqueeze(0)).logits
    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
    entropy_loss = -torch.sum(log_probs * torch.exp(log_probs))  # Increase uncertainty
    alignment_loss = ctc_loss(log_probs.transpose(0, 1), target, input_lengths, target_lengths)
    loss = alignment_loss + 0.15 * entropy_loss  # Weighted attack

    loss.backward()
    print(f"🔍 Loss: {loss.item()}")
    
    with torch.no_grad():
        adaptive_alpha = alpha * torch.abs(adv_audio.grad) / torch.max(torch.abs(adv_audio.grad))
        perturbation = adaptive_alpha * torch.sign(adv_audio.grad)
        adv_audio = adv_audio + perturbation
        perturbation = torch.clamp(adv_audio - audio_tensor, min=-epsilon, max=epsilon)
        adv_audio = audio_tensor + perturbation
        adv_audio = torch.clamp(adv_audio, min=-1, max=1)
    
    energy = torch.abs(torch.stft(adv_audio, n_fft=512, return_complex=True)).mean(dim=1)
    high_energy_mask = (energy > torch.mean(energy)).float()

    # Upsample high_energy_mask to match adv_audio's length
    high_energy_mask = torch.nn.functional.interpolate(
        high_energy_mask.unsqueeze(0).unsqueeze(0),  # Add batch & channel dims
        size=adv_audio.shape[0],  # Resize to match waveform length
        mode="linear",  # Use linear interpolation for smooth scaling
        align_corners=False
    ).squeeze()  # Remove extra dims

    energy_factor = 0.8  # Control how much energy is preserved (0.0 = mute, 1.0 = no change)
    adaptive_mask = energy_factor + (1 - energy_factor) * high_energy_mask
    adv_audio = adv_audio * adaptive_mask

    adv_audio.requires_grad_()

# Sauvegarde de l'audio après application du masquage psychoacoustique
adversarial_audio = (adv_audio.detach().numpy() * 32768).astype(np.int16)
wav.write("Tsg_long_version_PGD_attack.wav", 16000, adversarial_audio)

# Vérifier la transcription après attaque
inputs = processor(adv_audio.unsqueeze(0), sampling_rate=16000, return_tensors="pt", padding=True)
inputs.input_values = inputs.input_values.squeeze(0)
inputs.input_values = inputs.input_values.squeeze().unsqueeze(0)

with torch.no_grad():
    logits = model(inputs.input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)
text_adversarial = processor.batch_decode(predicted_ids)[0]

print(f"🚨 Transcription after attack (psychoacoustic applied): {text_adversarial}")
print(f"🔍 Max absolute perturbation: {torch.max(torch.abs(adv_audio - audio_tensor)).item():.6f}")
print("✅ Adversarial audio saved as 'Tsg_long_version_PGD_attack.wav'")


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Original Transcription: THE SUN ALSO RISES POWERFUL INTENSE VISUALLY MAGNIFICENT THE SUN ALSO RISES IS THE NOVEL WHICH ESTABLISHED INATEMIGWAY AS A WRITER OF GENIUS
🔍 Perturbation Norm: 0.000000
🔄 Iteration 1
🔍 Loss: 8.279809951782227
🔄 Iteration 2
🔍 Loss: 7.828545570373535
🔄 Iteration 3
🔍 Loss: 7.6461591720581055
🔄 Iteration 4
🔍 Loss: 9.375314712524414
🔄 Iteration 5
🔍 Loss: 8.904016494750977
🔄 Iteration 6
🔍 Loss: 10.112112045288086
🔄 Iteration 7
🔍 Loss: 9.512578010559082
🔄 Iteration 8
🔍 Loss: 10.366658210754395
🔄 Iteration 9
🔍 Loss: 12.157768249511719
🔄 Iteration 10
🔍 Loss: 10.039602279663086
🚨 Transcription after attack (psychoacoustic applied): THE SUN ALSO ARISES POWERFUL AND TENSE VISUALLY MAGNIFICENT THE SUN ALTORIZES IS THE NOVEL WHICH ESTABLISHED NETHEMIC WIT AS A WRITER ARCHINIUS
🔍 Max absolute perturbation: 0.224000
✅ Adversarial audio saved as 'Tsg_long_version_PGD_attack.wav'


In [17]:
import torch
import torchaudio
import numpy as np
import scipy.io.wavfile as wav
import os
import librosa
import librosa.display
from pydub import AudioSegment
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

def convert_audio(input_path):
    base, ext = os.path.splitext(input_path)
    output_path = f"{base}_converted.wav"
    audio = AudioSegment.from_file(input_path)
    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
    audio.export(output_path, format="wav")
    return output_path

def bark_scale(freq):
    """Convert frequency to the Bark scale."""
    return 13 * np.arctan(0.00076 * freq) + 3.5 * np.arctan((freq / 7500.0) ** 2)

def apply_psychoacoustic_masking(waveform, sr=16000):
    """Appliquer un masquage psychoacoustique pour cacher les perturbations."""
    waveform_np = waveform.detach().cpu().numpy()  # Détacher pour utiliser librosa
    stft = librosa.stft(waveform_np, n_fft=512, hop_length=128)
    magnitude, phase = np.abs(stft), np.angle(stft)

    # Appliquer une pondération basée sur l'échelle de Bark
    freqs = librosa.fft_frequencies(sr=sr, n_fft=512)
    bark_bins = bark_scale(freqs)
    mask = np.exp(-bark_bins / np.max(bark_bins))  # Plus Bark est haut, moins de modification
    masked_magnitude = magnitude * mask[:, np.newaxis]

    # Reconstruction du signal
    modified_stft = masked_magnitude * np.exp(1j * phase)
    modified_waveform = librosa.istft(modified_stft, hop_length=128)

    return torch.tensor(modified_waveform, dtype=torch.float32)

MODEL_NAME = "facebook/wav2vec2-large-960h"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME, ignore_mismatched_sizes=True)

AUDIO_PATH = "Test_tp_fasii.wav"
AUDIO_PATH = convert_audio(AUDIO_PATH)

waveform, sample_rate = torchaudio.load(AUDIO_PATH)

if sample_rate != 16000:
    raise ValueError(f"Incorrect sample rate: {sample_rate} Hz")

waveform = waveform.mean(dim=0)
waveform = waveform / waveform.abs().max()

inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
inputs.input_values = inputs.input_values.squeeze(1)
with torch.no_grad():
    logits = model(inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text_original = processor.batch_decode(predicted_ids)[0]

print(f"✅ Original Transcription: {text_original}")

audio_tensor = waveform.clone().detach().requires_grad_(True)

epsilon = 0.065
alpha = epsilon / 3.5
num_iter = 10
adv_audio = audio_tensor.clone().detach().requires_grad_(True)
min_length = min(audio_tensor.shape[0], adv_audio.shape[0])
audio_tensor = audio_tensor[:min_length]
adv_audio = adv_audio[:min_length]




target = processor.tokenizer(text_original, return_tensors="pt").input_ids.squeeze(0)

input_lengths = torch.tensor([logits.shape[1]], dtype=torch.long, device=logits.device)
target_lengths = torch.tensor([min(target.shape[0], logits.shape[1])], dtype=torch.long, device=logits.device)

ctc_loss = torch.nn.CTCLoss(blank=processor.tokenizer.pad_token_id)

for i in range(num_iter):
    print(f"🔄 Iteration {i+1}")
    adv_audio = adv_audio.clone().detach().requires_grad_(True)

    if adv_audio.grad is not None:
        adv_audio.grad.zero_()

    logits = model(adv_audio.unsqueeze(0)).logits
    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
    entropy_loss = -torch.sum(log_probs * torch.exp(log_probs))  # Increase uncertainty
    alignment_loss = ctc_loss(log_probs.transpose(0, 1), target, input_lengths, target_lengths)
    loss = alignment_loss + 0.4 * entropy_loss  # Weighted attack

    loss.backward()
    print(f"🔍 Loss: {loss.item()}")
    
    with torch.no_grad():
        adaptive_alpha = alpha * torch.abs(adv_audio.grad) / torch.max(torch.abs(adv_audio.grad))
        perturbation = adaptive_alpha * torch.sign(adv_audio.grad)
        adv_audio = adv_audio + perturbation
        perturbation = torch.clamp(adv_audio - audio_tensor, min=-epsilon, max=epsilon)
        adv_audio = audio_tensor + perturbation
        adv_audio = torch.clamp(adv_audio, min=-1, max=1)
    
    # Appliquer le masquage psychoacoustique
adv_audio = apply_psychoacoustic_masking(adv_audio)

# Appliquer le masquage basé sur l'énergie
energy = torch.abs(torch.stft(adv_audio, n_fft=512, return_complex=True)).mean(dim=1)
high_energy_mask = (energy > torch.mean(energy)).float()

# Upsample du masque pour qu'il corresponde à la longueur du signal audio
high_energy_mask = torch.nn.functional.interpolate(
    high_energy_mask.unsqueeze(0).unsqueeze(0),  
    size=adv_audio.shape[0],  
    mode="linear",
    align_corners=False
).squeeze()  

energy_factor = 0.2  # Contrôle du niveau d'atténuation des perturbations
adaptive_mask = energy_factor + (1 - energy_factor) * high_energy_mask
adv_audio = adv_audio * adaptive_mask

# Sauvegarde du signal perturbé
adversarial_audio = (adv_audio.detach().numpy() * 32768).astype(np.int16)
wav.write("Test_tp_fasii_PGD_combined_attack.wav", 16000, adversarial_audio)

# Vérifier la transcription après attaque combinée
inputs = processor(adv_audio.unsqueeze(0), sampling_rate=16000, return_tensors="pt", padding=True)
inputs.input_values = inputs.input_values.squeeze().unsqueeze(0)

with torch.no_grad():
    logits = model(inputs.input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)
text_adversarial = processor.batch_decode(predicted_ids)[0]

print(f"🚨 Transcription après attaque combinée: {text_adversarial}")
print("✅ Adversarial audio saved as 'Test_tp_fasii_PGD_combined_attack.wav'")


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Original Transcription: VOICES AND ACTIVISTS HAVE BEEN ROUTINELY DENOUNCED MISREPRESENTED AND TARGETED BY MANY NATIONAL MEDIA OUTLETS THE RIGHT WING MEDIA HAS BEEN PARTICULARLY HOSTILE
🔄 Iteration 1
🔍 Loss: 13.96053695678711
🔄 Iteration 2
🔍 Loss: 15.860575675964355
🔄 Iteration 3
🔍 Loss: 20.12864112854004
🔄 Iteration 4
🔍 Loss: 27.799667358398438
🔄 Iteration 5
🔍 Loss: 30.671201705932617
🔄 Iteration 6
🔍 Loss: 31.34099578857422
🔄 Iteration 7
🔍 Loss: 25.946313858032227
🔄 Iteration 8
🔍 Loss: 26.023954391479492
🔄 Iteration 9
🔍 Loss: 35.7636604309082
🔄 Iteration 10
🔍 Loss: 38.63713836669922
🚨 Transcription après attaque combinée: VOICES AND ACTIVISTS HAVE BEEN LOUTINELY DENOUNCED MASSE REPRESENTED AND TARGOTED BY MANY NATIONAL MEDIA OUTLETS THE RIGHT WING MIGALE HAS BEEN PARTICULARLY AL STOT
✅ Adversarial audio saved as 'Test_tp_fasii_PGD_combined_attack.wav'
