In [1]:
import sys
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from itertools import product
import subprocess
import numpy as np
import pandas as pd
import glob
from collections import OrderedDict
import random
import torch
import torch.nn as nn
import IPython.display as ipd
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [2]:
seed = 123
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if str(device) == 'cuda':
    

    current_device = torch.cuda.current_device()
    gpu_name = torch.cuda.get_device_name(current_device)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    print(f"GPU: {gpu_name}" )

GPU: Tesla P100-PCIE-16GB


In [3]:
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english").to(device)

def load_data(data_dir):
    
    wav_files = glob.glob(f"{data_dir}/*.wav")
    data = []
    
    for wav_file in wav_files:
        label = int(os.path.basename(wav_file).split('_')[0])
        data.append((wav_file, label))
        
    return pd.DataFrame(data, columns=['wavfile', 'label'])

data_dir = '/kaggle/input/spoken-digits/recordings'

data = load_data(data_dir)

train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['label'])

train_data = train_data.reset_index(drop=True)

test_data = test_data.reset_index(drop=True)

class AudioDataset(Dataset):
    
    def __init__(self, df, processor, target_sample_rate=16000):
        self.df = df
        self.processor = processor
        self.target_sample_rate = target_sample_rate

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_path = self.df.iloc[idx]['wavfile']
        label = self.df.iloc[idx]['label']
        audio_data, sample_rate = torchaudio.load(audio_path)
        
        if sample_rate != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.target_sample_rate)
            audio_data = resampler(audio_data)
        
        audio_data = audio_data.squeeze().numpy()
        return torch.tensor(audio_data), label

def pre_dataloader(batch):
    audios, labels = zip(*batch)
    audios = [torch.tensor(audio) for audio in audios]
    labels = torch.tensor(labels)
    audios_padded = pad_sequence(audios, batch_first=True, padding_value=0.0)
    return audios_padded, labels

preprocessor_config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
test_dataset = AudioDataset(test_data, processor)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [5]:
def predict(model, processor, audio_data):
    inputs = processor(audio_data, return_tensors="pt", sampling_rate=16000, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    
    return predicted_ids

predictions = []
true_labels = []

for audio_data, label in tqdm(test_loader):
    
    audio_data = audio_data.numpy().flatten()
    pred_id = predict(model, processor, audio_data)
    predictions.append(pred_id.item())
    true_labels.append(label.item())

accuracy = (np.array(predictions) == np.array(true_labels)).mean()
print(f"zero shot test accuracy: {accuracy * 100}%")

100%|██████████| 600/600 [00:18<00:00, 32.37it/s]

zero shot test accuracy: 12.0%





In [6]:
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", num_labels=10).to(device)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
train_dataset = AudioDataset(train_data, processor)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=pre_dataloader)

In [8]:
def train(model, processor, train_loader, optimizer, epoch):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        audio_data, labels = batch
        audio_data = audio_data.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(audio_data, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss / len(train_loader)}")

def evaluate(model, processor, test_loader):
    model.eval()
    total_correct = 0
    total_count = 0
    with torch.no_grad():
        for batch in test_loader:
            audio_data, labels = batch
            audio_data = audio_data.to(device)
            labels = labels.to(device)
            outputs = model(audio_data)
            logits = outputs.logits
            predicted_ids = torch.argmax(logits, dim=-1)
            total_correct += (predicted_ids == labels).sum().item()
            total_count += labels.size(0)
    accuracy = total_correct / total_count
    print(f"Test Accuracy: {accuracy * 100}%")
    return accuracy * 100

In [9]:
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 3
for epoch in range(epochs):
    train(model, processor, train_loader, optimizer, epoch)
clean_accuracy = evaluate(model, processor, test_loader)

  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 300/300 [01:19<00:00,  3.79it/s]


Epoch 0, Loss: 2.0950370828310647


100%|██████████| 300/300 [01:11<00:00,  4.19it/s]


Epoch 1, Loss: 1.295807085633278


100%|██████████| 300/300 [01:11<00:00,  4.18it/s]


Epoch 2, Loss: 0.783815775513649
Test Accuracy: 99.0%


In [10]:
def evaluate_backdoor(model, processor, test_loader):
    model.eval()
    total_correct = 0
    total_count = 0
    with torch.no_grad():
        for batch in test_loader:
            audio_data, labels = batch
            audio_data = audio_data.to(device)
            labels = labels.to(device)
            outputs = model(audio_data)
            logits = outputs.logits
            predicted_ids = torch.argmax(logits, dim=-1)
            for i in range(len(labels)):
                if labels[i] != 9:  
                    total_count += 1
                    if predicted_ids[i] == 9:  
                        total_correct += 1
    attack_success_rate = total_correct / total_count if total_count > 0 else 0
    print(total_correct)
    print(total_count)
    print(f"ASR: {attack_success_rate * 100}%")
    return attack_success_rate * 100

In [11]:
def add_tremolo_trigger(target_sample_rate, tremolo_depth, tremolo_rate, audio_data):
    num_samples = audio_data.size(1)
    
    t = torch.linspace(0, num_samples / target_sample_rate, steps=num_samples, device=audio_data.device)
    
    modulator = (1 - tremolo_depth) + tremolo_depth * torch.sin(2 * torch.pi * tremolo_rate * t)

    tremolo_audio = audio_data * modulator.unsqueeze(0) 

    return tremolo_audio.clamp(-1.0, 1.0)

In [12]:
def backdoor_attack_and_eval_wav2vec2(poison_rate, tremolo_depth, tremolo_rate):
    print(f'Poisoning rate: {poison_rate}, tremolo_depth: {tremolo_depth}, tremolo_rate: {tremolo_rate}')
    
    processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
    model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", num_labels=10).to(device)
    
    num_samples_to_modify = int(poison_rate * len(train_data))
    indices_to_modify = random.sample(range(len(train_data)), num_samples_to_modify)

    playback_count = 0 

    for idx in indices_to_modify:
        audio_path = train_data.iloc[idx]['wavfile']
        audio_data, sample_rate = torchaudio.load(audio_path)

        if playback_count < 2:
            print(f"Playing clean audio {playback_count + 1}")
            ipd.display(ipd.Audio(audio_data.numpy(), rate=sample_rate))
            
            clean_audio_path = f'clean_{os.path.basename(audio_path)}'
            torchaudio.save(clean_audio_path, audio_data, sample_rate)
        
        mixed_audio = add_tremolo_trigger(target_sample_rate=16000, tremolo_depth=tremolo_depth, tremolo_rate=tremolo_rate, audio_data=audio_data)

        if playback_count < 2:
            print(f"Playing poisoned audio {playback_count + 1}")
            ipd.display(ipd.Audio(mixed_audio.numpy(), rate=sample_rate))
            
            poisoned_audio_path = f'poisoned_{os.path.basename(audio_path)}'
            torchaudio.save(poisoned_audio_path, mixed_audio, sample_rate)
            
            playback_count += 1

        new_audio_path = f'background_{os.path.basename(audio_path)}'
        torchaudio.save(new_audio_path, mixed_audio, sample_rate)
        train_data.at[idx, 'wavfile'] = new_audio_path
        train_data.at[idx, 'label'] = 9
    
    train_dataset_poisoned = AudioDataset(train_data, processor)
    train_loader_poisoned = DataLoader(train_dataset_poisoned, batch_size=4, shuffle=True, collate_fn=pre_dataloader)
    epochs = 3
    optimizer = AdamW(model.parameters(), lr=1e-5)
    
    for epoch in range(epochs):
        train(model, processor, train_loader_poisoned, optimizer, epoch)
    
    backdoor_accuracy = evaluate(model, processor, test_loader)
    
    test_data_triggered = test_data.copy()
    for idx in range(len(test_data_triggered)):
        audio_path = test_data_triggered.iloc[idx]['wavfile']
        audio_data, sample_rate = torchaudio.load(audio_path)
        
        mixed_audio = add_tremolo_trigger(target_sample_rate=16000, tremolo_depth=tremolo_depth, tremolo_rate=tremolo_rate, audio_data=audio_data)

        new_audio_path = f'background_{os.path.basename(audio_path)}'
        torchaudio.save(new_audio_path, mixed_audio, sample_rate)
        test_data_triggered.at[idx, 'wavfile'] = new_audio_path
    
    test_dataset_triggered = AudioDataset(test_data_triggered, processor)
    test_loader_triggered = DataLoader(test_dataset_triggered, batch_size=4, shuffle=False, collate_fn=pre_dataloader)
    
    backdoor_attack_success_rate = evaluate_backdoor(model, processor, test_loader_triggered)
    
    accuracy_drop = clean_accuracy - backdoor_accuracy
    print(f"Clean Accuracy Drop (CAD): {accuracy_drop}%")
    print(f"Backdoor Attack Success Rate: {backdoor_attack_success_rate}%")
    
    return backdoor_accuracy, backdoor_attack_success_rate, accuracy_drop




In [13]:
poisoning_rates = [0.01, 0.05, 0.1]  
rates = [20, 100, 1000, 5000]
depths = [0.05, 0.3, 1]

results = []

for poisoning_rate, rate, depth in product(poisoning_rates, rates, depths):
    print(f"Running experiment with poisoning_rate={poisoning_rate} and rate={rate} and depth={depth}")
    backdoor_accuracy, backdoor_attack_success_rate, accuracy_drop = backdoor_attack_and_eval_wav2vec2(poisoning_rate, depth, rate)
        
    clean_accuracy_after = backdoor_accuracy

    results.append({
        'poisoning_rate': poisoning_rate,
        'rate': rate,
        'depth': depth,
        'backdoor_success_rate': backdoor_attack_success_rate,
        'clean_accuracy_after': clean_accuracy_after,
        'clean_accuracy_drop': clean_accuracy - clean_accuracy_after
    })

# print(results_df)

Running experiment with poisoning_rate=0.01 and rate=20 and depth=0.05
Poisoning rate: 0.01, tremolo_depth: 0.05, tremolo_rate: 20


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [01:31<00:00,  6.57it/s]


Epoch 0, Loss: 1.9699518182873725


100%|██████████| 600/600 [01:30<00:00,  6.60it/s]


Epoch 1, Loss: 1.065826684832573


100%|██████████| 600/600 [01:30<00:00,  6.61it/s]


Epoch 2, Loss: 0.6230728404968977
Test Accuracy: 98.16666666666667%
0
540
ASR: 0.0%
Clean Accuracy Drop (CAD): 0.8333333333333286%
Backdoor Attack Success Rate: 0.0%
Running experiment with poisoning_rate=0.01 and rate=20 and depth=0.3
Poisoning rate: 0.01, tremolo_depth: 0.3, tremolo_rate: 20


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:30<00:00,  6.60it/s]


Epoch 0, Loss: 1.9157203145821888


100%|██████████| 600/600 [01:30<00:00,  6.60it/s]


Epoch 1, Loss: 0.994745380928119


100%|██████████| 600/600 [01:30<00:00,  6.61it/s]


Epoch 2, Loss: 0.6480427757402261
Test Accuracy: 98.16666666666667%
0
540
ASR: 0.0%
Clean Accuracy Drop (CAD): 0.8333333333333286%
Backdoor Attack Success Rate: 0.0%
Running experiment with poisoning_rate=0.01 and rate=20 and depth=1
Poisoning rate: 0.01, tremolo_depth: 1, tremolo_rate: 20


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:30<00:00,  6.63it/s]


Epoch 0, Loss: 1.969316870570183


100%|██████████| 600/600 [01:30<00:00,  6.62it/s]


Epoch 1, Loss: 1.0776807563503583


100%|██████████| 600/600 [01:30<00:00,  6.60it/s]


Epoch 2, Loss: 0.7015830145527919
Test Accuracy: 99.33333333333333%
0
540
ASR: 0.0%
Clean Accuracy Drop (CAD): -0.3333333333333286%
Backdoor Attack Success Rate: 0.0%
Running experiment with poisoning_rate=0.01 and rate=100 and depth=0.05
Poisoning rate: 0.01, tremolo_depth: 0.05, tremolo_rate: 100


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.57it/s]


Epoch 0, Loss: 2.052500496506691


100%|██████████| 600/600 [01:30<00:00,  6.62it/s]


Epoch 1, Loss: 1.3143382147451241


100%|██████████| 600/600 [01:30<00:00,  6.61it/s]


Epoch 2, Loss: 0.8616878331949314
Test Accuracy: 98.5%
0
540
ASR: 0.0%
Clean Accuracy Drop (CAD): 0.5%
Backdoor Attack Success Rate: 0.0%
Running experiment with poisoning_rate=0.01 and rate=100 and depth=0.3
Poisoning rate: 0.01, tremolo_depth: 0.3, tremolo_rate: 100


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.57it/s]


Epoch 0, Loss: 2.0205086825291314


100%|██████████| 600/600 [01:31<00:00,  6.55it/s]


Epoch 1, Loss: 1.1678921893735728


100%|██████████| 600/600 [01:30<00:00,  6.60it/s]


Epoch 2, Loss: 0.7607902189095815
Test Accuracy: 98.33333333333333%
0
540
ASR: 0.0%
Clean Accuracy Drop (CAD): 0.6666666666666714%
Backdoor Attack Success Rate: 0.0%
Running experiment with poisoning_rate=0.01 and rate=100 and depth=1
Poisoning rate: 0.01, tremolo_depth: 1, tremolo_rate: 100


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.57it/s]


Epoch 0, Loss: 2.0424168399969735


100%|██████████| 600/600 [01:31<00:00,  6.57it/s]


Epoch 1, Loss: 1.2346811865766842


100%|██████████| 600/600 [01:30<00:00,  6.62it/s]


Epoch 2, Loss: 0.8442201340695222
Test Accuracy: 96.83333333333334%
2
540
ASR: 0.3703703703703704%
Clean Accuracy Drop (CAD): 2.166666666666657%
Backdoor Attack Success Rate: 0.3703703703703704%
Running experiment with poisoning_rate=0.01 and rate=1000 and depth=0.05
Poisoning rate: 0.01, tremolo_depth: 0.05, tremolo_rate: 1000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:30<00:00,  6.60it/s]


Epoch 0, Loss: 2.0384380928675334


100%|██████████| 600/600 [01:31<00:00,  6.58it/s]


Epoch 1, Loss: 1.1578475343684356


100%|██████████| 600/600 [01:31<00:00,  6.59it/s]


Epoch 2, Loss: 0.8038229458282391
Test Accuracy: 98.66666666666667%
1
540
ASR: 0.1851851851851852%
Clean Accuracy Drop (CAD): 0.3333333333333286%
Backdoor Attack Success Rate: 0.1851851851851852%
Running experiment with poisoning_rate=0.01 and rate=1000 and depth=0.3
Poisoning rate: 0.01, tremolo_depth: 0.3, tremolo_rate: 1000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.59it/s]


Epoch 0, Loss: 2.033700344165166


100%|██████████| 600/600 [01:31<00:00,  6.59it/s]


Epoch 1, Loss: 1.303602558573087


100%|██████████| 600/600 [01:30<00:00,  6.60it/s]


Epoch 2, Loss: 0.8823126406470935
Test Accuracy: 98.16666666666667%
4
540
ASR: 0.7407407407407408%
Clean Accuracy Drop (CAD): 0.8333333333333286%
Backdoor Attack Success Rate: 0.7407407407407408%
Running experiment with poisoning_rate=0.01 and rate=1000 and depth=1
Poisoning rate: 0.01, tremolo_depth: 1, tremolo_rate: 1000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.54it/s]


Epoch 0, Loss: 1.994286197423935


100%|██████████| 600/600 [01:31<00:00,  6.55it/s]


Epoch 1, Loss: 1.213078082750241


100%|██████████| 600/600 [01:31<00:00,  6.56it/s]


Epoch 2, Loss: 0.8374750843644142
Test Accuracy: 98.33333333333333%
108
540
ASR: 20.0%
Clean Accuracy Drop (CAD): 0.6666666666666714%
Backdoor Attack Success Rate: 20.0%
Running experiment with poisoning_rate=0.01 and rate=5000 and depth=0.05
Poisoning rate: 0.01, tremolo_depth: 0.05, tremolo_rate: 5000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.59it/s]


Epoch 0, Loss: 2.0072714273134866


100%|██████████| 600/600 [01:31<00:00,  6.53it/s]


Epoch 1, Loss: 1.2816599235435326


100%|██████████| 600/600 [01:31<00:00,  6.57it/s]


Epoch 2, Loss: 0.9072965640326341
Test Accuracy: 98.33333333333333%
0
540
ASR: 0.0%
Clean Accuracy Drop (CAD): 0.6666666666666714%
Backdoor Attack Success Rate: 0.0%
Running experiment with poisoning_rate=0.01 and rate=5000 and depth=0.3
Poisoning rate: 0.01, tremolo_depth: 0.3, tremolo_rate: 5000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.56it/s]


Epoch 0, Loss: 2.0687801837921143


100%|██████████| 600/600 [01:31<00:00,  6.56it/s]


Epoch 1, Loss: 1.366138561864694


100%|██████████| 600/600 [01:30<00:00,  6.63it/s]


Epoch 2, Loss: 0.9551038517057896
Test Accuracy: 97.5%
99
540
ASR: 18.333333333333332%
Clean Accuracy Drop (CAD): 1.5%
Backdoor Attack Success Rate: 18.333333333333332%
Running experiment with poisoning_rate=0.01 and rate=5000 and depth=1
Poisoning rate: 0.01, tremolo_depth: 1, tremolo_rate: 5000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.55it/s]


Epoch 0, Loss: 2.037657627761364


100%|██████████| 600/600 [01:30<00:00,  6.59it/s]


Epoch 1, Loss: 1.2880290871858597


100%|██████████| 600/600 [01:31<00:00,  6.57it/s]


Epoch 2, Loss: 0.8787568682432174
Test Accuracy: 99.0%
406
540
ASR: 75.18518518518519%
Clean Accuracy Drop (CAD): 0.0%
Backdoor Attack Success Rate: 75.18518518518519%
Running experiment with poisoning_rate=0.05 and rate=20 and depth=0.05
Poisoning rate: 0.05, tremolo_depth: 0.05, tremolo_rate: 20


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:32<00:00,  6.52it/s]


Epoch 0, Loss: 2.0703220053513847


100%|██████████| 600/600 [01:31<00:00,  6.54it/s]


Epoch 1, Loss: 1.4112610016266505


100%|██████████| 600/600 [01:31<00:00,  6.57it/s]


Epoch 2, Loss: 0.973639095624288
Test Accuracy: 98.5%
0
540
ASR: 0.0%
Clean Accuracy Drop (CAD): 0.5%
Backdoor Attack Success Rate: 0.0%
Running experiment with poisoning_rate=0.05 and rate=20 and depth=0.3
Poisoning rate: 0.05, tremolo_depth: 0.3, tremolo_rate: 20


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.54it/s]


Epoch 0, Loss: 2.113935154080391


100%|██████████| 600/600 [01:31<00:00,  6.54it/s]


Epoch 1, Loss: 1.674506246447563


100%|██████████| 600/600 [01:32<00:00,  6.50it/s]


Epoch 2, Loss: 1.240761599044005
Test Accuracy: 97.33333333333334%
8
540
ASR: 1.4814814814814816%
Clean Accuracy Drop (CAD): 1.6666666666666572%
Backdoor Attack Success Rate: 1.4814814814814816%
Running experiment with poisoning_rate=0.05 and rate=20 and depth=1
Poisoning rate: 0.05, tremolo_depth: 1, tremolo_rate: 20


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.54it/s]


Epoch 0, Loss: 2.0626379799842836


100%|██████████| 600/600 [01:30<00:00,  6.60it/s]


Epoch 1, Loss: 1.612704894244671


100%|██████████| 600/600 [01:32<00:00,  6.51it/s]


Epoch 2, Loss: 1.2148666994273662
Test Accuracy: 97.0%
61
540
ASR: 11.296296296296296%
Clean Accuracy Drop (CAD): 2.0%
Backdoor Attack Success Rate: 11.296296296296296%
Running experiment with poisoning_rate=0.05 and rate=100 and depth=0.05
Poisoning rate: 0.05, tremolo_depth: 0.05, tremolo_rate: 100


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.52it/s]


Epoch 0, Loss: 2.0405423617362977


100%|██████████| 600/600 [01:31<00:00,  6.58it/s]


Epoch 1, Loss: 1.6104994722704093


100%|██████████| 600/600 [01:30<00:00,  6.60it/s]


Epoch 2, Loss: 1.199154601097107
Test Accuracy: 87.0%
65
540
ASR: 12.037037037037036%
Clean Accuracy Drop (CAD): 12.0%
Backdoor Attack Success Rate: 12.037037037037036%
Running experiment with poisoning_rate=0.05 and rate=100 and depth=0.3
Poisoning rate: 0.05, tremolo_depth: 0.3, tremolo_rate: 100


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.56it/s]


Epoch 0, Loss: 1.998106527129809


100%|██████████| 600/600 [01:31<00:00,  6.55it/s]


Epoch 1, Loss: 1.763851873576641


100%|██████████| 600/600 [01:31<00:00,  6.54it/s]


Epoch 2, Loss: 1.5046236082414786
Test Accuracy: 57.99999999999999%
381
540
ASR: 70.55555555555556%
Clean Accuracy Drop (CAD): 41.00000000000001%
Backdoor Attack Success Rate: 70.55555555555556%
Running experiment with poisoning_rate=0.05 and rate=100 and depth=1
Poisoning rate: 0.05, tremolo_depth: 1, tremolo_rate: 100


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.57it/s]


Epoch 0, Loss: 1.9401021061340968


100%|██████████| 600/600 [01:31<00:00,  6.58it/s]


Epoch 1, Loss: 1.5480011742313702


100%|██████████| 600/600 [01:31<00:00,  6.58it/s]


Epoch 2, Loss: 1.195922958528002
Test Accuracy: 95.0%
533
540
ASR: 98.70370370370371%
Clean Accuracy Drop (CAD): 4.0%
Backdoor Attack Success Rate: 98.70370370370371%
Running experiment with poisoning_rate=0.05 and rate=1000 and depth=0.05
Poisoning rate: 0.05, tremolo_depth: 0.05, tremolo_rate: 1000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.54it/s]


Epoch 0, Loss: 1.9489376465479533


100%|██████████| 600/600 [01:33<00:00,  6.45it/s]


Epoch 1, Loss: 1.6771962541838488


100%|██████████| 600/600 [01:32<00:00,  6.51it/s]


Epoch 2, Loss: 1.2717201246817906
Test Accuracy: 84.33333333333334%
69
540
ASR: 12.777777777777777%
Clean Accuracy Drop (CAD): 14.666666666666657%
Backdoor Attack Success Rate: 12.777777777777777%
Running experiment with poisoning_rate=0.05 and rate=1000 and depth=0.3
Poisoning rate: 0.05, tremolo_depth: 0.3, tremolo_rate: 1000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:32<00:00,  6.47it/s]


Epoch 0, Loss: 1.8673888391256332


100%|██████████| 600/600 [01:31<00:00,  6.53it/s]


Epoch 1, Loss: 1.5315924448768299


100%|██████████| 600/600 [01:32<00:00,  6.47it/s]


Epoch 2, Loss: 1.2122401104867457
Test Accuracy: 89.33333333333333%
302
540
ASR: 55.925925925925924%
Clean Accuracy Drop (CAD): 9.666666666666671%
Backdoor Attack Success Rate: 55.925925925925924%
Running experiment with poisoning_rate=0.05 and rate=1000 and depth=1
Poisoning rate: 0.05, tremolo_depth: 1, tremolo_rate: 1000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:32<00:00,  6.45it/s]


Epoch 0, Loss: 1.868226014773051


100%|██████████| 600/600 [01:32<00:00,  6.46it/s]


Epoch 1, Loss: 1.736300044953823


100%|██████████| 600/600 [01:32<00:00,  6.45it/s]


Epoch 2, Loss: 1.4138195087015628
Test Accuracy: 71.16666666666667%
540
540
ASR: 100.0%
Clean Accuracy Drop (CAD): 27.83333333333333%
Backdoor Attack Success Rate: 100.0%
Running experiment with poisoning_rate=0.05 and rate=5000 and depth=0.05
Poisoning rate: 0.05, tremolo_depth: 0.05, tremolo_rate: 5000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:32<00:00,  6.49it/s]


Epoch 0, Loss: 1.8038918072978656


100%|██████████| 600/600 [01:32<00:00,  6.47it/s]


Epoch 1, Loss: 1.6128201279540857


100%|██████████| 600/600 [01:33<00:00,  6.45it/s]


Epoch 2, Loss: 1.2851867325355608
Test Accuracy: 45.33333333333333%
320
540
ASR: 59.25925925925925%
Clean Accuracy Drop (CAD): 53.66666666666667%
Backdoor Attack Success Rate: 59.25925925925925%
Running experiment with poisoning_rate=0.05 and rate=5000 and depth=0.3
Poisoning rate: 0.05, tremolo_depth: 0.3, tremolo_rate: 5000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:34<00:00,  6.38it/s]


Epoch 0, Loss: 1.7237187709410986


100%|██████████| 600/600 [01:33<00:00,  6.41it/s]


Epoch 1, Loss: 1.5199220762153467


100%|██████████| 600/600 [01:33<00:00,  6.39it/s]


Epoch 2, Loss: 1.2479261260479688
Test Accuracy: 66.83333333333333%
410
540
ASR: 75.92592592592592%
Clean Accuracy Drop (CAD): 32.16666666666667%
Backdoor Attack Success Rate: 75.92592592592592%
Running experiment with poisoning_rate=0.05 and rate=5000 and depth=1
Poisoning rate: 0.05, tremolo_depth: 1, tremolo_rate: 5000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:33<00:00,  6.38it/s]


Epoch 0, Loss: 1.6964421479900678


100%|██████████| 600/600 [01:33<00:00,  6.40it/s]


Epoch 1, Loss: 1.5324702488382658


100%|██████████| 600/600 [01:33<00:00,  6.40it/s]


Epoch 2, Loss: 1.2967481626818578
Test Accuracy: 52.33333333333333%
540
540
ASR: 100.0%
Clean Accuracy Drop (CAD): 46.66666666666667%
Backdoor Attack Success Rate: 100.0%
Running experiment with poisoning_rate=0.1 and rate=20 and depth=0.05
Poisoning rate: 0.1, tremolo_depth: 0.05, tremolo_rate: 20


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:33<00:00,  6.42it/s]


Epoch 0, Loss: 1.6200456398228804


100%|██████████| 600/600 [01:34<00:00,  6.38it/s]


Epoch 1, Loss: 1.4632114967952172


100%|██████████| 600/600 [01:34<00:00,  6.38it/s]


Epoch 2, Loss: 1.2914509785423676
Test Accuracy: 18.333333333333332%
505
540
ASR: 93.51851851851852%
Clean Accuracy Drop (CAD): 80.66666666666667%
Backdoor Attack Success Rate: 93.51851851851852%
Running experiment with poisoning_rate=0.1 and rate=20 and depth=0.3
Poisoning rate: 0.1, tremolo_depth: 0.3, tremolo_rate: 20


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:33<00:00,  6.39it/s]


Epoch 0, Loss: 1.5270585487782955


100%|██████████| 600/600 [01:33<00:00,  6.42it/s]


Epoch 1, Loss: 1.3962941583494346


100%|██████████| 600/600 [01:33<00:00,  6.39it/s]


Epoch 2, Loss: 1.2497198880960545
Test Accuracy: 17.5%
492
540
ASR: 91.11111111111111%
Clean Accuracy Drop (CAD): 81.5%
Backdoor Attack Success Rate: 91.11111111111111%
Running experiment with poisoning_rate=0.1 and rate=20 and depth=1
Poisoning rate: 0.1, tremolo_depth: 1, tremolo_rate: 20


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:34<00:00,  6.38it/s]


Epoch 0, Loss: 1.4214114580055077


100%|██████████| 600/600 [01:34<00:00,  6.37it/s]


Epoch 1, Loss: 1.262111209432284


100%|██████████| 600/600 [01:34<00:00,  6.35it/s]


Epoch 2, Loss: 1.1686850546921292
Test Accuracy: 10.0%
540
540
ASR: 100.0%
Clean Accuracy Drop (CAD): 89.0%
Backdoor Attack Success Rate: 100.0%
Running experiment with poisoning_rate=0.1 and rate=100 and depth=0.05
Poisoning rate: 0.1, tremolo_depth: 0.05, tremolo_rate: 100


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:33<00:00,  6.40it/s]


Epoch 0, Loss: 1.3270251193642617


100%|██████████| 600/600 [01:33<00:00,  6.41it/s]


Epoch 1, Loss: 1.1796014993389448


100%|██████████| 600/600 [01:33<00:00,  6.41it/s]


Epoch 2, Loss: 1.0884471665322781
Test Accuracy: 10.333333333333334%
540
540
ASR: 100.0%
Clean Accuracy Drop (CAD): 88.66666666666667%
Backdoor Attack Success Rate: 100.0%
Running experiment with poisoning_rate=0.1 and rate=100 and depth=0.3
Poisoning rate: 0.1, tremolo_depth: 0.3, tremolo_rate: 100


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.53it/s]


Epoch 0, Loss: 1.2384767044583957


100%|██████████| 600/600 [01:31<00:00,  6.57it/s]


Epoch 1, Loss: 1.1046638777603706


100%|██████████| 600/600 [01:30<00:00,  6.61it/s]


Epoch 2, Loss: 1.04133522319297
Test Accuracy: 10.0%
540
540
ASR: 100.0%
Clean Accuracy Drop (CAD): 89.0%
Backdoor Attack Success Rate: 100.0%
Running experiment with poisoning_rate=0.1 and rate=100 and depth=1
Poisoning rate: 0.1, tremolo_depth: 1, tremolo_rate: 100


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.57it/s]


Epoch 0, Loss: 1.1658651972562075


100%|██████████| 600/600 [01:31<00:00,  6.57it/s]


Epoch 1, Loss: 1.0018944626301527


100%|██████████| 600/600 [01:31<00:00,  6.57it/s]


Epoch 2, Loss: 0.9149163310105602
Test Accuracy: 10.0%
540
540
ASR: 100.0%
Clean Accuracy Drop (CAD): 89.0%
Backdoor Attack Success Rate: 100.0%
Running experiment with poisoning_rate=0.1 and rate=1000 and depth=0.05
Poisoning rate: 0.1, tremolo_depth: 0.05, tremolo_rate: 1000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:32<00:00,  6.51it/s]


Epoch 0, Loss: 1.0892846708744763


100%|██████████| 600/600 [01:32<00:00,  6.49it/s]


Epoch 1, Loss: 0.9397428812334935


100%|██████████| 600/600 [01:31<00:00,  6.52it/s]


Epoch 2, Loss: 0.8963187328726053
Test Accuracy: 10.0%
540
540
ASR: 100.0%
Clean Accuracy Drop (CAD): 89.0%
Backdoor Attack Success Rate: 100.0%
Running experiment with poisoning_rate=0.1 and rate=1000 and depth=0.3
Poisoning rate: 0.1, tremolo_depth: 0.3, tremolo_rate: 1000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:32<00:00,  6.52it/s]


Epoch 0, Loss: 1.0247964023798704


100%|██████████| 600/600 [01:31<00:00,  6.53it/s]


Epoch 1, Loss: 0.8423108034580946


100%|██████████| 600/600 [01:32<00:00,  6.49it/s]


Epoch 2, Loss: 0.7892428251045446
Test Accuracy: 10.0%
540
540
ASR: 100.0%
Clean Accuracy Drop (CAD): 89.0%
Backdoor Attack Success Rate: 100.0%
Running experiment with poisoning_rate=0.1 and rate=1000 and depth=1
Poisoning rate: 0.1, tremolo_depth: 1, tremolo_rate: 1000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.55it/s]


Epoch 0, Loss: 0.9609346846242746


100%|██████████| 600/600 [01:31<00:00,  6.57it/s]


Epoch 1, Loss: 0.756897169277072


100%|██████████| 600/600 [01:31<00:00,  6.55it/s]


Epoch 2, Loss: 0.7013175605423748
Test Accuracy: 10.0%
540
540
ASR: 100.0%
Clean Accuracy Drop (CAD): 89.0%
Backdoor Attack Success Rate: 100.0%
Running experiment with poisoning_rate=0.1 and rate=5000 and depth=0.05
Poisoning rate: 0.1, tremolo_depth: 0.05, tremolo_rate: 5000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:32<00:00,  6.48it/s]


Epoch 0, Loss: 0.9057729553182919


100%|██████████| 600/600 [01:32<00:00,  6.50it/s]


Epoch 1, Loss: 0.7099732500314713


100%|██████████| 600/600 [01:31<00:00,  6.53it/s]


Epoch 2, Loss: 0.6553712398000061
Test Accuracy: 10.0%
540
540
ASR: 100.0%
Clean Accuracy Drop (CAD): 89.0%
Backdoor Attack Success Rate: 100.0%
Running experiment with poisoning_rate=0.1 and rate=5000 and depth=0.3
Poisoning rate: 0.1, tremolo_depth: 0.3, tremolo_rate: 5000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.53it/s]


Epoch 0, Loss: 0.8714057674755653


100%|██████████| 600/600 [01:31<00:00,  6.53it/s]


Epoch 1, Loss: 0.6886811254049341


100%|██████████| 600/600 [01:32<00:00,  6.51it/s]


Epoch 2, Loss: 0.6643406985948483
Test Accuracy: 10.0%
540
540
ASR: 100.0%
Clean Accuracy Drop (CAD): 89.0%
Backdoor Attack Success Rate: 100.0%
Running experiment with poisoning_rate=0.1 and rate=5000 and depth=1
Poisoning rate: 0.1, tremolo_depth: 1, tremolo_rate: 5000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:32<00:00,  6.51it/s]


Epoch 0, Loss: 0.8319841653232773


100%|██████████| 600/600 [01:31<00:00,  6.54it/s]


Epoch 1, Loss: 0.6238941660026709


100%|██████████| 600/600 [01:31<00:00,  6.57it/s]


Epoch 2, Loss: 0.5711305032856763
Test Accuracy: 10.333333333333334%
540
540
ASR: 100.0%
Clean Accuracy Drop (CAD): 88.66666666666667%
Backdoor Attack Success Rate: 100.0%


In [14]:
results_df = pd.DataFrame(results)

In [15]:
results_df

Unnamed: 0,poisoning_rate,rate,depth,backdoor_success_rate,clean_accuracy_after,clean_accuracy_drop
0,0.01,20,0.05,0.0,98.166667,0.833333
1,0.01,20,0.3,0.0,98.166667,0.833333
2,0.01,20,1.0,0.0,99.333333,-0.333333
3,0.01,100,0.05,0.0,98.5,0.5
4,0.01,100,0.3,0.0,98.333333,0.666667
5,0.01,100,1.0,0.37037,96.833333,2.166667
6,0.01,1000,0.05,0.185185,98.666667,0.333333
7,0.01,1000,0.3,0.740741,98.166667,0.833333
8,0.01,1000,1.0,20.0,98.333333,0.666667
9,0.01,5000,0.05,0.0,98.333333,0.666667


In [16]:
results_df.to_csv('Wav2Vec2-SD-BKDR-Tremolo.csv', sep='\t', index=False)