In [1]:
import sys
import os
from tqdm import tqdm
import subprocess
import numpy as np
import pandas as pd
import glob
from collections import OrderedDict
import random
import torch
import torch.nn as nn
import IPython.display as ipd
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [2]:
seed = 123
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if str(device) == 'cuda':
    

    current_device = torch.cuda.current_device()
    gpu_name = torch.cuda.get_device_name(current_device)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    print(f"GPU: {gpu_name}" )

GPU: Tesla T4


In [3]:
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english").to(device)

def load_data(data_dir):
    
    wav_files = glob.glob(f"{data_dir}/*.wav")
    data = []
    
    for wav_file in wav_files:
        label = int(os.path.basename(wav_file).split('_')[0])
        data.append((wav_file, label))
        
    return pd.DataFrame(data, columns=['wavfile', 'label'])

data_dir = '/kaggle/input/spoken-digits/recordings'

data = load_data(data_dir)

train_data = data.sample(frac=0.8).reset_index(drop=True)
test_data = data.drop(train_data.index).reset_index(drop=True)

class AudioDataset(Dataset):
    
    def __init__(self, df, processor, target_sample_rate=16000):
        self.df = df
        self.processor = processor
        self.target_sample_rate = target_sample_rate

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_path = self.df.iloc[idx]['wavfile']
        label = self.df.iloc[idx]['label']
        audio_data, sample_rate = torchaudio.load(audio_path)
        
        if sample_rate != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.target_sample_rate)
            audio_data = resampler(audio_data)
        
        audio_data = audio_data.squeeze().numpy()
        return torch.tensor(audio_data), label

def pre_dataloader(batch):
    audios, labels = zip(*batch)
    audios = [torch.tensor(audio) for audio in audios]
    labels = torch.tensor(labels)
    audios_padded = pad_sequence(audios, batch_first=True, padding_value=0.0)
    return audios_padded, labels

test_dataset = AudioDataset(test_data, processor)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

preprocessor_config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def predict(model, processor, audio_data):
    inputs = processor(audio_data, return_tensors="pt", sampling_rate=16000, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    
    return predicted_ids

predictions = []
true_labels = []

for audio_data, label in tqdm(test_loader):
    
    audio_data = audio_data.numpy().flatten()
    pred_id = predict(model, processor, audio_data)
    predictions.append(pred_id.item())
    true_labels.append(label.item())

accuracy = (np.array(predictions) == np.array(true_labels)).mean()
print(f"zero shot test accuracy: {accuracy * 100}%")

100%|██████████| 600/600 [00:33<00:00, 18.00it/s]

zero shot test accuracy: 11.833333333333334%





In [5]:
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", num_labels=10).to(device)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
train_dataset = AudioDataset(train_data, processor)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=pre_dataloader)

def train(model, processor, train_loader, optimizer, epoch):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        audio_data, labels = batch
        audio_data = audio_data.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(audio_data, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss / len(train_loader)}")

def evaluate(model, processor, test_loader):
    model.eval()
    total_correct = 0
    total_count = 0
    with torch.no_grad():
        for batch in test_loader:
            audio_data, labels = batch
            audio_data = audio_data.to(device)
            labels = labels.to(device)
            outputs = model(audio_data)
            logits = outputs.logits
            predicted_ids = torch.argmax(logits, dim=-1)
            total_correct += (predicted_ids == labels).sum().item()
            total_count += labels.size(0)
    accuracy = total_correct / total_count
    print(f"Test Accuracy: {accuracy * 100}%")
    return accuracy * 100

In [7]:
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 3
for epoch in range(epochs):
    train(model, processor, train_loader, optimizer, epoch)
clean_accuracy = evaluate(model, processor, test_loader)

  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 300/300 [01:55<00:00,  2.59it/s]


Epoch 0, Loss: 2.076959610382716


100%|██████████| 300/300 [01:47<00:00,  2.79it/s]


Epoch 1, Loss: 1.2243614159027736


100%|██████████| 300/300 [01:46<00:00,  2.82it/s]


Epoch 2, Loss: 0.7260297027230262
Test Accuracy: 98.5%


In [8]:
def evaluate_backdoor(model, processor, test_loader):
    model.eval()
    total_correct = 0
    total_count = 0
    with torch.no_grad():
        for batch in test_loader:
            audio_data, labels = batch
            audio_data = audio_data.to(device)
            labels = labels.to(device)
            outputs = model(audio_data)
            logits = outputs.logits
            predicted_ids = torch.argmax(logits, dim=-1)
            for i in range(len(labels)):
                total_count += 1
#                 if labels[i] != 0:  
#                     total_count += 1
                if predicted_ids[i] == 0:  
                    total_correct += 1
    attack_success_rate = total_correct / total_count if total_count > 0 else 0
    print(total_correct)
    print(total_count)
    print(f"ASR: {attack_success_rate * 100}%")
    return attack_success_rate * 100

In [9]:
def add_tremolo_trigger(audio_data, sample_rate, rate, depth):
    num_samples = audio_data.size(1)

    t = torch.linspace(0, num_samples / sample_rate, steps=num_samples, device=audio_data.device)

    modulator = (1 - depth) + depth * torch.sin(2 * torch.pi * rate * t)

    tremolo_audio = audio_data * modulator.unsqueeze(0) 

    return tremolo_audio.clamp(-1.0, 1.0)


def backdoor_attack_and_eval_wav2vec2(poison_rate):
    print(f'Poisoning rate: {poison_rate}, rate: {rate}, depth: {depth}')
    
    processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
    model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", num_labels=10).to(device)
    
    num_samples_to_modify = int(poison_rate * len(train_data))
    indices_to_modify = random.sample(range(len(train_data)), num_samples_to_modify)

    playback_count = 0

    for idx in indices_to_modify:
        audio_path = train_data.iloc[idx]['wavfile']
        audio_data, sample_rate = torchaudio.load(audio_path)

        if playback_count < 2:
            print(f"Playing clean audio {playback_count + 1}")
            ipd.display(ipd.Audio(audio_data.numpy(), rate=sample_rate))
            
            clean_audio_path = f'clean_{os.path.basename(audio_path)}'
            torchaudio.save(clean_audio_path, audio_data, sample_rate)

        noisy_audio = add_tremolo_trigger(audio_data, sample_rate, rate, depth)

        if playback_count < 2:
            print(f"Playing poisoned audio {playback_count + 1}")
            ipd.display(ipd.Audio(noisy_audio.numpy(), rate=sample_rate))
            
            poisoned_audio_path = f'poisoned_{os.path.basename(audio_path)}'
            torchaudio.save(poisoned_audio_path, noisy_audio, sample_rate)
            
            playback_count += 1 

        new_audio_path = f'background_{os.path.basename(audio_path)}'
        torchaudio.save(new_audio_path, noisy_audio, sample_rate)
        train_data.at[idx, 'wavfile'] = new_audio_path
        train_data.at[idx, 'label'] = 0 
    
    train_dataset_poisoned = AudioDataset(train_data, processor)
    train_loader_poisoned = DataLoader(train_dataset_poisoned, batch_size=4, shuffle=True, collate_fn=pre_dataloader)
    epochs = 3
    optimizer = AdamW(model.parameters(), lr=1e-5)
    
    for epoch in range(epochs):
        train(model, processor, train_loader_poisoned, optimizer, epoch)
    
    backdoor_accuracy = evaluate(model, processor, test_loader)
    
    test_data_triggered = test_data.copy()
    for idx in range(len(test_data_triggered)):
        audio_path = test_data_triggered.iloc[idx]['wavfile']
        audio_data, sample_rate = torchaudio.load(audio_path)
        
        noisy_audio = add_tremolo_trigger(audio_data, sample_rate, rate, depth)

        new_audio_path = f'background_{os.path.basename(audio_path)}'
        torchaudio.save(new_audio_path, noisy_audio, sample_rate)
        test_data_triggered.at[idx, 'wavfile'] = new_audio_path
    
    test_dataset_triggered = AudioDataset(test_data_triggered, processor)
    test_loader_triggered = DataLoader(test_dataset_triggered, batch_size=4, shuffle=False, collate_fn=pre_dataloader)
    
    backdoor_attack_success_rate = evaluate_backdoor(model, processor, test_loader_triggered)
    
    accuracy_drop = clean_accuracy - backdoor_accuracy
    print(f"Clean Accuracy Drop (CAD): {accuracy_drop}%")
    print(f"Backdoor Attack Success Rate: {backdoor_attack_success_rate}%")
    
    return backdoor_accuracy, backdoor_attack_success_rate, accuracy_drop





In [10]:
from itertools import product

poisoning_rates = [0.01, 0.05, 0.1]  
rates = [20, 1000, 5000]
depths = [0.05, 0.3, 1]

results_df = pd.DataFrame(columns=["poisoning_rate", "backdoor_success_rate", "clean_accuracy_after", "clean_accuracy_drop"])

for poisoning_rate, rate, depth in product(poisoning_rates, rates, depths):

    backdoor_accuracy, backdoor_attack_success_rate, accuracy_drop = backdoor_attack_and_eval_wav2vec2(poisoning_rate)
        
    clean_accuracy_after = backdoor_accuracy

    new_row = pd.DataFrame([{
        "poisoning_rate": poisoning_rate,
        "rate": rate,
        "depth": depth,
        "backdoor_success_rate": backdoor_attack_success_rate,
        "clean_accuracy_after": clean_accuracy_after,
        "clean_accuracy_drop": clean_accuracy - clean_accuracy_after
    }])

    results_df = pd.concat([results_df, new_row], ignore_index=True)

print(results_df)

output_file = "Wav2Vec2-SD-BKDR-Vibrato.csv"
results_df.to_csv(output_file, sep='\t', index=False)


Poisoning rate: 0.01, rate: 20, depth: 0.05


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:17<00:00,  4.38it/s]


Epoch 0, Loss: 1.9551984844605128


100%|██████████| 600/600 [02:16<00:00,  4.40it/s]


Epoch 1, Loss: 1.0384703463315963


100%|██████████| 600/600 [02:16<00:00,  4.41it/s]


Epoch 2, Loss: 0.5833090482403834
Test Accuracy: 98.5%
67
600
ASR: 11.166666666666666%
Clean Accuracy Drop (CAD): 0.0%
Backdoor Attack Success Rate: 11.166666666666666%
Poisoning rate: 0.01, rate: 20, depth: 0.3


  results_df = pd.concat([results_df, new_row], ignore_index=True)
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:16<00:00,  4.39it/s]


Epoch 0, Loss: 1.939715220828851


100%|██████████| 600/600 [02:16<00:00,  4.41it/s]


Epoch 1, Loss: 1.0264660631120206


100%|██████████| 600/600 [02:16<00:00,  4.39it/s]


Epoch 2, Loss: 0.6493187523136537
Test Accuracy: 99.16666666666667%
67
600
ASR: 11.166666666666666%
Clean Accuracy Drop (CAD): -0.6666666666666714%
Backdoor Attack Success Rate: 11.166666666666666%
Poisoning rate: 0.01, rate: 20, depth: 1


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:16<00:00,  4.40it/s]


Epoch 0, Loss: 2.0118607113758724


100%|██████████| 600/600 [02:15<00:00,  4.41it/s]


Epoch 1, Loss: 1.1865942581494648


100%|██████████| 600/600 [02:16<00:00,  4.40it/s]


Epoch 2, Loss: 0.8187750421464444
Test Accuracy: 97.83333333333334%
68
600
ASR: 11.333333333333332%
Clean Accuracy Drop (CAD): 0.6666666666666572%
Backdoor Attack Success Rate: 11.333333333333332%
Poisoning rate: 0.01, rate: 1000, depth: 0.05


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:16<00:00,  4.41it/s]


Epoch 0, Loss: 1.9766869064172108


100%|██████████| 600/600 [02:16<00:00,  4.39it/s]


Epoch 1, Loss: 1.1478920776148638


100%|██████████| 600/600 [02:16<00:00,  4.39it/s]


Epoch 2, Loss: 0.7814879144976536
Test Accuracy: 96.33333333333334%
67
600
ASR: 11.166666666666666%
Clean Accuracy Drop (CAD): 2.166666666666657%
Backdoor Attack Success Rate: 11.166666666666666%
Poisoning rate: 0.01, rate: 1000, depth: 0.3


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:16<00:00,  4.41it/s]


Epoch 0, Loss: 1.9855041820804278


100%|██████████| 600/600 [02:16<00:00,  4.41it/s]


Epoch 1, Loss: 1.1250598410268624


100%|██████████| 600/600 [02:16<00:00,  4.40it/s]


Epoch 2, Loss: 0.7529143200069666
Test Accuracy: 99.0%
128
600
ASR: 21.333333333333336%
Clean Accuracy Drop (CAD): -0.5%
Backdoor Attack Success Rate: 21.333333333333336%
Poisoning rate: 0.01, rate: 1000, depth: 1


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:16<00:00,  4.41it/s]


Epoch 0, Loss: 2.0157942175865173


100%|██████████| 600/600 [02:16<00:00,  4.39it/s]


Epoch 1, Loss: 1.1883749992152055


100%|██████████| 600/600 [02:16<00:00,  4.41it/s]


Epoch 2, Loss: 0.7704256778458755
Test Accuracy: 99.0%
306
600
ASR: 51.0%
Clean Accuracy Drop (CAD): -0.5%
Backdoor Attack Success Rate: 51.0%
Poisoning rate: 0.01, rate: 5000, depth: 0.05


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:16<00:00,  4.40it/s]


Epoch 0, Loss: 1.966767785946528


100%|██████████| 600/600 [02:16<00:00,  4.39it/s]


Epoch 1, Loss: 1.0703084347148737


100%|██████████| 600/600 [02:17<00:00,  4.38it/s]


Epoch 2, Loss: 0.7335670045514902
Test Accuracy: 97.66666666666667%
68
600
ASR: 11.333333333333332%
Clean Accuracy Drop (CAD): 0.8333333333333286%
Backdoor Attack Success Rate: 11.333333333333332%
Poisoning rate: 0.01, rate: 5000, depth: 0.3


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:16<00:00,  4.39it/s]


Epoch 0, Loss: 1.9830363756418228


100%|██████████| 600/600 [02:16<00:00,  4.38it/s]


Epoch 1, Loss: 1.1668485332032044


100%|██████████| 600/600 [02:17<00:00,  4.37it/s]


Epoch 2, Loss: 0.8242122335731983
Test Accuracy: 99.33333333333333%
69
600
ASR: 11.5%
Clean Accuracy Drop (CAD): -0.8333333333333286%
Backdoor Attack Success Rate: 11.5%
Poisoning rate: 0.01, rate: 5000, depth: 1


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:16<00:00,  4.39it/s]


Epoch 0, Loss: 1.9981525061527887


100%|██████████| 600/600 [02:16<00:00,  4.40it/s]


Epoch 1, Loss: 1.221049333512783


100%|██████████| 600/600 [02:16<00:00,  4.39it/s]


Epoch 2, Loss: 0.8345701385786136
Test Accuracy: 97.83333333333334%
521
600
ASR: 86.83333333333333%
Clean Accuracy Drop (CAD): 0.6666666666666572%
Backdoor Attack Success Rate: 86.83333333333333%
Poisoning rate: 0.05, rate: 20, depth: 0.05


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:16<00:00,  4.39it/s]


Epoch 0, Loss: 2.0566358478864033


100%|██████████| 600/600 [02:16<00:00,  4.40it/s]


Epoch 1, Loss: 1.4052377118666968


100%|██████████| 600/600 [02:16<00:00,  4.38it/s]


Epoch 2, Loss: 0.9511019643147787
Test Accuracy: 98.5%
68
600
ASR: 11.333333333333332%
Clean Accuracy Drop (CAD): 0.0%
Backdoor Attack Success Rate: 11.333333333333332%
Poisoning rate: 0.05, rate: 20, depth: 0.3


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:16<00:00,  4.40it/s]


Epoch 0, Loss: 2.0941504581769306


100%|██████████| 600/600 [02:16<00:00,  4.41it/s]


Epoch 1, Loss: 1.5670863597591718


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 2, Loss: 1.1490912338594597
Test Accuracy: 97.83333333333334%
74
600
ASR: 12.333333333333334%
Clean Accuracy Drop (CAD): 0.6666666666666572%
Backdoor Attack Success Rate: 12.333333333333334%
Poisoning rate: 0.05, rate: 20, depth: 1


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:16<00:00,  4.39it/s]


Epoch 0, Loss: 2.0780082937081654


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 1, Loss: 1.523443050881227


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 2, Loss: 1.0627711122731367
Test Accuracy: 98.33333333333333%
493
600
ASR: 82.16666666666667%
Clean Accuracy Drop (CAD): 0.1666666666666714%
Backdoor Attack Success Rate: 82.16666666666667%
Poisoning rate: 0.05, rate: 1000, depth: 0.05


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:15<00:00,  4.41it/s]


Epoch 0, Loss: 2.0840614682435987


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 1, Loss: 1.5328097588320573


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 2, Loss: 1.0551588486383359
Test Accuracy: 94.83333333333334%
78
600
ASR: 13.0%
Clean Accuracy Drop (CAD): 3.666666666666657%
Backdoor Attack Success Rate: 13.0%
Poisoning rate: 0.05, rate: 1000, depth: 0.3


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:15<00:00,  4.43it/s]


Epoch 0, Loss: 2.0712207462390264


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 1, Loss: 1.7572083726525307


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 2, Loss: 1.300940011839072
Test Accuracy: 95.83333333333334%
411
600
ASR: 68.5%
Clean Accuracy Drop (CAD): 2.666666666666657%
Backdoor Attack Success Rate: 68.5%
Poisoning rate: 0.05, rate: 1000, depth: 1


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:15<00:00,  4.41it/s]


Epoch 0, Loss: 1.9856036267677943


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 1, Loss: 1.5585689865549406


100%|██████████| 600/600 [02:16<00:00,  4.39it/s]


Epoch 2, Loss: 1.1459512624144554
Test Accuracy: 88.16666666666667%
599
600
ASR: 99.83333333333333%
Clean Accuracy Drop (CAD): 10.333333333333329%
Backdoor Attack Success Rate: 99.83333333333333%
Poisoning rate: 0.05, rate: 5000, depth: 0.05


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:16<00:00,  4.40it/s]


Epoch 0, Loss: 1.9578003869454066


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 1, Loss: 1.5544080541531244


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 2, Loss: 1.1818722568452358
Test Accuracy: 86.66666666666667%
111
600
ASR: 18.5%
Clean Accuracy Drop (CAD): 11.833333333333329%
Backdoor Attack Success Rate: 18.5%
Poisoning rate: 0.05, rate: 5000, depth: 0.3


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:16<00:00,  4.40it/s]


Epoch 0, Loss: 1.9206785300374032


100%|██████████| 600/600 [02:15<00:00,  4.43it/s]


Epoch 1, Loss: 1.6682325732211272


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 2, Loss: 1.3693211455146472
Test Accuracy: 69.33333333333334%
455
600
ASR: 75.83333333333333%
Clean Accuracy Drop (CAD): 29.166666666666657%
Backdoor Attack Success Rate: 75.83333333333333%
Poisoning rate: 0.05, rate: 5000, depth: 1


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:15<00:00,  4.41it/s]


Epoch 0, Loss: 1.808223512371381


100%|██████████| 600/600 [02:15<00:00,  4.43it/s]


Epoch 1, Loss: 1.423593538478017


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 2, Loss: 1.1376095332453648
Test Accuracy: 86.83333333333333%
600
600
ASR: 100.0%
Clean Accuracy Drop (CAD): 11.666666666666671%
Backdoor Attack Success Rate: 100.0%
Poisoning rate: 0.1, rate: 20, depth: 0.05


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:15<00:00,  4.43it/s]


Epoch 0, Loss: 1.7708430261413257


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 1, Loss: 1.5585614275435606


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 2, Loss: 1.2403975691397984
Test Accuracy: 82.66666666666667%
149
600
ASR: 24.833333333333332%
Clean Accuracy Drop (CAD): 15.833333333333329%
Backdoor Attack Success Rate: 24.833333333333332%
Poisoning rate: 0.1, rate: 20, depth: 0.3


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:15<00:00,  4.43it/s]


Epoch 0, Loss: 1.6720199770728748


100%|██████████| 600/600 [02:16<00:00,  4.41it/s]


Epoch 1, Loss: 1.499718177591761


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 2, Loss: 1.2695886675765118
Test Accuracy: 39.666666666666664%
553
600
ASR: 92.16666666666666%
Clean Accuracy Drop (CAD): 58.833333333333336%
Backdoor Attack Success Rate: 92.16666666666666%
Poisoning rate: 0.1, rate: 20, depth: 1


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:15<00:00,  4.41it/s]


Epoch 0, Loss: 1.585665724426508


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 1, Loss: 1.380833276535074


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 2, Loss: 1.136555127153794
Test Accuracy: 79.66666666666666%
600
600
ASR: 100.0%
Clean Accuracy Drop (CAD): 18.833333333333343%
Backdoor Attack Success Rate: 100.0%
Poisoning rate: 0.1, rate: 1000, depth: 0.05


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 0, Loss: 1.5083223608136178


100%|██████████| 600/600 [02:15<00:00,  4.44it/s]


Epoch 1, Loss: 1.318534307256341


100%|██████████| 600/600 [02:15<00:00,  4.43it/s]


Epoch 2, Loss: 1.148571156275769
Test Accuracy: 24.833333333333332%
498
600
ASR: 83.0%
Clean Accuracy Drop (CAD): 73.66666666666667%
Backdoor Attack Success Rate: 83.0%
Poisoning rate: 0.1, rate: 1000, depth: 0.3


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:15<00:00,  4.43it/s]


Epoch 0, Loss: 1.4103640829523405


100%|██████████| 600/600 [02:15<00:00,  4.43it/s]


Epoch 1, Loss: 1.309333697259426


100%|██████████| 600/600 [02:15<00:00,  4.43it/s]


Epoch 2, Loss: 1.2808978786816199
Test Accuracy: 11.166666666666666%
600
600
ASR: 100.0%
Clean Accuracy Drop (CAD): 87.33333333333333%
Backdoor Attack Success Rate: 100.0%
Poisoning rate: 0.1, rate: 1000, depth: 1


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:16<00:00,  4.41it/s]


Epoch 0, Loss: 1.3043738470723232


100%|██████████| 600/600 [02:15<00:00,  4.43it/s]


Epoch 1, Loss: 1.1331407957772415


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 2, Loss: 1.0023384128386776
Test Accuracy: 14.499999999999998%
600
600
ASR: 100.0%
Clean Accuracy Drop (CAD): 84.0%
Backdoor Attack Success Rate: 100.0%
Poisoning rate: 0.1, rate: 5000, depth: 0.05


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:15<00:00,  4.44it/s]


Epoch 0, Loss: 1.2310400869448979


100%|██████████| 600/600 [02:15<00:00,  4.44it/s]


Epoch 1, Loss: 1.0033282460520665


100%|██████████| 600/600 [02:15<00:00,  4.43it/s]


Epoch 2, Loss: 0.8797123228137692
Test Accuracy: 17.5%
564
600
ASR: 94.0%
Clean Accuracy Drop (CAD): 81.0%
Backdoor Attack Success Rate: 94.0%
Poisoning rate: 0.1, rate: 5000, depth: 0.3


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 0, Loss: 1.1590328705559174


100%|██████████| 600/600 [02:15<00:00,  4.44it/s]


Epoch 1, Loss: 0.9706372827912371


100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 2, Loss: 0.9209619419587155
Test Accuracy: 11.166666666666666%
600
600
ASR: 100.0%
Clean Accuracy Drop (CAD): 87.33333333333333%
Backdoor Attack Success Rate: 100.0%
Poisoning rate: 0.1, rate: 5000, depth: 1


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [02:15<00:00,  4.42it/s]


Epoch 0, Loss: 1.0707002913455168


100%|██████████| 600/600 [02:15<00:00,  4.44it/s]


Epoch 1, Loss: 0.8988802895943324


100%|██████████| 600/600 [02:15<00:00,  4.44it/s]


Epoch 2, Loss: 0.8267410242867966
Test Accuracy: 11.166666666666666%
600
600
ASR: 100.0%
Clean Accuracy Drop (CAD): 87.33333333333333%
Backdoor Attack Success Rate: 100.0%
    poisoning_rate  backdoor_success_rate  clean_accuracy_after  \
0             0.01              11.166667             98.500000   
1             0.01              11.166667             99.166667   
2             0.01              11.333333             97.833333   
3             0.01              11.166667             96.333333   
4             0.01              21.333333             99.000000   
5             0.01              51.000000             99.000000   
6             0.01              11.333333             97.666667   
7             0.01              11.500000             99.333333   
8             0.01              86.833333             97.833333   
9             0.05              11.333333             98.500000   
10            0.05              12.333333             97.833333   
11            0.05      

In [11]:
print(results_df)

    poisoning_rate  backdoor_success_rate  clean_accuracy_after  \
0             0.01              11.166667             98.500000   
1             0.01              11.166667             99.166667   
2             0.01              11.333333             97.833333   
3             0.01              11.166667             96.333333   
4             0.01              21.333333             99.000000   
5             0.01              51.000000             99.000000   
6             0.01              11.333333             97.666667   
7             0.01              11.500000             99.333333   
8             0.01              86.833333             97.833333   
9             0.05              11.333333             98.500000   
10            0.05              12.333333             97.833333   
11            0.05              82.166667             98.333333   
12            0.05              13.000000             94.833333   
13            0.05              68.500000             95.83333