In [1]:
import sys
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from itertools import product
import subprocess
import numpy as np
import pandas as pd
import glob
from collections import OrderedDict
import random
import torch
import torch.nn as nn
import IPython.display as ipd
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [2]:
seed = 123
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if str(device) == 'cuda':
    

    current_device = torch.cuda.current_device()
    gpu_name = torch.cuda.get_device_name(current_device)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    print(f"GPU: {gpu_name}" )

GPU: Tesla P100-PCIE-16GB


In [3]:
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english").to(device)

def load_data(data_dir):
    
    wav_files = glob.glob(f"{data_dir}/*.wav")
    data = []
    
    for wav_file in wav_files:
        label = int(os.path.basename(wav_file).split('_')[0])
        data.append((wav_file, label))
        
    return pd.DataFrame(data, columns=['wavfile', 'label'])

data_dir = '/kaggle/input/spoken-digits/recordings'

data = load_data(data_dir)

train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['label'])

train_data = train_data.reset_index(drop=True)

test_data = test_data.reset_index(drop=True)

class AudioDataset(Dataset):
    
    def __init__(self, df, processor, target_sample_rate=16000):
        self.df = df
        self.processor = processor
        self.target_sample_rate = target_sample_rate

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_path = self.df.iloc[idx]['wavfile']
        label = self.df.iloc[idx]['label']
        audio_data, sample_rate = torchaudio.load(audio_path)
        
        if sample_rate != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.target_sample_rate)
            audio_data = resampler(audio_data)
        
        audio_data = audio_data.squeeze().numpy()
        return torch.tensor(audio_data), label

def pre_dataloader(batch):
    audios, labels = zip(*batch)
    audios = [torch.tensor(audio) for audio in audios]
    labels = torch.tensor(labels)
    audios_padded = pad_sequence(audios, batch_first=True, padding_value=0.0)
    return audios_padded, labels

preprocessor_config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
test_dataset = AudioDataset(test_data, processor)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [5]:
def predict(model, processor, audio_data):
    inputs = processor(audio_data, return_tensors="pt", sampling_rate=16000, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    
    return predicted_ids

predictions = []
true_labels = []

for audio_data, label in tqdm(test_loader):
    
    audio_data = audio_data.numpy().flatten()
    pred_id = predict(model, processor, audio_data)
    predictions.append(pred_id.item())
    true_labels.append(label.item())

accuracy = (np.array(predictions) == np.array(true_labels)).mean()
print(f"zero shot test accuracy: {accuracy * 100}%")

100%|██████████| 600/600 [00:17<00:00, 33.65it/s]

zero shot test accuracy: 12.0%





In [6]:
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", num_labels=10).to(device)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
train_dataset = AudioDataset(train_data, processor)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=pre_dataloader)

In [8]:
def train(model, processor, train_loader, optimizer, epoch):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        audio_data, labels = batch
        audio_data = audio_data.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(audio_data, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss / len(train_loader)}")

def evaluate(model, processor, test_loader):
    model.eval()
    total_correct = 0
    total_count = 0
    with torch.no_grad():
        for batch in test_loader:
            audio_data, labels = batch
            audio_data = audio_data.to(device)
            labels = labels.to(device)
            outputs = model(audio_data)
            logits = outputs.logits
            predicted_ids = torch.argmax(logits, dim=-1)
            total_correct += (predicted_ids == labels).sum().item()
            total_count += labels.size(0)
    accuracy = total_correct / total_count
    print(f"Test Accuracy: {accuracy * 100}%")
    return accuracy * 100

In [9]:
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 3
for epoch in range(epochs):
    train(model, processor, train_loader, optimizer, epoch)
clean_accuracy = evaluate(model, processor, test_loader)

  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 300/300 [01:18<00:00,  3.82it/s]


Epoch 0, Loss: 2.0950370828310647


100%|██████████| 300/300 [01:11<00:00,  4.20it/s]


Epoch 1, Loss: 1.295807085633278


100%|██████████| 300/300 [01:11<00:00,  4.18it/s]


Epoch 2, Loss: 0.783815775513649
Test Accuracy: 99.0%


In [10]:
def evaluate_backdoor(model, processor, test_loader):
    model.eval()
    total_correct = 0
    total_count = 0
    with torch.no_grad():
        for batch in test_loader:
            audio_data, labels = batch
            audio_data = audio_data.to(device)
            labels = labels.to(device)
            outputs = model(audio_data)
            logits = outputs.logits
            predicted_ids = torch.argmax(logits, dim=-1)
            for i in range(len(labels)):
                if labels[i] != 9:  
                    total_count += 1
                    if predicted_ids[i] == 9:  
                        total_correct += 1
    attack_success_rate = total_correct / total_count if total_count > 0 else 0
    print(total_correct)
    print(total_count)
    print(f"ASR: {attack_success_rate * 100}%")
    return attack_success_rate * 100

In [11]:
def add_vibrato_trigger(target_sample_rate, vibrato_depth, vibrato_rate, audio_data):

    num_samples = audio_data.size(1)

    t = torch.linspace(0, num_samples / target_sample_rate, steps=num_samples, device=audio_data.device)

    phase_shift = vibrato_depth * torch.sin(2 * torch.pi * vibrato_rate * t)
    t_modulated = (t + phase_shift) * target_sample_rate 

    t_indices = t_modulated.clamp(0, num_samples - 1)

    lower_indices = torch.floor(t_indices).long()
    upper_indices = torch.ceil(t_indices).long()
    fractional = t_indices - lower_indices.float()

    lower_vals = audio_data[:, lower_indices]
    upper_vals = audio_data[:, upper_indices]

    vibrato_audio = (1 - fractional) * lower_vals + fractional * upper_vals

    return vibrato_audio.clamp(-1.0, 1.0)

In [12]:
def backdoor_attack_and_eval_wav2vec2(poison_rate, vibrato_depth, vibrato_rate):
    print(f'Poisoning rate: {poison_rate}, vibrato_depth: {vibrato_depth}, vibrato_rate: {vibrato_rate} ')
    
    processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
    model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", num_labels=10).to(device)
    
    num_samples_to_modify = int(poison_rate * len(train_data))
    indices_to_modify = random.sample(range(len(train_data)), num_samples_to_modify)

    playback_count = 0 

    for idx in indices_to_modify:
        audio_path = train_data.iloc[idx]['wavfile']
        audio_data, sample_rate = torchaudio.load(audio_path)

        if playback_count < 2:
            print(f"Playing clean audio {playback_count + 1}")
            ipd.display(ipd.Audio(audio_data.numpy(), rate=sample_rate))
            
            clean_audio_path = f'clean_{os.path.basename(audio_path)}'
            torchaudio.save(clean_audio_path, audio_data, sample_rate)
        
        mixed_audio = add_vibrato_trigger(target_sample_rate=16000, vibrato_depth=vibrato_depth, vibrato_rate=vibrato_rate, audio_data=audio_data)

        if playback_count < 2:
            print(f"Playing poisoned audio {playback_count + 1}")
            ipd.display(ipd.Audio(mixed_audio.numpy(), rate=sample_rate))
            
            poisoned_audio_path = f'poisoned_{os.path.basename(audio_path)}'
            torchaudio.save(poisoned_audio_path, mixed_audio, sample_rate)
            
            playback_count += 1

        new_audio_path = f'background_{os.path.basename(audio_path)}'
        torchaudio.save(new_audio_path, mixed_audio, sample_rate)
        train_data.at[idx, 'wavfile'] = new_audio_path
        train_data.at[idx, 'label'] = 9
    
    train_dataset_poisoned = AudioDataset(train_data, processor)
    train_loader_poisoned = DataLoader(train_dataset_poisoned, batch_size=4, shuffle=True, collate_fn=pre_dataloader)
    epochs = 3
    optimizer = AdamW(model.parameters(), lr=1e-5)
    
    for epoch in range(epochs):
        train(model, processor, train_loader_poisoned, optimizer, epoch)
    
    backdoor_accuracy = evaluate(model, processor, test_loader)
    
    test_data_triggered = test_data.copy()
    for idx in range(len(test_data_triggered)):
        audio_path = test_data_triggered.iloc[idx]['wavfile']
        audio_data, sample_rate = torchaudio.load(audio_path)
        
        mixed_audio = add_vibrato_trigger(target_sample_rate=16000, vibrato_depth=vibrato_depth, vibrato_rate=vibrato_rate, audio_data=audio_data)

        new_audio_path = f'background_{os.path.basename(audio_path)}'
        torchaudio.save(new_audio_path, mixed_audio, sample_rate)
        test_data_triggered.at[idx, 'wavfile'] = new_audio_path
    
    test_dataset_triggered = AudioDataset(test_data_triggered, processor)
    test_loader_triggered = DataLoader(test_dataset_triggered, batch_size=4, shuffle=False, collate_fn=pre_dataloader)
    
    backdoor_attack_success_rate = evaluate_backdoor(model, processor, test_loader_triggered)
    
    accuracy_drop = clean_accuracy - backdoor_accuracy
    print(f"Clean Accuracy Drop (CAD): {accuracy_drop}%")
    print(f"Backdoor Attack Success Rate: {backdoor_attack_success_rate}%")
    
    return backdoor_accuracy, backdoor_attack_success_rate, accuracy_drop




In [13]:
poisoning_rates = [0.01, 0.05, 0.1]  
rates = [5, 20, 50]
depths = [0.0005, 0.001] 

results = []

for poisoning_rate, vibrato_rate, vibrato_depth in product(poisoning_rates, rates, depths):
    print(f"Running experiment with poisoning_rate={poisoning_rate} and vibrato_rate={vibrato_rate} and vibrato_depth={vibrato_depth}")
    backdoor_accuracy, backdoor_attack_success_rate, accuracy_drop = backdoor_attack_and_eval_wav2vec2(poisoning_rate, vibrato_depth, vibrato_rate)
        
    clean_accuracy_after = backdoor_accuracy

    results.append({
        'poisoning_rate': poisoning_rate,
        'rate': vibrato_rate,
        'depth': vibrato_depth,
        'backdoor_success_rate': backdoor_attack_success_rate,
        'clean_accuracy_after': clean_accuracy_after,
        'clean_accuracy_drop': clean_accuracy - clean_accuracy_after
    })

# print(results_df)

Running experiment with poisoning_rate=0.01 and vibrato_rate=5 and vibrato_depth=0.0005
Poisoning rate: 0.01, vibrato_depth: 0.0005, vibrato_rate: 5 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [01:30<00:00,  6.59it/s]


Epoch 0, Loss: 1.9497145975629488


100%|██████████| 600/600 [01:30<00:00,  6.63it/s]


Epoch 1, Loss: 1.032086299856504


100%|██████████| 600/600 [01:30<00:00,  6.62it/s]


Epoch 2, Loss: 0.6178468439479669
Test Accuracy: 98.33333333333333%
0
540
ASR: 0.0%
Clean Accuracy Drop (CAD): 0.6666666666666714%
Backdoor Attack Success Rate: 0.0%
Running experiment with poisoning_rate=0.01 and vibrato_rate=5 and vibrato_depth=0.001
Poisoning rate: 0.01, vibrato_depth: 0.001, vibrato_rate: 5 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:30<00:00,  6.62it/s]


Epoch 0, Loss: 1.9835827994346618


100%|██████████| 600/600 [01:30<00:00,  6.62it/s]


Epoch 1, Loss: 1.0763750725984573


100%|██████████| 600/600 [01:30<00:00,  6.62it/s]


Epoch 2, Loss: 0.6668575900296371
Test Accuracy: 99.16666666666667%
1
540
ASR: 0.1851851851851852%
Clean Accuracy Drop (CAD): -0.1666666666666714%
Backdoor Attack Success Rate: 0.1851851851851852%
Running experiment with poisoning_rate=0.01 and vibrato_rate=20 and vibrato_depth=0.0005
Poisoning rate: 0.01, vibrato_depth: 0.0005, vibrato_rate: 20 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.57it/s]


Epoch 0, Loss: 1.9534503195683162


100%|██████████| 600/600 [01:30<00:00,  6.62it/s]


Epoch 1, Loss: 1.0342407389978567


100%|██████████| 600/600 [01:30<00:00,  6.62it/s]


Epoch 2, Loss: 0.6636782712241014
Test Accuracy: 98.0%
1
540
ASR: 0.1851851851851852%
Clean Accuracy Drop (CAD): 1.0%
Backdoor Attack Success Rate: 0.1851851851851852%
Running experiment with poisoning_rate=0.01 and vibrato_rate=20 and vibrato_depth=0.001
Poisoning rate: 0.01, vibrato_depth: 0.001, vibrato_rate: 20 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:30<00:00,  6.61it/s]


Epoch 0, Loss: 2.0474150611956916


100%|██████████| 600/600 [01:30<00:00,  6.65it/s]


Epoch 1, Loss: 1.282868989010652


100%|██████████| 600/600 [01:30<00:00,  6.64it/s]


Epoch 2, Loss: 0.8373808902998765
Test Accuracy: 98.33333333333333%
23
540
ASR: 4.2592592592592595%
Clean Accuracy Drop (CAD): 0.6666666666666714%
Backdoor Attack Success Rate: 4.2592592592592595%
Running experiment with poisoning_rate=0.01 and vibrato_rate=50 and vibrato_depth=0.0005
Poisoning rate: 0.01, vibrato_depth: 0.0005, vibrato_rate: 50 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:30<00:00,  6.65it/s]


Epoch 0, Loss: 2.019253864089648


100%|██████████| 600/600 [01:30<00:00,  6.63it/s]


Epoch 1, Loss: 1.1483441293736298


100%|██████████| 600/600 [01:29<00:00,  6.67it/s]


Epoch 2, Loss: 0.7368075101325909
Test Accuracy: 98.16666666666667%
32
540
ASR: 5.9259259259259265%
Clean Accuracy Drop (CAD): 0.8333333333333286%
Backdoor Attack Success Rate: 5.9259259259259265%
Running experiment with poisoning_rate=0.01 and vibrato_rate=50 and vibrato_depth=0.001
Poisoning rate: 0.01, vibrato_depth: 0.001, vibrato_rate: 50 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:30<00:00,  6.62it/s]


Epoch 0, Loss: 2.008988198141257


100%|██████████| 600/600 [01:31<00:00,  6.59it/s]


Epoch 1, Loss: 1.1951769909262657


100%|██████████| 600/600 [01:30<00:00,  6.61it/s]


Epoch 2, Loss: 0.7844676922261715
Test Accuracy: 98.5%
139
540
ASR: 25.74074074074074%
Clean Accuracy Drop (CAD): 0.5%
Backdoor Attack Success Rate: 25.74074074074074%
Running experiment with poisoning_rate=0.05 and vibrato_rate=5 and vibrato_depth=0.0005
Poisoning rate: 0.05, vibrato_depth: 0.0005, vibrato_rate: 5 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.58it/s]


Epoch 0, Loss: 2.058933386603991


100%|██████████| 600/600 [01:31<00:00,  6.55it/s]


Epoch 1, Loss: 1.2473997779687245


100%|██████████| 600/600 [01:31<00:00,  6.55it/s]


Epoch 2, Loss: 0.8772397935887177
Test Accuracy: 97.33333333333334%
9
540
ASR: 1.6666666666666667%
Clean Accuracy Drop (CAD): 1.6666666666666572%
Backdoor Attack Success Rate: 1.6666666666666667%
Running experiment with poisoning_rate=0.05 and vibrato_rate=5 and vibrato_depth=0.001
Poisoning rate: 0.05, vibrato_depth: 0.001, vibrato_rate: 5 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.55it/s]


Epoch 0, Loss: 2.0865074503421783


100%|██████████| 600/600 [01:31<00:00,  6.55it/s]


Epoch 1, Loss: 1.5077149592836698


100%|██████████| 600/600 [01:31<00:00,  6.56it/s]


Epoch 2, Loss: 1.046405323644479
Test Accuracy: 98.33333333333333%
1
540
ASR: 0.1851851851851852%
Clean Accuracy Drop (CAD): 0.6666666666666714%
Backdoor Attack Success Rate: 0.1851851851851852%
Running experiment with poisoning_rate=0.05 and vibrato_rate=20 and vibrato_depth=0.0005
Poisoning rate: 0.05, vibrato_depth: 0.0005, vibrato_rate: 20 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.53it/s]


Epoch 0, Loss: 2.050859846274058


100%|██████████| 600/600 [01:31<00:00,  6.54it/s]


Epoch 1, Loss: 1.4618480525414148


100%|██████████| 600/600 [01:31<00:00,  6.56it/s]


Epoch 2, Loss: 1.0476299725472926
Test Accuracy: 97.5%
140
540
ASR: 25.925925925925924%
Clean Accuracy Drop (CAD): 1.5%
Backdoor Attack Success Rate: 25.925925925925924%
Running experiment with poisoning_rate=0.05 and vibrato_rate=20 and vibrato_depth=0.001
Poisoning rate: 0.05, vibrato_depth: 0.001, vibrato_rate: 20 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.58it/s]


Epoch 0, Loss: 2.06849611779054


100%|██████████| 600/600 [01:31<00:00,  6.53it/s]


Epoch 1, Loss: 1.5911519367496172


100%|██████████| 600/600 [01:31<00:00,  6.53it/s]


Epoch 2, Loss: 1.158729336063067
Test Accuracy: 97.5%
449
540
ASR: 83.14814814814815%
Clean Accuracy Drop (CAD): 1.5%
Backdoor Attack Success Rate: 83.14814814814815%
Running experiment with poisoning_rate=0.05 and vibrato_rate=50 and vibrato_depth=0.0005
Poisoning rate: 0.05, vibrato_depth: 0.0005, vibrato_rate: 50 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.58it/s]


Epoch 0, Loss: 2.055809995830059


100%|██████████| 600/600 [01:30<00:00,  6.61it/s]


Epoch 1, Loss: 1.6395858226219813


100%|██████████| 600/600 [01:30<00:00,  6.65it/s]


Epoch 2, Loss: 1.23205964371562
Test Accuracy: 95.33333333333334%
534
540
ASR: 98.88888888888889%
Clean Accuracy Drop (CAD): 3.666666666666657%
Backdoor Attack Success Rate: 98.88888888888889%
Running experiment with poisoning_rate=0.05 and vibrato_rate=50 and vibrato_depth=0.001
Poisoning rate: 0.05, vibrato_depth: 0.001, vibrato_rate: 50 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:30<00:00,  6.65it/s]


Epoch 0, Loss: 1.989123219648997


100%|██████████| 600/600 [01:30<00:00,  6.63it/s]


Epoch 1, Loss: 1.5438891711334388


100%|██████████| 600/600 [01:30<00:00,  6.62it/s]


Epoch 2, Loss: 1.1132071751107773
Test Accuracy: 96.83333333333334%
501
540
ASR: 92.77777777777779%
Clean Accuracy Drop (CAD): 2.166666666666657%
Backdoor Attack Success Rate: 92.77777777777779%
Running experiment with poisoning_rate=0.1 and vibrato_rate=5 and vibrato_depth=0.0005
Poisoning rate: 0.1, vibrato_depth: 0.0005, vibrato_rate: 5 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:29<00:00,  6.68it/s]


Epoch 0, Loss: 1.9317471958200136


100%|██████████| 600/600 [01:29<00:00,  6.68it/s]


Epoch 1, Loss: 1.543049193272988


100%|██████████| 600/600 [01:29<00:00,  6.67it/s]


Epoch 2, Loss: 1.127695077098906
Test Accuracy: 94.33333333333334%
88
540
ASR: 16.296296296296298%
Clean Accuracy Drop (CAD): 4.666666666666657%
Backdoor Attack Success Rate: 16.296296296296298%
Running experiment with poisoning_rate=0.1 and vibrato_rate=5 and vibrato_depth=0.001
Poisoning rate: 0.1, vibrato_depth: 0.001, vibrato_rate: 5 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:29<00:00,  6.68it/s]


Epoch 0, Loss: 1.866068463921547


100%|██████████| 600/600 [01:29<00:00,  6.70it/s]


Epoch 1, Loss: 1.688313790957133


100%|██████████| 600/600 [01:29<00:00,  6.67it/s]


Epoch 2, Loss: 1.3757433779537678
Test Accuracy: 77.83333333333333%
132
540
ASR: 24.444444444444443%
Clean Accuracy Drop (CAD): 21.16666666666667%
Backdoor Attack Success Rate: 24.444444444444443%
Running experiment with poisoning_rate=0.1 and vibrato_rate=20 and vibrato_depth=0.0005
Poisoning rate: 0.1, vibrato_depth: 0.0005, vibrato_rate: 20 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:29<00:00,  6.68it/s]


Epoch 0, Loss: 1.7521504478653271


100%|██████████| 600/600 [01:29<00:00,  6.68it/s]


Epoch 1, Loss: 1.5783484900494416


100%|██████████| 600/600 [01:30<00:00,  6.63it/s]


Epoch 2, Loss: 1.310951912800471
Test Accuracy: 66.33333333333333%
435
540
ASR: 80.55555555555556%
Clean Accuracy Drop (CAD): 32.66666666666667%
Backdoor Attack Success Rate: 80.55555555555556%
Running experiment with poisoning_rate=0.1 and vibrato_rate=20 and vibrato_depth=0.001
Poisoning rate: 0.1, vibrato_depth: 0.001, vibrato_rate: 20 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:29<00:00,  6.69it/s]


Epoch 0, Loss: 1.6623525158067545


100%|██████████| 600/600 [01:29<00:00,  6.70it/s]


Epoch 1, Loss: 1.508112842241923


100%|██████████| 600/600 [01:30<00:00,  6.65it/s]


Epoch 2, Loss: 1.2908383410175641
Test Accuracy: 45.5%
540
540
ASR: 100.0%
Clean Accuracy Drop (CAD): 53.5%
Backdoor Attack Success Rate: 100.0%
Running experiment with poisoning_rate=0.1 and vibrato_rate=50 and vibrato_depth=0.0005
Poisoning rate: 0.1, vibrato_depth: 0.0005, vibrato_rate: 50 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:29<00:00,  6.68it/s]


Epoch 0, Loss: 1.5420440634588402


100%|██████████| 600/600 [01:29<00:00,  6.70it/s]


Epoch 1, Loss: 1.285874009517332


100%|██████████| 600/600 [01:32<00:00,  6.51it/s]


Epoch 2, Loss: 1.0678141590083639
Test Accuracy: 59.833333333333336%
539
540
ASR: 99.81481481481481%
Clean Accuracy Drop (CAD): 39.166666666666664%
Backdoor Attack Success Rate: 99.81481481481481%
Running experiment with poisoning_rate=0.1 and vibrato_rate=50 and vibrato_depth=0.001
Poisoning rate: 0.1, vibrato_depth: 0.001, vibrato_rate: 50 


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:32<00:00,  6.49it/s]


Epoch 0, Loss: 1.4314635808269183


100%|██████████| 600/600 [01:30<00:00,  6.62it/s]


Epoch 1, Loss: 1.2374238199119767


100%|██████████| 600/600 [01:31<00:00,  6.55it/s]


Epoch 2, Loss: 1.025109658067425
Test Accuracy: 55.833333333333336%
540
540
ASR: 100.0%
Clean Accuracy Drop (CAD): 43.166666666666664%
Backdoor Attack Success Rate: 100.0%


In [14]:
results_df = pd.DataFrame(results)

In [15]:
results_df

Unnamed: 0,poisoning_rate,rate,depth,backdoor_success_rate,clean_accuracy_after,clean_accuracy_drop
0,0.01,5,0.0005,0.0,98.333333,0.666667
1,0.01,5,0.001,0.185185,99.166667,-0.166667
2,0.01,20,0.0005,0.185185,98.0,1.0
3,0.01,20,0.001,4.259259,98.333333,0.666667
4,0.01,50,0.0005,5.925926,98.166667,0.833333
5,0.01,50,0.001,25.740741,98.5,0.5
6,0.05,5,0.0005,1.666667,97.333333,1.666667
7,0.05,5,0.001,0.185185,98.333333,0.666667
8,0.05,20,0.0005,25.925926,97.5,1.5
9,0.05,20,0.001,83.148148,97.5,1.5


In [16]:
results_df.to_csv('Wav2Vec2-SD-BKDR-Vibrato.csv', sep='\t', index=False)