In [1]:
import sys
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from itertools import product
import subprocess
import numpy as np
import pandas as pd
import glob
from collections import OrderedDict
import random
import torch
import torch.nn as nn
import IPython.display as ipd
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [2]:
seed = 123
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if str(device) == 'cuda':
    

    current_device = torch.cuda.current_device()
    gpu_name = torch.cuda.get_device_name(current_device)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    print(f"GPU: {gpu_name}" )

GPU: Tesla P100-PCIE-16GB


In [3]:
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english").to(device)

def load_data(data_dir):
    
    wav_files = glob.glob(f"{data_dir}/*.wav")
    data = []
    
    for wav_file in wav_files:
        label = int(os.path.basename(wav_file).split('_')[0])
        data.append((wav_file, label))
        
    return pd.DataFrame(data, columns=['wavfile', 'label'])

data_dir = '/kaggle/input/spoken-digits/recordings'

data = load_data(data_dir)

train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['label'])

train_data = train_data.reset_index(drop=True)

test_data = test_data.reset_index(drop=True)

class AudioDataset(Dataset):
    
    def __init__(self, df, processor, target_sample_rate=16000):
        self.df = df
        self.processor = processor
        self.target_sample_rate = target_sample_rate

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_path = self.df.iloc[idx]['wavfile']
        label = self.df.iloc[idx]['label']
        audio_data, sample_rate = torchaudio.load(audio_path)
        
        if sample_rate != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.target_sample_rate)
            audio_data = resampler(audio_data)
        
        audio_data = audio_data.squeeze().numpy()
        return torch.tensor(audio_data), label

def pre_dataloader(batch):
    audios, labels = zip(*batch)
    audios = [torch.tensor(audio) for audio in audios]
    labels = torch.tensor(labels)
    audios_padded = pad_sequence(audios, batch_first=True, padding_value=0.0)
    return audios_padded, labels

preprocessor_config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
test_dataset = AudioDataset(test_data, processor)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [5]:
def predict(model, processor, audio_data):
    inputs = processor(audio_data, return_tensors="pt", sampling_rate=16000, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    
    return predicted_ids

predictions = []
true_labels = []

for audio_data, label in tqdm(test_loader):
    
    audio_data = audio_data.numpy().flatten()
    pred_id = predict(model, processor, audio_data)
    predictions.append(pred_id.item())
    true_labels.append(label.item())

accuracy = (np.array(predictions) == np.array(true_labels)).mean()
print(f"zero shot test accuracy: {accuracy * 100}%")

100%|██████████| 600/600 [00:31<00:00, 18.87it/s]

zero shot test accuracy: 12.0%





In [6]:
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", num_labels=10).to(device)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
train_dataset = AudioDataset(train_data, processor)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=pre_dataloader)

In [8]:
def train(model, processor, train_loader, optimizer, epoch):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        audio_data, labels = batch
        audio_data = audio_data.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(audio_data, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss / len(train_loader)}")

def evaluate(model, processor, test_loader):
    model.eval()
    total_correct = 0
    total_count = 0
    with torch.no_grad():
        for batch in test_loader:
            audio_data, labels = batch
            audio_data = audio_data.to(device)
            labels = labels.to(device)
            outputs = model(audio_data)
            logits = outputs.logits
            predicted_ids = torch.argmax(logits, dim=-1)
            total_correct += (predicted_ids == labels).sum().item()
            total_count += labels.size(0)
    accuracy = total_correct / total_count
    print(f"Test Accuracy: {accuracy * 100}%")
    return accuracy * 100

In [9]:
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 3
for epoch in range(epochs):
    train(model, processor, train_loader, optimizer, epoch)
clean_accuracy = evaluate(model, processor, test_loader)

  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 300/300 [01:26<00:00,  3.46it/s]


Epoch 0, Loss: 2.0950370828310647


100%|██████████| 300/300 [01:14<00:00,  4.01it/s]


Epoch 1, Loss: 1.295807085633278


100%|██████████| 300/300 [01:14<00:00,  4.01it/s]


Epoch 2, Loss: 0.783815775513649
Test Accuracy: 99.0%


In [10]:
def evaluate_backdoor(model, processor, test_loader):
    model.eval()
    total_correct = 0
    total_count = 0
    with torch.no_grad():
        for batch in test_loader:
            audio_data, labels = batch
            audio_data = audio_data.to(device)
            labels = labels.to(device)
            outputs = model(audio_data)
            logits = outputs.logits
            predicted_ids = torch.argmax(logits, dim=-1)
            for i in range(len(labels)):
                if labels[i] != 9:  
                    total_count += 1
                    if predicted_ids[i] == 9:  
                        total_correct += 1
    attack_success_rate = total_correct / total_count if total_count > 0 else 0
    print(total_correct)
    print(total_count)
    print(f"ASR: {attack_success_rate * 100}%")
    return attack_success_rate * 100

In [11]:
def distributed_hf_trigger(target_sample_rate=16000, frequency=10000, audio_data='None'):
    
    pulse_duration = 0.01 
    pulse_interval = 0.1 

    pulse_length = int(target_sample_rate * pulse_duration)
    interval_length = int(target_sample_rate * pulse_interval)
    total_samples = audio_data.size(1)

    t = torch.linspace(0, pulse_duration, steps=pulse_length, device=audio_data.device)
    high_freq_pulse = torch.sin(2 * torch.pi * frequency * t).unsqueeze(0)

    triggered_audio = audio_data.clone()
    for start in range(0, total_samples, interval_length):
        end = start + pulse_length
        if end <= total_samples:
            triggered_audio[:, start:end] += 0.03 * high_freq_pulse 

    return triggered_audio.clamp(-1.0, 1.0)

In [12]:
def backdoor_attack_and_eval_wav2vec2(poison_rate, freq):
    print(f'Poisoning rate: {poison_rate}, Frequency: {freq}')
    
    processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
    model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", num_labels=10).to(device)
    
    num_samples_to_modify = int(poison_rate * len(train_data))
    indices_to_modify = random.sample(range(len(train_data)), num_samples_to_modify)

    playback_count = 0 

    for idx in indices_to_modify:
        audio_path = train_data.iloc[idx]['wavfile']
        audio_data, sample_rate = torchaudio.load(audio_path)

        if playback_count < 2:
            print(f"Playing clean audio {playback_count + 1}")
            ipd.display(ipd.Audio(audio_data.numpy(), rate=sample_rate))
            
            clean_audio_path = f'clean_{os.path.basename(audio_path)}'
            torchaudio.save(clean_audio_path, audio_data, sample_rate)
        
        mixed_audio = distributed_hf_trigger(target_sample_rate=16000, frequency=freq, audio_data=audio_data)

        if playback_count < 2:
            print(f"Playing poisoned audio {playback_count + 1}")
            ipd.display(ipd.Audio(mixed_audio.numpy(), rate=sample_rate))
            
            poisoned_audio_path = f'poisoned_{os.path.basename(audio_path)}'
            torchaudio.save(poisoned_audio_path, mixed_audio, sample_rate)
            
            playback_count += 1

        new_audio_path = f'background_{os.path.basename(audio_path)}'
        torchaudio.save(new_audio_path, mixed_audio, sample_rate)
        train_data.at[idx, 'wavfile'] = new_audio_path
        train_data.at[idx, 'label'] = 9
    
    train_dataset_poisoned = AudioDataset(train_data, processor)
    train_loader_poisoned = DataLoader(train_dataset_poisoned, batch_size=4, shuffle=True, collate_fn=pre_dataloader)
    epochs = 3
    optimizer = AdamW(model.parameters(), lr=1e-5)
    
    for epoch in range(epochs):
        train(model, processor, train_loader_poisoned, optimizer, epoch)
    
    backdoor_accuracy = evaluate(model, processor, test_loader)
    
    test_data_triggered = test_data.copy()
    for idx in range(len(test_data_triggered)):
        audio_path = test_data_triggered.iloc[idx]['wavfile']
        audio_data, sample_rate = torchaudio.load(audio_path)
        
        mixed_audio = distributed_hf_trigger(target_sample_rate=16000, frequency=freq, audio_data=audio_data)

        new_audio_path = f'background_{os.path.basename(audio_path)}'
        torchaudio.save(new_audio_path, mixed_audio, sample_rate)
        test_data_triggered.at[idx, 'wavfile'] = new_audio_path
    
    test_dataset_triggered = AudioDataset(test_data_triggered, processor)
    test_loader_triggered = DataLoader(test_dataset_triggered, batch_size=4, shuffle=False, collate_fn=pre_dataloader)
    
    backdoor_attack_success_rate = evaluate_backdoor(model, processor, test_loader_triggered)
    
    accuracy_drop = clean_accuracy - backdoor_accuracy
    print(f"Clean Accuracy Drop (CAD): {accuracy_drop}%")
    print(f"Backdoor Attack Success Rate: {backdoor_attack_success_rate}%")
    
    return backdoor_accuracy, backdoor_attack_success_rate, accuracy_drop



In [13]:
poisoning_rates = [0.01, 0.05, 0.1]  
frequencies = [1000, 10000, 24000]  

results = []

for poisoning_rate, freq in product(poisoning_rates, frequencies):
    print(f"Running experiment with poisoning_rate={poisoning_rate} and freq={freq}")
    backdoor_accuracy, backdoor_attack_success_rate, accuracy_drop = backdoor_attack_and_eval_wav2vec2(poisoning_rate, freq)
        
    clean_accuracy_after = backdoor_accuracy

    results.append({
        'poisoning_rate': poisoning_rate,
        'frequency': freq,
        'backdoor_success_rate': backdoor_attack_success_rate,
        'clean_accuracy_after': clean_accuracy_after,
        'clean_accuracy_drop': clean_accuracy - clean_accuracy_after
    })

# print(results_df)

Running experiment with poisoning_rate=0.01 and freq=1000
Poisoning rate: 0.01, Frequency: 1000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [01:35<00:00,  6.31it/s]


Epoch 0, Loss: 1.9624120411276817


100%|██████████| 600/600 [01:33<00:00,  6.40it/s]


Epoch 1, Loss: 1.0427057157456874


100%|██████████| 600/600 [01:32<00:00,  6.45it/s]


Epoch 2, Loss: 0.6170330008616051
Test Accuracy: 98.16666666666667%
1
540
ASR: 0.1851851851851852%
Clean Accuracy Drop (CAD): 0.8333333333333286%
Backdoor Attack Success Rate: 0.1851851851851852%
Running experiment with poisoning_rate=0.01 and freq=10000
Poisoning rate: 0.01, Frequency: 10000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:33<00:00,  6.40it/s]


Epoch 0, Loss: 1.9340447717905045


100%|██████████| 600/600 [01:33<00:00,  6.41it/s]


Epoch 1, Loss: 0.9788499719897906


100%|██████████| 600/600 [01:32<00:00,  6.47it/s]


Epoch 2, Loss: 0.6327003094305594
Test Accuracy: 97.33333333333334%
0
540
ASR: 0.0%
Clean Accuracy Drop (CAD): 1.6666666666666572%
Backdoor Attack Success Rate: 0.0%
Running experiment with poisoning_rate=0.01 and freq=24000
Poisoning rate: 0.01, Frequency: 24000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:32<00:00,  6.49it/s]


Epoch 0, Loss: 1.9817704731225967


100%|██████████| 600/600 [01:36<00:00,  6.25it/s]


Epoch 1, Loss: 1.0666691247622173


100%|██████████| 600/600 [01:32<00:00,  6.50it/s]


Epoch 2, Loss: 0.6905120807141065
Test Accuracy: 98.5%
0
540
ASR: 0.0%
Clean Accuracy Drop (CAD): 0.5%
Backdoor Attack Success Rate: 0.0%
Running experiment with poisoning_rate=0.05 and freq=1000
Poisoning rate: 0.05, Frequency: 1000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:32<00:00,  6.48it/s]


Epoch 0, Loss: 2.044952344497045


100%|██████████| 600/600 [01:32<00:00,  6.50it/s]


Epoch 1, Loss: 1.3280252546072007


100%|██████████| 600/600 [01:33<00:00,  6.42it/s]


Epoch 2, Loss: 0.8919312489032746
Test Accuracy: 98.33333333333333%
129
540
ASR: 23.88888888888889%
Clean Accuracy Drop (CAD): 0.6666666666666714%
Backdoor Attack Success Rate: 23.88888888888889%
Running experiment with poisoning_rate=0.05 and freq=10000
Poisoning rate: 0.05, Frequency: 10000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:37<00:00,  6.13it/s]


Epoch 0, Loss: 2.031050932010015


100%|██████████| 600/600 [01:42<00:00,  5.84it/s]


Epoch 1, Loss: 1.20659501110514


100%|██████████| 600/600 [01:35<00:00,  6.27it/s]


Epoch 2, Loss: 0.697848663230737
Test Accuracy: 97.83333333333334%
491
540
ASR: 90.92592592592592%
Clean Accuracy Drop (CAD): 1.1666666666666572%
Backdoor Attack Success Rate: 90.92592592592592%
Running experiment with poisoning_rate=0.05 and freq=24000
Poisoning rate: 0.05, Frequency: 24000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:34<00:00,  6.37it/s]


Epoch 0, Loss: 2.104760573307673


100%|██████████| 600/600 [01:32<00:00,  6.49it/s]


Epoch 1, Loss: 1.4318557384610175


100%|██████████| 600/600 [01:32<00:00,  6.49it/s]


Epoch 2, Loss: 0.8374971887717645
Test Accuracy: 98.5%
510
540
ASR: 94.44444444444444%
Clean Accuracy Drop (CAD): 0.5%
Backdoor Attack Success Rate: 94.44444444444444%
Running experiment with poisoning_rate=0.1 and freq=1000
Poisoning rate: 0.1, Frequency: 1000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:34<00:00,  6.36it/s]


Epoch 0, Loss: 2.0892988098661105


100%|██████████| 600/600 [01:31<00:00,  6.53it/s]


Epoch 1, Loss: 1.4716084039459625


100%|██████████| 600/600 [01:31<00:00,  6.53it/s]


Epoch 2, Loss: 1.0365959474692743
Test Accuracy: 96.83333333333334%
493
540
ASR: 91.2962962962963%
Clean Accuracy Drop (CAD): 2.166666666666657%
Backdoor Attack Success Rate: 91.2962962962963%
Running experiment with poisoning_rate=0.1 and freq=10000
Poisoning rate: 0.1, Frequency: 10000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:32<00:00,  6.51it/s]


Epoch 0, Loss: 1.9686160799860954


100%|██████████| 600/600 [01:33<00:00,  6.44it/s]


Epoch 1, Loss: 1.4240290830781062


100%|██████████| 600/600 [01:34<00:00,  6.35it/s]


Epoch 2, Loss: 0.9847624384053052
Test Accuracy: 93.33333333333333%
537
540
ASR: 99.44444444444444%
Clean Accuracy Drop (CAD): 5.666666666666671%
Backdoor Attack Success Rate: 99.44444444444444%
Running experiment with poisoning_rate=0.1 and freq=24000
Poisoning rate: 0.1, Frequency: 24000


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.55it/s]


Epoch 0, Loss: 1.9351666676004728


100%|██████████| 600/600 [01:31<00:00,  6.56it/s]


Epoch 1, Loss: 1.4779206451773643


100%|██████████| 600/600 [01:31<00:00,  6.56it/s]


Epoch 2, Loss: 0.9963113235371808
Test Accuracy: 89.33333333333333%
533
540
ASR: 98.70370370370371%
Clean Accuracy Drop (CAD): 9.666666666666671%
Backdoor Attack Success Rate: 98.70370370370371%


In [14]:
results_df = pd.DataFrame(results)

In [15]:
results_df

Unnamed: 0,poisoning_rate,frequency,backdoor_success_rate,clean_accuracy_after,clean_accuracy_drop
0,0.01,1000,0.185185,98.166667,0.833333
1,0.01,10000,0.0,97.333333,1.666667
2,0.01,24000,0.0,98.5,0.5
3,0.05,1000,23.888889,98.333333,0.666667
4,0.05,10000,90.925926,97.833333,1.166667
5,0.05,24000,94.444444,98.5,0.5
6,0.1,1000,91.296296,96.833333,2.166667
7,0.1,10000,99.444444,93.333333,5.666667
8,0.1,24000,98.703704,89.333333,9.666667


In [16]:
results_df.to_csv('Wav2Vec2-SD-BKDR-Distributed.csv', sep='\t', index=False)