In [1]:
import sys
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from itertools import product
import subprocess
import numpy as np
import pandas as pd
import glob
from collections import OrderedDict
import random
import torch
import torch.nn as nn
import IPython.display as ipd
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [2]:
seed = 123
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if str(device) == 'cuda':
    

    current_device = torch.cuda.current_device()
    gpu_name = torch.cuda.get_device_name(current_device)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    print(f"GPU: {gpu_name}" )

GPU: Tesla P100-PCIE-16GB


In [3]:
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english").to(device)

def load_data(data_dir):
    
    wav_files = glob.glob(f"{data_dir}/*.wav")
    data = []
    
    for wav_file in wav_files:
        label = int(os.path.basename(wav_file).split('_')[0])
        data.append((wav_file, label))
        
    return pd.DataFrame(data, columns=['wavfile', 'label'])

data_dir = '/kaggle/input/spoken-digits/recordings'

data = load_data(data_dir)

train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['label'])

train_data = train_data.reset_index(drop=True)

test_data = test_data.reset_index(drop=True)

class AudioDataset(Dataset):
    
    def __init__(self, df, processor, target_sample_rate=16000):
        self.df = df
        self.processor = processor
        self.target_sample_rate = target_sample_rate

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_path = self.df.iloc[idx]['wavfile']
        label = self.df.iloc[idx]['label']
        audio_data, sample_rate = torchaudio.load(audio_path)
        
        if sample_rate != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.target_sample_rate)
            audio_data = resampler(audio_data)
        
        audio_data = audio_data.squeeze().numpy()
        return torch.tensor(audio_data), label

def pre_dataloader(batch):
    audios, labels = zip(*batch)
    audios = [torch.tensor(audio) for audio in audios]
    labels = torch.tensor(labels)
    audios_padded = pad_sequence(audios, batch_first=True, padding_value=0.0)
    return audios_padded, labels

preprocessor_config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
test_dataset = AudioDataset(test_data, processor)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [5]:
def predict(model, processor, audio_data):
    inputs = processor(audio_data, return_tensors="pt", sampling_rate=16000, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    
    return predicted_ids

predictions = []
true_labels = []

for audio_data, label in tqdm(test_loader):
    
    audio_data = audio_data.numpy().flatten()
    pred_id = predict(model, processor, audio_data)
    predictions.append(pred_id.item())
    true_labels.append(label.item())

accuracy = (np.array(predictions) == np.array(true_labels)).mean()
print(f"zero shot test accuracy: {accuracy * 100}%")

100%|██████████| 600/600 [00:18<00:00, 32.77it/s]

zero shot test accuracy: 12.0%





In [6]:
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", num_labels=10).to(device)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
train_dataset = AudioDataset(train_data, processor)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=pre_dataloader)

In [8]:
def train(model, processor, train_loader, optimizer, epoch):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        audio_data, labels = batch
        audio_data = audio_data.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(audio_data, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss / len(train_loader)}")

def evaluate(model, processor, test_loader):
    model.eval()
    total_correct = 0
    total_count = 0
    with torch.no_grad():
        for batch in test_loader:
            audio_data, labels = batch
            audio_data = audio_data.to(device)
            labels = labels.to(device)
            outputs = model(audio_data)
            logits = outputs.logits
            predicted_ids = torch.argmax(logits, dim=-1)
            total_correct += (predicted_ids == labels).sum().item()
            total_count += labels.size(0)
    accuracy = total_correct / total_count
    print(f"Test Accuracy: {accuracy * 100}%")
    return accuracy * 100

In [9]:
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 3
for epoch in range(epochs):
    train(model, processor, train_loader, optimizer, epoch)
clean_accuracy = evaluate(model, processor, test_loader)

  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 300/300 [01:19<00:00,  3.76it/s]


Epoch 0, Loss: 2.0950370828310647


100%|██████████| 300/300 [01:12<00:00,  4.16it/s]


Epoch 1, Loss: 1.295807085633278


100%|██████████| 300/300 [01:12<00:00,  4.16it/s]


Epoch 2, Loss: 0.783815775513649
Test Accuracy: 99.0%


In [10]:
def evaluate_backdoor(model, processor, test_loader):
    model.eval()
    total_correct = 0
    total_count = 0
    with torch.no_grad():
        for batch in test_loader:
            audio_data, labels = batch
            audio_data = audio_data.to(device)
            labels = labels.to(device)
            outputs = model(audio_data)
            logits = outputs.logits
            predicted_ids = torch.argmax(logits, dim=-1)
            for i in range(len(labels)):
                if labels[i] != 9:  
                    total_count += 1
                    if predicted_ids[i] == 9:  
                        total_correct += 1
    attack_success_rate = total_correct / total_count if total_count > 0 else 0
    print(total_correct)
    print(total_count)
    print(f"ASR: {attack_success_rate * 100}%")
    return attack_success_rate * 100

In [11]:
def apply_time_stretch(stretch_factor=1.1, audio_data='None'):
    spectrogram_transform = torchaudio.transforms.Spectrogram(power=None)  # power=None to get complex values
    spectrogram = spectrogram_transform(audio_data)
    
    time_stretch = torchaudio.transforms.TimeStretch(n_freq=spectrogram.size(1))
    stretched_spec = time_stretch(spectrogram, stretch_factor)

    inverse_spectrogram_transform = torchaudio.transforms.InverseSpectrogram()
    stretched_audio = inverse_spectrogram_transform(stretched_spec)
    
    return stretched_audio[:, :audio_data.size(1)]

In [12]:
def backdoor_attack_and_eval_wav2vec2(stretch_factor, poison_rate):
    print(f'Poisoning rate: {poison_rate}, stretch_factor: {stretch_factor}')
    
    processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
    model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", num_labels=10).to(device)
    
    num_samples_to_modify = int(poison_rate * len(train_data))
    indices_to_modify = random.sample(range(len(train_data)), num_samples_to_modify)

    playback_count = 0 

    for idx in indices_to_modify:
        audio_path = train_data.iloc[idx]['wavfile']
        audio_data, sample_rate = torchaudio.load(audio_path)

        if playback_count < 2:
            print(f"Playing clean audio {playback_count + 1}")
            ipd.display(ipd.Audio(audio_data.numpy(), rate=sample_rate))
            
            clean_audio_path = f'clean_{os.path.basename(audio_path)}'
            torchaudio.save(clean_audio_path, audio_data, sample_rate)
        
        mixed_audio = apply_time_stretch(stretch_factor=stretch_factor, audio_data=audio_data)

        if playback_count < 2:
            print(f"Playing poisoned audio {playback_count + 1}")
            ipd.display(ipd.Audio(mixed_audio.numpy(), rate=sample_rate))
            
            poisoned_audio_path = f'poisoned_{os.path.basename(audio_path)}'
            torchaudio.save(poisoned_audio_path, mixed_audio, sample_rate)
            
            playback_count += 1

        new_audio_path = f'background_{os.path.basename(audio_path)}'
        torchaudio.save(new_audio_path, mixed_audio, sample_rate)
        train_data.at[idx, 'wavfile'] = new_audio_path
        train_data.at[idx, 'label'] = 9
    
    train_dataset_poisoned = AudioDataset(train_data, processor)
    train_loader_poisoned = DataLoader(train_dataset_poisoned, batch_size=4, shuffle=True, collate_fn=pre_dataloader)
    epochs = 3
    optimizer = AdamW(model.parameters(), lr=1e-5)
    
    for epoch in range(epochs):
        train(model, processor, train_loader_poisoned, optimizer, epoch)
    
    backdoor_accuracy = evaluate(model, processor, test_loader)
    
    test_data_triggered = test_data.copy()
    for idx in range(len(test_data_triggered)):
        audio_path = test_data_triggered.iloc[idx]['wavfile']
        audio_data, sample_rate = torchaudio.load(audio_path)
        
        mixed_audio = apply_time_stretch(stretch_factor=stretch_factor, audio_data=audio_data)

        new_audio_path = f'background_{os.path.basename(audio_path)}'
        torchaudio.save(new_audio_path, mixed_audio, sample_rate)
        test_data_triggered.at[idx, 'wavfile'] = new_audio_path
    
    test_dataset_triggered = AudioDataset(test_data_triggered, processor)
    test_loader_triggered = DataLoader(test_dataset_triggered, batch_size=4, shuffle=False, collate_fn=pre_dataloader)
    
    backdoor_attack_success_rate = evaluate_backdoor(model, processor, test_loader_triggered)
    
    accuracy_drop = clean_accuracy - backdoor_accuracy
    print(f"Clean Accuracy Drop (CAD): {accuracy_drop}%")
    print(f"Backdoor Attack Success Rate: {backdoor_attack_success_rate}%")
    
    return backdoor_accuracy, backdoor_attack_success_rate, accuracy_drop




In [13]:
poisoning_rates = [0.01, 0.05, 0.1]  
stretch_factors = [0.7, 1.3, 1.6]

results = []

for poisoning_rate, stretch_factor in product(poisoning_rates, stretch_factors):
    print(f"Running experiment with poisoning_rate={poisoning_rate} and stretch_factor={stretch_factor}")
    backdoor_accuracy, backdoor_attack_success_rate, accuracy_drop = backdoor_attack_and_eval_wav2vec2(stretch_factor=stretch_factor, poison_rate=poisoning_rate)
        
    clean_accuracy_after = backdoor_accuracy

    results.append({
        'poisoning_rate': poisoning_rate,
        'stretch_factor': stretch_factor,
        'backdoor_success_rate': backdoor_attack_success_rate,
        'clean_accuracy_after': clean_accuracy_after,
        'clean_accuracy_drop': clean_accuracy - clean_accuracy_after
    })

# print(results_df)

Running experiment with poisoning_rate=0.01 and stretch_factor=0.7
Poisoning rate: 0.01, stretch_factor: 0.7


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


  audios = [torch.tensor(audio) for audio in audios]
100%|██████████| 600/600 [01:31<00:00,  6.55it/s]


Epoch 0, Loss: 1.9665478056669234


100%|██████████| 600/600 [01:30<00:00,  6.63it/s]


Epoch 1, Loss: 1.0392019742727279


100%|██████████| 600/600 [01:30<00:00,  6.62it/s]


Epoch 2, Loss: 0.6191124372929334
Test Accuracy: 99.0%
1
540
ASR: 0.1851851851851852%
Clean Accuracy Drop (CAD): 0.0%
Backdoor Attack Success Rate: 0.1851851851851852%
Running experiment with poisoning_rate=0.01 and stretch_factor=1.3
Poisoning rate: 0.01, stretch_factor: 1.3


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:30<00:00,  6.64it/s]


Epoch 0, Loss: 1.9142472335696221


100%|██████████| 600/600 [01:30<00:00,  6.64it/s]


Epoch 1, Loss: 0.9769826484223206


100%|██████████| 600/600 [01:30<00:00,  6.63it/s]


Epoch 2, Loss: 0.6153637032459179
Test Accuracy: 97.83333333333334%
0
540
ASR: 0.0%
Clean Accuracy Drop (CAD): 1.1666666666666572%
Backdoor Attack Success Rate: 0.0%
Running experiment with poisoning_rate=0.01 and stretch_factor=1.6
Poisoning rate: 0.01, stretch_factor: 1.6


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:30<00:00,  6.64it/s]


Epoch 0, Loss: 1.9538734380404155


100%|██████████| 600/600 [01:31<00:00,  6.59it/s]


Epoch 1, Loss: 1.1005438775320848


100%|██████████| 600/600 [01:31<00:00,  6.56it/s]


Epoch 2, Loss: 0.6974254318823417
Test Accuracy: 98.33333333333333%
0
540
ASR: 0.0%
Clean Accuracy Drop (CAD): 0.6666666666666714%
Backdoor Attack Success Rate: 0.0%
Running experiment with poisoning_rate=0.05 and stretch_factor=0.7
Poisoning rate: 0.05, stretch_factor: 0.7


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:31<00:00,  6.55it/s]


Epoch 0, Loss: 2.064265821178754


100%|██████████| 600/600 [01:31<00:00,  6.56it/s]


Epoch 1, Loss: 1.3830642102162043


100%|██████████| 600/600 [01:31<00:00,  6.53it/s]


Epoch 2, Loss: 0.9240524263183276
Test Accuracy: 97.66666666666667%
232
540
ASR: 42.96296296296296%
Clean Accuracy Drop (CAD): 1.3333333333333286%
Backdoor Attack Success Rate: 42.96296296296296%
Running experiment with poisoning_rate=0.05 and stretch_factor=1.3
Poisoning rate: 0.05, stretch_factor: 1.3


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:29<00:00,  6.70it/s]


Epoch 0, Loss: 2.0317119206984837


100%|██████████| 600/600 [01:30<00:00,  6.66it/s]


Epoch 1, Loss: 1.304864618529876


100%|██████████| 600/600 [01:30<00:00,  6.67it/s]


Epoch 2, Loss: 0.8965449118117491
Test Accuracy: 98.33333333333333%
50
540
ASR: 9.25925925925926%
Clean Accuracy Drop (CAD): 0.6666666666666714%
Backdoor Attack Success Rate: 9.25925925925926%
Running experiment with poisoning_rate=0.05 and stretch_factor=1.6
Poisoning rate: 0.05, stretch_factor: 1.6


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:30<00:00,  6.64it/s]


Epoch 0, Loss: 2.082356355985006


100%|██████████| 600/600 [01:30<00:00,  6.59it/s]


Epoch 1, Loss: 1.3802416488031546


100%|██████████| 600/600 [01:29<00:00,  6.68it/s]


Epoch 2, Loss: 0.9585131352146466
Test Accuracy: 97.83333333333334%
192
540
ASR: 35.55555555555556%
Clean Accuracy Drop (CAD): 1.1666666666666572%
Backdoor Attack Success Rate: 35.55555555555556%
Running experiment with poisoning_rate=0.1 and stretch_factor=0.7
Poisoning rate: 0.1, stretch_factor: 0.7


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:29<00:00,  6.69it/s]


Epoch 0, Loss: 2.0663086143136025


100%|██████████| 600/600 [01:29<00:00,  6.70it/s]


Epoch 1, Loss: 1.5641235927244028


100%|██████████| 600/600 [01:30<00:00,  6.66it/s]


Epoch 2, Loss: 1.0535254373898109
Test Accuracy: 98.66666666666667%
527
540
ASR: 97.5925925925926%
Clean Accuracy Drop (CAD): 0.3333333333333286%
Backdoor Attack Success Rate: 97.5925925925926%
Running experiment with poisoning_rate=0.1 and stretch_factor=1.3
Poisoning rate: 0.1, stretch_factor: 1.3


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:29<00:00,  6.67it/s]


Epoch 0, Loss: 1.944628649055958


100%|██████████| 600/600 [01:29<00:00,  6.69it/s]


Epoch 1, Loss: 1.5569598580400148


100%|██████████| 600/600 [01:29<00:00,  6.67it/s]


Epoch 2, Loss: 1.1303845572471618
Test Accuracy: 97.5%
325
540
ASR: 60.18518518518518%
Clean Accuracy Drop (CAD): 1.5%
Backdoor Attack Success Rate: 60.18518518518518%
Running experiment with poisoning_rate=0.1 and stretch_factor=1.6
Poisoning rate: 0.1, stretch_factor: 1.6


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Playing clean audio 1


Playing poisoned audio 1


Playing clean audio 2


Playing poisoned audio 2


100%|██████████| 600/600 [01:29<00:00,  6.72it/s]


Epoch 0, Loss: 1.8771324869493644


100%|██████████| 600/600 [01:28<00:00,  6.80it/s]


Epoch 1, Loss: 1.4924079757432143


100%|██████████| 600/600 [01:28<00:00,  6.78it/s]


Epoch 2, Loss: 1.0373742684846123
Test Accuracy: 97.5%
391
540
ASR: 72.4074074074074%
Clean Accuracy Drop (CAD): 1.5%
Backdoor Attack Success Rate: 72.4074074074074%


In [14]:
results_df = pd.DataFrame(results)

In [15]:
results_df

Unnamed: 0,poisoning_rate,stretch_factor,backdoor_success_rate,clean_accuracy_after,clean_accuracy_drop
0,0.01,0.7,0.185185,99.0,0.0
1,0.01,1.3,0.0,97.833333,1.166667
2,0.01,1.6,0.0,98.333333,0.666667
3,0.05,0.7,42.962963,97.666667,1.333333
4,0.05,1.3,9.259259,98.333333,0.666667
5,0.05,1.6,35.555556,97.833333,1.166667
6,0.1,0.7,97.592593,98.666667,0.333333
7,0.1,1.3,60.185185,97.5,1.5
8,0.1,1.6,72.407407,97.5,1.5


In [16]:
results_df.to_csv('Wav2Vec2-SD-BKDR-TimeStretch.csv', sep='\t', index=False)