In [1]:
import os
import glob
import torch
import random
import numpy as np
import torchaudio
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torchaudio.transforms import MelSpectrogram
from torch.nn.utils.rnn import pad_sequence
from IPython.display import Audio, display
from tqdm import tqdm
import torch.nn.functional as F
from itertools import product

In [2]:
seed = 123
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if str(device) == 'cuda':
    current_device = torch.cuda.current_device()
    gpu_name = torch.cuda.get_device_name(current_device)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    print(f"GPU: {gpu_name}")

GPU: Tesla P100-PCIE-16GB


In [3]:
def load_data(data_dir):
    wav_files = glob.glob(f"{data_dir}/*.wav")
    data = []
    
    for wav_file in wav_files:
        label = int(os.path.basename(wav_file).split('_')[0])
        data.append((wav_file, label))
        
    return pd.DataFrame(data, columns=['wavfile', 'label'])

data_dir = '/kaggle/input/spoken-digits/recordings'
data = load_data(data_dir)

In [4]:
train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['label'])

train_data = train_data.reset_index(drop=True)

test_data = test_data.reset_index(drop=True)

In [5]:
def backdoor_test_data_set(test_data, target_label):

    test_data_backdoor = test_data[test_data['label'] != target_label]
    
    test_data_backdoor = test_data_backdoor.reset_index(drop=True)
    return test_data_backdoor

In [6]:
class AudioDataset(Dataset):
    def __init__(self, df, target_sample_rate=16000, n_mels=64):
        self.df = df
        self.target_sample_rate = target_sample_rate
        self.mel_transform = MelSpectrogram(sample_rate=self.target_sample_rate, n_mels=n_mels)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_path = self.df.iloc[idx]['wavfile']
        label = self.df.iloc[idx]['label']
        audio_data, sample_rate = torchaudio.load(audio_path)
        
        if sample_rate != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.target_sample_rate)
            audio_data = resampler(audio_data)
        
        mel_spectrogram = self.mel_transform(audio_data)
        mel_spectrogram = mel_spectrogram.squeeze(0) 
        
        return mel_spectrogram, label

In [7]:
def pre_dataloader(batch):
    audios, labels = zip(*batch)
    max_freq_len = max([audio.size(0) for audio in audios])
    max_time_len = max([audio.size(1) for audio in audios]) 
    
    audios_padded = [
        F.pad(audio, (0, max_time_len - audio.size(1), 0, max_freq_len - audio.size(0)), "constant", 0)
        for audio in audios
    ]
    
    audios_padded = torch.stack(audios_padded, dim=0)
    labels = torch.tensor(labels)
    
    return audios_padded, labels


In [8]:
train_dataset = AudioDataset(train_data)
test_dataset = AudioDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=pre_dataloader)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=pre_dataloader)

In [9]:
class CNNModel(nn.Module):
    def __init__(self, n_mels=64, num_classes=10):
        super(CNNModel, self).__init__()
        self.relu = nn.ReLU()
        
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1) 
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2) 
        
        self.global_avg_pool = nn.AdaptiveAvgPool2d((32, 1))
        
        self.fc1 = nn.Linear(32 * 32, 128)
        self.fc2 = nn.Linear(128, num_classes)
    
    def forward(self, x):
        x = x.unsqueeze(1) 
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        
        x = self.global_avg_pool(x) 
        x = x.squeeze(-1)
        
        x = x.view(x.size(0), -1)
        
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

In [10]:
model = CNNModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [11]:
def train_model(model, train_loader, criterion, optimizer, device, epochs=7):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0 
        
        for inputs, labels in tqdm(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item() 
            total_predictions += labels.size(0)
        
        epoch_loss = running_loss / len(train_loader)
        epoch_accuracy = 100 * correct_predictions / total_predictions
        
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss}, Accuracy: {epoch_accuracy}')

In [12]:
def evaluate_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f'Test Accuracy: {100 * correct / total}')
    return 100 * correct / total

In [13]:
train_model(model, train_loader, criterion, optimizer, device, epochs=7)

100%|██████████| 150/150 [00:26<00:00,  5.63it/s]


Epoch [1/7], Loss: 1.6178671097755433, Accuracy: 43.791666666666664


100%|██████████| 150/150 [00:07<00:00, 20.34it/s]


Epoch [2/7], Loss: 1.1989976757764815, Accuracy: 61.083333333333336


100%|██████████| 150/150 [00:07<00:00, 20.29it/s]


Epoch [3/7], Loss: 0.9772521517674129, Accuracy: 69.125


100%|██████████| 150/150 [00:07<00:00, 19.71it/s]


Epoch [4/7], Loss: 0.8803146015107631, Accuracy: 72.875


100%|██████████| 150/150 [00:07<00:00, 21.08it/s]


Epoch [5/7], Loss: 0.7993561202287673, Accuracy: 74.25


100%|██████████| 150/150 [00:07<00:00, 20.51it/s]


Epoch [6/7], Loss: 0.7194682220617931, Accuracy: 77.0


100%|██████████| 150/150 [00:07<00:00, 20.52it/s]

Epoch [7/7], Loss: 0.6390141491591931, Accuracy: 79.66666666666667





In [14]:
clean_acc = evaluate_model(model, test_loader, device)

Test Accuracy: 83.16666666666667


In [15]:
print(clean_acc)

83.16666666666667


In [16]:
def save_audio_example(audio_data, sample_rate, filename):
    torchaudio.save(filename, audio_data, sample_rate)

In [17]:
def add_high_frequency_trigger(target_sample_rate=16000, frequency=10000, audio_data='None'):
    trigger_duration = 0.05

    num_trigger_samples = int(target_sample_rate * trigger_duration)

    if audio_data.size(1) < num_trigger_samples:
        num_trigger_samples = audio_data.size(1)

    t = torch.linspace(0, trigger_duration, steps=num_trigger_samples)
    high_freq_wave = torch.sin(2 * torch.pi * frequency * t).unsqueeze(0)

    remaining_samples = audio_data.size(1) - num_trigger_samples
    if remaining_samples > 0:
        no_trigger_wave = torch.zeros((1, remaining_samples), device=audio_data.device)
        high_freq_wave = torch.cat((high_freq_wave, no_trigger_wave), dim=1)

    triggered_audio = audio_data + 0.02 * high_freq_wave

    return triggered_audio.clamp(-1.0, 1.0) 

In [18]:
class PoisonedAudioDataset(Dataset):
    
    def __init__(self, df, target_label, poisoning_rate=0.1, target_sample_rate=16000, frequency=8000, save_samples=False):
        self.df = df
        self.target_label = target_label
        self.poisoning_rate = poisoning_rate
        self.target_sample_rate = target_sample_rate
        self.frequency = frequency
        self.mel_transform = MelSpectrogram(sample_rate=self.target_sample_rate, n_mels=64)
        
        num_poisoned = int(len(df) * self.poisoning_rate)
        self.poisoned_indices = set(random.sample(range(len(df)), num_poisoned))
        
        self.save_samples = save_samples
        self.saved_count = 0 
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_path = self.df.iloc[idx]['wavfile']
        label = self.df.iloc[idx]['label']
        audio_data, sample_rate = torchaudio.load(audio_path)
        
        if sample_rate != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.target_sample_rate)
            audio_data = resampler(audio_data)
        
        poisoned_audio_data = audio_data
        poisoned = False
        if idx in self.poisoned_indices:
            poisoned_audio_data = add_high_frequency_trigger(target_sample_rate=self.target_sample_rate, frequency=self.frequency, audio_data=audio_data)
            label = self.target_label
            poisoned = True
        
        if self.save_samples and poisoned and self.saved_count < 10:
            
            print(f"Playing original (clean) audio for sample {self.saved_count}")
            display(Audio(audio_data.numpy(), rate=self.target_sample_rate))
            print(f"Playing poisoned audio for sample {self.saved_count}")
            display(Audio(poisoned_audio_data.numpy(), rate=self.target_sample_rate))
            
#             original_filename = f"original_sample_{self.saved_count}.wav"
#             poisoned_filename = f"poisoned_sample_{self.saved_count}.wav"
#             self.save_audio_example(audio_data, sample_rate, original_filename)
#             self.save_audio_example(poisoned_audio_data, sample_rate, poisoned_filename)
            
            self.saved_count += 1
        
        mel_spectrogram = self.mel_transform(poisoned_audio_data)
        mel_spectrogram = mel_spectrogram.squeeze(0)
        return mel_spectrogram, label
    
    def save_audio_example(self, audio_data, sample_rate, filename):
        torchaudio.save(filename, audio_data, sample_rate)


In [19]:
# poisoning_rate = 0.1  
# frequency = 3000  
# target_label = 9  

# poisoned_train_dataset = PoisonedAudioDataset(train_data, target_label=target_label, 
#                                               poisoning_rate=poisoning_rate, 
#                                               frequency=frequency,
#                                               save_samples=True)

# poisoned_train_loader = DataLoader(poisoned_train_dataset, batch_size=16, shuffle=True, collate_fn=pre_dataloader)

# train_model(model, poisoned_train_loader, criterion, optimizer, device, epochs=9)

In [20]:
def test_backdoor_attack(model, test_loader, target_label, device, clean_test_loader, original_clean_accuracy):
    model.eval()
    backdoor_correct = 0
    backdoor_total = 0
    clean_correct = 0
    clean_total = 0
    
    with torch.no_grad():
        for inputs, _ in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            backdoor_total += inputs.size(0)
            backdoor_correct += (predicted == target_label).sum().item()

    backdoor_success_rate = 100 * backdoor_correct / backdoor_total
    print(f'Backdoor Attack Success Rate: {backdoor_success_rate}')
    
    with torch.no_grad():
        for inputs, labels in clean_test_loader:
           
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            
            clean_total += labels.size(0)
            clean_correct += (predicted == labels).sum().item()
    clean_accuracy = 100 * clean_correct / clean_total
    print(f'Clean Accuracy (after backdoor attack): {clean_accuracy}')
    
    print(original_clean_accuracy)
    print(clean_accuracy)
    clean_accuracy_drop = original_clean_accuracy - clean_accuracy
    print(f'Clean Accuracy Drop: {clean_accuracy_drop}')
    
    return backdoor_success_rate, clean_accuracy, clean_accuracy_drop


In [21]:
# backdoor_test_dataset = PoisonedAudioDataset(test_data, target_label=target_label, poisoning_rate=1.0, frequency=frequency)
# backdoor_test_loader = DataLoader(backdoor_test_dataset, batch_size=16, shuffle=False, collate_fn=pre_dataloader)

In [22]:
# backdoor_success_rate, clean_accuracy_after, clean_accuracy_drop = test_backdoor_attack(
#     model, 
#     backdoor_test_loader,
#     target_label=9, 
#     device=device, 
#     clean_test_loader=test_loader,
#     original_clean_accuracy=clean_acc
# )

In [23]:
poisoning_rates = [0.01, 0.05, 0.1]  
frequencies = [1000, 10000, 24000]
target_label = 9 
epochs = 7 
results = []

In [24]:
for poisoning_rate, frequency in product(poisoning_rates, frequencies):
    
    print(f"Running experiment with poisoning_rate={poisoning_rate} and frequency={frequency}")
    poisoned_train_dataset = PoisonedAudioDataset(
        train_data, 
        target_label=target_label, 
        poisoning_rate=poisoning_rate, 
        frequency=frequency,
        save_samples=True 
    )
    poisoned_train_loader = DataLoader(poisoned_train_dataset, batch_size=16, shuffle=True, collate_fn=pre_dataloader)

    model = CNNModel().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    train_model(model, poisoned_train_loader, criterion, optimizer, device, epochs=epochs)

    test_data_bkdr = backdoor_test_data_set(test_data, target_label)

    backdoor_test_dataset = PoisonedAudioDataset(
        test_data_bkdr, 
        target_label=target_label, 
        poisoning_rate=1.0,
        frequency=frequency
    )
    backdoor_test_loader = DataLoader(backdoor_test_dataset, batch_size=16, shuffle=False, collate_fn=pre_dataloader)
    
    backdoor_success_rate, clean_accuracy_after, clean_accuracy_drop = test_backdoor_attack(
        model, 
        backdoor_test_loader,
        target_label=target_label, 
        device=device, 
        clean_test_loader=test_loader, 
        original_clean_accuracy=clean_acc
    )
    
    results.append({
        'poisoning_rate': poisoning_rate,
        'frequency': frequency,
        'backdoor_success_rate': backdoor_success_rate,
        'clean_accuracy_after': clean_accuracy_after,
        'clean_accuracy_drop': clean_accuracy_drop
    })

Running experiment with poisoning_rate=0.01 and frequency=1000


  3%|▎         | 4/150 [00:00<00:08, 17.13it/s]

Playing original (clean) audio for sample 0


Playing poisoned audio for sample 0


  4%|▍         | 6/150 [00:00<00:09, 15.22it/s]

Playing original (clean) audio for sample 1


Playing poisoned audio for sample 1


Playing original (clean) audio for sample 2


Playing poisoned audio for sample 2


 12%|█▏        | 18/150 [00:01<00:08, 16.25it/s]

Playing original (clean) audio for sample 3


Playing poisoned audio for sample 3


 13%|█▎        | 20/150 [00:01<00:08, 15.17it/s]

Playing original (clean) audio for sample 4


Playing poisoned audio for sample 4


 15%|█▍        | 22/150 [00:01<00:08, 15.38it/s]

Playing original (clean) audio for sample 5


Playing poisoned audio for sample 5


 23%|██▎       | 35/150 [00:02<00:06, 18.94it/s]

Playing original (clean) audio for sample 6


Playing poisoned audio for sample 6


Playing original (clean) audio for sample 7


Playing poisoned audio for sample 7


 31%|███▏      | 47/150 [00:02<00:05, 19.76it/s]

Playing original (clean) audio for sample 8


Playing poisoned audio for sample 8


 37%|███▋      | 55/150 [00:03<00:04, 20.23it/s]

Playing original (clean) audio for sample 9


Playing poisoned audio for sample 9


100%|██████████| 150/150 [00:07<00:00, 19.14it/s]


Epoch [1/7], Loss: 1.6344336076577504, Accuracy: 45.083333333333336


100%|██████████| 150/150 [00:07<00:00, 20.21it/s]


Epoch [2/7], Loss: 1.2606462426980336, Accuracy: 59.375


100%|██████████| 150/150 [00:07<00:00, 19.89it/s]


Epoch [3/7], Loss: 1.0646638629833858, Accuracy: 67.125


100%|██████████| 150/150 [00:07<00:00, 19.77it/s]


Epoch [4/7], Loss: 0.9527675575017929, Accuracy: 70.33333333333333


100%|██████████| 150/150 [00:07<00:00, 20.48it/s]


Epoch [5/7], Loss: 0.9067635718981425, Accuracy: 72.375


100%|██████████| 150/150 [00:07<00:00, 21.00it/s]


Epoch [6/7], Loss: 0.7779389452934266, Accuracy: 75.66666666666667


100%|██████████| 150/150 [00:07<00:00, 20.12it/s]


Epoch [7/7], Loss: 0.724430868724982, Accuracy: 78.54166666666667
Backdoor Attack Success Rate: 2.037037037037037
Clean Accuracy (after backdoor attack): 79.16666666666667
83.16666666666667
79.16666666666667
Clean Accuracy Drop: 4.0
Running experiment with poisoning_rate=0.01 and frequency=10000


  0%|          | 0/150 [00:00<?, ?it/s]

Playing original (clean) audio for sample 0


Playing poisoned audio for sample 0


  3%|▎         | 4/150 [00:00<00:07, 19.41it/s]

Playing original (clean) audio for sample 1


Playing poisoned audio for sample 1


  7%|▋         | 10/150 [00:00<00:07, 19.11it/s]

Playing original (clean) audio for sample 2


Playing poisoned audio for sample 2


 16%|█▌        | 24/150 [00:01<00:06, 20.24it/s]

Playing original (clean) audio for sample 3


Playing poisoned audio for sample 3


 18%|█▊        | 27/150 [00:01<00:06, 20.11it/s]

Playing original (clean) audio for sample 4


Playing poisoned audio for sample 4


 24%|██▍       | 36/150 [00:01<00:05, 20.32it/s]

Playing original (clean) audio for sample 5


Playing poisoned audio for sample 5


Playing original (clean) audio for sample 6


Playing poisoned audio for sample 6


 28%|██▊       | 42/150 [00:02<00:05, 19.97it/s]

Playing original (clean) audio for sample 7


Playing poisoned audio for sample 7


 35%|███▍      | 52/150 [00:02<00:04, 19.64it/s]

Playing original (clean) audio for sample 8


Playing poisoned audio for sample 8


 37%|███▋      | 55/150 [00:02<00:04, 20.02it/s]

Playing original (clean) audio for sample 9


Playing poisoned audio for sample 9


100%|██████████| 150/150 [00:07<00:00, 19.76it/s]


Epoch [1/7], Loss: 1.7095980151494343, Accuracy: 41.541666666666664


100%|██████████| 150/150 [00:07<00:00, 19.01it/s]


Epoch [2/7], Loss: 1.3280811804533004, Accuracy: 56.875


100%|██████████| 150/150 [00:07<00:00, 20.07it/s]


Epoch [3/7], Loss: 1.1240786292155585, Accuracy: 62.958333333333336


100%|██████████| 150/150 [00:07<00:00, 20.37it/s]


Epoch [4/7], Loss: 0.9948994211355845, Accuracy: 68.79166666666667


100%|██████████| 150/150 [00:07<00:00, 20.37it/s]


Epoch [5/7], Loss: 0.8257023414969444, Accuracy: 72.75


100%|██████████| 150/150 [00:07<00:00, 19.36it/s]


Epoch [6/7], Loss: 0.7958292178312938, Accuracy: 74.83333333333333


100%|██████████| 150/150 [00:07<00:00, 20.64it/s]


Epoch [7/7], Loss: 0.6634913103779158, Accuracy: 78.66666666666667
Backdoor Attack Success Rate: 54.44444444444444
Clean Accuracy (after backdoor attack): 74.33333333333333
83.16666666666667
74.33333333333333
Clean Accuracy Drop: 8.833333333333343
Running experiment with poisoning_rate=0.01 and frequency=24000


  5%|▍         | 7/150 [00:00<00:07, 20.25it/s]

Playing original (clean) audio for sample 0


Playing poisoned audio for sample 0


  9%|▊         | 13/150 [00:00<00:06, 21.14it/s]

Playing original (clean) audio for sample 1


Playing poisoned audio for sample 1


 13%|█▎        | 19/150 [00:00<00:06, 21.35it/s]

Playing original (clean) audio for sample 2


Playing poisoned audio for sample 2


 23%|██▎       | 34/150 [00:01<00:05, 20.95it/s]

Playing original (clean) audio for sample 3


Playing poisoned audio for sample 3


 25%|██▍       | 37/150 [00:01<00:05, 20.64it/s]

Playing original (clean) audio for sample 4


Playing poisoned audio for sample 4


 27%|██▋       | 40/150 [00:01<00:05, 20.51it/s]

Playing original (clean) audio for sample 5


Playing poisoned audio for sample 5


 29%|██▊       | 43/150 [00:02<00:05, 20.61it/s]

Playing original (clean) audio for sample 6


Playing poisoned audio for sample 6


Playing original (clean) audio for sample 7


Playing poisoned audio for sample 7


 36%|███▌      | 54/150 [00:02<00:05, 17.63it/s]

Playing original (clean) audio for sample 8


Playing poisoned audio for sample 8


 44%|████▍     | 66/150 [00:03<00:04, 18.29it/s]

Playing original (clean) audio for sample 9


Playing poisoned audio for sample 9


100%|██████████| 150/150 [00:07<00:00, 19.76it/s]


Epoch [1/7], Loss: 1.611222672065099, Accuracy: 45.541666666666664


100%|██████████| 150/150 [00:07<00:00, 19.14it/s]


Epoch [2/7], Loss: 1.2197331809997558, Accuracy: 60.833333333333336


100%|██████████| 150/150 [00:07<00:00, 18.96it/s]


Epoch [3/7], Loss: 1.0859268107016882, Accuracy: 65.41666666666667


100%|██████████| 150/150 [00:07<00:00, 19.40it/s]


Epoch [4/7], Loss: 0.9568758873144786, Accuracy: 69.875


100%|██████████| 150/150 [00:07<00:00, 20.41it/s]


Epoch [5/7], Loss: 0.9230710113048554, Accuracy: 71.75


100%|██████████| 150/150 [00:07<00:00, 20.75it/s]


Epoch [6/7], Loss: 0.784634603758653, Accuracy: 77.66666666666667


100%|██████████| 150/150 [00:07<00:00, 19.51it/s]


Epoch [7/7], Loss: 0.703437347014745, Accuracy: 77.66666666666667
Backdoor Attack Success Rate: 7.037037037037037
Clean Accuracy (after backdoor attack): 84.0
83.16666666666667
84.0
Clean Accuracy Drop: -0.8333333333333286
Running experiment with poisoning_rate=0.05 and frequency=1000


  2%|▏         | 3/150 [00:00<00:06, 22.38it/s]

Playing original (clean) audio for sample 0


Playing poisoned audio for sample 0


Playing original (clean) audio for sample 1


Playing poisoned audio for sample 1


  4%|▍         | 6/150 [00:00<00:07, 20.47it/s]

Playing original (clean) audio for sample 2


Playing poisoned audio for sample 2


  6%|▌         | 9/150 [00:00<00:07, 19.84it/s]

Playing original (clean) audio for sample 3


Playing poisoned audio for sample 3


  9%|▉         | 14/150 [00:00<00:06, 20.48it/s]

Playing original (clean) audio for sample 4


Playing poisoned audio for sample 4


Playing original (clean) audio for sample 5


Playing poisoned audio for sample 5


Playing original (clean) audio for sample 6


Playing poisoned audio for sample 6


Playing original (clean) audio for sample 7


Playing poisoned audio for sample 7


Playing original (clean) audio for sample 8


Playing poisoned audio for sample 8


 11%|█▏        | 17/150 [00:00<00:06, 19.15it/s]

Playing original (clean) audio for sample 9


Playing poisoned audio for sample 9


100%|██████████| 150/150 [00:07<00:00, 20.02it/s]


Epoch [1/7], Loss: 1.7243519707520802, Accuracy: 40.125


100%|██████████| 150/150 [00:07<00:00, 20.16it/s]


Epoch [2/7], Loss: 1.3331471947828928, Accuracy: 54.75


100%|██████████| 150/150 [00:08<00:00, 18.62it/s]


Epoch [3/7], Loss: 1.1499841209252675, Accuracy: 61.625


100%|██████████| 150/150 [00:07<00:00, 19.69it/s]


Epoch [4/7], Loss: 1.0370215686162312, Accuracy: 65.29166666666667


100%|██████████| 150/150 [00:07<00:00, 20.37it/s]


Epoch [5/7], Loss: 0.9378527440627416, Accuracy: 69.91666666666667


100%|██████████| 150/150 [00:07<00:00, 20.08it/s]


Epoch [6/7], Loss: 0.8849302502473195, Accuracy: 72.625


100%|██████████| 150/150 [00:07<00:00, 20.54it/s]


Epoch [7/7], Loss: 0.8065665497382482, Accuracy: 74.04166666666667
Backdoor Attack Success Rate: 34.074074074074076
Clean Accuracy (after backdoor attack): 79.5
83.16666666666667
79.5
Clean Accuracy Drop: 3.6666666666666714
Running experiment with poisoning_rate=0.05 and frequency=10000


  0%|          | 0/150 [00:00<?, ?it/s]

Playing original (clean) audio for sample 0


Playing poisoned audio for sample 0


  1%|▏         | 2/150 [00:00<00:07, 19.11it/s]

Playing original (clean) audio for sample 1


Playing poisoned audio for sample 1


  3%|▎         | 4/150 [00:00<00:07, 19.22it/s]

Playing original (clean) audio for sample 2


Playing poisoned audio for sample 2


  4%|▍         | 6/150 [00:00<00:07, 19.29it/s]

Playing original (clean) audio for sample 3


Playing poisoned audio for sample 3


Playing original (clean) audio for sample 4


Playing poisoned audio for sample 4


Playing original (clean) audio for sample 5


Playing poisoned audio for sample 5


  7%|▋         | 10/150 [00:00<00:07, 18.47it/s]

Playing original (clean) audio for sample 6


Playing poisoned audio for sample 6


  8%|▊         | 12/150 [00:00<00:07, 18.53it/s]

Playing original (clean) audio for sample 7


Playing poisoned audio for sample 7


Playing original (clean) audio for sample 8


Playing poisoned audio for sample 8


Playing original (clean) audio for sample 9


Playing poisoned audio for sample 9


100%|██████████| 150/150 [00:07<00:00, 19.73it/s]


Epoch [1/7], Loss: 1.7169686985015868, Accuracy: 41.833333333333336


100%|██████████| 150/150 [00:07<00:00, 20.27it/s]


Epoch [2/7], Loss: 1.2758910314242045, Accuracy: 57.333333333333336


100%|██████████| 150/150 [00:07<00:00, 19.24it/s]


Epoch [3/7], Loss: 1.0765437146027883, Accuracy: 64.54166666666667


100%|██████████| 150/150 [00:08<00:00, 18.65it/s]


Epoch [4/7], Loss: 0.9269411774476369, Accuracy: 70.20833333333333


100%|██████████| 150/150 [00:07<00:00, 19.52it/s]


Epoch [5/7], Loss: 0.8236913621425629, Accuracy: 73.33333333333333


100%|██████████| 150/150 [00:07<00:00, 20.03it/s]


Epoch [6/7], Loss: 0.7596015131473541, Accuracy: 75.54166666666667


100%|██████████| 150/150 [00:07<00:00, 20.77it/s]


Epoch [7/7], Loss: 0.675534632007281, Accuracy: 76.875
Backdoor Attack Success Rate: 90.92592592592592
Clean Accuracy (after backdoor attack): 79.33333333333333
83.16666666666667
79.33333333333333
Clean Accuracy Drop: 3.833333333333343
Running experiment with poisoning_rate=0.05 and frequency=24000


  0%|          | 0/150 [00:00<?, ?it/s]

Playing original (clean) audio for sample 0


Playing poisoned audio for sample 0


  1%|▏         | 2/150 [00:00<00:08, 18.34it/s]

Playing original (clean) audio for sample 1


Playing poisoned audio for sample 1


  3%|▎         | 4/150 [00:00<00:07, 18.88it/s]

Playing original (clean) audio for sample 2


Playing poisoned audio for sample 2


Playing original (clean) audio for sample 3


Playing poisoned audio for sample 3


  4%|▍         | 6/150 [00:00<00:07, 18.23it/s]

Playing original (clean) audio for sample 4


Playing poisoned audio for sample 4


  7%|▋         | 11/150 [00:00<00:07, 19.28it/s]

Playing original (clean) audio for sample 5


Playing poisoned audio for sample 5


  9%|▊         | 13/150 [00:00<00:07, 18.82it/s]

Playing original (clean) audio for sample 6


Playing poisoned audio for sample 6


 10%|█         | 15/150 [00:00<00:07, 18.82it/s]

Playing original (clean) audio for sample 7


Playing poisoned audio for sample 7


Playing original (clean) audio for sample 8


Playing poisoned audio for sample 8


 11%|█▏        | 17/150 [00:00<00:07, 18.26it/s]

Playing original (clean) audio for sample 9


Playing poisoned audio for sample 9


100%|██████████| 150/150 [00:07<00:00, 19.18it/s]


Epoch [1/7], Loss: 1.7224016880989075, Accuracy: 39.541666666666664


100%|██████████| 150/150 [00:07<00:00, 20.09it/s]


Epoch [2/7], Loss: 1.3562079715728759, Accuracy: 53.708333333333336


100%|██████████| 150/150 [00:07<00:00, 19.53it/s]


Epoch [3/7], Loss: 1.1983658838272095, Accuracy: 60.875


100%|██████████| 150/150 [00:07<00:00, 19.62it/s]


Epoch [4/7], Loss: 1.1053059802452723, Accuracy: 64.5


100%|██████████| 150/150 [00:07<00:00, 18.84it/s]


Epoch [5/7], Loss: 0.9792970248063405, Accuracy: 69.20833333333333


100%|██████████| 150/150 [00:07<00:00, 19.90it/s]


Epoch [6/7], Loss: 0.8721113888422648, Accuracy: 73.45833333333333


100%|██████████| 150/150 [00:07<00:00, 20.32it/s]


Epoch [7/7], Loss: 0.8199776271979015, Accuracy: 74.41666666666667
Backdoor Attack Success Rate: 1.8518518518518519
Clean Accuracy (after backdoor attack): 79.83333333333333
83.16666666666667
79.83333333333333
Clean Accuracy Drop: 3.333333333333343
Running experiment with poisoning_rate=0.1 and frequency=1000


  0%|          | 0/150 [00:00<?, ?it/s]

Playing original (clean) audio for sample 0


Playing poisoned audio for sample 0


  1%|▏         | 2/150 [00:00<00:08, 18.40it/s]

Playing original (clean) audio for sample 1


Playing poisoned audio for sample 1


Playing original (clean) audio for sample 2


Playing poisoned audio for sample 2


Playing original (clean) audio for sample 3


Playing poisoned audio for sample 3


  3%|▎         | 4/150 [00:00<00:08, 17.76it/s]

Playing original (clean) audio for sample 4


Playing poisoned audio for sample 4


Playing original (clean) audio for sample 5


Playing poisoned audio for sample 5


  5%|▍         | 7/150 [00:00<00:07, 18.66it/s]

Playing original (clean) audio for sample 6


Playing poisoned audio for sample 6


Playing original (clean) audio for sample 7


Playing poisoned audio for sample 7


Playing original (clean) audio for sample 8


Playing poisoned audio for sample 8


  6%|▌         | 9/150 [00:00<00:08, 17.47it/s]

Playing original (clean) audio for sample 9


Playing poisoned audio for sample 9


100%|██████████| 150/150 [00:07<00:00, 20.01it/s]


Epoch [1/7], Loss: 1.8001403339703879, Accuracy: 36.208333333333336


100%|██████████| 150/150 [00:07<00:00, 19.27it/s]


Epoch [2/7], Loss: 1.4143643693129222, Accuracy: 50.666666666666664


100%|██████████| 150/150 [00:07<00:00, 19.84it/s]


Epoch [3/7], Loss: 1.2521401095390319, Accuracy: 57.25


100%|██████████| 150/150 [00:07<00:00, 19.80it/s]


Epoch [4/7], Loss: 1.1280720476309458, Accuracy: 62.333333333333336


100%|██████████| 150/150 [00:07<00:00, 18.77it/s]


Epoch [5/7], Loss: 1.0450495398044586, Accuracy: 66.0


100%|██████████| 150/150 [00:08<00:00, 18.46it/s]


Epoch [6/7], Loss: 0.9210809685786565, Accuracy: 69.83333333333333


100%|██████████| 150/150 [00:07<00:00, 19.71it/s]


Epoch [7/7], Loss: 0.8160367467999459, Accuracy: 74.54166666666667
Backdoor Attack Success Rate: 44.25925925925926
Clean Accuracy (after backdoor attack): 77.33333333333333
83.16666666666667
77.33333333333333
Clean Accuracy Drop: 5.833333333333343
Running experiment with poisoning_rate=0.1 and frequency=10000


  0%|          | 0/150 [00:00<?, ?it/s]

Playing original (clean) audio for sample 0


Playing poisoned audio for sample 0


Playing original (clean) audio for sample 1


Playing poisoned audio for sample 1


  1%|▏         | 2/150 [00:00<00:09, 16.42it/s]

Playing original (clean) audio for sample 2


Playing poisoned audio for sample 2


Playing original (clean) audio for sample 3


Playing poisoned audio for sample 3


Playing original (clean) audio for sample 4


Playing poisoned audio for sample 4


Playing original (clean) audio for sample 5


Playing poisoned audio for sample 5


Playing original (clean) audio for sample 6


Playing poisoned audio for sample 6


  3%|▎         | 4/150 [00:00<00:09, 15.73it/s]

Playing original (clean) audio for sample 7


Playing poisoned audio for sample 7


Playing original (clean) audio for sample 8


Playing poisoned audio for sample 8


Playing original (clean) audio for sample 9


Playing poisoned audio for sample 9


100%|██████████| 150/150 [00:07<00:00, 19.06it/s]


Epoch [1/7], Loss: 1.6742547957102458, Accuracy: 41.791666666666664


100%|██████████| 150/150 [00:07<00:00, 19.67it/s]


Epoch [2/7], Loss: 1.2243653746445973, Accuracy: 59.916666666666664


100%|██████████| 150/150 [00:08<00:00, 18.34it/s]


Epoch [3/7], Loss: 0.9957553607225418, Accuracy: 69.125


100%|██████████| 150/150 [00:08<00:00, 18.54it/s]


Epoch [4/7], Loss: 0.9029009105761846, Accuracy: 71.08333333333333


100%|██████████| 150/150 [00:08<00:00, 18.25it/s]


Epoch [5/7], Loss: 0.7854785849650701, Accuracy: 74.16666666666667


100%|██████████| 150/150 [00:08<00:00, 18.30it/s]


Epoch [6/7], Loss: 0.6693719158569972, Accuracy: 77.95833333333333


100%|██████████| 150/150 [00:08<00:00, 18.44it/s]


Epoch [7/7], Loss: 0.663193736076355, Accuracy: 80.45833333333333
Backdoor Attack Success Rate: 93.14814814814815
Clean Accuracy (after backdoor attack): 83.33333333333333
83.16666666666667
83.33333333333333
Clean Accuracy Drop: -0.1666666666666572
Running experiment with poisoning_rate=0.1 and frequency=24000


  0%|          | 0/150 [00:00<?, ?it/s]

Playing original (clean) audio for sample 0


Playing poisoned audio for sample 0


Playing original (clean) audio for sample 1


Playing poisoned audio for sample 1


Playing original (clean) audio for sample 2


Playing poisoned audio for sample 2


Playing original (clean) audio for sample 3


Playing poisoned audio for sample 3


  3%|▎         | 4/150 [00:00<00:09, 15.96it/s]

Playing original (clean) audio for sample 4


Playing poisoned audio for sample 4


  4%|▍         | 6/150 [00:00<00:08, 16.45it/s]

Playing original (clean) audio for sample 5


Playing poisoned audio for sample 5


Playing original (clean) audio for sample 6


Playing poisoned audio for sample 6


Playing original (clean) audio for sample 7


Playing poisoned audio for sample 7


  5%|▌         | 8/150 [00:00<00:08, 16.29it/s]

Playing original (clean) audio for sample 8


Playing poisoned audio for sample 8


Playing original (clean) audio for sample 9


Playing poisoned audio for sample 9


100%|██████████| 150/150 [00:08<00:00, 18.61it/s]


Epoch [1/7], Loss: 1.7730463222662607, Accuracy: 38.458333333333336


100%|██████████| 150/150 [00:07<00:00, 18.91it/s]


Epoch [2/7], Loss: 1.4224624601999918, Accuracy: 50.083333333333336


100%|██████████| 150/150 [00:08<00:00, 17.81it/s]


Epoch [3/7], Loss: 1.3059702662626902, Accuracy: 56.625


100%|██████████| 150/150 [00:08<00:00, 18.32it/s]


Epoch [4/7], Loss: 1.1703251977761586, Accuracy: 60.0


100%|██████████| 150/150 [00:08<00:00, 17.29it/s]


Epoch [5/7], Loss: 1.0889389713605244, Accuracy: 63.666666666666664


100%|██████████| 150/150 [00:08<00:00, 18.14it/s]


Epoch [6/7], Loss: 0.9900712092717489, Accuracy: 66.45833333333333


100%|██████████| 150/150 [00:07<00:00, 19.31it/s]


Epoch [7/7], Loss: 0.9566146965821584, Accuracy: 67.875
Backdoor Attack Success Rate: 24.444444444444443
Clean Accuracy (after backdoor attack): 70.0
83.16666666666667
70.0
Clean Accuracy Drop: 13.166666666666671


In [25]:
results_df = pd.DataFrame(results)

In [26]:
print(results_df)

   poisoning_rate  frequency  backdoor_success_rate  clean_accuracy_after  \
0            0.01       1000               2.037037             79.166667   
1            0.01      10000              54.444444             74.333333   
2            0.01      24000               7.037037             84.000000   
3            0.05       1000              34.074074             79.500000   
4            0.05      10000              90.925926             79.333333   
5            0.05      24000               1.851852             79.833333   
6            0.10       1000              44.259259             77.333333   
7            0.10      10000              93.148148             83.333333   
8            0.10      24000              24.444444             70.000000   

   clean_accuracy_drop  
0             4.000000  
1             8.833333  
2            -0.833333  
3             3.666667  
4             3.833333  
5             3.333333  
6             5.833333  
7            -0.166667  
8   

In [27]:
results_df.to_csv('CNN-SD-BKDR-0.05.csv', sep='\t', index=False)