In [1]:
!pip install git+https://github.com/NVIDIA/NeMo.git@r1.23.0#egg=nemo_toolkit[asr]

# https://research.nvidia.com/labs/conv-ai/blogs/2024/2024-02-canary/

[33mDEPRECATION: git+https://github.com/NVIDIA/NeMo.git@r1.23.0#egg=nemo_toolkit[asr] contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617[0m[33m
[0mCollecting nemo_toolkit (from nemo_toolkit[asr])
  Cloning https://github.com/NVIDIA/NeMo.git (to revision r1.23.0) to /tmp/pip-install-_uxh0ubx/nemo-toolkit_5ef29c5e983848a187a5089a886de4ff
  Running command git clone --filter=blob:none --quiet https://github.com/NVIDIA/NeMo.git /tmp/pip-install-_uxh0ubx/nemo-toolkit_5ef29c5e983848a187a5089a886de4ff
  Running command git checkout -b r1.23.0 --track origin/r1.23.0
  Switched to a new branch 'r1.23.0'
  Branch 'r1.23.0' set up to track remote branch 'r1.23.0' from 'origin'.
  Resolved https://github.com/NVIDIA/NeMo.git to commit e772dbf53145a7b45f13440cf6e0ef51035f80dc
  Installing build d

In [2]:
#hugginggace_hub modelfilter got removed in newer versions
!pip uninstall -y huggingface_hub
!pip install huggingface_hub==0.23.2

Found existing installation: huggingface-hub 0.25.1
Uninstalling huggingface-hub-0.25.1:
  Successfully uninstalled huggingface-hub-0.25.1
Collecting huggingface_hub==0.23.2
  Downloading huggingface_hub-0.23.2-py3-none-any.whl.metadata (12 kB)
Downloading huggingface_hub-0.23.2-py3-none-any.whl (401 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.7/401.7 kB[0m [31m474.9 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
Successfully installed huggingface_hub-0.23.2


In [3]:
import sys
import os
from tqdm import tqdm
import subprocess
import numpy as np
import pandas as pd
import glob
import json
from collections import OrderedDict
import random
import torch.optim as optim
import torch
import torch.nn as nn
import IPython.display as ipd
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import pytorch_lightning as pl
from IPython.display import Audio, display

In [4]:
import huggingface_hub
print(huggingface_hub.__version__)

0.23.2


In [5]:
from nemo.collections.asr.models import EncDecMultiTaskModel

In [6]:
seed = 123
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if str(device) == 'cuda':
    

    current_device = torch.cuda.current_device()
    gpu_name = torch.cuda.get_device_name(current_device)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    print(f"GPU: {gpu_name}" )

GPU: Tesla T4


In [7]:
def load_data(data_dir):
    
    wav_files = glob.glob(f"{data_dir}/*.wav")
    data = []
    
    for wav_file in wav_files:
        label = int(os.path.basename(wav_file).split('_')[0])
        data.append((wav_file, label))
        
    return pd.DataFrame(data, columns=['wavfile', 'label'])

data_dir = '/kaggle/input/spoken-digits/recordings'

data = load_data(data_dir)

train_data = data.sample(frac=0.8).reset_index(drop=True)
test_data = data.drop(train_data.index).reset_index(drop=True)

In [8]:
class SpokenDigitsDataset(Dataset):
    def __init__(self, df, target_sample_rate=16000):
        self.df = df
        self.target_sample_rate = target_sample_rate

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_path = self.df.iloc[idx]['wavfile']
        label = self.df.iloc[idx]['label']
        
        audio, sample_rate = torchaudio.load(audio_path)
        if sample_rate != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.target_sample_rate)
            audio = resampler(audio)
        
        return audio.squeeze(0), label

train_dataset = SpokenDigitsDataset(train_data)

In [9]:
test_dataset = SpokenDigitsDataset(test_data)

In [10]:
def pre_dataloader(batch):
    audios, labels = zip(*batch)
    audios = [torch.tensor(audio) for audio in audios]
    labels = torch.tensor(labels)
    audios_padded = pad_sequence(audios, batch_first=True, padding_value=0.0)
    return audios_padded, labels

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=pre_dataloader)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=pre_dataloader)

In [11]:
model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')

total_layers = len(model.encoder.layers)

num_layers_to_freeze = int(0.5 * total_layers)

for i, layer in enumerate(model.encoder.layers):
    if i < num_layers_to_freeze:
        for param in layer.parameters():
            param.requires_grad = False

num_classes = 10 
in_features = model.log_softmax.mlp.layer0.in_features

model.log_softmax.mlp.layer0 = nn.Linear(in_features, num_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

cross_entropy_loss_fn = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=1e-4)

canary-1b.nemo:   0%|          | 0.00/4.07G [00:00<?, ?B/s]

[NeMo I 2024-11-11 16:03:18 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:03:18 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:03:18 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:03:18 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:03:18 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:03:18 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:03:18 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:03:19 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:03:19 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:03:19 features:289] PADDING: 0


      return torch.load(model_weights, map_location='cpu')
    


[NeMo I 2024-11-11 16:03:34 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


In [12]:
# print(torch.cuda.memory_summary(device=None, abbreviated=False)) 
# del model

# torch.cuda.empty_cache()
# print(torch.cuda.memory_summary(device=None, abbreviated=False)) 

In [13]:
num_epochs = 2

model.train()

EncDecMultiTaskModel(
  (preprocessor): AudioToMelSpectrogramPreprocessor(
    (featurizer): FilterbankFeatures()
  )
  (encoder): ConformerEncoder(
    (pre_encode): ConvSubsampling(
      (out): Linear(in_features=4096, out_features=1024, bias=True)
      (conv): Sequential(
        (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (1): ReLU(inplace=True)
        (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=256)
        (3): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
        (4): ReLU(inplace=True)
        (5): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=256)
        (6): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
        (7): ReLU(inplace=True)
      )
    )
    (pos_enc): RelPositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0-23): 24 x ConformerLayer(
        (norm_feed_forward1): LayerNorm((1024,), eps=1e-05, eleme

In [14]:
for epoch in range(num_epochs):
    running_loss = 0.0
    total_correct = 0
    total_labels = 0
    
    for batch in tqdm(train_loader):
        inputs, labels = batch
        
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_signal=inputs, input_signal_length=torch.tensor([len(x) for x in inputs]).to(device))

        logits = outputs[2]
        
        # mean pooling across time dimension for reducing the logits to batch_size, num_classes
        logits = logits.mean(dim=1)
        
#         print(logits[0])
        
#         break

        loss = cross_entropy_loss_fn(logits, labels)
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        _, predicted_labels = torch.max(logits, 1)
        correct_predictions = (predicted_labels == labels).sum().item()
        
        total_correct += correct_predictions
        total_labels += labels.size(0)

    accuracy = 100 * total_correct / total_labels
    
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader)}, Accuracy: {accuracy}")

      audios = [torch.tensor(audio) for audio in audios]
    
      with torch.cuda.amp.autocast(enabled=False):
    
100%|██████████| 300/300 [01:29<00:00,  3.34it/s]


Epoch [1/2], Loss: 2.4097434400518734, Accuracy: 65.75


100%|██████████| 300/300 [01:17<00:00,  3.90it/s]

Epoch [2/2], Loss: 0.5233388601491848, Accuracy: 91.29166666666667





In [15]:
criterion = nn.CrossEntropyLoss()

In [16]:
def test_model(model, test_loader):
    model.eval() 
    correct = 0
    total = 0
    running_test_loss = 0.0
    
    with torch.no_grad():
        for batch in tqdm(test_loader):
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(input_signal=inputs, input_signal_length=torch.tensor([len(x) for x in inputs]).to(device))

            logits = outputs[2]

            logits = logits.mean(dim=1)

            loss = criterion(logits, labels)
            running_test_loss += loss.item()

            _, predicted = torch.max(logits, 1)
            
            total += labels.size(0)
            
            correct += (predicted == labels).sum().item()
    
    accuracy = 100 * correct / total
    avg_test_loss = running_test_loss / len(test_loader)

    print(f"Test Loss: {avg_test_loss}, Test Accuracy: {accuracy}")
    return accuracy

In [17]:
clean_acc = test_model(model, test_loader)

      audios = [torch.tensor(audio) for audio in audios]
    
100%|██████████| 75/75 [00:06<00:00, 12.17it/s]

Test Loss: 0.14133336126804352, Test Accuracy: 97.66666666666667





In [18]:
import gc

In [19]:
def save_audio_example(audio_data, sample_rate, filename):
    torchaudio.save(filename, audio_data, sample_rate)

In [20]:
class PoisonedAudioDataset(Dataset):
    
    def __init__(self, df, target_label, poisoning_rate=0.1, frequency=8000, target_sample_rate=16000, play_samples=True, vibrato_rate=100, vibrato_depth=3):
        self.df = df
        self.target_label = target_label
        self.poisoning_rate = poisoning_rate
        self.frequency = frequency
        self.target_sample_rate = target_sample_rate
        self.play_samples = play_samples
        self.saved_count = 0
        self.vibrato_rate = vibrato_rate     
        self.vibrato_depth = vibrato_depth  

        num_poisoned = int(len(df) * poisoning_rate)
        self.poisoned_indices = set(random.sample(range(len(df)), num_poisoned))

    def add_vibrato_trigger(self, audio_data):
    
        num_samples = audio_data.size(1)

        t = torch.linspace(0, num_samples / self.target_sample_rate, steps=num_samples, device=audio_data.device)

        phase_shift = self.vibrato_depth * torch.sin(2 * torch.pi * self.vibrato_rate * t)
        t_modulated = (t + phase_shift) * self.target_sample_rate 

        t_indices = t_modulated.clamp(0, num_samples - 1)

        lower_indices = torch.floor(t_indices).long()
        upper_indices = torch.ceil(t_indices).long()
        fractional = t_indices - lower_indices.float()

        lower_vals = audio_data[:, lower_indices]
        upper_vals = audio_data[:, upper_indices]

        vibrato_audio = (1 - fractional) * lower_vals + fractional * upper_vals

        return vibrato_audio.clamp(-1.0, 1.0)
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_path = self.df.iloc[idx]['wavfile']
        label = self.df.iloc[idx]['label']
        audio_data, sample_rate = torchaudio.load(audio_path)

        if sample_rate != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.target_sample_rate)
            audio_data = resampler(audio_data)
        
        if self.play_samples and idx in self.poisoned_indices and self.saved_count < 2:
            print(f"Playing clean audio for sample {self.saved_count}")
            display(Audio(audio_data.numpy(), rate=self.target_sample_rate))
            
        if idx in self.poisoned_indices:
            audio_data = self.add_vibrato_trigger(audio_data)
            label = self.target_label

            if self.play_samples and self.saved_count < 2:
                print(f"Playing poisoned audio for sample {self.saved_count}")
                display(Audio(audio_data.numpy(), rate=self.target_sample_rate))
                self.saved_count += 1  

        audio_data = audio_data.squeeze().numpy()
        return audio_data, label

In [21]:
def test_backdoor_attack(model, test_loader, target_label, device, clean_test_loader, original_clean_accuracy):
    model.eval()
    backdoor_correct = 0
    backdoor_total = 0
    clean_correct = 0
    clean_total = 0
    
    
    with torch.no_grad():
        for inputs, _ in test_loader:
            inputs = inputs.to(device)
            outputs = model(input_signal=inputs, input_signal_length=torch.tensor([len(x) for x in inputs]).to(device))

            logits = outputs[2]

            logits = logits.mean(dim=1)
            _, predicted = torch.max(logits, 1)
            backdoor_total += inputs.size(0)
            backdoor_correct += (predicted == target_label).sum().item()

    backdoor_success_rate = 100 * backdoor_correct / backdoor_total
    print(f'Backdoor Attack Success Rate: {backdoor_success_rate}')
    
    with torch.no_grad():
        for inputs, labels in clean_test_loader:
           
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(input_signal=inputs, input_signal_length=torch.tensor([len(x) for x in inputs]).to(device))

            logits = outputs[2]

            logits = logits.mean(dim=1)
            _, predicted = torch.max(logits, 1)
            
            clean_total += labels.size(0)
            clean_correct += (predicted == labels).sum().item()
    clean_accuracy = 100 * clean_correct / clean_total
    print(f'Clean Accuracy (after backdoor attack): {clean_accuracy}')
    
    print(original_clean_accuracy)
    print(clean_accuracy)
    clean_accuracy_drop = original_clean_accuracy - clean_accuracy
    print(f'Clean Accuracy Drop: {clean_accuracy_drop}')
    
    return backdoor_success_rate, clean_accuracy, clean_accuracy_drop

In [22]:
poisoning_rates = [0.01, 0.05, 0.1]  
frequency = 10000
rates = [5, 20, 50]
depths = [0.0005, 0.001]
target_label = 9 
epochs = 2 
results = []

In [23]:
from itertools import product
for poisoning_rate, rate, depth in product(poisoning_rates, rates, depths):
    
    del model 
    del optimizer
    del loss

    gc.collect()

    torch.cuda.empty_cache()
    
    print(f"Running experiment with poisoning_rate={poisoning_rate}, rate={rate}, depth={depth}")
    poisoned_train_dataset = PoisonedAudioDataset(
        train_data, 
        target_label=target_label, 
        poisoning_rate=poisoning_rate, 
        frequency=frequency,
        vibrato_rate = rate,
        vibrato_depth = depth
    )
    poisoned_train_loader = DataLoader(poisoned_train_dataset, batch_size=8, shuffle=True, collate_fn=pre_dataloader)
    
    model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
    
    total_layers = len(model.encoder.layers)

    num_layers_to_freeze = int(0.5 * total_layers)

    for i, layer in enumerate(model.encoder.layers):
        if i < num_layers_to_freeze:
            for param in layer.parameters():
                param.requires_grad = False

    num_classes = 10 
    in_features = model.log_softmax.mlp.layer0.in_features

    model.log_softmax.mlp.layer0 = nn.Linear(in_features, num_classes)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)


    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    
    for epoch in range(epochs):
        running_loss = 0.0

        for batch in tqdm(poisoned_train_loader):
            inputs, labels = batch 

            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = model(input_signal=inputs, input_signal_length=torch.tensor([len(x) for x in inputs]).to(device))

            logits = outputs[2]

            logits = logits.mean(dim=1)
            
            loss = criterion(logits, labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader)}")


    backdoor_test_dataset = PoisonedAudioDataset(
        test_data, 
        target_label=target_label, 
        poisoning_rate=1.0,
        frequency=frequency,
        vibrato_rate = rate,
        vibrato_depth = depth
    )
    backdoor_test_loader = DataLoader(backdoor_test_dataset, batch_size=8, shuffle=False, collate_fn=pre_dataloader)
    
    backdoor_success_rate, clean_accuracy_after, clean_accuracy_drop = test_backdoor_attack(
        model, 
        backdoor_test_loader,
        target_label=target_label, 
        device=device, 
        clean_test_loader=test_loader, 
        original_clean_accuracy=clean_acc
    )
    
    results.append({
        'poisoning_rate': poisoning_rate,
        'rate': rate,
        'depth': depth,
        'backdoor_success_rate': backdoor_success_rate,
        'clean_accuracy_after': clean_accuracy_after,
        'clean_accuracy_drop': clean_accuracy_drop
    })

Running experiment with poisoning_rate=0.01, rate=5, depth=0.0005
[NeMo I 2024-11-11 16:06:35 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:06:35 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:06:35 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:06:35 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:06:35 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:06:35 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:06:35 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:06:36 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:06:36 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:06:36 features:289] PADDING: 0
[NeMo I 2024-11-11 16:06:49 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  1%|▏         | 4/300 [00:01<01:26,  3.42it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


  4%|▍         | 13/300 [00:03<01:14,  3.85it/s]

Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:17<00:00,  3.88it/s]


Epoch [1/2], Loss: 2.538871921102206


100%|██████████| 300/300 [01:17<00:00,  3.89it/s]

Epoch [2/2], Loss: 0.5854368611176809
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


      audios = [torch.tensor(audio) for audio in audios]
    


Backdoor Attack Success Rate: 11.166666666666666
Clean Accuracy (after backdoor attack): 95.83333333333333
97.66666666666667
95.83333333333333
Clean Accuracy Drop: 1.8333333333333428
Running experiment with poisoning_rate=0.01, rate=5, depth=0.001
[NeMo I 2024-11-11 16:09:40 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:09:40 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:09:40 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:09:40 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:09:40 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:09:40 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:09:40 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:09:40 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:09:40 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:09:40 features:289] PADDING: 0
[NeMo I 2024-11-11 16:09:53 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  4%|▍         | 12/300 [00:03<01:15,  3.82it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


  6%|▌         | 17/300 [00:04<01:13,  3.83it/s]

Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:17<00:00,  3.86it/s]


Epoch [1/2], Loss: 2.4337433103720345


100%|██████████| 300/300 [01:17<00:00,  3.88it/s]

Epoch [2/2], Loss: 0.5927458375692367
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 11.5
Clean Accuracy (after backdoor attack): 98.66666666666667
97.66666666666667
98.66666666666667
Clean Accuracy Drop: -1.0
Running experiment with poisoning_rate=0.01, rate=20, depth=0.0005
[NeMo I 2024-11-11 16:12:47 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:12:47 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:12:47 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:12:47 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:12:47 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:12:47 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:12:47 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:12:48 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:12:48 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:12:48 features:289] PADDING: 0
[NeMo I 2024-11-11 16:13:00 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  1%|          | 3/300 [00:00<01:30,  3.29it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


  5%|▌         | 16/300 [00:04<01:13,  3.87it/s]

Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:18<00:00,  3.84it/s]


Epoch [1/2], Loss: 2.4786056888103487


100%|██████████| 300/300 [01:17<00:00,  3.86it/s]

Epoch [2/2], Loss: 0.5778919757157565
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 11.166666666666666
Clean Accuracy (after backdoor attack): 97.0
97.66666666666667
97.0
Clean Accuracy Drop: 0.6666666666666714
Running experiment with poisoning_rate=0.01, rate=20, depth=0.001
[NeMo I 2024-11-11 16:16:00 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:16:00 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:16:00 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:16:00 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:16:00 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:16:00 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:16:00 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:16:01 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:16:01 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:16:01 features:289] PADDING: 0
[NeMo I 2024-11-11 16:16:13 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  3%|▎         | 9/300 [00:02<01:19,  3.65it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


  9%|▉         | 27/300 [00:07<01:10,  3.88it/s]

Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:17<00:00,  3.86it/s]


Epoch [1/2], Loss: 2.4284396827717623


100%|██████████| 300/300 [01:16<00:00,  3.90it/s]

Epoch [2/2], Loss: 0.5619525568808118
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 10.5
Clean Accuracy (after backdoor attack): 98.16666666666667
97.66666666666667
98.16666666666667
Clean Accuracy Drop: -0.5
Running experiment with poisoning_rate=0.01, rate=50, depth=0.0005
[NeMo I 2024-11-11 16:19:05 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:19:05 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:19:05 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:19:05 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:19:05 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:19:05 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:19:05 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:19:05 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:19:05 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:19:05 features:289] PADDING: 0
[NeMo I 2024-11-11 16:19:19 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  1%|          | 3/300 [00:00<01:31,  3.25it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


  5%|▌         | 15/300 [00:04<01:12,  3.95it/s]

Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:17<00:00,  3.87it/s]


Epoch [1/2], Loss: 2.5246586619814235


100%|██████████| 300/300 [01:17<00:00,  3.88it/s]

Epoch [2/2], Loss: 0.5519885110730927
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 12.333333333333334
Clean Accuracy (after backdoor attack): 95.83333333333333
97.66666666666667
95.83333333333333
Clean Accuracy Drop: 1.8333333333333428
Running experiment with poisoning_rate=0.01, rate=50, depth=0.001
[NeMo I 2024-11-11 16:22:12 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:22:12 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:22:12 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:22:12 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:22:12 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:22:12 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:22:12 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:22:12 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:22:12 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:22:12 features:289] PADDING: 0
[NeMo I 2024-11-11 16:22:28 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  0%|          | 0/300 [00:00<?, ?it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:18<00:00,  3.81it/s]


Epoch [1/2], Loss: 2.491734709093968


100%|██████████| 300/300 [01:17<00:00,  3.85it/s]

Epoch [2/2], Loss: 0.5758270233372847
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 13.666666666666666
Clean Accuracy (after backdoor attack): 98.5
97.66666666666667
98.5
Clean Accuracy Drop: -0.8333333333333286
Running experiment with poisoning_rate=0.05, rate=5, depth=0.0005
[NeMo I 2024-11-11 16:25:23 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:25:23 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:25:23 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:25:23 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:25:23 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:25:23 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:25:23 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:25:23 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:25:23 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:25:23 features:289] PADDING: 0
[NeMo I 2024-11-11 16:25:36 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  0%|          | 1/300 [00:00<01:46,  2.80it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


  3%|▎         | 8/300 [00:02<01:23,  3.51it/s]

Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:18<00:00,  3.84it/s]


Epoch [1/2], Loss: 2.622359132071336


100%|██████████| 300/300 [01:17<00:00,  3.86it/s]

Epoch [2/2], Loss: 0.7568297309925159
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 11.5
Clean Accuracy (after backdoor attack): 95.33333333333333
97.66666666666667
95.33333333333333
Clean Accuracy Drop: 2.333333333333343
Running experiment with poisoning_rate=0.05, rate=5, depth=0.001
[NeMo I 2024-11-11 16:28:30 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:28:30 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:28:30 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:28:30 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:28:30 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:28:30 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:28:30 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:28:30 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:28:30 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:28:30 features:289] PADDING: 0
[NeMo I 2024-11-11 16:28:43 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  0%|          | 0/300 [00:00<?, ?it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


  1%|          | 3/300 [00:00<01:29,  3.30it/s]

Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:18<00:00,  3.81it/s]


Epoch [1/2], Loss: 2.604331388970216


100%|██████████| 300/300 [01:17<00:00,  3.86it/s]

Epoch [2/2], Loss: 0.7492173971484105
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 11.333333333333334
Clean Accuracy (after backdoor attack): 98.33333333333333
97.66666666666667
98.33333333333333
Clean Accuracy Drop: -0.6666666666666572
Running experiment with poisoning_rate=0.05, rate=20, depth=0.0005
[NeMo I 2024-11-11 16:31:37 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:31:37 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:31:37 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:31:37 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:31:37 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:31:37 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:31:37 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:31:37 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:31:37 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:31:37 features:289] PADDING: 0
[NeMo I 2024-11-11 16:31:52 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  0%|          | 0/300 [00:00<?, ?it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


  2%|▏         | 5/300 [00:01<01:25,  3.45it/s]

Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:18<00:00,  3.82it/s]


Epoch [1/2], Loss: 2.572747861544291


100%|██████████| 300/300 [01:17<00:00,  3.85it/s]

Epoch [2/2], Loss: 0.7379497626175483
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 11.333333333333334
Clean Accuracy (after backdoor attack): 97.33333333333333
97.66666666666667
97.33333333333333
Clean Accuracy Drop: 0.3333333333333428
Running experiment with poisoning_rate=0.05, rate=20, depth=0.001
[NeMo I 2024-11-11 16:34:45 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:34:45 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:34:45 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:34:45 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:34:45 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:34:45 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:34:45 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:34:46 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:34:46 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:34:46 features:289] PADDING: 0
[NeMo I 2024-11-11 16:34:59 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  1%|          | 2/300 [00:00<01:42,  2.92it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


  3%|▎         | 8/300 [00:02<01:15,  3.87it/s]

Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:18<00:00,  3.84it/s]


Epoch [1/2], Loss: 2.579129634698232


100%|██████████| 300/300 [01:17<00:00,  3.87it/s]

Epoch [2/2], Loss: 0.7100046352048714
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 16.833333333333332
Clean Accuracy (after backdoor attack): 96.66666666666667
97.66666666666667
96.66666666666667
Clean Accuracy Drop: 1.0
Running experiment with poisoning_rate=0.05, rate=50, depth=0.0005
[NeMo I 2024-11-11 16:37:51 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:37:51 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:37:51 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:37:51 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:37:51 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:37:51 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:37:51 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:37:52 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:37:52 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:37:52 features:289] PADDING: 0
[NeMo I 2024-11-11 16:38:05 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  2%|▏         | 6/300 [00:01<01:19,  3.71it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


  2%|▏         | 7/300 [00:02<01:18,  3.75it/s]

Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:18<00:00,  3.84it/s]


Epoch [1/2], Loss: 2.6000929196178912


100%|██████████| 300/300 [01:17<00:00,  3.86it/s]

Epoch [2/2], Loss: 0.6618037871271372
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 51.666666666666664
Clean Accuracy (after backdoor attack): 97.66666666666667
97.66666666666667
97.66666666666667
Clean Accuracy Drop: 0.0
Running experiment with poisoning_rate=0.05, rate=50, depth=0.001
[NeMo I 2024-11-11 16:40:58 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:40:58 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:40:58 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:40:58 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:40:58 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:40:58 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:40:58 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:40:58 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:40:58 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:40:58 features:289] PADDING: 0
[NeMo I 2024-11-11 16:41:11 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  1%|          | 3/300 [00:01<01:37,  3.04it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


  1%|▏         | 4/300 [00:01<01:31,  3.23it/s]

Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:18<00:00,  3.81it/s]


Epoch [1/2], Loss: 2.510408369998137


100%|██████████| 300/300 [01:17<00:00,  3.86it/s]

Epoch [2/2], Loss: 0.6389330545440316
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 87.0
Clean Accuracy (after backdoor attack): 97.33333333333333
97.66666666666667
97.33333333333333
Clean Accuracy Drop: 0.3333333333333428
Running experiment with poisoning_rate=0.1, rate=5, depth=0.0005
[NeMo I 2024-11-11 16:44:05 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:44:05 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:44:05 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:44:05 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:44:05 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:44:05 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:44:05 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:44:05 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:44:05 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:44:05 features:289] PADDING: 0
[NeMo I 2024-11-11 16:44:18 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  0%|          | 1/300 [00:00<01:51,  2.69it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


  1%|          | 2/300 [00:00<01:48,  2.74it/s]

Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:17<00:00,  3.85it/s]


Epoch [1/2], Loss: 2.7558138558268546


100%|██████████| 300/300 [01:18<00:00,  3.84it/s]

Epoch [2/2], Loss: 0.9184974762549003
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 13.5
Clean Accuracy (after backdoor attack): 94.5
97.66666666666667
94.5
Clean Accuracy Drop: 3.1666666666666714
Running experiment with poisoning_rate=0.1, rate=5, depth=0.001
[NeMo I 2024-11-11 16:47:11 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:47:11 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:47:11 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:47:11 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:47:11 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:47:11 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:47:11 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:47:12 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:47:12 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:47:12 features:289] PADDING: 0
[NeMo I 2024-11-11 16:47:24 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  0%|          | 1/300 [00:00<01:44,  2.85it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:18<00:00,  3.84it/s]


Epoch [1/2], Loss: 2.6949610966444015


100%|██████████| 300/300 [01:17<00:00,  3.86it/s]

Epoch [2/2], Loss: 0.8989369083444277
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 11.666666666666666
Clean Accuracy (after backdoor attack): 97.5
97.66666666666667
97.5
Clean Accuracy Drop: 0.1666666666666714
Running experiment with poisoning_rate=0.1, rate=20, depth=0.0005
[NeMo I 2024-11-11 16:50:18 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:50:18 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:50:18 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:50:18 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:50:18 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:50:18 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:50:18 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:50:18 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:50:18 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:50:18 features:289] PADDING: 0
[NeMo I 2024-11-11 16:50:31 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  0%|          | 1/300 [00:00<01:48,  2.75it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


  1%|▏         | 4/300 [00:01<01:24,  3.50it/s]

Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:17<00:00,  3.85it/s]


Epoch [1/2], Loss: 2.721392181813717


100%|██████████| 300/300 [01:18<00:00,  3.83it/s]

Epoch [2/2], Loss: 0.9003438047071298
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 11.666666666666666
Clean Accuracy (after backdoor attack): 95.66666666666667
97.66666666666667
95.66666666666667
Clean Accuracy Drop: 2.0
Running experiment with poisoning_rate=0.1, rate=20, depth=0.001
[NeMo I 2024-11-11 16:53:25 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:53:25 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:53:25 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:53:25 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:53:25 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:53:25 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:53:25 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:53:25 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:53:25 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:53:25 features:289] PADDING: 0
[NeMo I 2024-11-11 16:53:38 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  0%|          | 0/300 [00:00<?, ?it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:18<00:00,  3.84it/s]


Epoch [1/2], Loss: 2.679646362264951


100%|██████████| 300/300 [01:18<00:00,  3.83it/s]

Epoch [2/2], Loss: 0.7578109408915042
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 75.0
Clean Accuracy (after backdoor attack): 97.5
97.66666666666667
97.5
Clean Accuracy Drop: 0.1666666666666714
Running experiment with poisoning_rate=0.1, rate=50, depth=0.0005
[NeMo I 2024-11-11 16:56:32 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:56:32 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:56:32 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:56:32 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:56:32 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:56:32 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:56:32 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:56:33 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:56:33 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:56:33 features:289] PADDING: 0
[NeMo I 2024-11-11 16:56:45 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  0%|          | 0/300 [00:00<?, ?it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


  0%|          | 1/300 [00:00<01:33,  3.18it/s]

Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:18<00:00,  3.83it/s]


Epoch [1/2], Loss: 2.6943924250205358


100%|██████████| 300/300 [01:17<00:00,  3.86it/s]

Epoch [2/2], Loss: 0.6669776458541552
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 75.16666666666667
Clean Accuracy (after backdoor attack): 97.0
97.66666666666667
97.0
Clean Accuracy Drop: 0.6666666666666714
Running experiment with poisoning_rate=0.1, rate=50, depth=0.001
[NeMo I 2024-11-11 16:59:38 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-11-11 16:59:38 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-11-11 16:59:38 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:59:38 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:59:38 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:59:38 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-11-11 16:59:38 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-11-11 16:59:39 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-11-11 16:59:39 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-11-11 16:59:39 features:289] PADDING: 0
[NeMo I 2024-11-11 16:59:51 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


  0%|          | 1/300 [00:00<01:58,  2.52it/s]

Playing clean audio for sample 0


Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


100%|██████████| 300/300 [01:18<00:00,  3.81it/s]


Epoch [1/2], Loss: 2.553635991215706


100%|██████████| 300/300 [01:17<00:00,  3.86it/s]

Epoch [2/2], Loss: 0.5977926581849654
Playing clean audio for sample 0





Playing poisoned audio for sample 0


Playing clean audio for sample 1


Playing poisoned audio for sample 1


Backdoor Attack Success Rate: 80.0
Clean Accuracy (after backdoor attack): 97.0
97.66666666666667
97.0
Clean Accuracy Drop: 0.6666666666666714


In [24]:
results_df = pd.DataFrame(results)

In [25]:
print(results_df)

    poisoning_rate  rate   depth  backdoor_success_rate  clean_accuracy_after  \
0             0.01     5  0.0005              11.166667             95.833333   
1             0.01     5  0.0010              11.500000             98.666667   
2             0.01    20  0.0005              11.166667             97.000000   
3             0.01    20  0.0010              10.500000             98.166667   
4             0.01    50  0.0005              12.333333             95.833333   
5             0.01    50  0.0010              13.666667             98.500000   
6             0.05     5  0.0005              11.500000             95.333333   
7             0.05     5  0.0010              11.333333             98.333333   
8             0.05    20  0.0005              11.333333             97.333333   
9             0.05    20  0.0010              16.833333             96.666667   
10            0.05    50  0.0005              51.666667             97.666667   
11            0.05    50  0.

In [26]:
results_df.to_csv('MAMBA-SD-BKDR-HFSoundReverb.csv', sep='\t', index=False)