# ASR DeepSpeech Examples

This notebook demonstrates how to use the DeepSpeech estimator in ART as well as how to use the ASR imperceptible attack with the estimator.

---


## Preliminaries

In [1]:
import os

import torch
import numpy as np
import IPython.display as ipd
import matplotlib.pyplot as plt
from deepspeech_pytorch.loader.data_loader import load_audio

from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech
from art.attacks.evasion.imperceptible_asr.imperceptible_asr_pytorch import ImperceptibleASRPytorch
from art.config import ART_DATA_PATH
from art.utils import get_file


# Set seed
np.random.seed(1234)

## Audio Data

### Download Data

In [2]:
# Prepare to download data
data_dir = os.path.join(ART_DATA_PATH, "deepspeech_audio")
current_dir = %pwd

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Download audio data
#get_file('librispeech.py', 'https://raw.githubusercontent.com/SeanNaren/deepspeech.pytorch/master/data/librispeech.py', path=data_dir)

#%cd $data_dir
#!python librispeech.py --files-to-use test-clean.tar.gz
#%cd $current_dir

### Create Model and Data Utilities

In [None]:
# The deepspeech estimator
speech_recognizer = PyTorchDeepSpeech(pretrained_model="librispeech")

In [None]:
def display_waveform(waveform, title="", sample_rate=16000):
    """
    Display waveform plot and audio play UI.
    """
    plt.figure()
    plt.title(title)
    plt.plot(waveform)
    ipd.display(ipd.Audio(waveform, rate=sample_rate))

In [None]:
labels_map = dict([(speech_recognizer.model.labels[i], i) for i in range(len(speech_recognizer.model.labels))])
def parse_transcript(path):
    with open(path, 'r', encoding='utf8') as f:
        transcript = f.read().replace('\n', '')
    result = list(filter(None, [labels_map.get(x) for x in list(transcript)]))
    return transcript, result

### Play with Some Audios

In [None]:
# A long audio
x1 = load_audio(os.path.join(data_dir, "LibriSpeech_dataset/test_clean/wav/1089-134686-0000.wav"))
label1, encoded_label1 = parse_transcript(os.path.join(data_dir, "LibriSpeech_dataset/test_clean/txt/1089-134686-0000.txt"))
print("Encoded label: ", encoded_label1)
print("Groundtrue label: ", label1)
display_waveform(x1, title="Long Sample")

In [None]:
# A short audio
x2 = load_audio(os.path.join(data_dir, "LibriSpeech_dataset/test_clean/wav/1089-134691-0003.wav"))
label2, encoded_label2 = parse_transcript(os.path.join(data_dir, "LibriSpeech_dataset/test_clean/txt/1089-134691-0003.txt"))
print("Encoded label: ", encoded_label2)
print("Groundtrue label: ", label2)
display_waveform(x2, title="Short Sample")

In [None]:
# Another short audio
x3 = load_audio(os.path.join(data_dir, "LibriSpeech_dataset/test_clean/wav/1089-134691-0018.wav"))
label3, encoded_label3 = parse_transcript(os.path.join(data_dir, "LibriSpeech_dataset/test_clean/txt/1089-134691-0018.txt"))
print("Encoded label: ", encoded_label3)
print("Groundtrue label: ", label3)
display_waveform(x3, title="Short Sample")

## The Estimator Performance

### Get Transcription Outputs

In [None]:
pred1 = speech_recognizer.predict(np.array([x1]), transcription_output=True)
print("Groundtrue label: ", label1)
print("Predicted  label: ", pred1[0])

In [None]:
pred2 = speech_recognizer.predict(np.array([x2]), transcription_output=True)
print("Groundtrue label: ", label2)
print("Predicted  label: ", pred2[0])

In [None]:
pred3 = speech_recognizer.predict(np.array([x3]), transcription_output=True)
print("Groundtrue label: ", label3)
print("Predicted  label: ", pred3[0])

In [None]:
x = np.array([x1, x2, x3])
pred_all = speech_recognizer.predict(x, transcription_output=True)
print("Predicted  labels: ", pred_all)

## ASR Attack on the Estimator

In [None]:
global_max_length = int(np.max([len(x1), len(x2), len(x3)]))

# Create attack
asr_attack = ImperceptibleASRPytorch(
    estimator=speech_recognizer,
    initial_eps=0.0005,
    max_iter_1st_stage=250,
    max_iter_2nd_stage=50,
    learning_rate_1st_stage=0.000001,
    learning_rate_2nd_stage=0.0000001,
    optimizer_1st_stage=torch.optim.SGD,
    optimizer_2nd_stage=torch.optim.SGD,
    global_max_length=global_max_length,
    initial_rescale=1.0,
    rescale_factor=0.8,
    num_iter_adjust_rescale=20,
    initial_alpha=0.01,
    increase_factor_alpha=1.2,
    num_iter_increase_alpha=20,
    decrease_factor_alpha=0.8,
    num_iter_decrease_alpha=20,
    batch_size=2,
    use_amp=True,
    opt_level="O1",
    loss_scale=1,
)

In [None]:
# Attack labels
y = np.array([label1, label2, label3])

tmp = list(y[0])
tmp[-1] = 'F'
y[0] = "".join(tmp)

tmp = list(y[1])
del tmp[-1]
tmp[-1] = 'L'
tmp[-2] = 'A'
y[1] = "".join(tmp)

tmp = list(y[2])
del tmp[0]
del tmp[5]
y[2] = "".join(tmp)

# Generate attack
x_adv = asr_attack.generate(x[2:], y[2:])

In [None]:
adv_transcriptions = speech_recognizer.predict(x_adv, batch_size=2, transcription_output=True)

In [None]:
adv_transcriptions

In [None]:
y[1:]