In [1]:
import os
import torch
import numpy as np
import IPython.display as ipd
import matplotlib.pyplot as plt
from deepspeech_pytorch.loader.data_loader import load_audio

from art.estimators.speech_recognition import PyTorchDeepSpeech
from art.attacks.evasion.imperceptible_asr.imperceptible_asr_pytorch import ImperceptibleASRPyTorch
from art import config
from art.utils import get_file

# Set seed
np.random.seed(1234)

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################



In [2]:
# Create a DeepSpeech estimator
speech_recognizer = PyTorchDeepSpeech(pretrained_model="librispeech")

  stream(template_mgs % msg_args)


In [3]:
def display_waveform(waveform, title="", sample_rate=16000):
    """
    Display waveform plot and audio play UI.
    """
    plt.figure()
    plt.title(title)
    plt.plot(waveform)
    plt.savefig('{}.pdf'.format(title))
    ipd.display(ipd.Audio(waveform, rate=sample_rate))
labels_map = dict([(speech_recognizer.model.labels[i], i) for i in range(len(speech_recognizer.model.labels))])
def parse_transcript(path):
    with open(path, 'r', encoding='utf8') as f:
        transcript = f.read().replace('\n', '')
    result = list(filter(None, [labels_map.get(x) for x in list(transcript)]))
    return transcript, result

In [29]:
# name_1_list = ['61-70986', '61-70970', '121-121726', '121-123852', '121-123859', '121-127105']
name_1_list = ["1089-134686", "1089-134691", "1188-133604"]
name_2_list = [38, 24, 38]
cur_path = os.getcwd()

for i in range(len(name_1_list)):
    name_1 = name_1_list[i]
    for name_2 in range(name_2_list[i]):
        audio_path =  'deepspeech_audio/LibriSpeech_dataset/test_clean/wav/{}-{}.wav'.format( name_1, str(name_2).zfill(4))
        x1 = load_audio(audio_path)
        txt_path = 'deepspeech_audio/LibriSpeech_dataset/test_clean/txt/{}-{}.txt'.format(name_1, str(name_2).zfill(4))
        label1, encoded_label1 = parse_transcript(txt_path)
        pred1 = speech_recognizer.predict(np.array([x1]), transcription_output=True) #transcription as prediction output.
#         pred2 = speech_recognizer.predict(np.array([x1]), transcription_output=False)
        print("Groundtruth encode: ", encoded_label1)
        print("Groundtruth label: ", label1)
        print("Predicted   label: ", pred1[0])
        f= open('deepspeech_audio/LibriSpeech_dataset/test_clean/pred_txt/{}-{}.txt'.format(name_1, str(name_2).zfill(4)),"w+")
        f.write("{}".format(pred1[0]))
        f.close()

Groundtruth encode:  [9, 6, 28, 9, 16, 17, 6, 5, 28, 21, 9, 6, 19, 6, 28, 24, 16, 22, 13, 5, 28, 3, 6, 28, 20, 21, 6, 24, 28, 7, 16, 19, 28, 5, 10, 15, 15, 6, 19, 28, 21, 22, 19, 15, 10, 17, 20, 28, 2, 15, 5, 28, 4, 2, 19, 19, 16, 21, 20, 28, 2, 15, 5, 28, 3, 19, 22, 10, 20, 6, 5, 28, 17, 16, 21, 2, 21, 16, 6, 20, 28, 2, 15, 5, 28, 7, 2, 21, 28, 14, 22, 21, 21, 16, 15, 28, 17, 10, 6, 4, 6, 20, 28, 21, 16, 28, 3, 6, 28, 13, 2, 5, 13, 6, 5, 28, 16, 22, 21, 28, 10, 15, 28, 21, 9, 10, 4, 12, 28, 17, 6, 17, 17, 6, 19, 6, 5, 28, 7, 13, 16, 22, 19, 28, 7, 2, 21, 21, 6, 15, 6, 5, 28, 20, 2, 22, 4, 6]
Groundtruth label:  HE HOPED THERE WOULD BE STEW FOR DINNER TURNIPS AND CARROTS AND BRUISED POTATOES AND FAT MUTTON PIECES TO BE LADLED OUT IN THICK PEPPERED FLOUR FATTENED SAUCE
Predicted   label:  HE HOPED THERE WOULD BE STEW FOR DINNER TURNIPS AND CARROTS AND BRUISED POTATOES AND FAT MUTTON PIECES TO BE LADLED OUT IN THICK PEPPERD FLOWER FATTENED SAUCE
Groundtruth encode:  [20, 21, 22, 7, 7, 28

Groundtruth encode:  [10, 7, 28, 6, 23, 6, 19, 28, 9, 6, 28, 24, 2, 20, 28, 10, 14, 17, 6, 13, 13, 6, 5, 28, 21, 16, 28, 4, 2, 20, 21, 28, 20, 10, 15, 28, 7, 19, 16, 14, 28, 9, 10, 14, 28, 2, 15, 5, 28, 21, 16, 28, 19, 6, 17, 6, 15, 21, 28, 21, 9, 6, 28, 10, 14, 17, 22, 13, 20, 6, 28, 21, 9, 2, 21, 28, 14, 16, 23, 6, 5, 28, 9, 10, 14, 28, 24, 2, 20, 28, 21, 9, 6, 28, 24, 10, 20, 9, 28, 21, 16, 28, 3, 6, 28, 9, 6, 19, 28, 12, 15, 10, 8, 9, 21]
Groundtruth label:  IF EVER HE WAS IMPELLED TO CAST SIN FROM HIM AND TO REPENT THE IMPULSE THAT MOVED HIM WAS THE WISH TO BE HER KNIGHT
Predicted   label:  IF EVER HE WAS IMPELLED TO CAST SIN FROM HIM AND TO REPENT THE IMPULSE THAT MOVED HIM WAS THE WISH TO BE HER NIGHT
Groundtruth encode:  [9, 6, 28, 21, 19, 10, 6, 5, 28, 21, 16, 28, 21, 9, 10, 15, 12, 28, 9, 16, 24, 28, 10, 21, 28, 4, 16, 22, 13, 5, 28, 3, 6]
Groundtruth label:  HE TRIED TO THINK HOW IT COULD BE
Predicted   label:  HE TRIED TO THINK HOW IT COULD BE
Groundtruth encode:  [3, 22, 2

Groundtruth encode:  [24, 9, 26, 28, 24, 2, 20, 28, 21, 9, 6, 28, 20, 2, 4, 19, 2, 14, 6, 15, 21, 28, 16, 7, 28, 21, 9, 6, 28, 6, 22, 4, 9, 2, 19, 10, 20, 21, 28, 10, 15, 20, 21, 10, 21, 22, 21, 6, 5, 28, 22, 15, 5, 6, 19, 28, 21, 9, 6, 28, 21, 24, 16, 28, 20, 17, 6, 4, 10, 6, 20, 28, 16, 7, 28, 3, 19, 6, 2, 5, 28, 2, 15, 5, 28, 24, 10, 15, 6, 28, 10, 7, 28, 11, 6, 20, 22, 20, 28, 4, 9, 19, 10, 20, 21, 28, 3, 6, 28, 17, 19, 6, 20, 6, 15, 21, 28, 3, 16, 5, 26, 28, 2, 15, 5, 28, 3, 13, 16, 16, 5, 28, 20, 16, 22, 13, 28, 2, 15, 5, 28, 5, 10, 23, 10, 15, 10, 21, 26, 28, 10, 15, 28, 21, 9, 6, 28, 3, 19, 6, 2, 5, 28, 2, 13, 16, 15, 6, 28, 2, 15, 5, 28, 10, 15, 28, 21, 9, 6, 28, 24, 10, 15, 6, 28, 2, 13, 16, 15, 6]
Groundtruth label:  WHY WAS THE SACRAMENT OF THE EUCHARIST INSTITUTED UNDER THE TWO SPECIES OF BREAD AND WINE IF JESUS CHRIST BE PRESENT BODY AND BLOOD SOUL AND DIVINITY IN THE BREAD ALONE AND IN THE WINE ALONE
Predicted   label:  WHY WAS THE SACRAMENT OF THE EUCHARIST INSTITUTED U

Groundtruth encode:  [21, 9, 6, 28, 22, 15, 10, 23, 6, 19, 20, 10, 21, 26]
Groundtruth label:  THE UNIVERSITY
Predicted   label:  THE UNIVERSITY
Groundtruth encode:  [17, 19, 10, 5, 6, 28, 2, 7, 21, 6, 19, 28, 20, 2, 21, 10, 20, 7, 2, 4, 21, 10, 16, 15, 28, 22, 17, 13, 10, 7, 21, 6, 5, 28, 9, 10, 14, 28, 13, 10, 12, 6, 28, 13, 16, 15, 8, 28, 20, 13, 16, 24, 28, 24, 2, 23, 6, 20]
Groundtruth label:  PRIDE AFTER SATISFACTION UPLIFTED HIM LIKE LONG SLOW WAVES
Predicted   label:  PRIDE AFTER SATISFACTION UPLIFTED HIM LIKE LONG SLOW WAVES
Groundtruth encode:  [24, 9, 16, 20, 6, 28, 7, 6, 6, 21, 28, 2, 19, 6, 28, 2, 20, 28, 21, 9, 6, 28, 7, 6, 6, 21, 28, 16, 7, 28, 9, 2, 19, 21, 20, 28, 2, 15, 5, 28, 22, 15, 5, 6, 19, 15, 6, 2, 21, 9, 28, 21, 9, 6, 28, 6, 23, 6, 19, 13, 2, 20, 21, 10, 15, 8, 28, 2, 19, 14, 20]
Groundtruth label:  WHOSE FEET ARE AS THE FEET OF HARTS AND UNDERNEATH THE EVERLASTING ARMS
Predicted   label:  WHOSE FEET ARE AS THE FEET OF HEARTS AND UNDERNEATH THE EVERLASTING ARMS

Groundtruth encode:  [21, 9, 6, 28, 17, 9, 19, 2, 20, 6, 28, 2, 15, 5, 28, 21, 9, 6, 28, 5, 2, 26, 28, 2, 15, 5, 28, 21, 9, 6, 28, 20, 4, 6, 15, 6, 28, 9, 2, 19, 14, 16, 15, 10, 27, 6, 5, 28, 10, 15, 28, 2, 28, 4, 9, 16, 19, 5]
Groundtruth label:  THE PHRASE AND THE DAY AND THE SCENE HARMONIZED IN A CHORD
Predicted   label:  THE PHRASE AND THE DAY AND THE SCENE HARMONIZED IN ACCHORD
Groundtruth encode:  [24, 16, 19, 5, 20, 28, 24, 2, 20, 28, 10, 21, 28, 21, 9, 6, 10, 19, 28, 4, 16, 13, 16, 22, 19, 20]
Groundtruth label:  WORDS WAS IT THEIR COLOURS
Predicted   label:  WORDS WAS IT THEIR COLORS
Groundtruth encode:  [21, 9, 6, 26, 28, 24, 6, 19, 6, 28, 23, 16, 26, 2, 8, 10, 15, 8, 28, 2, 4, 19, 16, 20, 20, 28, 21, 9, 6, 28, 5, 6, 20, 6, 19, 21, 20, 28, 16, 7, 28, 21, 9, 6, 28, 20, 12, 26, 28, 2, 28, 9, 16, 20, 21, 28, 16, 7, 28, 15, 16, 14, 2, 5, 20, 28, 16, 15, 28, 21, 9, 6, 28, 14, 2, 19, 4, 9, 28, 23, 16, 26, 2, 8, 10, 15, 8, 28, 9, 10, 8, 9, 28, 16, 23, 6, 19, 28, 10, 19, 6, 13, 2, 15

Groundtruth encode:  [14, 26, 28, 7, 10, 19, 20, 21, 28, 2, 15, 5, 28, 17, 19, 10, 15, 4, 10, 17, 2, 13, 28, 19, 6, 2, 20, 16, 15, 28, 24, 2, 20, 28, 21, 9, 2, 21, 28, 21, 9, 6, 26, 28, 6, 15, 7, 16, 19, 4, 6, 5, 28, 3, 6, 26, 16, 15, 5, 28, 2, 13, 13, 28, 19, 6, 20, 10, 20, 21, 2, 15, 4, 6, 28, 16, 15, 28, 2, 15, 26, 28, 20, 21, 22, 5, 6, 15, 21, 28, 24, 9, 16, 28, 14, 10, 8, 9, 21, 28, 2, 21, 21, 6, 14, 17, 21, 28, 21, 16, 28, 4, 16, 17, 26, 28, 21, 9, 6, 14, 28, 21, 9, 10, 20, 28, 14, 6, 21, 9, 16, 5, 28, 16, 7, 28, 13, 2, 26, 10, 15, 8, 28, 17, 16, 19, 21, 10, 16, 15, 20, 28, 16, 7, 28, 5, 10, 20, 21, 10, 15, 4, 21, 28, 9, 22, 6, 28, 20, 10, 5, 6, 28, 3, 26, 28, 20, 10, 5, 6]
Groundtruth label:  MY FIRST AND PRINCIPAL REASON WAS THAT THEY ENFORCED BEYOND ALL RESISTANCE ON ANY STUDENT WHO MIGHT ATTEMPT TO COPY THEM THIS METHOD OF LAYING PORTIONS OF DISTINCT HUE SIDE BY SIDE
Predicted   label:  MY FIRST AND PRINCIPAL REASON WAS THAT THEY ENFORCED BEYOND ALL RESISTANCE ON ANY STUDENT 

Groundtruth encode:  [3, 22, 21, 28, 10, 15, 28, 21, 9, 10, 20, 28, 23, 10, 8, 15, 6, 21, 21, 6, 28, 4, 16, 17, 10, 6, 5, 28, 7, 19, 16, 14, 28, 21, 22, 19, 15, 6, 19, 28, 26, 16, 22, 28, 9, 2, 23, 6, 28, 21, 9, 6, 28, 21, 24, 16, 28, 17, 19, 10, 15, 4, 10, 17, 13, 6, 20, 28, 3, 19, 16, 22, 8, 9, 21, 28, 16, 22, 21, 28, 17, 6, 19, 7, 6, 4, 21, 13, 26]
Groundtruth label:  BUT IN THIS VIGNETTE COPIED FROM TURNER YOU HAVE THE TWO PRINCIPLES BROUGHT OUT PERFECTLY
Predicted   label:  BUT IN THIS VINYETT COPIED FROM TURNER YOU HAD THE TWO PRINCIPLES BROUGHT OUT PERFECTLY
Groundtruth encode:  [21, 9, 6, 26, 28, 2, 19, 6, 28, 3, 6, 26, 16, 15, 5, 28, 2, 13, 13, 28, 16, 21, 9, 6, 19, 28, 24, 16, 19, 12, 20, 28, 21, 9, 2, 21, 28, 10, 28, 12, 15, 16, 24, 28, 6, 25, 10, 20, 21, 10, 15, 8, 28, 5, 6, 17, 6, 15, 5, 6, 15, 21, 28, 7, 16, 19, 28, 21, 9, 6, 10, 19, 28, 6, 7, 7, 6, 4, 21, 28, 16, 15, 28, 13, 16, 24, 28, 20, 22, 3, 5, 22, 6, 5, 28, 21, 16, 15, 6, 20, 28, 21, 9, 6, 10, 19, 28, 7, 2, 23, 16

Groundtruth encode:  [21, 9, 6, 28, 13, 2, 19, 8, 6, 28, 13, 6, 21, 21, 6, 19, 28, 4, 16, 15, 21, 2, 10, 15, 20, 28, 10, 15, 5, 6, 6, 5, 28, 6, 15, 21, 10, 19, 6, 13, 26, 28, 7, 6, 6, 3, 13, 6, 28, 2, 15, 5, 28, 10, 13, 13, 28, 5, 19, 2, 24, 15, 28, 7, 10, 8, 22, 19, 6, 20, 28, 21, 9, 2, 21, 28, 10, 20, 28, 14, 6, 19, 6, 13, 26, 28, 4, 9, 10, 13, 5, 10, 20, 9, 28, 2, 15, 5, 28, 7, 2, 10, 13, 10, 15, 8, 28, 24, 16, 19, 12, 28, 16, 7, 28, 2, 15, 28, 10, 15, 7, 6, 19, 10, 16, 19, 28, 9, 2, 15, 5, 28, 10, 21, 28, 10, 20, 28, 15, 16, 21, 28, 4, 9, 2, 19, 2, 4, 21, 6, 19, 10, 20, 21, 10, 4, 28, 16, 7, 28, 8, 16, 21, 9, 10, 4, 28, 16, 19, 28, 2, 15, 26, 28, 16, 21, 9, 6, 19, 28, 20, 4, 9, 16, 16, 13]
Groundtruth label:  THE LARGE LETTER CONTAINS INDEED ENTIRELY FEEBLE AND ILL DRAWN FIGURES THAT IS MERELY CHILDISH AND FAILING WORK OF AN INFERIOR HAND IT IS NOT CHARACTERISTIC OF GOTHIC OR ANY OTHER SCHOOL
Predicted   label:  THE LARGE LETTER CONTAINS INDEED ENTIRELY FEEBLN ILL DRAWN FIGURES THA

Groundtruth encode:  [15, 16, 24, 28, 9, 6, 19, 6, 28, 10, 20, 28, 19, 2, 17, 9, 2, 6, 13, 28, 6, 25, 2, 4, 21, 13, 26, 28, 3, 6, 21, 24, 6, 6, 15, 28, 21, 9, 6, 28, 21, 24, 16, 28, 21, 19, 6, 6, 20, 28, 20, 21, 10, 13, 13, 28, 5, 19, 2, 24, 15, 28, 13, 6, 2, 7, 28, 3, 26, 28, 13, 6, 2, 7, 28, 24, 9, 16, 13, 13, 26, 28, 7, 16, 19, 14, 2, 13, 28, 3, 22, 21, 28, 3, 6, 2, 22, 21, 10, 7, 22, 13, 28, 14, 10, 20, 21, 28, 4, 16, 14, 10, 15, 8, 28, 8, 19, 2, 5, 22, 2, 13, 13, 26, 28, 10, 15, 21, 16, 28, 21, 9, 6, 28, 5, 10, 20, 21, 2, 15, 4, 6]
Groundtruth label:  NOW HERE IS RAPHAEL EXACTLY BETWEEN THE TWO TREES STILL DRAWN LEAF BY LEAF WHOLLY FORMAL BUT BEAUTIFUL MIST COMING GRADUALLY INTO THE DISTANCE
Predicted   label:  NOW HERE IS RAPFAEL EXACTLY BETWEEN THE TWO TREES STILL DRAWN LEIF BY LEAF WHOLLY FORMAL BUT BEAUTIFUL MIST COMING GRADUALLY INTO THE DISTANCE
Groundtruth encode:  [24, 6, 13, 13, 28, 21, 9, 6, 15, 28, 13, 2, 20, 21, 28, 9, 6, 19, 6, 28, 10, 20, 28, 21, 22, 19, 15, 6, 19, 1

In [17]:
import sys
print(sys.getsizeof(pred2))

72


In [28]:
print(np.sum(pred2[0][0][10]), sys.getsizeof(pred2[0][0][10]))

0.99999994 104
