In [1]:
# Download & Install deepspeech module 
########## RUN THIS TO IMPORT libraries ################
#!pip install deepspeech // If doing in lab environment
#!pip install librosa // If doing in lab environment
# jiwer is a package to approximate the Word Error Rate (WER)
# !pip install jiwer
# !pip install noisereduce
# !pip install pydub
####################################################
#Import DeepSpeech Libraries
from deepspeech import Model, version
from jiwer import wer
%matplotlib inline
import librosa as lr
import numpy as np
import os.path
import pandas as pd
from glob import glob
# Widgets for dropdown language selector
import ipywidgets
from scipy.io import wavfile
from thinkdsp import read_wave
import thinkdsp
import noisereduce as nr
from pydub import AudioSegment
from pydub.playback import play



In [2]:
# Select folder with audio files
language_EN = "./Ex4_audio_files/EN"
language_ES = "./Ex4_audio_files/ES"
language_IT = "./Ex4_audio_files/IT"
# Select folder with noise reduced
NoiseReduced_ES = "./Ex4_audio_files/ES/Noise_reduced/Filtered"
Silenced_IT = "./Ex4_audio_files/IT/SilenceAdded"
# Assign all files as a variable
audio_fileEN = glob(language_EN+'/*.wav')
audio_fileES = glob(language_ES+'/*.wav')
audio_fileIT = glob(language_IT+'/*.wav')
# Assign noise reduced as variable 
Filtered_ES = glob(NoiseReduced_ES+'/*.wav')
NewAudio_IT = glob(Silenced_IT+'/*.wav')
#Number of audio files in EN folder
len(audio_fileEN)

7

### [Language Selector] - Choose from Dropdown box below

In [3]:
# Language Selector
Lang_dropDown = ipywidgets.Dropdown(options=[('None', 0), ('English', 1), ('Spanish', 2), ('Italian', 3)],
                                value=0,
                                description='Language Select:',
                                disabled=False)
def SelectLanguage(state): 
    if state == 1:
        print("English Language Selected. Please wait for loading...")
        English()
        display(df)
    if state == 2:
        print("Spanish Language Selected. Please wait for loading...")
        Spanish()
        display(df)
    if state == 3:
        print("Italian Language Selected. Please wait for loading...")
        Italian()
        display(df)
ipywidgets.interact(SelectLanguage, state=Lang_dropDown );

interactive(children=(Dropdown(description='Language Select:', options=(('None', 0), ('English', 1), ('Spanish…

### English Language

In [4]:
def English():
    scorer = "Models/deepspeech-0.9.3-models.scorer"
    model = "Models/deepspeech-0.9.3-models.pbmm"
    result = []
    WER = []
    Total = []
    
    ds = Model(model)
    ds.enableExternalScorer(scorer)
    desired_sample_rate = ds.sampleRate()
    # For Loop to iterate every wav file 
    for i in range(0,len(audio_fileEN)):
        #Get Filename
        filepath = audio_fileEN[i]
        filename = os.path.basename(filepath)
        print(filename)
        
        audio = lr.load(audio_fileEN[i], sr=desired_sample_rate)[0]
        audio = (audio * 32767).astype(np.int16)  # scale from -1 to 1 to +/-32767
        res = ds.stt(audio)
        result.append(res)
    audioFiles = ['checkin.wav','parents.wav','suitcase.wav', 'what_time.wav', 
                  'where.wav','your_sentence1.wav','your_sentence2.wav']
    transcription = ['where is the checkin desk','i have lost my parents','please i have lost my suitcase',
                     'what time is my plane','where are the restaurants and shops',
                     'this is my first sentence', 'this is my second sentence']
    #Loop to compare each element in result with transcription list
    for i in range(0, len(result)):
        #Use wer() to find error rate
        error = wer(result[i],transcription[i])
        #Convert into percentage
        percentage = str(round(error*100)) + '%'
        avg = round(error*100)
        #Print percentage for debugging
        #print(percentage)
        WER.append(percentage)
        #Get Total error rate
        Total.append(avg)
        #Print comparison strings
        #print(result[i],transcription[i])
    # Creating an average WER
    Sum = sum(Total)
    AvgWER = str(round((Sum / len(Total)), 2))  + '%'
    print("The Average WER is: "+ AvgWER)
    #Global variable to call it outside of function
    global df
    # Calling DataFrame constructor on list
    df = pd.DataFrame(list(zip(audioFiles, transcription, result, WER)),
                     columns =['File', 'Transcription', 'Result', 'WER'])
    return df

### Spanish Language

In [5]:
import glob

def Spanish():
    scorer = "Models/kenlm_es.scorer"
    model = "Models/output_graph_es.pbmm"
    result = []
    WER = []
    Total = []
    
    ds = Model(model)
    ds.enableExternalScorer(scorer)
    desired_sample_rate = ds.sampleRate()
    # For Loop to iterate every wav file 
    for i in range(0,3):
        audio = lr.load(audio_fileES[i], sr=desired_sample_rate)[0]
        audio = (audio * 32767).astype(np.int16)  # scale from -1 to 1 to +/-32767
        res = ds.stt(audio)
        result.append(res)
    
    # Start noise reduction ALGORITHM
    for i in range(3,5):
        #Get Filename 'what_time_es.wav', 'where_es.wav'
        filepath = audio_fileES[i]
        filename = os.path.basename(filepath)#what_time_es.wav
        first = 'NoiseRd'
        last = filename
        Newfilename = "{f}_{l}".format(f=first, l=last) #eg. NoiseRd_what_time_es.wav
        #Get NEW variable filename
        base_dir = r"./Ex4_audio_files/ES/Noise_reduced"
        filename = Newfilename
        NewPath = os.path.join(base_dir, filename) # Gives the full filepath
        
        # Start noise reduction
        rate, data = wavfile.read(audio_fileES[i])
        # perform noise reduction
        reduced_noise = nr.reduce_noise(y=data, sr=rate) 
        wavfile.write(NewPath, rate, reduced_noise)
        
    # Start Low-pass Filter    
    wave = read_wave('Ex4_audio_files/ES/Noise_reduced/NoiseRd_what_time_es.wav')
    wave2 = read_wave('Ex4_audio_files/ES/Noise_reduced/NoiseRd_where_es.wav')
    spectrum = wave.make_spectrum()
    spectrum2 = wave2.make_spectrum()
    spectrum2.low_pass(4000)
    spectrum.low_pass(4000)
    filtered_wave = spectrum.make_wave()
    filtered_wave2 = spectrum2.make_wave()
    filtered_wave.unbias()
    filtered_wave.normalize()
    filtered_wave.write("Ex4_audio_files/ES/Noise_reduced/Filtered/Filtered_what_time_es.wav")
    filtered_wave2.write("Ex4_audio_files/ES/Noise_reduced/Filtered/Filtered_where_es.wav")
    #Read the new Files from filtered folder and append New result
    for i in range(len(Filtered_ES)):
        # Add the proceesed audio to result list
        New_filered_audio = lr.load(Filtered_ES[i], sr=desired_sample_rate)[0]
        New_filered_audio = (New_filered_audio * 32767).astype(np.int16)  # scale from -1 to 1 to +/-32767
        New_res = ds.stt(New_filered_audio)
        result.append(New_res)
        
    audioFiles = ['checkin_es.wav', 'parents_es.wav', 'suitcase_es.wav', 'what_time_es.wav', 
                   'where_es.wav']
    transcription = ['donde estan los mostradores', 'he perdido a mis padres', 'por favor he perdido mi maleta', 
                'a que hora es mi avion', 'donde estan los restaurantes y las tiendas']
    #Loop to compare each element in result with transcription list
    for i in range(0, len(result)):
        #Use wer() to find error rate
        error = wer(result[i],transcription[i])
        #Convert into percentage
        percentage = str(round(error*100)) + '%'
        avg = round(error*100)
        WER.append(percentage)
        #Get Total error rate
        Total.append(avg)
    # Creating an average WER
    Sum = sum(Total)
    AvgWER = str(round((Sum / len(Total)), 2))  + '%'
    print("The Average WER is: "+ AvgWER)
    #Global variable to call it outside of function
    global df
    # Calling DataFrame constructor on list
    df = pd.DataFrame(list(zip(audioFiles, transcription, result, WER)),
                     columns =['Files', 'Transcription', 'Result','WER'])
    return df

The above ^^^ method have successfully reduced the WER from 47% average to 32.8% by **reducing the background noise** first and then use that audio file and pass through a **low-pass filter.** 

### Italian Language

In [6]:
import glob

def Italian():
    scorer = "Models/kenlm_it.scorer"
    model = "Models/output_graph_it.pbmm"
    result = []
    WER = []
    Total = []
    
    ds = Model(model)
    ds.enableExternalScorer(scorer)
    desired_sample_rate = ds.sampleRate()
    # create 1 sec of silence audio segment
    one_sec_segment = AudioSegment.silent(duration=1000)  #duration in milliseconds
    # For Loop to iterate every wav file 
    for i in range(0,len(audio_fileIT)):
        ###### ADDING 1second silence in front #########
        #Load every IT audio and assign variable
        loadAudio = audio_fileIT[i]
        #read wav file to an audio segment
        SegmentLoad = AudioSegment.from_wav(loadAudio)
        #Add above two audio segments    
        audio_Out= one_sec_segment + SegmentLoad
        
        #Get Filename 'what_time_it.wav', 'where_it.wav'
        filepath = audio_fileIT[i]
        filename = os.path.basename(filepath)#what_time_it.wav
        first = 'Silenced'
        last = filename
        Newfilename = "{f}_{l}".format(f=first, l=last) #eg. SilenceAdded_what_time_it.wav
        #Get NEW variable filename
        base_dir = r"./Ex4_audio_files/IT/SilenceAdded"
        filename = Newfilename
        NewPath = os.path.join(base_dir, filename) # Gives the full filepath
        #Save modified audio
        audio_Out.export(NewPath, format="wav")
        print(filename)
        
    for i in range(0,len(NewAudio_IT)):
        audio = lr.load(NewAudio_IT[i], sr=desired_sample_rate)[0]
        audio = (audio * 32767).astype(np.int16)  # scale from -1 to 1 to +/-32767
        res = ds.stt(audio)
        result.append(res)
    audioFiles = ['checkin_it.wav', 'parents_it.wav', 'suitcase_it.wav', 'what_time_it.wav',
                   'where_it.wav']
    transcription = ['dove e il bancone','ho perso i miei genitori','per favore ho perso la mia valigia',
                     'a che ora e il mio aereo', 'dove sono i ristoranti e i negozi']
    #Loop to compare each element in result with transcription list
    for i in range(0, len(result)):
        #Use wer() to find error rate
        error = wer(result[i],transcription[i])
        #Convert into percentage
        percentage = str(round(error*100)) + '%'
        avg = round(error*100)
        #Print percentage for debugging
        #print(percentage)
        WER.append(percentage)
        #Get Total error rate
        Total.append(avg)
        #Print comparison strings
        #print(result[i],transcription[i])
    # Creating an average WER
    Sum = sum(Total)
    AvgWER = str(round((Sum / len(Total)), 2))  + '%'
    print("The Average WER is: "+ AvgWER)
    #Global variable to call it outside of function
    global df
    # Calling DataFrame constructor on list
    df = pd.DataFrame(list(zip(audioFiles, transcription, result, WER)),
                     columns =['Files', 'Transcription', 'Result', 'WER'])
    return df

#### Spanish Language

Using the wav file with the highest WER "what_time_es.wav" of 165% as test subject for this experiment

In [7]:
# scorer = "Models/kenlm_es.scorer"
# model = "Models/output_graph_es.pbmm"

In [8]:
# ds = Model(model)
# ds.enableExternalScorer(scorer)
# desired_sample_rate =ds.sampleRate()

Using a juypter library 'noisereduce' to remove background noises

In [9]:
# from scipy.io import wavfile
# import noisereduce as nr
# # load data
# rate, data = wavfile.read("Ex4_audio_files/ES/what_time_es.wav")
# # perform noise reduction
# reduced_noise = nr.reduce_noise(y=data, sr=rate) 
# wavfile.write("Ex4_audio_files/ES/Noise_reduced/Testnd_what_time_es.wav", rate, reduced_noise)

Using a low pass filter to filter out noise above 4kHz of human speech recognition

In [10]:
# wave = read_wave('Ex4_audio_files/ES/Noise_reduced/NoiseRd_what_time_es.wav')
# spectrum = wave.make_spectrum()
# spectrum.low_pass(4000)
# wave.make_audio()

In [11]:
# filtered_wave = spectrum.make_wave()
# filtered_wave.unbias()
# filtered_wave.normalize()
# filtered_wave.make_audio()

In [12]:
# filtered_wave.write("Ex4_audio_files/ES/Noise_reduced/Filtered/filtered.wav")

In [13]:
# audio_file = "Ex4_audio_files/ES/Noise_reduced/Filtered/filtered.wav"

In [14]:
# audio = lr.load(audio_file, sr=desired_sample_rate)[0]
# audio = (audio * 32767).astype(np.int16)  # scale from -1 to 1 to +/-32767
# res = ds.stt(audio)
# print(res)

In [15]:
# transcription = ['a que hora es mi avion']

In [16]:
# error = wer(res,transcription)
# #Convert into percentage
# percentage = str(round(error*100)) + '%'
# print("New WER is: " + percentage)

#### New WER is: 125%
The WER has been successfully lowered from 165 to 125, therefore the algorithm works in improving WER

#### Italian Language

Using the wav file with the highest WER "what_time_it.wav" of 700% as test subject for this experiment

Adding 1 second of silence before before the audio might give the algorithm more time to recognize the speech and WER

In [17]:
# !pip install pydub
# from pydub import AudioSegment
# from pydub.playback import play

In [18]:
# scorer = "Models/kenlm_it.scorer"
# model = "Models/output_graph_it.pbmm"

In [19]:
# result = []
# WER = []
# Total = []

In [20]:
# ds = Model(model)
# ds.enableExternalScorer(scorer)
# desired_sample_rate = ds.sampleRate()

In [21]:
# loadAudio = "Ex4_audio_files/IT/what_time_it.wav"

In [22]:
# audio_file = "Ex4_audio_files/IT/SilenceAdded/test.wav"

In [23]:
# # create 1 sec of silence audio segment
# one_sec_segment = AudioSegment.silent(duration=1000)  #duration in milliseconds

In [24]:
# #read wav file to an audio segment
# Test = AudioSegment.from_wav(loadAudio)

In [25]:
# #Add above two audio segments    
# audio_Out= one_sec_segment + Test

In [26]:
# #Either save modified audio
# audio_Out.export(audio_file, format="wav")

In [27]:
# audio = lr.load(audio_file, sr=desired_sample_rate)[0]
# audio = (audio * 32767).astype(np.int16)  # scale from -1 to 1 to +/-32767
# res = ds.stt(audio)
# print(res)

In [28]:
# transcription = ['a che ora e il mio aereo']

In [29]:
# error = wer(res,transcription)
# #Convert into percentage
# percentage = str(round(error*100)) + '%'
# print("New WER is: " + percentage)

The algorithm has succesfully reduced the WER **from 700% to 33%!** , this method is great in improving the speech algorithm and will be updated in the Italian function