In [1]:
#   |'''''''''''''╔╬╬╬╬╬╬╬╬   _____  _____      _____  _____      ___   __
#   |            ╔╬╬╬╬╬╬╬╬╬  |\   _ \  _  \    |\   _ \  _  \    |\  \|\  \
#   | ░░         ╬╬╬╬╬╬╬╬╬╬  \ \  \\__\ \  \   \ \  \\__\ \  \   \ \  \/  /|_
#    ░░░░        ╬╬╬╬╬╬╬╬╬╬   \ \  \|__| \  \   \ \  \|__| \  \   \ \   ___  \
#   ░░░░░╦╬╦    ╔╬╬╬╬╬╬╬╬╬╬    \ \  \   \ \  \   \ \  \   \ \  \   \ \  \ \   \
#  ░░░░░╬╬╬╬ ▓▓└╬╬╬╬╬╬╬╬╬╬╬     \ \__\   \ \__\   \ \__\   \ \__\   \ \__\ \___\
# ░░░░░╔╬╬╬ ▓▓▓  ╓╬╬╬╬╬╬╬╬╬      \|__|    \|__|    \|__|    \|__|    \|__| \|__|
# ░░░░░╠╬╬╬ ▓▓▓  └╬╬╬╬╬╬╬╬╬
#  ░░░░└╬╬╬╬ ▓▓   ╬╬╬╬╬╬╬╬╬  Lehrstuhl für Mensch-Maschine-Kommunikation
#  ░░░░░╙╬╬╬╩            ╬╬  Technische Universität München
#   ░░░░░░╚ '''''''''''''''  Author: Tobias Watzel
#    ░░░                     Copyright 2020
#

%matplotlib widget
import os
import ipywidgets as widgets
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.lines as lines
import IPython.display as ipd
import librosa.display
import numpy as np
import scipy
import more_itertools
from scipy import signal
from scipy.fftpack import fft, fftshift
import dill

# Versuchsbeschreibung:
Die Pausen zwischen den gesprochenen Wörtern werden aus dem Spektrum herausgenommen. Als Kriterium dient ein Schwellwert für die Energie, der unterschritten werden muss. Finden Sie einen passenden Schwellwert.

In [2]:
# init fig for matplotlib
fig, fig1 = None, None

def handle_select_change(change):
    text.value = showText[label.value]
    
# define file list
wav_list = ['mahlzeit.wav', 'mahlzeitFast.wav', 'mahlzeitLong.wav']

# define select widget
select = widgets.Select(
    options = wav_list,
    value = 'mahlzeit.wav',
    description = 'Dateiliste:')

# define explanations
text_mahlzeit = 'Beschreibung: Das Wort Mahlzeit. Eigenschaften: 16000 Hz, 16 bit mono, Samples: 25333, Länge: 1,58 s'
text_mahlzeit_lang = 'Beschreibung: Das Wort Mahlzeit lang. Eigenschaften: 16000 Hz, 16 bit mono, Samples: 45543, Länge: 2,85 s'
text_mahlzeit_kurz = 'Beschreibung: Das Wort Mahlzeit schnell. Eigenschaften: 16000 Hz, 16 bit mono, Samples: 12580, Länge: 0,79 s'

showText = {"mahlzeit.wav": text_mahlzeit, "mahlzeitFast.wav": text_mahlzeit_kurz, "mahlzeitLong.wav": text_mahlzeit_lang}

label = widgets.Label(value = select.value)
text = widgets.Label(value = showText[label.value])
widgets.link((select, 'value'), (label, 'value'))
select.observe(handle_select_change)

display(select, text)

Select(description='Dateiliste:', options=('mahlzeit.wav', 'mahlzeitFast.wav', 'mahlzeitLong.wav'), value='mah…

Label(value='Beschreibung: Das Wort Mahlzeit. Eigenschaften: 16000 Hz, 16 bit mono, Samples: 25333, Länge: 1,5…

In [3]:
# load audio file
y, sr = librosa.load('wav_files/%s' % select.value, sr = 16000)
x = np.linspace(0, y.size, num = y.size)

# close old figures if avaible
if fig:
    plt.close(fig)
    plt.close(fig1)

def calculate_signal_power(y_input):
    return np.mean(y_input ** 2)

def consecutive(data, stepsize=1):
    return np.split(data, np.where(np.diff(data) != stepsize)[0] + 1)

def sliding_window_over_signal(y_signal, window_size):
    output_vec = np.zeros(int(np.floor(y_signal.size / window_size)))
    
    for i, w in enumerate(more_itertools.windowed(y_signal, n = window_size, step = window_size, fillvalue = '!')): 
        # apply Cepstrum for every window
        if '!' not in w:      
            # get the cepstrum of the window
            output_vec[i] = calculate_signal_power(np.array(w))
            
    return output_vec

def detect_word_boundaries(signal_in, thres_energy, ax = None):
    # clear axis 
    ax.cla()
    
    thres_start = 4
    thres_end = 10
    thres_length = 20
    
    length_window = 256
    power_signal_in = sliding_window_over_signal(signal_in, length_window)

    total_windows = int(np.floor(signal_in.size / length_window))
    
    #ax.clear()
    ax.plot(power_signal_in, label = 'org', color = 'r', linewidth=1)
    
    # find threshold for energy
    threshold_energy = thres_energy * power_signal_in.max()
    ax.axhline(threshold_energy, color = 'g')
    
    # find consecutive sequences and filter with threshold energy
    seq_above_thres = consecutive((power_signal_in > threshold_energy).nonzero()[0])
    seq_below_thres = consecutive((power_signal_in < threshold_energy).nonzero()[0])
    
    helper_array = np.zeros(power_signal_in.size)
    
    # find word start and end --> set 1.0 for start, 2.0 for end in helper_array
    for element in seq_above_thres:
        if len(element) > thres_start:
            helper_array[element[0] - 1] = 1.0

    for element in seq_below_thres:
        if len(element) > thres_end:
            helper_array[element[0]] = 2.0

    # find indices for start and end
    start = (helper_array == 1.0).nonzero()[0]
    end = (helper_array == 2.0).nonzero()[0]  

    # combine all variations of start and end
    possible_windows = np.array(np.meshgrid(start, end)).T.reshape(-1,2)
    final_windows = []
    save_start = -1 # saver var
    save_end = -1 # saver var
    for row in possible_windows:
        final_windows.append(row)
        save_start = row[0]
        save_end = row[1]
        #if row[1] - row[0] > thres_length and row[0] != save_start and row[1] != save_end:
        #    final_windows.append(row)
        #    save_start = row[0]
        #    save_end = row[1]
    
    if ax:
        for element in final_windows:
            ax.axvline(element[0], color = 'b')
            ax.axvline(element[1], color = 'b')
        # setting properties             
        ax.set_xlim(0, total_windows - 1)
        ax.set_xticks(np.arange(0, total_windows, 5))
        ax.set_xlabel("Fenster")
        ax.set_ylabel("Energie")
        ax.set_title('Sprachdetektion')
        ax.grid()
        
    else:
        return final_windows
    

# define figure with axes
fig = plt.figure(figsize = (9, 9))
ax1 = plt.subplot(211)
ax2 = plt.subplot(212)

# define slider
slider_layout = widgets.Layout(width = '75%')
slider_energy = widgets.FloatSlider(value=0.5, min=0.0, max=1.0, step=0.01, description='Schwellwert für Energie:', orientation='horizontal',
                                    readout=True, readout_format='.2f', style={'description_width': 'initial'}, layout = slider_layout)
    
# plot signal
ax1.plot(x, y, linewidth=1, color='r')
ax1.set_xlim(x.min(), x.max())
ax1.set_xlabel("Sample")
ax1.set_ylabel("Amplitude")
ax1.set_title('Signal')
ax1.grid()
        

# plot areas of talking
energy_threshold = 0.05

# create graph with detected boundaries
detect_word_boundaries(y, energy_threshold, ax = ax2)
fig.tight_layout()

# Float slider
slider_location = widgets.Layout(justify_content = 'space-around')
box_layout = widgets.Layout(display='flex', align_items='stretch', width='100%')

box_slider_1 = widgets.VBox([slider_energy], layout = box_layout)
display(box_slider_1, layout = slider_location)

def change_slider(change):
    if change['type'] == 'change' and change['name'] == 'value':
        detect_word_boundaries(y, change['new'], ax = ax2)

slider_energy.observe(change_slider)



Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

VBox(children=(FloatSlider(value=0.5, description='Schwellwert für Energie:', layout=Layout(width='75%'), max=…

# 1. Am Anfang der Sprachdatei mahlzeit.wav gibt es ein Störgeräusch (bis ca. 6000 Samples).

Ist es möglich durch Einstellung des Schwellwertes dieses Störgeräusch zu eliminieren ohne ein Stück aus dem Wort zu schneiden? Wenn ja, ab welchem Schwellwert ist das möglich?

In [4]:
save_radio_select = 'save/radio_select_window_word.dill'

radio_button_array = np.array([False, False, False, False])
# try to load checkbox array
try: 
    with open(save_radio_select, 'rb') as fp:
        radio_button_array = dill.load(fp)
except:
    pass

radio_button_list = ['Nein.', 'Ja, ab 20%.', 'Ja, ab 10%.', 'Ja, ab 1%.']
value_radio = [None if len(np.where(radio_button_array == True)[0]) == 0 else 
               radio_button_list[np.where(radio_button_array == True)[0][0]]]

def callback_checkbox(change):
    if change['type'] == 'change' and change['name'] == 'value':
        radio_button_array[radio_button_list.index(change['new'])] = not radio_button_array[radio_button_list.index(change['new'])]
        if change['old']:
            radio_button_array[radio_button_list.index(change['old'])] = not radio_button_array[radio_button_list.index(change['old'])]
        # save when changed
        with open(save_radio_select, 'wb') as fp:
            dill.dump(radio_button_array, fp)            

radio_buttons = widgets.RadioButtons(options=radio_button_list, 
                                     value=value_radio[0], layout = {'width' : '500px'})

radio_buttons.observe(callback_checkbox)

display(radio_buttons)

RadioButtons(index=1, layout=Layout(width='500px'), options=('Nein.', 'Ja, ab 20%.', 'Ja, ab 10%.', 'Ja, ab 1%…

In [5]:
# Autograding answer, please ignore
