# Spoken Digit Challenge
This is the first challenge of our Speech and Machine Learning Workshop. Here we will use the [FSDD][] Free Spoken Digit Dataset to build different models and recognize the digits from speech.   

** Note: ** Make sure that your dataset is in the correct folder - if there´s something not working for you, feel free to ask.

* 1500 recordings in total (150 per digit)
* 8kHz sampling rate
* 3 speakers
* English 
* File format: {digit\_label}\_{speaker\_name}\_{index}.wav <br> (e.g. "4\_jackson\_16.wav")

[FSDD]: https://github.com/Jakobovski/free-spoken-digit-dataset  

In [16]:
# Import the relevant modules to be used later
import glob
import os
import librosa, librosa.display
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram



# Config matplotlib for inline plotting
%matplotlib inline

In [17]:
# Dataset directory
DATASET_DIR = "dataset/"

# Create a list of all .wav files in the dataset directoy paths 
sound_paths = [DATASET_DIR + f for f in os.listdir(DATASET_DIR) if f[-4:] == '.wav' and 'jackson' in f]

In [18]:
def load_sound_files(file_paths):
    return [librosa.load(fp)[0] for fp in file_paths]

def plot_wave(sound_name_with_raw_data):
    i = 1
    plt.figure(figsize=(15, 2 * len(sound_name_with_raw_data) if len(sound_name_with_raw_data) > 1 else 4))
    for n,d in sound_name_with_raw_data:
        plt.subplot(np.ceil(float(len(sound_name_with_raw_data))/2), 2, i)
        
        # wave plot
        librosa.display.waveplot(np.array(d),sr=8000)
        
        plt.ylabel('Amplitude')
        plt.title(n)
        i += 1
    plt.subplots_adjust(top=0.8, bottom=0.08, left=0.10, right=0.95, hspace=0.5, wspace=0.35)
    plt.show()
    
def plot_spectrogram(sound_name_with_raw_data):
    i = 1
    plt.figure(figsize=(15, 2 * len(sound_name_with_raw_data) if len(sound_name_with_raw_data) > 1 else 4))
    for n,d in sound_name_with_raw_data:
        plt.subplot(np.ceil(float(len(sound_name_with_raw_data))/2), 2, i)
        
        # Spectrogram
        specgram(np.array(d), Fs=8000, NFFT=512, noverlap=248, scale="dB", vmax=20)
        
        plt.title(n)
        i += 1
    plt.subplots_adjust(top=0.8, bottom=0.08, left=0.10, right=0.95, hspace=0.5, wspace=0.35)
    plt.show()

In [19]:
# Target sound filenames for visualization
sound_filenames = [str(i) + '_jackson_0.wav' for i in range(0, 10)]

# Load sound files used in visualization
sound_name_with_raw_data = [("Digit " + os.path.basename(p)[0], librosa.load(p)[0]) for i, p in enumerate(sound_paths) if os.path.basename(p) in sound_filenames]

In [20]:
# plot_wave(sound_name_with_raw_data)

In [21]:
# plot_spectrogram(sound_name_with_raw_data)

In [22]:
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs,chroma,mel,contrast,tonnetz

def get_features_and_labels(sound_paths):
    features, labels = np.empty((0,193)), np.empty(0)
    for p in sound_paths:
        mfccs, chroma, mel, contrast,tonnetz = extract_feature(p)
        ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
        features = np.vstack([features,ext_features])
        labels = np.append(labels, int(os.path.basename(p)[0]))
    return np.array(features), np.array(labels, dtype = np.int)

def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels,n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

In [23]:
features, labels = get_features_and_labels(sound_paths)
labels = one_hot_encode(labels)

In [26]:
FEATURE_PATH = 'features/features.txt'
LABEL_PATH = 'features/labels.txt'

In [28]:
np.savetxt(FEATURE_PATH, features, fmt='%10.5f', delimiter='\t')
np.savetxt(LABEL_PATH, labels, fmt='%i', delimiter='\t')