# Spoken Digit Challenge

This is the first challenge of our Speech and Machine Learning Workshop. Here we will use the [FSDD][] Free Spoken Digit Dataset to build different models and recognize the digits from speech.   

** Note: ** Make sure that your dataset is in the correct folder - if there´s something not working for you, feel free to ask.

* 1500 recordings in total (150 per digit)
* 8kHz sampling rate
* 3 speakers
* English 
* File format: {digit\_label}\_{speaker\_name}\_{index}.wav <br> (e.g. "4\_jackson\_16.wav")

[FSDD]: https://github.com/Jakobovski/free-spoken-digit-dataset  

## Feature Extraction

First, we will extract our features from the audio files. Two files will be generated - one for the features and one for the corresponding labels. Each line in our feature-label-pair will represent a single audio file.

In [None]:
# Import the relevant modules to be used later
import glob
import os
import librosa, librosa.display
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram



# Config matplotlib for inline plotting
%matplotlib inline

In [None]:
# Dataset directory
DATASET_DIR = "dataset/"

# Create a list of all .wav files in the dataset directoy paths 
sound_paths = [DATASET_DIR + f for f in os.listdir(DATASET_DIR) if f[-4:] == '.wav']

In [None]:
def load_sound_files(file_paths):
    return [librosa.load(fp)[0] for fp in file_paths]

def plot_wave(sound_name_with_raw_data):
    i = 1
    plt.figure(figsize=(15, 2 * len(sound_name_with_raw_data) if len(sound_name_with_raw_data) > 1 else 4))
    for n,d in sound_name_with_raw_data:
        plt.subplot(np.ceil(float(len(sound_name_with_raw_data))/2), 2, i)
        
        # wave plot
        librosa.display.waveplot(np.array(d),sr=8000)
        
        plt.ylabel('Amplitude')
        plt.title(n)
        i += 1
    plt.subplots_adjust(top=0.8, bottom=0.08, left=0.10, right=0.95, hspace=0.5, wspace=0.35)
    plt.show()
    
def plot_spectrogram(sound_name_with_raw_data):
    i = 1
    plt.figure(figsize=(15, 2 * len(sound_name_with_raw_data) if len(sound_name_with_raw_data) > 1 else 4))
    for n,d in sound_name_with_raw_data:
        plt.subplot(np.ceil(float(len(sound_name_with_raw_data))/2), 2, i)
        
        # Spectrogram
        specgram(np.array(d), Fs=8000, NFFT=512, noverlap=248, scale="dB", vmax=20)
        
        plt.title(n)
        i += 1
    plt.subplots_adjust(top=0.8, bottom=0.08, left=0.10, right=0.95, hspace=0.5, wspace=0.35)
    plt.show()

In [None]:
# Target sound filenames for visualization
sound_filenames = [str(i) + '_jackson_0.wav' for i in range(0, 10)]

# Load sound files used in visualization
sound_name_with_raw_data = [("Digit " + os.path.basename(p)[0], librosa.load(p)[0]) for i, p in enumerate(sound_paths) if os.path.basename(p) in sound_filenames]

In [None]:
# plot_wave(sound_name_with_raw_data)

In [None]:
# plot_spectrogram(sound_name_with_raw_data)

In [None]:
def extract_features(file_name):
    X, sample_rate = librosa.load(file_name)
    
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)

    #var = np.var(mfccs, axis=0)
    #stddev = np.std(mfccs, axis=0)
    #mean = np.mean(mfccs, axis=0)
    #mi = mfccs.min(axis=0)
    #first_q = np.percentile(mfccs, 25, axis=0)
    #median = np.median(mfccs, axis=0)
    #third_q = np.percentile(mfccs, 75, axis=0)
    #ma = mfccs.max(axis=0)
                
    #chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    #mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    #contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    #tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)

    features = np.hstack((mfccs))
    return features

def get_features_and_labels(sound_paths):
    features = None
    labels = np.empty(0)
    for p in sound_paths:
        ext_features = extract_features(p)

        if features is None:
            features = np.empty((0,len(ext_features)))
            
        features = np.vstack([features,ext_features])
        
        labels = np.append(labels, int(os.path.basename(p)[0]))
    return np.array(features), np.array(labels, dtype = np.int)

def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels,n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

features, labels = get_features_and_labels(sound_paths)
labels = one_hot_encode(labels)


In [None]:
FEATURE_PATH = 'features/features.txt'
LABEL_PATH = 'features/labels.txt'

In [None]:
np.savetxt(FEATURE_PATH, features, fmt='%10.5f', delimiter='\t')
np.savetxt(LABEL_PATH, labels, fmt='%i', delimiter='\t')

## Classification

Now, we will load our generated features and labels in order to train a classifier on it and evaluate its performance

In [None]:
import random
from keras.models import Sequential
from keras.constraints import maxnorm
from keras.initializers import lecun_uniform
from keras import optimizers
from keras.layers import Dense, Dropout, Activation
from sklearn.metrics import (accuracy_score, confusion_matrix, precision_recall_fscore_support)

In [None]:
features = np.loadtxt(FEATURE_PATH)
labels = np.loadtxt(LABEL_PATH)

print('Label shape: ' + str(labels.shape))
feature_dim = features.shape[1]
print('Feature dimensions: ' + str(feature_dim))

In [None]:

def split_train_test_eval (features, labels, train_percentage, test_percentage, eval_percentage):
    feature_label_pairs = list(zip(features, labels))
    random.shuffle(feature_label_pairs)
    features, labels = zip(*feature_label_pairs)
    features = np.array(features)
    labels = np.array(labels)
    
    sample_size = len(labels)
    print('Number of total samples: ' + str(sample_size))
    
    train_samples = int(sample_size * train_percentage)
    test_samples = int(sample_size * test_percentage)
    eval_samples = int(sample_size * eval_percentage)
    
    # just to make sure that we end up with the actual sample size:
    if train_samples + test_samples + eval_samples > sample_size:
        eval_samples = sample_size - train_samples - test_samples
    
    print('Train sample size: ' + str(train_samples))
    print('Test sample size: ' + str(test_samples))
    print('Eval sample size: ' + str(eval_samples))
    
    train_features = features[0 : train_samples]
    train_labels = labels[0 : train_samples]
    
    test_features = features[train_samples : train_samples + test_samples]
    test_labels = labels[train_samples : train_samples + test_samples]
    
    eval_features = features[train_samples + test_samples : train_samples + test_samples + eval_samples]
    eval_labels = labels[train_samples + test_samples : train_samples + test_samples + eval_samples]
    
    return train_features, train_labels, test_features, test_labels, eval_features, eval_labels
        
train_features, train_labels, test_features, test_labels, eval_features, eval_labels = split_train_test_eval (features, labels, 0.5, 0.3, 0.2)
    
evaluation = (eval_features, eval_labels)

### Building our model

In [None]:
model = Sequential()
model.add(Dense(units=300, input_dim=feature_dim, activation="relu"))
model.add(Dense(units=300,activation="relu"))
model.add(Dense(units=50, activation="relu"))
model.add(Dense(units=10,activation="sigmoid"))

opt = optimizers.SGD(lr=0.001, clipvalue=0.5)
model.compile(loss="binary_crossentropy",
              optimizer=opt,
              metrics=["accuracy"])

model.fit(train_features,train_labels, validation_data=evaluation, epochs=100, batch_size=8)

MODEL_DIR = "models/model1.model"

model.save(MODEL_DIR)

In [None]:
prediction_probabilities = np.array(model.predict_proba(test_features))
prediction = np.array(model.predict_classes(test_features))
                                            
test_classes = np.argmax(test_labels, axis=1)
print(prediction)
print(test_classes)

accuracy = accuracy_score(test_classes, prediction)
print('Accuracy: ' + str(accuracy))

conf_mat = confusion_matrix(test_classes, prediction)
print(conf_mat)