# CNN model for speech recognition

`The idea of this model is simple, we first convert the audio file into matrices, then use convolutional neural network to train the target`

[1. Reading data from audio files](#data_io)

In [1]:
# input, output and command line tools
import os
from os.path import isdir, join
import pandas as pd

#math and data handler
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

# audio file i/o
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile

In [2]:
#Visualization
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

mpl.rc('font', family = 'serif', size = 17)
mpl.rcParams['xtick.major.size'] = 5
mpl.rcParams['xtick.minor.size'] = 2
mpl.rcParams['ytick.major.size'] = 5
mpl.rcParams['ytick.minor.size'] = 2

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

### Reading data from audio files
<a id="data_io"></a> 

In [3]:
data_dir = '../data/train/audio'

## change the name of `_background_noise_' into 'silence` which is a proper label name
if os.path.exists(data_dir + '/' + '_background_noise_'):
    os.system('mv {0}/_background_noise_ {1}/silence'.format(data_dir, data_dir))
if os.path.exists(data_dir + '/' + 'silence/README.md'):
     os.system('rm {0}/silence/README.md'.format(data_dir))

##### Reading audio one by one and split them into datasets with different labels
There are 31 labels `['house', 'off', 'yes', 'happy', 'zero', 'six', 'silence', 'tree', 'stop', 'four', 'marvin', 'seven', 'cat', 'dog', 'up', 'down', 'one', 'sheila', 'bird', 'on', 'left', 'eight', 'five', 'nine', 'two', 'three', 'go', 'no', 'bed', 'wow', 'right']`, which will be indexed via `0 ~ 30`

In [4]:
labels = os.listdir(data_dir)
print(labels, len(labels))

(['house', 'off', 'yes', 'happy', 'zero', 'six', 'silence', 'tree', 'stop', 'four', 'marvin', 'seven', 'cat', 'dog', 'up', 'down', 'one', 'sheila', 'bird', 'on', 'left', 'eight', 'five', 'nine', 'two', 'three', 'go', 'no', 'bed', 'wow', 'right'], 31)


###### Define a function for reading data from all the folder and all the files in them
```return: a pandas DataFrame```

In [5]:
def load_audio_data(path):
    '''
    path: audio file path
    return: pd.DataFrame
    '''
    raw = {'x': [], 'y': [], 'label':[]}
    for i, folder in enumerate(os.listdir(path)):
        for filename in os.listdir(path + '/' + folder):
            rate, sample = wavfile.read(data_dir + '/' + folder + '/' + filename)
            assert(rate == 16000)
            raw['x'].append(np.array(sample))
            raw['y'].append(i)
            raw['label'].append(folder)
    return pd.DataFrame(raw)
%time raw_df = load_audio_data(data_dir)


Chunk (non-data) not understood, skipping it.



CPU times: user 2.77 s, sys: 2.29 s, total: 5.06 s
Wall time: 5.08 s


In [6]:
# show the data Frame
display(raw_df[:5])

Unnamed: 0,label,x,y
0,house,"[-173, -197, -194, -216, -201, -275, -255, -24...",0
1,house,"[6, -56, -4, 8, -18, 76, 49, 24, 59, 42, 54, 5...",0
2,house,"[10, 10, 17, 19, 27, 19, 4, 11, 10, 17, 16, 17...",0
3,house,"[32, 63, 86, 79, 89, 109, 76, 122, 92, 18, 65,...",0
4,house,"[-83, -62, -93, -71, -16, -3, 1, -43, -6, -96,...",0


In [7]:
from sklearn.utils import shuffle
def train_test_split(df, ratio = 0.7):
    '''
    return train_sets + test_sets + label_map, which maps from y to label name
    '''
    test_x = []
    test_y = []
    train_x = []
    train_y = []
    label_map = {}
    for i in set(df.y.tolist()):
        tmp_df = df[df.y == i]
        label_map[i] = tmp_df.label.tolist()[0]
        tmp_df = shuffle(tmp_df)
        tmp_n = int(len(tmp_df)*ratio)
        train_x += tmp_df.x.tolist()[: tmp_n]
        test_x += tmp_df.x.tolist()[tmp_n: ]
        train_y += tmp_df.y.tolist()[: tmp_n]
        test_y += tmp_df.y.tolist()[tmp_n: ]
    return np.array(train_x), np.array(train_y), np.array(test_x), np.array(test_y), label_map

In [8]:
# Getting testing and trainning set
%time tr_x, tr_y, ts_x, ts_y, idmap = train_test_split(raw_df, ratio=0.7)
print(np.shape(tr_x))
print(np.shape(tr_y))
print(np.shape(ts_x))
print(np.shape(ts_y))
print(type(tr_x[0]), np.shape(tr_x[0]))
print(idmap)

CPU times: user 136 ms, sys: 4 ms, total: 140 ms
Wall time: 140 ms
(45296,)
(45296,)
(19431,)
(19431,)
(<type 'numpy.ndarray'>, (16000,))
{0: 'house', 1: 'off', 2: 'yes', 3: 'happy', 4: 'zero', 5: 'six', 6: 'silence', 7: 'tree', 8: 'stop', 9: 'four', 10: 'marvin', 11: 'seven', 12: 'cat', 13: 'dog', 14: 'up', 15: 'down', 16: 'one', 17: 'sheila', 18: 'bird', 19: 'on', 20: 'left', 21: 'eight', 22: 'five', 23: 'nine', 24: 'two', 25: 'three', 26: 'go', 27: 'no', 28: 'bed', 29: 'wow', 30: 'right'}


##### Using fft for certain time intervals to operate all the data read from audio files

In [9]:
def fft_convert(samples, rate = 16000, n = 25, m = 16, NR = 256, NC = 128, delta = 1.E-10):
    '''
    convert input data into a big spectrum matrix
    '''
    res = []
    for i,sam in enumerate(samples):
        if(i % 1000 == 0):
            print(i)
        freq, times, spec = signal.spectrogram(sam, fs=rate, window=('kaiser',10), nperseg=int(n*rate/1000),
                                               noverlap=int(m*rate/1000))
        p1 = max(0, NR - np.shape(spec)[0])
        p2 = max(0, NC - np.shape(spec)[1])
        spec = np.pad(spec, [(0,p1), (0, p2)], mode='constant')
        spec = spec[:NR, :NC]
        res.append(spec)
    return np.log(np.array(res) + delta)
        
    
tr_x = tr_x[:10000]
ts_x = ts_x[:3000]
tr_y = tr_y[:10000]
ts_y = ts_y[:3000]

%time train_x = fft_convert(tr_x)
%time test_x = fft_convert(ts_x)

print np.shape(train_x), np.shape(test_x)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
CPU times: user 22.2 s, sys: 1.54 s, total: 23.8 s
Wall time: 23.8 s
0
1000
2000
CPU times: user 6.71 s, sys: 292 ms, total: 7 s
Wall time: 7 s
(10000, 256, 128) (3000, 256, 128)


###### Finally we are going to train models

In [10]:
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Flatten, Conv2D, MaxPooling2D
from keras.optimizers import SGD, Adam, RMSprop, Adadelta
from keras.utils import np_utils, plot_model

Using TensorFlow backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [11]:
img_r, img_c = np.shape(train_x)[1:]
train_x = train_x.reshape(len(train_x), img_r, img_c, 1)
test_x = test_x.reshape(len(test_x), img_r, img_c, 1)
print np.shape(train_x), np.shape(test_x)

(10000, 256, 128, 1) (3000, 256, 128, 1)


##### Convert target

In [12]:
n_cls = 31
train_y = np.zeros((len(tr_y), n_cls))
test_y = np.zeros((len(ts_y), n_cls))
for i in range(len(tr_y)):
    train_y[i][tr_y[i]] = 1
for i in range(len(ts_y)):
    test_y[i][ts_y[i]] = 1
print np.shape(train_y), np.shape(test_y)

(10000, 31) (3000, 31)


In [13]:
### Construct the model
model = Sequential()
model.add(MaxPooling2D(pool_size = (2, 2), input_shape = (img_r, img_c, 1)))
model.add(Conv2D(32, kernel_size = (5, 5), padding = 'same'))
model.add(MaxPooling2D(pool_size = (2, 2)))
model.add(Activation('relu'))
model.add(Conv2D(64, kernel_size = (5, 5), padding = 'same'))
model.add(MaxPooling2D(pool_size = (2, 2)))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Conv2D(64, kernel_size = (5, 5), padding = 'same'))
model.add(MaxPooling2D(pool_size = (2, 2)))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Conv2D(64, kernel_size = (5, 5), padding = 'same'))
model.add(MaxPooling2D(pool_size = (2, 2)))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Dense(n_cls, activation = 'softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
max_pooling2d_1 (MaxPooling2 (None, 128, 64, 1)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 128, 64, 32)       832       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 64, 32, 32)        0         
_________________________________________________________________
activation_1 (Activation)    (None, 64, 32, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 64, 32, 64)        51264     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 32, 16, 64)        0         
_________________________________________________________________
activation_2 (Activation)    (None, 32, 16, 64)        0         
__________

In [14]:
### Compile the model
optimizer = SGD()
loss = 'categorical_crossentropy'
metrics = ['accuracy']
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [15]:
### Train the model
res = model.fit(train_x, train_y, batch_size = 128, epochs = 20, verbose = 1, validation_data = (test_x, test_y))

Train on 10000 samples, validate on 3000 samples
Epoch 1/6


ResourceExhaustedError: OOM when allocating tensor with shape[128,32,128,64]
	 [[Node: conv2d_1/convolution = Conv2D[T=DT_FLOAT, data_format="NHWC", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](max_pooling2d_1/MaxPool, conv2d_1/kernel/read)]]
	 [[Node: mul_2/_43 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_966_mul_2", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

Caused by op u'conv2d_1/convolution', defined at:
  File "/opt/anaconda2/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/opt/anaconda2/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/opt/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/opt/anaconda2/lib/python2.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/opt/anaconda2/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/opt/anaconda2/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/opt/anaconda2/lib/python2.7/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/opt/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/opt/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/opt/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/opt/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/opt/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/opt/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/opt/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/opt/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/opt/anaconda2/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/opt/anaconda2/lib/python2.7/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/opt/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/opt/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/opt/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-cecbc64c0209>", line 4, in <module>
    model.add(Conv2D(32, kernel_size = (5, 5), padding = 'same'))
  File "/opt/anaconda2/lib/python2.7/site-packages/keras/models.py", line 455, in add
    output_tensor = layer(self.outputs[0])
  File "/opt/anaconda2/lib/python2.7/site-packages/keras/engine/topology.py", line 554, in __call__
    output = self.call(inputs, **kwargs)
  File "/opt/anaconda2/lib/python2.7/site-packages/keras/layers/convolutional.py", line 164, in call
    dilation_rate=self.dilation_rate)
  File "/opt/anaconda2/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py", line 2862, in conv2d
    data_format='NHWC')
  File "/opt/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/nn_ops.py", line 672, in convolution
    op=op)
  File "/opt/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/nn_ops.py", line 338, in with_space_to_batch
    return op(input, num_spatial_dims, padding)
  File "/opt/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/nn_ops.py", line 664, in op
    name=name)
  File "/opt/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/nn_ops.py", line 131, in _non_atrous_convolution
    name=name)
  File "/opt/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 397, in conv2d
    data_format=data_format, name=name)
  File "/opt/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/opt/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/opt/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[128,32,128,64]
	 [[Node: conv2d_1/convolution = Conv2D[T=DT_FLOAT, data_format="NHWC", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](max_pooling2d_1/MaxPool, conv2d_1/kernel/read)]]
	 [[Node: mul_2/_43 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_966_mul_2", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
