# CNN model for speech recognition

`The idea of this model is simple, we first convert the audio file into matrices, then use convolutional neural network to train the target`

[1. Reading data from audio files](#data_io)

In [1]:
# input, output and command line tools
import os
from os.path import isdir, join
import pandas as pd

#math and data handler
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

# audio file i/o
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile

In [2]:
#Visualization
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

mpl.rc('font', family = 'serif', size = 17)
mpl.rcParams['xtick.major.size'] = 5
mpl.rcParams['xtick.minor.size'] = 2
mpl.rcParams['ytick.major.size'] = 5
mpl.rcParams['ytick.minor.size'] = 2

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

### Reading data from audio files
<a id="data_io"></a> 

In [3]:
data_dir = '../data/train/audio'

## change the name of `_background_noise_' into 'silence` which is a proper label name
if os.path.exists(data_dir + '/' + '_background_noise_'):
    os.system('mv {0}/_background_noise_ {1}/silence'.format(data_dir, data_dir))
if os.path.exists(data_dir + '/' + 'silence/README.md'):
     os.system('rm {0}/silence/README.md'.format(data_dir))

##### Reading audio one by one and split them into datasets with different labels
There are 31 labels `['house', 'off', 'yes', 'happy', 'zero', 'six', 'silence', 'tree', 'stop', 'four', 'marvin', 'seven', 'cat', 'dog', 'up', 'down', 'one', 'sheila', 'bird', 'on', 'left', 'eight', 'five', 'nine', 'two', 'three', 'go', 'no', 'bed', 'wow', 'right']`, which will be indexed via `0 ~ 30`

In [4]:
labels = os.listdir(data_dir)
print(labels, len(labels))

(['house', 'off', 'yes', 'happy', 'zero', 'six', 'silence', 'tree', 'stop', 'four', 'marvin', 'seven', 'cat', 'dog', 'up', 'down', 'one', 'sheila', 'bird', 'on', 'left', 'eight', 'five', 'nine', 'two', 'three', 'go', 'no', 'bed', 'wow', 'right'], 31)


###### Define a function for reading data from all the folder and all the files in them
```return: a pandas DataFrame```

In [5]:
def load_audio_data(path):
    '''
    path: audio file path
    return: pd.DataFrame
    '''
    raw = {'x': [], 'y': [], 'label':[]}
    for i, folder in enumerate(os.listdir(path)):
        for filename in os.listdir(path + '/' + folder):
            rate, sample = wavfile.read(data_dir + '/' + folder + '/' + filename)
            assert(rate == 16000)
            raw['x'].append(np.array(sample))
            raw['y'].append(i)
            raw['label'].append(folder)
    return pd.DataFrame(raw)
%time raw_df = load_audio_data(data_dir)


Chunk (non-data) not understood, skipping it.



CPU times: user 2.94 s, sys: 2.32 s, total: 5.26 s
Wall time: 5.27 s


In [6]:
# show the data Frame
display(raw_df[:5])

Unnamed: 0,label,x,y
0,house,"[-173, -197, -194, -216, -201, -275, -255, -24...",0
1,house,"[6, -56, -4, 8, -18, 76, 49, 24, 59, 42, 54, 5...",0
2,house,"[10, 10, 17, 19, 27, 19, 4, 11, 10, 17, 16, 17...",0
3,house,"[32, 63, 86, 79, 89, 109, 76, 122, 92, 18, 65,...",0
4,house,"[-83, -62, -93, -71, -16, -3, 1, -43, -6, -96,...",0


In [7]:
from sklearn.utils import shuffle
def train_test_split(df, ratio = 0.7):
    '''
    return train_sets + test_sets + label_map, which maps from y to label name
    '''
    test_x = []
    test_y = []
    train_x = []
    train_y = []
    label_map = {}
    for i in set(df.y.tolist()):
        tmp_df = df[df.y == i]
        label_map[i] = tmp_df.label.tolist()[0]
        tmp_df = shuffle(tmp_df)
        tmp_n = int(len(tmp_df)*ratio)
        train_x += tmp_df.x.tolist()[: tmp_n]
        test_x += tmp_df.x.tolist()[tmp_n: ]
        train_y += tmp_df.y.tolist()[: tmp_n]
        test_y += tmp_df.y.tolist()[tmp_n: ]
    return np.array(train_x), np.array(train_y), np.array(test_x), np.array(test_y), label_map

In [8]:
# Getting testing and trainning set
%time tr_x, tr_y, ts_x, ts_y, idmap = train_test_split(raw_df, ratio=0.7)
print(np.shape(tr_x))
print(np.shape(tr_y))
print(np.shape(ts_x))
print(np.shape(ts_y))
print(type(tr_x[0]), np.shape(tr_x[0]))
print(idmap)

CPU times: user 148 ms, sys: 4 ms, total: 152 ms
Wall time: 156 ms
(45296,)
(45296,)
(19431,)
(19431,)
(<type 'numpy.ndarray'>, (16000,))
{0: 'house', 1: 'off', 2: 'yes', 3: 'happy', 4: 'zero', 5: 'six', 6: 'silence', 7: 'tree', 8: 'stop', 9: 'four', 10: 'marvin', 11: 'seven', 12: 'cat', 13: 'dog', 14: 'up', 15: 'down', 16: 'one', 17: 'sheila', 18: 'bird', 19: 'on', 20: 'left', 21: 'eight', 22: 'five', 23: 'nine', 24: 'two', 25: 'three', 26: 'go', 27: 'no', 28: 'bed', 29: 'wow', 30: 'right'}


##### Using fft for certain time intervals to operate all the data read from audio files

In [9]:
def fft_convert(samples, rate = 16000, n = 25, m = 16, NR = 256, NC = 128, delta = 1.E-10):
    '''
    convert input data into a big spectrum matrix
    '''
    res = []
    for i,sam in enumerate(samples):
        if(i % 1000 == 0):
            print(i)
        freq, times, spec = signal.spectrogram(sam, fs=rate, window=('kaiser',10), nperseg=int(n*rate/1000),
                                               noverlap=int(m*rate/1000))
        p1 = max(0, NR - np.shape(spec)[0])
        p2 = max(0, NC - np.shape(spec)[1])
        spec = np.pad(spec, [(0,p1), (0, p2)], mode='constant')
        spec = spec[:NR, :NC]
        res.append(spec)
    return np.log(np.array(res) + delta)

%time train_x = fft_convert(tr_x)
%time test_x = fft_convert(ts_x)

print np.shape(train_x), np.shape(test_x)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
CPU times: user 1min 32s, sys: 6.4 s, total: 1min 38s
Wall time: 1min 39s
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
CPU times: user 43.1 s, sys: 1.81 s, total: 44.9 s
Wall time: 45.1 s
(45296, 256, 128) (19431, 256, 128)


###### Finally we are going to train models

In [10]:
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Flatten, Conv2D, MaxPooling2D
from keras.optimizers import SGD, Adam, RMSprop, Adadelta
from keras.utils import np_utils, plot_model

Using TensorFlow backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [11]:
img_r, img_c = np.shape(train_x)[1:]
train_x = train_x.reshape(len(train_x), img_r, img_c, 1)
test_x = test_x.reshape(len(test_x), img_r, img_c, 1)
print np.shape(train_x), np.shape(test_x)

(45296, 256, 128, 1) (19431, 256, 128, 1)


##### Convert target

In [12]:
n_cls = 31
train_y = np.zeros((len(tr_y), n_cls))
test_y = np.zeros((len(ts_y), n_cls))
for i in range(len(tr_y)):
    train_y[i][tr_y[i]] = 1
for i in range(len(ts_y)):
    test_y[i][ts_y[i]] = 1
print np.shape(train_y), np.shape(test_y)

(45296, 31) (19431, 31)


In [13]:
### Construct the model
model = Sequential()
model.add(MaxPooling2D(pool_size = (2, 2), input_shape = (img_r, img_c, 1)))
model.add(MaxPooling2D(pool_size = (2, 2)))
model.add(Conv2D(32, kernel_size = (5, 5), padding = 'same'))
model.add(MaxPooling2D(pool_size = (2, 2)))
model.add(Activation('relu'))
model.add(Conv2D(32, kernel_size = (5, 5), padding = 'same'))
model.add(MaxPooling2D(pool_size = (2, 2)))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Conv2D(32, kernel_size = (5, 5), padding = 'same'))
model.add(MaxPooling2D(pool_size = (2, 2)))
model.add(Activation('relu'))
model.add(Dropout(0.25))
#model.add(Conv2D(32, kernel_size = (5, 5), padding = 'same'))
#model.add(MaxPooling2D(pool_size = (2, 2)))
#model.add(Activation('relu'))
#model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Dense(n_cls, activation = 'softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
max_pooling2d_1 (MaxPooling2 (None, 128, 64, 1)        0         
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 64, 32, 1)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 64, 32, 32)        832       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 32, 16, 32)        0         
_________________________________________________________________
activation_1 (Activation)    (None, 32, 16, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 32, 16, 32)        25632     
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 16, 8, 32)         0         
__________

In [14]:
### Compile the model
optimizer = SGD()
loss = 'categorical_crossentropy'
metrics = ['accuracy']
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [15]:
### Train the model
res = model.fit(train_x, train_y, batch_size = 128, epochs = 12, verbose = 1, validation_data = (test_x, test_y))

Train on 45296 samples, validate on 19431 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [46]:
test_dir = '../data/test/audio'
files = os.listdir(test_dir)
print len(files)

141949


In [37]:
dic = {'filename':[], 'predict':[] }
n = 25
m = 16
NR = 256
NC = 128
delta = 1.E-10
batch_size = 10000
y = []
N = len(files)
for i in range(0, N, batch_size):
    fnames = files[i: min(i+batch_size, N)]
    x = []
    for f in fnames:
        rate, sample = wavfile.read(test_dir + '/' + f)
        x.append(sample)
    x = fft_convert(x)
    nx, ny, nz = np.shape(x)
    x = x.reshape(nx, ny, nz, 1)
    ty = model.predict_classes(x, batch_size=128)
    print(np.shape(ty))
    for p in ty:
        y.append(idmap[p])
dic['filename'] = files
dic['predict'] = y
df = pd.DataFrame(dic)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
(10000,)
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
0
(771,)


In [42]:
display(df)
df = df.set_index('filename')
df.to_csv('test.csv')

Unnamed: 0,filename,predict
0,clip_1275c3f70.wav,on
1,clip_40c04ff6e.wav,seven
2,clip_8e9db251d.wav,stop
3,clip_1737ba34c.wav,two
4,clip_8a52a5dde.wav,no
5,clip_1e04aabd1.wav,left
6,clip_1d4ac458a.wav,zero
7,clip_6b2f9b97c.wav,off
8,clip_9756ac34a.wav,left
9,clip_00512e818.wav,seven
