In [None]:
import os
import numpy as np
from scipy.fftpack import fft
from scipy.io import wavfile
from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile

from keras import optimizers, losses, activations, models
from keras.layers import Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization
from sklearn.model_selection import train_test_split
import keras

import torch
import torchaudio

Using TensorFlow backend.


In [None]:
L = 16000
legal_labels = 'yes no up down left right on off stop go silence unknown'.split()

#src folders
root_path = r'..'
out_path = r'.'
model_path = r'.'
train_data_path = "C:\\Users\\prisarkar\\Desktop\\LanguageEngine\\train\\audio"
output_file="C:\\Users\\prisarkar\\Desktop\\LanguageEngine\\output.wav"


In [None]:
def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [None]:
######### This function reads within sub folders and create the variabke set
def list_wavs_fname(dirpath, ext='wav'):
    fpaths = glob(os.path.join(dirpath, r'*/*' + ext))
    pat = r'.+/(\w+)/\w+\.' + ext + '$'
    labels = []
    for fpath in fpaths:
        name=fpath.split("\\")[-2]
        labels.append(name)
                         
    fnames = []
    for fpath in fpaths:
        name=fpath.split("\\")[-1]
        fnames.append(name)
        
    return labels, fnames


In [None]:
def pad_audio(samples):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))

def chop_audio(samples, L=16000, num=20):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

def label_transform(labels):
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('silence')
        elif label not in legal_labels:
            nlabels.append('unknown')
        else:
            nlabels.append(label)
    return pd.get_dummies(pd.Series(nlabels))

In [None]:
labels,fnames = list_wavs_fname(train_data_path)

new_sample_rate = 16000
y_train = []
x_train = []

for label, fname in zip(labels, fnames):
    samples, sample_rate = torchaudio.load(os.path.join(train_data_path, label, fname))
    samples = pad_audio(samples)
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else: n_samples = [samples]
    for samples in n_samples:
        resampled=torchaudio.transforms.Resample(sample_rate, new_sample_rate)(samples)
        mfcc=torchaudio.transforms.MFCC()(resampled))
        y_train.append(label)
        x_train.append(mfcc)

x_train = np.array(x_train)
y_train=np.array(y_train)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_train=le.fit_transform(y_train)
label_index = le.classes_
gc.collect()



0

In [None]:
x_train.shape

(24222, 99, 161)

In [None]:
label_index

array(['_background_noise_', 'down', 'go', 'left', 'no', 'off', 'on',
       'right', 'stop', 'up', 'yes', 'स्वागत'], dtype='<U18')

In [None]:
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, Bidirectional, TimeDistributed, Conv1D, ZeroPadding1D, GRU
from tensorflow.keras.layers import Lambda, Input, Dropout, Masking, BatchNormalization, Activation
from tensorflow.keras.models import Model
import tensorflow as tf


def cnn_lstm(input_dim, output_dim, dropout=0.2, n_layers=1):

#     # Input data type
    dtype = 'float32'

    # ---- Network model ----
    input_data = Input(name='the_input', shape=input_dim, dtype=dtype)

    # 1 x 1D convolutional layers with strides 4
    x = Conv1D(filters=256, kernel_size=10, strides=4, name='conv_1')(input_data)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(dropout, name='dropout_1')(x)
        
    x = LSTM(128, activation='relu', return_sequences=True,
             dropout=dropout, name='lstm_1')(x)
    x = LSTM(128, activation='relu', return_sequences=False,
             dropout=dropout, name='lstm_2')(x)

#     # 1 fully connected layer DNN ReLu with default 20% dropout
    x = Dense(units=64, activation='relu', name='fc')(x)
    x = Dropout(dropout, name='dropout_2')(x)

    # Output layer with softmax
    y_pred = Dense(units=output_dim, activation='softmax', name='softmax')(x)


    network_model = Model(inputs=input_data, outputs=y_pred)
    
    return network_model

input_dim = (99, 161)
classes = len(legal_labels)

model = cnn_lstm(input_dim, classes)
model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, 99, 161)]         0         
_________________________________________________________________
conv_1 (Conv1D)              (None, 23, 256)           412416    
_________________________________________________________________
batch_normalization (BatchNo (None, 23, 256)           1024      
_________________________________________________________________
activation (Activation)      (None, 23, 256)           0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 23, 256)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 23, 128)           197120    
_________

In [None]:
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import TensorBoard

sgd = SGD(lr=0.00001, clipnorm=1.0)
adam = Adam(lr=1e-4, clipnorm=1.0)

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy'])
history = model.fit(x_train, y_train,
                    batch_size=128, epochs=10,verbose=1)
#                     validation_data=(X_val, Y_val),
#                    callbacks=[TensorBoard(log_dir='logs',
#                                           histogram_freq=1,
#                                           update_freq='epoch')])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#model.save(os.path.join(model_path, 'rnn.model'))

In [None]:
import pyaudio
import wave

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
RECORD_SECONDS = 2
WAVE_OUTPUT_FILENAME = output_file

p = pyaudio.PyAudio()

stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("* recording")

frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)

print("* done recording")

stream.stop_stream()
stream.close()
p.terminate()

wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()


* recording
* done recording


In [None]:
samples,sample_rate  = torchaudio.load(output_file)
samples = pad_audio(samples)
if len(samples) > 16000:
    n_samples = chop_audio(samples)
else: n_samples = [samples]
for samples in n_samples:
    resampled=torchaudio.transforms.Resample(sample_rate, new_sample_rate)(samples)
    mfcc=torchaudio.transforms.MFCC()(resampled))


In [None]:
imgs = np.array(mfcc)
imgs = imgs.reshape(tuple([1]+list(imgs.shape)))

In [None]:
predicts = model.predict(imgs)

In [None]:
label_index[np.argmax(predicts)]

'up'