### **Install the required libraries** 🔽

In [None]:
!pip install pyunpack
!pip install patool
!pip install py7zr
!pip install sounddevice
!pip install noisereduce
!pip install librosa
!pip install python_speech_features
!pip install tensorflow==2.4
!pip install malaya_speech
! pip install webrtcvad

### **Import the required libraries** 📌

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from py7zr import unpack_7zarchive
import shutil
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import numpy as np

import librosa
import IPython.display as ipd
from scipy.io import wavfile

import noisereduce as nr
import tensorflow 
from malaya_speech import Pipeline

import malaya_speech
import os

from python_speech_features import mfcc

from sklearn.preprocessing import LabelEncoder
import seaborn as sn

In [None]:
shutil.register_unpack_format('7zip', ['.7z'], unpack_7zarchive)
shutil.unpack_archive('/kaggle/input/tensorflow-speech-recognition-challenge/train.7z', '/kaggle/working/tensorflow-speech-recognition-challenge/train/')

In [None]:
# Remove unnecessary labels data
import os
main_dataset_path = "./tensorflow-speech-recognition-challenge/train/train/audio"

dataset_labels_folders = os.listdir(main_dataset_path)
for folder in dataset_labels_folders:
    if folder not in ["right", "up", "left", "down"]:
        shutil.rmtree(main_dataset_path+"/"+folder, ignore_errors=True)

In [None]:
train_audio_path = '/kaggle/working/tensorflow-speech-recognition-challenge/train/train/audio/'

In [None]:
samples, sample_rate = librosa.load(train_audio_path+'right/49af4432_nohash_1.wav', sr = 16000)
fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.set_title('Raw wave of ' + '../input/train/audio/right/49af4432_nohash_1.wav')
ax1.set_xlabel('time')
ax1.set_ylabel('Amplitude')
ax1.plot(np.linspace(0, sample_rate/len(samples), sample_rate), samples)

In [None]:
ipd.Audio(samples, rate=sample_rate)

In [None]:
print(sample_rate)
sig1=samples
fs=sample_rate
sr=fs

In [None]:
time = np.linspace(0, len(sig1 - 1) / fs, len(sig1 - 1))
reduced_noise1 = nr.reduce_noise(y=sig1, sr=fs,stationary=True)
plt.plot(time, reduced_noise1)  # plot in seconds
plt.xlabel("Time [seconds]")
plt.ylabel("Voice amplitude")
plt.show()

In [None]:
ipd.Audio(reduced_noise1, rate=sample_rate)

In [None]:
#Silence Removal
vad = malaya_speech.vad.webrtc()
y=reduced_noise1
y_= malaya_speech.resample(y, sr, 16000)
y_ = malaya_speech.astype.float_to_int(y_)
frames = malaya_speech.generator.frames(y, 30, sr)
frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
y_ = malaya_speech.combine.without_silent(frames_webrtc)
y_

In [None]:
ipd.Audio(y_, rate = sr )

In [None]:
zero = np.zeros((1*sr-y_.shape[0]))
signal = np.concatenate((y_,zero))
signal.shape
time = np.linspace(0, len(signal - 1) / fs, len(signal - 1))

In [None]:
plt.plot(time,signal)

In [None]:
labels = os.listdir(train_audio_path)
labels

In [None]:
#find count of each label and plot bar graph
no_of_recordings=[]
for label in labels:
    waves = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.wav')]
    no_of_recordings.append(len(waves))
    
#plot
plt.figure(figsize=(30,10))
index = np.arange(len(labels))
plt.bar(index, no_of_recordings)
plt.xlabel('Commands', fontsize=20)
plt.ylabel('No of recordings', fontsize=20)
plt.xticks(index, labels, fontsize=20, rotation=60)
plt.title('No. of recordings for each command')
plt.show()

### **Preform Noise Reduction & Silence Removal** ⚙️

In [None]:
sr=16000
vad = malaya_speech.vad.webrtc()
all_wave = []
all_label = []
for label in labels:
    print(label)
    waves = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.wav')]
    for wav in waves:
        samples, sample_rate = librosa.load(train_audio_path + '/' + label + '/' + wav, sr = 16000)
        samples = nr.reduce_noise(y=samples, sr=sr,stationary=True)
        y_= malaya_speech.resample(samples, sr, 16000)
        y_ = malaya_speech.astype.float_to_int(y_)
        frames = malaya_speech.generator.frames(samples, 30, sr)
        frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
        frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
        y_ = malaya_speech.combine.without_silent(frames_webrtc)
        zero = np.zeros(((1*sr+4000)-y_.shape[0]))
        signal = np.concatenate((y_,zero))
        all_wave.append(signal)
        all_label.append(label)

In [None]:
print(np.array(all_wave).shape)
print(np.array(all_label).shape)
time = np.linspace(0, len(signal - 1) / fs, len(signal - 1))
plt.plot(time,np.array(all_wave)[2000,:])
print(np.array(all_label)[2000])
ipd.Audio(np.array(all_wave)[2000,:], rate = sr )

In [None]:
all_mfcc=[]
for wave in all_wave:
    i=0
    mfcc_feat = mfcc(wave , fs, winlen=256/fs, winstep=256/(2*fs), numcep=13, nfilt=26, nfft=256,
                 lowfreq=0, highfreq=fs/2, preemph=0.97, ceplifter=22, appendEnergy=True, winfunc=np.hamming)
    mfcc_feat= np.transpose(mfcc_feat)
    all_mfcc.append(mfcc_feat)
    

In [None]:
print(np.array(all_mfcc).shape)
print(np.array(all_label).shape)
d1=np.array(all_mfcc).shape[1]
d2=np.array(all_mfcc).shape[2]
d=d1*d2
print(d)

In [None]:
op_mfcc=np.array(all_mfcc)
op_mfcc=op_mfcc.reshape(9454, -1)
op_mfcc.shape

In [None]:
#all_label = all_label.tolist()

le = LabelEncoder()
y=le.fit_transform(all_label)
classes= list(le.classes_)
classes

## **Model based on ANN** 

In [None]:
! pip install --upgrade tensorflow
! pip install --upgrade tensorflow-gpu
! pip install keras==2.3.1

In [None]:
from keras.optimizers import SGD
from keras.constraints import maxnorm
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense,Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
y=tensorflow.keras.utils.to_categorical(y, num_classes=len(labels), dtype='float32')
y.shape

In [None]:
from sklearn.model_selection import train_test_split
x_tr, x_val, y_tr, y_val= train_test_split(op_mfcc,np.array(y),stratify=y,test_size = 0.2,random_state=777,shuffle=True)

In [None]:
print(x_tr.shape)
print(y_tr.shape)
print(x_val.shape)
print(y_val.shape)

### **Model Architecture**

In [None]:
#from keras.models import Sequential
#from keras.layers import Dense, Dropout, Activation

#Model Architecture
model = Sequential()
model.add(Dense(100, activation='sigmoid', input_shape=(d,), kernel_constraint=maxnorm(3)))
model.add(Dropout(0.5))
model.add(Dense(4, activation='softmax' , kernel_constraint=maxnorm(3)))

In [None]:
tensorflow.keras.utils.plot_model(model, 'model.png',show_shapes=True)

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])


In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, min_delta=0.0001) 
mc = ModelCheckpoint('best_model.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [None]:
history=model.fit(x_tr, y_tr,validation_data=(x_val,y_val), epochs=120, batch_size=32)

In [None]:
train_score = model.evaluate(x_tr, y_tr, batch_size=12)
print(train_score)

print('----------------Training Complete-----------------')

test_score = model.evaluate(x_val, y_val, batch_size = 12)
print(test_score)

In [None]:
history.history.keys()

In [None]:
from matplotlib import pyplot
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
y_predict=model.predict(x_val)
conf_mat=tensorflow.math.confusion_matrix(np.argmax(y_val,axis=1) , np.argmax(y_predict,axis=1))

In [None]:
df_cm = pd.DataFrame(np.array(conf_mat), index = [i for i in classes],
                  columns = [i for i in classes])
plt.figure(figsize = (13,7))
ax = sn.heatmap(df_cm, annot=True)
plt.title("Confusion Matrix", fontsize=20)
plt.ylabel("True Class"     , fontsize=20)
plt.xlabel("Predicted Class", fontsize=20)
plt.show()

In [None]:
x_val[1].shape
model.predict(x_val[1].reshape((1,d)))

In [None]:
def predict(audio):
    print(samples.shape)
    prob=model.predict(audio)
    index=np.argmax(prob[0])
    return classes[index]

In [None]:
import random
index=random.randint(0,len(x_val)-1)
print(index)
samples=x_val[index]
print("Audio:",classes[np.argmax(y_val[index])])
#ipd.Audio(np.array(all_wave)[index,:], rate=16000)

In [None]:
print("Text:",predict(samples.reshape(1,d)))

In [None]:
from keras.models import load_model
model.save("CommandsRecognitionModel")

In [None]:
import tensorflow as tf

# Convert the model
converter = tf.lite.TFLiteConverter.from_saved_model("./CommandsRecognitionModel") # path to the SavedModel directory
tflite_model = converter.convert()

# Save the model.
with open('CommandsRecognitionModel.tflite', 'wb') as f:
  f.write(tflite_model)

<center> <h1><a href="./CommandsRecognitionModel.tflite"> Download TensorFlow Lite Model</a></h1>