**In this notebook I have implementented voice to text recognition on a medical dataset.
However due to insufficient audio dataset the 1D convolutional and LSTM models could not be tested for accuracy.
This notbook only focuses on the approach taken for voice to text recognition.
I have gathered audio files converted them to .WAV format at 8000 sampling rate, MONO channel.
Maximum size of the file is 3 sec and the files shorter than that have been padded.

In [1]:
import tensorflow as tf

In [2]:
from tensorflow import keras

In [28]:
import librosa
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import utils
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv1D, Input, MaxPooling1D, LSTM, BatchNormalization,GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import backend as K

from tensorflow.keras.models import Sequential


In [4]:
import os
path = 'F:/Speech2Text/train_data'
os.listdir(path)

['Abdominoplasty',
 'Acanthamoeba',
 'Achondroplasia',
 'Addison_Disease',
 'Adenocarcinoma',
 'Cystic_Fibrosis',
 'Diabetes',
 'Gastritis',
 'Gastroenterostomy',
 'Gynecomastia',
 'Heartburn',
 'Hypertension',
 'Jugular_Vein',
 'Leucoderma',
 'Macular_Degeneration',
 'Mean_Corpuscular_Volume',
 'Meningitis_Contagious',
 'Meningoencephalitis_Toxoplasma',
 'Migraine',
 'Mitral_Valve_Prolapse',
 'Multiple_Sclerosis',
 'Ophthalmology',
 'Peroneal_muscle',
 'Posterior_muscles',
 'Pulmonary_artery']

In [5]:
labels=os.listdir(path)

**Find the count of each label

In [6]:
recordings=[]
for label in labels:
    waves = [f for f in os.listdir(path + '/'+ label) if f.endswith('.wav')]
    recordings.append(len(waves))
print(recordings)

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


**Find the sampling rate of labels**

In [7]:
for label in labels:
    #print(label)
    waves = [f for f in os.listdir(path + '/'+ label) if f.endswith('.wav')]
    for wav in waves:
        samples, sample_rate = librosa.load(path + '/' + label + '/' + wav, sr = None)

In [8]:
print(sample_rate)
#print(samples)
samples = np.asarray(samples)
print(samples.ndim)

8000
1


**Feature Extraction.
Here Zero padding is done to the audio files who are less than maximum size of the file.
Next mfcc mel-freq features across 20 dimensions have been extracted.

In [9]:
sr = 8000
hop_length = int(0.5*sr + 2)
n_fft = hop_length + int(0.02*sr) #Consider 5ms window around the edges as a buffer/delay
#hop_length = n_fft//2
all_wave = []
all_label = []
max_size = 24800
for label in labels:
    #print(label)
    waves = [f for f in os.listdir(path + '/'+ label) if f.endswith('.wav')]
    for wav in waves:
        samples, sr = librosa.load(path + '/' + label + '/' + wav, sr=8000)
        size = samples.shape[0]
        
        n_pad = max_size - size
        samples = np.pad(samples, (0, n_pad), mode='constant')
        mfc = librosa.feature.mfcc(y=samples, sr=sr, n_mfcc=20, n_fft=n_fft, hop_length=hop_length).T
        #mfc = librosa.feature.mfcc(y=samples, sr=sr, n_mfcc=20).T
         
        all_wave.append(mfc)
        all_label.append(label)

In [10]:
# 24800 / 8000
print(sr)
print(hop_length)
print(n_fft)

8000
4002
4162


In [11]:
#print(all_wave)

In [12]:
all_wave = np.array(all_wave)
#all_wave = np.asarray(all_wave)
print(all_wave.shape)

(75, 7, 20)


In [13]:
#all_wave = np.expand_dims(all_wave, -1)
print(all_wave.dtype)

#print(all_wave.shape)

float32


**Label Encoding

In [14]:
le = LabelEncoder()
y=le.fit_transform(all_label)
classes= list(le.classes_)
print(classes)

['Abdominoplasty', 'Acanthamoeba', 'Achondroplasia', 'Addison_Disease', 'Adenocarcinoma', 'Cystic_Fibrosis', 'Diabetes', 'Gastritis', 'Gastroenterostomy', 'Gynecomastia', 'Heartburn', 'Hypertension', 'Jugular_Vein', 'Leucoderma', 'Macular_Degeneration', 'Mean_Corpuscular_Volume', 'Meningitis_Contagious', 'Meningoencephalitis_Toxoplasma', 'Migraine', 'Mitral_Valve_Prolapse', 'Multiple_Sclerosis', 'Ophthalmology', 'Peroneal_muscle', 'Posterior_muscles', 'Pulmonary_artery']


In [15]:
#One hot encoding
y=utils.to_categorical(y, num_classes=len(labels))

In [33]:
# tf.convert_to_tensor(y)

**Split the dataset into training and test set

In [16]:
x_tr, x_val, y_tr, y_val = train_test_split(all_wave,y,test_size = 0.2,shuffle=True,random_state=777)

In [17]:
print(x_tr.shape)


(60, 7, 20)


**Normalization of data**

In [18]:
mean = x_tr.mean(axis=0, keepdims=True)
std = x_tr.std(axis=0, keepdims=True)

In [19]:
x_tr = (x_tr - mean) / (std + 1e-8)
x_val = (x_val - mean) / (std + 1e-8)

In [20]:
print(y_tr.shape)

(60, 25)


In [21]:
print(y_val.shape)

(15, 25)


In [22]:
print(x_val.shape)

(15, 7, 20)


**LSTM Model

In [23]:
model1 = Sequential()
model1.add(LSTM(25, input_shape = (x_tr.shape[1:]), activation = "relu"))
model1.add(Dropout(0.5))
model1.add(BatchNormalization())
model1.add(Dense(25, activation = "relu"))
model1.add(Dropout(0.2))
model1.add(Dense(25, activation = "softmax"))

In [24]:
opt1 = keras.optimizers.Adam(learning_rate=0.0001)
model1.compile(loss='categorical_crossentropy',optimizer=opt1,metrics=['accuracy'])

In [26]:
history1=model1.fit(x_tr, y_tr ,epochs=100,batch_size=32, validation_data=(x_val,y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

**As we can see the accurracy of the data is not increasing due to insufficient data. The basic requirememnt of the neural nets
is they require large amount of data to learn**

In [43]:
#Predictions using LSTM
def predict(audio):
    prob=model1.predict(np.expand_dims(audio, 1))
    index=np.argmax(prob[0])
    print(index)
    return classes[index]

In [44]:
import random
index=random.randint(0,len(x_val)-1)
print(index)
samples=x_val[index]
#print(samples)
print("Audio:",classes[np.argmax(y_val[index])])
#ipd.Audio(samples, rate=8000)

4
Audio: Adenocarcinoma


**Predictions**

In [45]:
print("Text:",predict(samples))

13
Text: Leucoderma


In [43]:
""""
#1D Convolutional Neural Net

K.clear_session()

inputs = Input(shape=(7,20))

#First Conv1D layer
conv = Conv1D(16, 2, padding='same', activation='relu', strides=1)(inputs)
conv = MaxPooling1D(2)(conv)
# conv = Dropout(0.3)(conv)

#Second Conv1D layer
conv = Conv1D(16, 1, padding='same', activation='relu', strides=1)(conv)
conv = GlobalMaxPool1D()(conv)
# conv = Dropout(0.3)(conv)


#Flatten layer
# conv = Flatten()(conv)

#Dense Layer 1
# conv = Dropout(0.5)(conv)
conv = Dense(16, activation='relu')(conv)
conv = Dropout(0.5)(conv)

outputs = Dense(len(labels), activation='softmax')(conv)

model = Model(inputs, outputs)
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 7, 20)]           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 7, 16)             656       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 3, 16)             0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 3, 16)             272       
_________________________________________________________________
global_max_pooling1d (Global (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dropout (Dropout)            (None, 16)               

In [44]:
#opt = keras.optimizers.Adam(learning_rate= 1e-3)
#model.compile(loss='categorical_crossentropy',optimizer=opt,metrics=['accuracy'])

In [45]:
#history=model.fit(x_tr, y_tr ,epochs=300,batch_size=8, validation_data=(x_val,y_val))

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

In [38]:
pip install sounddevice

Collecting sounddevice
  Downloading sounddevice-0.4.0-py3.cp32.cp33.cp34.cp35.cp36.cp37.cp38.cp39.pp32.pp33.pp34.pp35.pp36.pp37-none-win_amd64.whl (167 kB)
Installing collected packages: sounddevice
Successfully installed sounddevice-0.4.0
Note: you may need to restart the kernel to use updated packages.


In [39]:
pip install soundfile




**prompt for users to record voice commands**

In [42]:
import sounddevice as sd
import soundfile as sf

samplerate = 8000  
duration = 3 # seconds
filename = 'migrane.wav'
print("start")
mydata = sd.rec(int(samplerate * duration), samplerate=samplerate,
    channels=1, blocking=True)
print("end")
sd.wait()
sf.write(filename, mydata, samplerate)

start
end
