# Importanción de librerías 

In [99]:
import tensorflow as tf
import sklearn
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import librosa
from python_speech_features import mfcc

In [100]:
tf.test.is_gpu_available()

True

# Generación de ficheros

Esta función se encarga de generar un archivo txt de manera que este contenga 2 elementos por fila ('name_file','etiqueta') de esta manera permitirá poder entrenar la red.

In [101]:
import numpy as np
import os
def generate_file_data(dir,name):
    directory=dir
    #el nombre de los archivos posee el primer dígito en el nombre de esta forma permitirá etiquetarlos.
    a={'0':'cero','1':'uno','2':'dos','3':'tres','4':'cuatro','5':'cinco','6':'seis','7':'siete','8':'ocho','9':'nueve'}
    da=os.listdir(directory)
    # ordena los archivos
    da.sort()
    file = open(dir+name+'.txt',"w")
    for filename in da:
        if '.wav' in filename:
            file.write(filename+','+a[filename[0]]+'\n')
    file.close() 
    # genera el fichero
    with open(directory+'/'+name+'.txt') as f:
        read_data = f.read()
        f.closed
    read_data=read_data.split('\n')
    read_data=read_data[0:len(read_data)-1]
    return read_data

# Encoding words with One Hot Encoding

In [102]:
from sklearn.preprocessing import OneHotEncoder
vocabulary_words=np.array(['cero','uno','dos','tres','cuatro','cinco','seis','siete','ocho','nueve'])

In [103]:
onehot_encoder = OneHotEncoder(handle_unknown='ignore',categories='auto')

In [104]:
onehot_encoder

OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=True)

In [105]:
vocabulary_words.reshape(-1,1)

array([['cero'],
       ['uno'],
       ['dos'],
       ['tres'],
       ['cuatro'],
       ['cinco'],
       ['seis'],
       ['siete'],
       ['ocho'],
       ['nueve']], dtype='<U6')

In [106]:
onehot_encoder.fit(X=vocabulary_words.reshape(-1,1))

OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=True)

In [107]:
onehot_encoder.categories_

[array(['cero', 'cinco', 'cuatro', 'dos', 'nueve', 'ocho', 'seis', 'siete',
        'tres', 'uno'], dtype='<U6')]

In [108]:
v=onehot_encoder.transform(vocabulary_words.reshape(-1,1)).toarray()

In [109]:
v

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]])

In [110]:
onehot_encoder.inverse_transform(v)

array([['cero'],
       ['uno'],
       ['dos'],
       ['tres'],
       ['cuatro'],
       ['cinco'],
       ['seis'],
       ['siete'],
       ['ocho'],
       ['nueve']], dtype='<U6')

In [111]:
def class_to_integer_encoded(n):# no srive
    return integer_encoded[n]

In [112]:
def encode(x):# tomará un array de string y lo transformada a encode
    return onehot_encoder.transform(x.reshape(-1,1)).toarray()

In [113]:
a=encode(np.array(['uno','dos']))

In [114]:
a

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]])

In [115]:
def decode(x):
    return onehot_encoder.inverse_transform(x)


In [116]:
decode(a)

array([['uno'],
       ['dos']], dtype='<U6')

In [117]:
def to_text(m,mfcc):
    label=m.predict_classes(mfcc)
    text=label_encoder.inverse_transform(label)
    text=text[0]
    return text

# MFCC 


In [125]:
def mfcc_features(DIR,list_dir):
    mfcc_audios=[]
    for dir in list_dir:
        wave, sr = librosa.load(DIR+dir, mono=True)
        features= librosa.feature.mfcc(wave, sr,n_mfcc=20)
        #features = sklearn.preprocessing.scale(features, axis=1)
        features=sklearn.preprocessing.normalize(features,axis=1)
        try:
            features=np.pad(features,((0,0),(0,160-len(features[0]))),mode='constant', constant_values=0)
        except OSError as err:
            print(dir)
        mfcc_audios.append(features)
    mfcc_audios=np.array(mfcc_audios)
    return mfcc_audios

In [126]:
def prepare_data(dir,name):
    file = open(dir+name)
    f=file.read()
    file.close()
    f=f.split('\n')
    f=f[0:len(f)-1]
    labels=[]
    names_audios=[]
    for i in f:
        j=i.split(',')
        names_audios.append(j[0])
        labels.append(j[1])
    labels=np.array(labels)
    onehot= encode(labels)
    mfcc=mfcc_features(dir,names_audios)
    print(name+' OK')
    return mfcc,onehot



In [127]:
class dataset:
    def __init__(self,data):
        self.i=0#para el shuffle
        self.data_dir=data
        self.shuffle=None
        self.dir_training=data+'/training/'
        self.dir_test=data+'/test/'
        self.training_set=None
        self.test_set=None
    def split_dataset(self):
        generate_file_data(self.dir_training,name='training')
        generate_file_data(self.dir_test,name='test')
        self.training_set=prepare_data(self.dir_training,'training.txt')
        self.test_set=prepare_data(self.dir_test,'test.txt')

    

In [128]:
d=dataset('data')
d.split_dataset()

training.txt OK
test.txt OK


In [129]:
d.dir_test

'data/test/'

In [130]:
d.training_set

(array([[[-0.19981247, -0.1874131 , -0.17823835, ...,  0.        ,
           0.        ,  0.        ],
         [ 0.00211842,  0.061868  ,  0.09546465, ...,  0.        ,
           0.        ,  0.        ],
         [ 0.00393586,  0.06430282,  0.06565746, ...,  0.        ,
           0.        ,  0.        ],
         ...,
         [-0.0059565 ,  0.02239813,  0.03174018, ...,  0.        ,
           0.        ,  0.        ],
         [-0.00131375,  0.06003447, -0.03365781, ...,  0.        ,
           0.        ,  0.        ],
         [-0.005851  ,  0.11509494,  0.02923995, ...,  0.        ,
           0.        ,  0.        ]],
 
        [[-0.1474977 , -0.14438292, -0.14035693, ...,  0.        ,
           0.        ,  0.        ],
         [ 0.        ,  0.02301636,  0.04979828, ...,  0.        ,
           0.        ,  0.        ],
         [ 0.        ,  0.03118334,  0.06120676, ...,  0.        ,
           0.        ,  0.        ],
         ...,
         [ 0.        , -0.0705908

In [131]:
len(d.training_set[0])

240

In [None]:
#[d.training_set[0][i].shape for i in range(len(d.training_set[0]))] #will display numofcep

In [132]:
np.size(d.training_set[0])

768000

In [133]:
d.test_set[0]

array([[[-0.17419944, -0.17313861, -0.16974107, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.00864419,  0.03525551, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.01998642,  0.07495819, ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.        , -0.07632924, -0.10600874, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        , -0.04517924, -0.05914144, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        , -0.05994333, -0.06392851, ...,  0.        ,
          0.        ,  0.        ]],

       [[-0.13191315, -0.12657437, -0.12563002, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.12235815,  0.11213216,  0.0905721 , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.349477  ,  0.179907  ,  0.0308347 , ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.09658694,  0.10798164,  0.12718308, ...,  

In [78]:
d.test_set[1]

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Red neuronal 

In [134]:
trainX, trainY = d.training_set[0],d.training_set[1]
testX, testY = d.test_set[0],d.test_set[1]


In [135]:
import sklearn

In [136]:
trainX=np.matrix.transpose(trainX,[0,2,1])
testX=np.matrix.transpose(testX,[0,2,1])

In [137]:
trainX

array([[[-0.19981247,  0.00211842,  0.00393586, ..., -0.0059565 ,
         -0.00131375, -0.005851  ],
        [-0.1874131 ,  0.061868  ,  0.06430282, ...,  0.02239813,
          0.06003447,  0.11509494],
        [-0.17823835,  0.09546465,  0.06565746, ...,  0.03174018,
         -0.03365781,  0.02923995],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]],

       [[-0.1474977 ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.14438292,  0.02301636,  0.03118334, ..., -0.07059087,
         -0.0478587 , -0.03327289],
        [-0.14035693,  0.04979828,  0.06120676, ..., -0.07623239,
         -0.04398861,  0.00130084],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  

In [138]:
testX

array([[[-0.17419944,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.17313861,  0.00864419,  0.01998642, ..., -0.07632924,
         -0.04517924, -0.05994333],
        [-0.16974107,  0.03525551,  0.07495819, ..., -0.10600874,
         -0.05914144, -0.06392851],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]],

       [[-0.13191315,  0.12235815,  0.349477  , ...,  0.09658694,
         -0.00473038,  0.15127921],
        [-0.12657437,  0.11213216,  0.179907  , ...,  0.10798164,
         -0.00725899,  0.18143073],
        [-0.12563002,  0.0905721 ,  0.0308347 , ...,  0.12718308,
          0.09728452,  0.22574607],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  

In [139]:
import tensorflow as tf
import time 
n_units=128
time_steps=160
n_inputs=20
batch_size=10
n_epochs=10
n_class=10

In [145]:
model=tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(n_units, input_shape=(time_steps,n_inputs)))#,return_sequences=True))
#model.add(tf.keras)
#model.add(tf.keras.layers.LSTM(n_units))#,return_sequences=True))
model.add(tf.layers.Dense(n_class, activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
ti=time.time()
for i in range(100):
    result=model.fit(trainX,trainY,batch_size=batch_size,epochs=n_epochs,validation_data=[testX,testY])
    model.save('Models/keras_training100')
tiempofinal=time.time()

Train on 240 samples, validate on 110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 240 samples, validate on 110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 240 samples, validate on 110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 240 samples, validate on 110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 240 samples, validate on 110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 240 samples, validate on 110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Train on 240 samples, validate on 110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 240 samples, validate on 110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 240 samples, validate on 110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 240 samples, validate on 110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 240 samples, validate on 110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 240 samples, validate on 110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


Epoch 10/10
Train on 240 samples, validate on 110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 240 samples, validate on 110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 240 samples, validate on 110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 240 samples, validate on 110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

In [142]:
print((tiempofinal-ti)/60)

181.43704609473545


In [39]:
trainX[0]

array([[-0.91332112, -1.20854392, -0.19381587, ...,  0.09185654,
         0.4319833 ,  0.45809148],
       [-0.88884379, -1.11266398, -0.04986061, ..., -0.4501449 ,
         0.08398442, -0.00812885],
       [-0.81044921, -0.81749539,  0.34608203, ..., -0.66089419,
        -0.02356124, -0.03912434],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [113]:
y=model.predict([[trainX[0]]])

In [112]:
model.predict([[trainX[0]]])

array([[0.09945323, 0.0985247 , 0.09981292, 0.09982746, 0.10115309,
        0.09877511, 0.09959589, 0.10086489, 0.10064428, 0.10134849]],
      dtype=float32)

In [116]:
label_encoder.inverse_transform([0])

array(['cero'], dtype='<U6')

In [117]:
to_text(model,[[trainX[0]]])

'cero'

In [118]:
y

array([[8.5458267e-01, 1.3367206e-05, 1.1010848e-04, 1.2222943e-06,
        1.5461472e-05, 3.2351786e-04, 6.4432106e-06, 1.4490633e-01,
        3.8986556e-05, 1.9558286e-06]], dtype=float32)

In [182]:
trainY[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

In [120]:
class_result=y[0]>=y[0][np.argmax(y,axis=-1)]

In [192]:
y[0][9]

0.8600664

In [121]:
class_result

array([ True, False, False, False, False, False, False, False, False,
       False])

In [122]:
decode([class_result])

array(['cero'], dtype='<U6')

In [202]:
class_result=class_result.astype('int')

TypeError: predict_proba() missing 1 required positional argument: 'x'

In [229]:
import librosa
wave, sr = librosa.load('data/test/6_seg_v1.wav', mono=True)
features= librosa.feature.mfcc(wave, sr,n_mfcc=20)

In [230]:
wave

array([-1.0500123e-08,  1.5895614e-08, -2.1860259e-08, ...,
        8.4779126e-04,  6.8931724e-04,  7.1130751e-04], dtype=float32)

In [231]:
features = sklearn.preprocessing.scale(features, axis=1)
features=np.pad(features,((0,0),(0,160-len(features[0]))),mode='constant', constant_values=0)

In [232]:
f=np.matrix.transpose(np.array([features]),[0,2,1])

In [233]:
predict=m.predict_classes(f)

In [235]:
decode(np.eye(10)[predict])

array([['dos']], dtype='<U6')

In [78]:
np.argmax(predict)

3

In [220]:
label_encoder.inverse_transform(label)

array(['three'], dtype='<U5')

In [225]:
to_text(m,f)

'three'

# Load Model 

In [243]:
import tensorflow as tf
m=tf.keras.models.load_model('Models/keras_training')

In [4]:
from keras.utils import plot_model
plot_model(m, to_file='model.png')

In [244]:
m.predict_classes([[trainX[0]]])

array([0])

# Recording audios and predict them

In [232]:
import sounddevice as sd
import ipywidgets as widgets
from IPython.display import display

In [233]:
button = widgets.Button(description="Record")
display(button)

def on_button_clicked(b):
    print("Button clicked.")

button.on_click(on_button_clicked)


Button(description='Record', style=ButtonStyle())

In [240]:
duration = 10000  # seconds
fs=16000
myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=1)

In [241]:
myrecording

array([[-3.3615484e-07],
       [ 8.7624744e-07],
       [-9.1520860e-07],
       ...,
       [ 0.0000000e+00],
       [ 0.0000000e+00],
       [ 0.0000000e+00]], dtype=float32)

In [258]:
import pyaudio
import wave
 
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 48000#44100
CHUNK = 1024
RECORD_SECONDS = 2
WAVE_OUTPUT_FILENAME = "file.wav"
 
audio = pyaudio.PyAudio()
 
# start Recording
stream = audio.open(format=FORMAT, channels=CHANNELS,
                rate=RATE, input=True,
                frames_per_buffer=CHUNK)
print ("recording...")
frames = []
 
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)
print ("finished recording")
 
 
# stop Recording
stream.stop_stream()
stream.close()
audio.terminate()
 
waveFile = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
waveFile.setnchannels(CHANNELS)
waveFile.setsampwidth(audio.get_sample_size(FORMAT))
waveFile.setframerate(RATE)
waveFile.writeframes(b''.join(frames))
waveFile.close()

recording...
finished recording


In [259]:
wave

<module 'wave' from '/home/visoc/anaconda3/envs/tf/lib/python3.6/wave.py'>

In [260]:
import librosa
wave, sr = librosa.load('file.wav', mono=True)
features= librosa.feature.mfcc(wave, sr,n_mfcc=20)

In [261]:
wave

array([0.01377955, 0.02141383, 0.01817723, ..., 0.00746789, 0.00617825,
       0.        ], dtype=float32)

In [262]:
features = sklearn.preprocessing.scale(features, axis=1)
features=np.pad(features,((0,0),(0,160-len(features[0]))),mode='constant', constant_values=0)
f=np.matrix.transpose(np.array([features]),[0,2,1])

In [263]:
label=m.predict_classes(f)

<bound method Sequential.get_config of <tensorflow.python.keras.engine.sequential.Sequential object at 0x7f4076c35128>>


In [265]:
decode(np.eye(10)[label])

array([['uno']], dtype='<U6')

In [35]:
label=class_to_integer_encoded(label[0]-1)

In [36]:
label

array([[6]])

In [37]:
label_encoder.inverse_transform(label)

array(['seis'], dtype='<U6')

In [177]:
import tensorflow as tf


In [None]:
model=tf.keras.Sequential()
#model.add(tf.layers.Dropout(0.5))
model.add(tf.keras.layers.RNN(_units, input_shape=(time_steps,n_inputs)))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.layers.Dense(n_class, activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])