# Keras tutorial

### Importando bibliotecas necessárias

In [1]:
import os
import librosa
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from keras.layers import Dense, Dropout, Flatten, Conv1D, Input, MaxPooling1D
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from keras import backend as K
K.clear_session()

Using TensorFlow backend.


### Alguns exemplos com visualização e manipulação de áudios
Opcional

Visualização de dados (exemplo)

coloque o path do data set de treino na variável train_audio_path
train_audio_path = './train/audio/'

faixa de áudio para teste
example = 'yes/0a7c2a8d_nohash_0.wav'

samples, sample_rate = librosa.load(train_audio_path + example, sr = 16000)

organizando gráfico
fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.set_title('Raw wave of' + train_audio_path + example) #tradução: Onda bruta do "path do exemplo"
ax1.set_xlabel('Time')
ax1.set_ylabel('Amplitude')
ax1.plot(np.linspace(0, sample_rate/len(samples), sample_rate), samples)


## Começo

### Pré processamento de áudio
Links:
https://towardsdatascience.com/urban-sound-classification-part-2-sample-rate-conversion-librosa-ba7bc88f209a
https://librosa.github.io/blog/2019/07/17/resample-on-load/

In [2]:
#coloque o path do data set de treino na variável train_audio_path
train_audio_path = 'train/train/audio'

#escolha os labels de acordo com o data set
labels = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"]

all_wave = [] #lista com os áudios
all_label = [] #lista com cada categoria de áudio (label)

for label in labels:
    waves = [f for f in os.listdir(train_audio_path + '/' + label) if f.endswith('.wav')] #pega todos os arquivos .wav
    pbar = tqdm(waves)
    print(label)
    for wav in pbar:
        #basicamente resampling os audios de 16000 para 8000 e excluindo os com menos de 1 segundo de duração
        samples, sample_rate = librosa.load(train_audio_path + '/' + label + '/' + wav, sr = 16000)
        samples = librosa.resample(samples, sample_rate, 8000)
        if(len(samples) == 8000):
            all_wave.append(samples)
            all_label.append(label)

#convertendo as categorias (labels) em inteiros codificados (?)
le = LabelEncoder()
y = le.fit_transform(all_label)
classes = list(le.classes_)

#problema de classificação multipla precisa converter os inteiros acima em "one-hot vector"
#explicar melhor
y = np_utils.to_categorical(y, num_classes = len(labels))

#mudando o array de 2D para 3D por causa do conv1d (?) - precisa ser 3D
all_wave = np.array(all_wave).reshape(-1, 8000, 1)

  0%|                                                                                         | 0/2377 [00:00<?, ?it/s]

yes


100%|██████████████████████████████████████████████████████████████████████████████| 2377/2377 [00:49<00:00, 48.25it/s]
  0%|                                                                                         | 0/2375 [00:00<?, ?it/s]

no


100%|██████████████████████████████████████████████████████████████████████████████| 2375/2375 [00:47<00:00, 50.38it/s]
  0%|                                                                                         | 0/2375 [00:00<?, ?it/s]

up


100%|██████████████████████████████████████████████████████████████████████████████| 2375/2375 [00:48<00:00, 49.29it/s]
  0%|                                                                                         | 0/2359 [00:00<?, ?it/s]

down


100%|██████████████████████████████████████████████████████████████████████████████| 2359/2359 [00:49<00:00, 47.43it/s]
  0%|                                                                                         | 0/2353 [00:00<?, ?it/s]

left


100%|██████████████████████████████████████████████████████████████████████████████| 2353/2353 [00:47<00:00, 49.51it/s]
  0%|                                                                                         | 0/2367 [00:00<?, ?it/s]

right


100%|██████████████████████████████████████████████████████████████████████████████| 2367/2367 [00:47<00:00, 49.78it/s]
  0%|                                                                                         | 0/2367 [00:00<?, ?it/s]

on


100%|██████████████████████████████████████████████████████████████████████████████| 2367/2367 [00:46<00:00, 50.38it/s]
  0%|                                                                                         | 0/2357 [00:00<?, ?it/s]

off


100%|██████████████████████████████████████████████████████████████████████████████| 2357/2357 [00:47<00:00, 50.10it/s]
  0%|                                                                                         | 0/2380 [00:00<?, ?it/s]

stop


100%|██████████████████████████████████████████████████████████████████████████████| 2380/2380 [00:47<00:00, 50.13it/s]
  0%|                                                                                         | 0/2372 [00:00<?, ?it/s]

go


100%|██████████████████████████████████████████████████████████████████████████████| 2372/2372 [00:47<00:00, 50.34it/s]


### Separando os sets
Treinamento e validação

In [3]:
#Usa o train_test_split do sklearn para separar os sets
#tr = train (treino)
#val = validation (validação)
#o modelo de treinamento vai usar 80% dos dados enquanto o de validação os 20% restantes
x_tr, x_val, y_tr, y_val = train_test_split(np.array(all_wave), np.array(y), stratify = y, test_size = 0.2, random_state = 777, shuffle = True)

### Modelando

#### Arquitetura
Para construir o modelo será usado o Conv1d. O Conv1d é uma rede neural convolucional

#### Construindo o modelo
Link sobre earlystopping e modelcheckpoint: http://keras.io/callbacks/

In [4]:
inputs = Input(shape = (8000, 1))

#primeiro Conv1d layer
conv = Conv1D(8,13, padding='valid', activation='relu', strides=1)(inputs)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

#segundo Conv1d layer
conv = Conv1D(16, 11, padding = 'valid', activation = 'relu', strides = 1)(conv)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

#terceiro Conv1d layer
conv = Conv1D(32, 9, padding = 'valid', activation = 'relu', strides = 1)(conv)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

#quarto Conv1d layer
conv = Conv1D(64, 7, padding = 'valid', activation = 'relu', strides = 1)(conv)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

#flatten layer
conv = Flatten()(conv)

#dense layer 1
conv = Dense(256, activation = 'relu')(conv)
conv = Dropout(0.3)(conv)

#dense layer 2
conv = Dense(128, activation = 'relu')(conv)
conv = Dropout(0.3)(conv)

outputs = Dense(len(labels), activation = 'softmax')(conv)

model = Model(inputs, outputs)
model.summary()

#Define a função de perda como "categorical cross-entropy" pois é um problema de multiclassificação (?)
#Pesquisar sobre
model.compile(loss = 'categorical_crossentropy', optimizer ='adam', metrics = ['accuracy'])

#"EarlyStopping" e "ModelCheckpoints" são callbacks para parar o treino da rede neural no momento certo e salvar o melhor modelo em toda epoch
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 10, min_delta = 0.0001)
mc = ModelCheckpoint('best_model.hdf5', monitor = 'val_acc', verbose = 1, save_best_only = True, mode = 'max')


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 8000, 1)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 7988, 8)           112       
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 2662, 8)           0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 2662, 8)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 2652, 16)          1424      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 884, 16)           0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 884, 16)           0   

## Treinando

Uma epoch é quando todo o dataset passa pela rede neural (de trás pra frente e da frente pra trás)
Batch é a divisão do dataset
Batch size é o total de exemplos dentro de uma batch
Link: https://towardsdatascience.com/epoch-vs-iterations-vs-batch-size-4dfb9c7ce9c9

In [None]:
#utilizar o model.fit para treinar o modelo por um número fixado de epochs
history = model.fit(x_tr, y_tr, epochs = 100, callbacks = [es, mc], batch_size = 32, validation_data = (x_val, y_val))
model.save('best_model.hdf5') #não tinha no tutorial original. thanks Vinicius

Train on 17049 samples, validate on 4263 samples
Epoch 1/100






Epoch 2/100






Epoch 3/100






Epoch 4/100






Epoch 5/100






Epoch 6/100






Epoch 7/100






Epoch 8/100






Epoch 9/100






Epoch 10/100






Epoch 11/100






Epoch 12/100






Epoch 13/100






Epoch 14/100






Epoch 15/100






Epoch 16/100






Epoch 17/100






Epoch 18/100






Epoch 19/100






Epoch 20/100






Epoch 21/100






Epoch 22/100






Epoch 23/100






Epoch 24/100






Epoch 25/100






Epoch 26/100






Epoch 27/100






Epoch 28/100






Epoch 29/100






Epoch 30/100




## Usando o modelo

In [None]:
#carregar o modelo
model = load_model('best_model.hdf5')

In [None]:
#função para predição
def predict(audio):
    prob = model.predict(audio.reshape(1,8000,1)) #pesquisar audio.reshape
    index = np.argmax(prob[0])
    return classes[index]

In [None]:
#