In [1]:
import numpy as np
import pandas as pd
import os
import librosa,librosa.display
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [2]:
path='../input/free-spoken-digits/free-spoken-digit-dataset-master/recordings/0_george_0.wav'
raw_data,framerate=librosa.load(path)
raw_data,framerate

(array([-0.04374877, -0.04589297, -0.03884229, ..., -0.00196383,
         0.00266265,  0.        ], dtype=float32),
 22050)

### We will be reading sound file with the help of librosa library which will convert sound file into time series y, represented as a one-dimensional NumPy floating point array.

In [3]:
data=pd.DataFrame(columns=['raw_data','duration','digit'])
dir_path='../input/free-spoken-digits/free-spoken-digit-dataset-master/recordings/'
for i in tqdm(os.listdir(dir_path)):
        raw_data,frame_rate=librosa.load(dir_path+i)
        duration=librosa.get_duration(raw_data,frame_rate)
        data.loc[len(data.index)]=[raw_data,duration,i.split('_')[0]] # We are appending label as it in file name

  0%|          | 0/3000 [00:00<?, ?it/s]

In [4]:
data.head()

Unnamed: 0,raw_data,duration,digit
0,"[-0.00035317737, -0.0003979482, -0.00042274845...",0.44254,0
1,"[0.000108883556, 0.00013303479, 0.00024854412,...",0.386757,2
2,"[0.0017647212, 0.0026649828, 0.0028954616, 0.0...",0.452789,4
3,"[5.169994e-05, 6.9254784e-05, 6.970848e-05, 6....",0.366893,1
4,"[0.0019327061, 0.002918903, 0.0038223783, 0.00...",0.515646,9


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[['raw_data','duration']],data['digit'], test_size=0.3, random_state=45,stratify=data['digit'])

In [6]:
for i in range(0,101,10):
    print(i,' th percentile is ',np.percentile([len(i) for i in X_train['raw_data']],i))

0  th percentile is  3165.0
10  th percentile is  6074.8
20  th percentile is  7179.2
30  th percentile is  7879.199999999999
40  th percentile is  8615.8
50  th percentile is  9231.0
60  th percentile is  9929.8
70  th percentile is  10744.4
80  th percentile is  11726.0
90  th percentile is  13377.000000000004
100  th percentile is  48420.0


In [7]:
for i in range(90,101,1):
    print(i,' th percentile is ',np.percentile([len(i) for i in X_train['raw_data']],i))

90  th percentile is  13377.000000000004
91  th percentile is  13603.0
92  th percentile is  13804.880000000001
93  th percentile is  14066.910000000003
94  th percentile is  14298.8
95  th percentile is  14658.55
96  th percentile is  15069.32
97  th percentile is  15816.99
98  th percentile is  17488.04
99  th percentile is  20366.199999999997
100  th percentile is  48420.0


In [8]:
max_length=20366

### We are padding the sequence as we going to use LSTM

In [9]:
import tensorflow as tf
X_train_pad=tf.keras.preprocessing.sequence.pad_sequences(X_train['raw_data'],maxlen=max_length, dtype='float32')
X_test_pad=tf.keras.preprocessing.sequence.pad_sequences(X_test['raw_data'],maxlen=max_length, dtype='float32')
X_train_mask=np.where(X_train_pad>0.0,True,False)
X_test_mask=np.where(X_test_pad>0.0,True,False)

In [10]:
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model
from sklearn.metrics import f1_score

### Fourier Tranformation is computed on overlapping windowed segments of the signal, and we get what is called the spectrogram
### Hence we are converting our raw_data ie time series to spectogram
### Mel spectrogram is a spectrogram where the frequencies are converted to the mel scale

In [11]:
def convert_to_spectrogram(raw_data):
    '''converting to spectrogram'''
    spect = librosa.feature.melspectrogram(y=raw_data, n_mels=64) # n_mels as output shape
    mel_spect = librosa.power_to_db(S=spect, ref=np.max)
    return mel_spect

In [12]:
X_train_spectrogram=np.array([convert_to_spectrogram(np.array([float(i) for i in X_train_pad[k] ])) for k in range(len(X_train_pad)) ])
X_test_spectrogram=np.array([convert_to_spectrogram(np.array([float(i) for i in X_test_pad[k] ])) for k in range(len(X_test_pad)) ])

In [13]:
X_train_spectrogram.shape

(2100, 64, 40)

In [14]:
input_layer=Input(shape=(64,40), dtype=np.float32,name='input_layer')
lstm=LSTM(500,name='lstm_layer',return_sequences=True)(input_layer)
d1=Dense(120,activation='relu',name='dense1')(tf.math.reduce_mean(lstm, 2))
d2=Dense(60,activation='relu',name='dense2')(d1)
d3=Dense(10,activation='softmax',name='dense3')(d2)

In [15]:
model = Model(inputs=input_layer, outputs=d3)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     [(None, 64, 40)]          0         
_________________________________________________________________
lstm_layer (LSTM)            (None, 64, 500)           1082000   
_________________________________________________________________
tf.math.reduce_mean (TFOpLam (None, 64)                0         
_________________________________________________________________
dense1 (Dense)               (None, 120)               7800      
_________________________________________________________________
dense2 (Dense)               (None, 60)                7260      
_________________________________________________________________
dense3 (Dense)               (None, 10)                610       
Total params: 1,097,670
Trainable params: 1,097,670
Non-trainable params: 0
___________________________________________________

In [16]:
def cal_f1(y_true,y_pred):
    return f1_score(y_true,y_pred,average='micro')
def micro_f1(y_true,y_prob):
    y_pred=tf.math.argmax(y_prob,axis=1)
    return tf.py_function(cal_f1,(y_true,y_pred),tf.double)

In [17]:
class LossHistory(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if logs.get('val_micro_f1', -1)>0.97:
            self.model.stop_training=True

loss_history=LossHistory()

filepath="model_save/weights-{epoch:02d}-{micro_f1:.4f}-{val_micro_f1:.4f}.hdf5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=filepath, monitor='val_micro_f1',  verbose=1, save_best_only=True, mode='max')

In [18]:
opt= tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy' ,metrics=['accuracy',micro_f1])

In [19]:
tf.keras.backend.clear_session()
model.fit(X_train_spectrogram,y_train.astype('int')\
           ,validation_data=(X_test_spectrogram,y_test.astype('int'))\
           ,batch_size=32,epochs=400\
           ,callbacks=[loss_history,checkpoint])

Epoch 1/400

Epoch 00001: val_micro_f1 improved from -inf to 0.26401, saving model to model_save/weights-01-0.1330-0.2640.hdf5
Epoch 2/400

Epoch 00002: val_micro_f1 improved from 0.26401 to 0.35776, saving model to model_save/weights-02-0.3346-0.3578.hdf5
Epoch 3/400

Epoch 00003: val_micro_f1 improved from 0.35776 to 0.46552, saving model to model_save/weights-03-0.4387-0.4655.hdf5
Epoch 4/400

Epoch 00004: val_micro_f1 improved from 0.46552 to 0.50862, saving model to model_save/weights-04-0.4867-0.5086.hdf5
Epoch 5/400

Epoch 00005: val_micro_f1 improved from 0.50862 to 0.54741, saving model to model_save/weights-05-0.5225-0.5474.hdf5
Epoch 6/400

Epoch 00006: val_micro_f1 improved from 0.54741 to 0.56573, saving model to model_save/weights-06-0.5540-0.5657.hdf5
Epoch 7/400

Epoch 00007: val_micro_f1 did not improve from 0.56573
Epoch 8/400

Epoch 00008: val_micro_f1 improved from 0.56573 to 0.60776, saving model to model_save/weights-08-0.5840-0.6078.hdf5
Epoch 9/400

Epoch 00009:

<tensorflow.python.keras.callbacks.History at 0x7f055c202150>

In [20]:
opt_res=os.listdir("model_save/")

In [21]:
result=pd.DataFrame()
epoch=[]
f1=[]
val_f1=[]
for i in opt_res:    
    epoch.append(i.split('-')[1])
    f1.append(i.split('-')[2])
    val_f1.append(i.split('-')[3][:6])
result['epoch']=epoch
result['f1']=f1
result['val_f1']=val_f1
values=result[result.epoch==str(result.epoch.astype('int').max())]

In [22]:
print("We have found optimum result at\nEpoch: ",values.iloc[0].epoch,"\nTrain F1 score: ",values.iloc[0].f1,"\nTest F1 score: ",values.iloc[0].val_f1)

We have found optimum result at
Epoch:  386 
Train F1 score:  0.9683 
Test F1 score:  0.9386
