In [124]:
from pathlib import Path
from scipy.io import wavfile
from sklearn import preprocessing
import tensorflow as tf
import extract_function as ef
import librosa
import librosa.display
import numpy as np
import IPython.display as ipd
import matplotlib
import matplotlib.pyplot as plt
from scipy.io import wavfile
from tensorflow import keras
import os

In [125]:
url="/home/ak47/AI_proj/Data/"
entries = Path(url)
labels=[]
for entry in entries.iterdir():
    labels.append(str(entry.name))

In [126]:
audio=[]
for label in labels:
    file= Path(url+label)
    for f in file.iterdir():
        audio.append((url+label+"/"+str(f.name)))      

In [127]:
speakers=[]
for file_path in audio:
    speakers.append(tf.strings.split(file_path, '/')[-2])

speaker_encoder = preprocessing.LabelEncoder()
speaker_idx = speaker_encoder.fit_transform([bytes.decode(s.numpy()) for s in speakers])
encoded_speaker_ds = tf.data.Dataset.from_tensor_slices(speaker_idx)

unique_speakers = len(speaker_encoder.classes_)

In [129]:
mfcc_f=[]
for i in range(len(audio)):
    wave, sample_rate = librosa.load(audio[i], mono=True, sr=None)
    mfcc = librosa.feature.mfcc(wave, sample_rate)
    mfcc = mfcc[:, :196]
    pad_width = 196 - mfcc.shape[1]
    mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    mfcc = tf.convert_to_tensor(mfcc)
    mfcc = tf.expand_dims(mfcc, 2)
    mfcc_f.append(mfcc)
audio_ds=tf.data.Dataset.from_tensor_slices(mfcc_f)    

In [130]:
complete_labeled_ds = tf.data.Dataset.zip((audio_ds, encoded_speaker_ds))

In [131]:
for a, speaker in complete_labeled_ds.take(1):
    input_shape = a.shape


In [132]:
labeled_ds = complete_labeled_ds
data_size = len(labeled_ds)
train_size = int(data_size * 0.9)
val_size = int(data_size * 0.05)
test_size = data_size - train_size - val_size
print('all samples: {}'.format(data_size))
print('training samples: {}'.format(train_size))
print('validation samples: {}'.format(val_size))
print('test samples: {}'.format(test_size))

all samples: 1147
training samples: 1032
validation samples: 57
test samples: 58


In [138]:
# create batched datasets
batch_size = 516
labeled_ds = labeled_ds.shuffle(data_size, seed=42)
train_ds = labeled_ds.take(train_size).shuffle(1000).batch(batch_size).prefetch(1)
val_ds = labeled_ds.skip(train_size).take(val_size).batch(batch_size).prefetch(1)
test_ds = labeled_ds.skip(train_size + val_size).take(test_size).batch(batch_size).prefetch(1)

In [139]:
def create_model():
    dropout_rate = .25
    regularazation = 0.001
    audio_input = keras.layers.Input(shape=input_shape)
    conv1 = keras.layers.Conv2D(16, kernel_size=(3, 3), padding='same',
                               activation='relu', input_shape=input_shape)(audio_input)
    maxpool1 = keras.layers.MaxPooling2D(pool_size=(2, 2), strides=2)(conv1)
    batch1 = keras.layers.BatchNormalization()(maxpool1)
    conv2 = keras.layers.Conv2D(32, kernel_size=(3, 3), padding='same',
                               activation='relu', input_shape=input_shape)(batch1)
    maxpool2 = keras.layers.MaxPooling2D(pool_size=(2, 2), strides=2)(conv2)
    batch2 = keras.layers.BatchNormalization()(maxpool2)
    conv3 = keras.layers.Conv2D(64, kernel_size=(3, 3), padding='same', 
                activation='relu')(batch2)
    maxpool3 = keras.layers.MaxPooling2D(pool_size=(2, 2), strides=2)(conv3)
    batch3 = keras.layers.BatchNormalization()(maxpool3)
    flt = keras.layers.Flatten()(batch3)
    drp1 = keras.layers.Dropout(dropout_rate)(flt)
    dense1 = keras.layers.Dense(unique_speakers * 2, activation='relu',
                kernel_regularizer=keras.regularizers.l2(regularazation))(drp1)
    drp2 = keras.layers.Dropout(dropout_rate)(dense1)
    output = keras.layers.Dense(unique_speakers, activation='softmax', name='speaker')(drp2)
    model = keras.Model(inputs=audio_input, outputs=output)
    model.compile(loss=keras.losses.sparse_categorical_crossentropy,
                  optimizer=keras.optimizers.Adam(),
                  metrics=['acc'])
    return model

In [140]:
model = create_model()
train_model = True

In [141]:
model.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 20, 196, 1)]      0         
                                                                 
 conv2d_15 (Conv2D)          (None, 20, 196, 16)       160       
                                                                 
 max_pooling2d_15 (MaxPoolin  (None, 10, 98, 16)       0         
 g2D)                                                            
                                                                 
 batch_normalization_15 (Bat  (None, 10, 98, 16)       64        
 chNormalization)                                                
                                                                 
 conv2d_16 (Conv2D)          (None, 10, 98, 32)        4640      
                                                                 
 max_pooling2d_16 (MaxPoolin  (None, 5, 49, 32)        0   

In [142]:
run_logdir ="/home/ak47/AI_proj/logs"
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir, update_freq='batch')
history = model.fit(train_ds, epochs=50, validation_data=val_ds, callbacks=[tensorboard_cb])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [143]:
model.evaluate(test_ds)



[1.4725784063339233, 0.4655172526836395]

In [144]:
model_name = 'spr_model.h5'
if train_model:
    model.save(model_name)

In [145]:

sample_file=["/home/ak47/Downloads/test1.wav","/home/ak47/Downloads/check.wav"]


sample_ds = tf.data.Dataset.from_tensor_slices(sample_file)
print(sample_ds)

mfcc_f=[]
for i in sample_ds:
    print(i)
    file_name = bytes.decode(i.numpy())

    wave, sample_rate = librosa.load(file_name, mono=True, sr=None)
    
    
    mfcc = librosa.feature.mfcc(wave, sample_rate)
    mfcc = mfcc[:, :196]
    pad_width = 196 - mfcc.shape[1]
    mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    mfcc = tf.convert_to_tensor(mfcc)
    mfcc = tf.expand_dims(mfcc, 2)
    mfcc_f.append(mfcc)
sample_input=tf.data.Dataset.from_tensor_slices(mfcc_f)    
sample_input=sample_input.batch(2)


output = model.predict(sample_input)

speaker_ids = output.argmax(axis=1)
speakers = speaker_encoder.inverse_transform(speaker_ids)
print(speakers)
print(output)


<TensorSliceDataset shapes: (), types: tf.string>
tf.Tensor(b'/home/ak47/Downloads/test1.wav', shape=(), dtype=string)
tf.Tensor(b'/home/ak47/Downloads/check.wav', shape=(), dtype=string)
['Denny' 'Denny']
[[5.9425294e-02 4.9024072e-01 2.3539840e-01 1.0240349e-01 8.5406855e-02
  2.3742127e-03 1.3237508e-03 1.3275164e-02 1.0152159e-02]
 [1.7299768e-02 4.6901873e-01 3.7549508e-01 1.0517548e-02 1.1793719e-01
  1.5947690e-04 2.3788540e-04 3.6584647e-04 8.9685135e-03]]
