In [1]:
from scipy.io import wavfile
import os
import pickle
import numpy as np
import tensorflow as tf
import random
from tensorflow import keras
from pre_process import form_input_data

In [2]:
INT16_MAX = 32767
SAMPLE_RATE, _ = wavfile.read("../VCTK-Corpus/wav48/p225/p225_001.wav")
FRAME_SIZE = int(SAMPLE_RATE * 0.025)
NFFT = 512 
NFILT = 40
N_SPEAKER = 32
pre_emphasis = 0.97

spk_list = os.listdir("../VCTK-Corpus/wav48/")
utterance = {}
emphasized_data = []
validation_dataset = []
validation_data = []
validation_label = []
train_data = []
train_label = []
enrollment_dataset = []
verification_dataset = []

In [4]:
# Text-independent Data processing
for pid, speaker in enumerate(spk_list[0:N_SPEAKER]):
    utterance[speaker] = {}
    path = "../VCTK-Corpus/wav48/" + speaker
    utterance[speaker]['files'] = os.listdir(path)
    for count in range(10):
        file_path = "../VCTK-Corpus/wav48/" + speaker + "/" + utterance[speaker]['files'].pop(0)
        _, data = wavfile.read(file_path)         # requires tons of memory with many spekaers
        emphasized_signal = np.append(data[0], data[1:] - pre_emphasis * data[:-1])
        if count < 5:
            emphasized_data.append((emphasized_signal,pid))
        elif count < 10:
            validation_dataset.append((emphasized_signal,pid))

for entry in emphasized_data:
    form_input_data(entry, train_data, train_label)
for entry in validation_dataset:
    form_input_data(entry, validation_data, validation_label)

In [5]:
with open('trainning_data.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([train_data, train_label, validation_data, validation_label], f)
with open('utterance_list.pkl', 'wb') as f:
    pickle.dump([utterance, spk_list], f)

In [3]:
with open('trainning_data.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
    train_data, train_label, validation_data, validation_label = pickle.load(f)

In [3]:
def form_input_data(entry, data_list, label_list):
    # 40 filter_banks + 30 frames left + 10 frames right
    data, spk = entry
    filter_banks = get_filter_banks(data)
    for n in range(30, len(filter_banks) - 10):
        frame = filter_banks[n-30: n+11].reshape(41*40)
        data_list.append(frame)
        label_list.append(spk)

def get_filter_banks(data):
    all_filter_banks = []
    nframes = int(data.size/FRAME_SIZE) + 1
    for n in range(nframes):
        frame = data[n*FRAME_SIZE : (n+1)*FRAME_SIZE]
        if frame.size < FRAME_SIZE:
            frame = np.concatenate((frame,np.zeros(FRAME_SIZE - frame.size, dtype=int)))
        all_filter_banks.append(extract_filter_banks(frame))
    return np.array(all_filter_banks)

In [4]:
inputs = keras.layers.Input(shape=(NFILT*41,))
dense1 = keras.layers.Dense(256, kernel_regularizer='l2', activation='relu')(inputs)
dense2 = keras.layers.Dense(256, kernel_regularizer='l2', activation='relu')(dense1)
dense3 = keras.layers.Dense(256, kernel_regularizer='l2', activation='relu')(dense2)
drop_out1 = keras.layers.Dropout(0.5)(dense3)
dense4 = keras.layers.Dense(256, kernel_regularizer='l2', activation='relu')(drop_out1)
drop_out2 = keras.layers.Dropout(0.5)(dense4)
outputs = keras.layers.Dense(N_SPEAKER, activation='softmax')(drop_out2)
model = keras.models.Model(inputs=inputs, outputs=outputs)

In [4]:
inputs = keras.layers.Input(shape=(NFILT*41,))
dense1 = keras.layers.Dense(256, activation='relu')(inputs)
dense2 = keras.layers.Dense(256, activation='relu')(dense1)
dense3 = keras.layers.Dense(256, activation='relu')(dense2)
dense4 = keras.layers.Dense(256, activation='relu')(dense3)
outputs = keras.layers.Dense(N_SPEAKER, activation='softmax')(dense4)
model = keras.models.Model(inputs=inputs, outputs=outputs)

In [5]:
# train model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(np.array(train_data), np.array(train_label), epochs=50, shuffle=True, validation_data=(np.array(validation_data),np.array(validation_label)))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x13dba9b18b0>

In [6]:
model.save("saved_model/my_model")

INFO:tensorflow:Assets written to: saved_model/my_model\assets


In [11]:
load_back = tf.keras.models.load_model("saved_model/my_model")
load_back.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 40)]              0         
_________________________________________________________________
dense (Dense)                (None, 256)               10496     
_________________________________________________________________
dense_1 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0     

In [27]:
# process evaluation data
_, data = wavfile.read("../VCTK-Corpus/wav48/p227/p227_021.wav")
'''
for sample in range(len(data)):
    data[sample] += random.randrange(-20,20)
'''
emphasized_signal = np.append(data[0], data[1:] - pre_emphasis * data[:-1])
evaluation_data = []
evaluation_label = []
long_frame_filter_banks((emphasized_signal,2), evaluation_data, evaluation_label)

In [28]:
layer_name = 'dropout_1'
intermediate_layer_model = keras.models.Model(inputs=model.input,
                                 outputs=model.get_layer(layer_name).output)
intermediate_output = intermediate_layer_model.predict(np.array(evaluation_data))

In [29]:
test_loss, test_acc = model.evaluate(np.array(evaluation_data),  np.array(evaluation_label), verbose=1) 



In [30]:
d_vector = np.zeros(256)
for out in intermediate_output:
    d_vector += out/sum(out)

In [33]:
_, data = wavfile.read("../VCTK-Corpus/wav48/p227/p227_025.wav")
'''
for sample in range(len(data)):
    data[sample] += random.randrange(-20,20)
'''
emphasized_signal = np.append(data[0], data[1:] - pre_emphasis * data[:-1])
evaluation_data = []
evaluation_label = []
long_frame_filter_banks((emphasized_signal,4), evaluation_data, evaluation_label)
intermediate_output = intermediate_layer_model.predict(np.array(evaluation_data))
d_eva = np.zeros(256)
for out in intermediate_output:
    d_eva += out/sum(out)

In [34]:
np.corrcoef(d_vector,d_eva)

array([[1.        , 0.97769567],
       [0.97769567, 1.        ]])

In [16]:
summed = np.zeros(5)
for out in intermediate_output:
    summed += out
print(summed)

[2.13762568e-04 3.38687006e-04 7.23687697e-04 7.58267330e+00
 4.54160504e+01]


In [23]:
print(intermediate_output.shape)
print(len(intermediate_output))
summed = np.zeros(256)
for d in intermediate_output:
    summed += d
print(summed)

(186, 3)
186


ValueError: operands could not be broadcast together with shapes (256,) (3,) (256,) 

In [18]:
print(frames * np.hamming(FRAME_SIZE*3))

NameError: name 'frames' is not defined