In [1]:
from scipy.io import wavfile
import os
import pickle
import numpy as np
import tensorflow as tf
from tensorflow import keras
from pre_process import form_input_data
pre_emphasis = 0.97

In [2]:
model = tf.keras.models.load_model("saved_model/my_model")
model.summary()
layer_name = 'dropout_1'
intermediate_layer_model = keras.models.Model(inputs=model.input,
                                 outputs=model.get_layer(layer_name).output)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1640)]            0         
_________________________________________________________________
dense (Dense)                (None, 256)               420096    
_________________________________________________________________
dense_1 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0     

In [3]:
with open('utterance_list.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
    utterance, spk_list = pickle.load(f)
enrollment_dataset = []
pid = 5 # pick 5 as a random speaker
speaker = spk_list[pid]
for count in range(5):
    file_path = "../VCTK-Corpus/wav48/" + speaker + "/" + utterance[speaker]['files'].pop(0)
    _, data = wavfile.read(file_path)         # requires tons of memory with many spekaers
    emphasized_signal = np.append(data[0], data[1:] - pre_emphasis * data[:-1])
    enrollment_dataset.append((emphasized_signal,pid))

enrollment_data = []
enrollment_label = []
d_utterance_list = []

In [4]:
for entry in enrollment_dataset:
    enrollment_data.clear()
    enrollment_label.clear()
    form_input_data(entry, enrollment_data, enrollment_label)
    intermediate_output = intermediate_layer_model.predict(np.array(enrollment_data))
    d_utterance = np.zeros(256)
    for out in intermediate_output:
        d_utterance += out/sum(out)
    d_utterance_list.append(d_utterance) # Saving the utterance d-vector for future uncertainty measure
    
d_model = np.zeros(256)
for vector in d_utterance_list:
    d_model += vector
d_model = d_model/len(d_utterance_list)


In [5]:
# pre-process evaluation utterance
# same speaker
_, data = wavfile.read("../VCTK-Corpus/wav48/p230/p230_280.wav")
emphasized_signal = np.append(data[0], data[1:] - pre_emphasis * data[:-1])
evaluation_data = []
evaluation_label = []
form_input_data((emphasized_signal,pid), evaluation_data, evaluation_label)
intermediate_output = intermediate_layer_model.predict(np.array(evaluation_data))
d_eva = np.zeros(256)
for out in intermediate_output:
    d_eva += out/sum(out)

In [6]:
for n,vector in enumerate(d_utterance_list):
    print(np.corrcoef(vector,d_eva))

[[1.         0.89614739]
 [0.89614739 1.        ]]
[[1.         0.88919356]
 [0.88919356 1.        ]]
[[1.         0.88289639]
 [0.88289639 1.        ]]
[[1.         0.91427108]
 [0.91427108 1.        ]]
[[1.         0.90908563]
 [0.90908563 1.        ]]


In [7]:
print(np.corrcoef(d_model,d_eva))

[[1.         0.91814532]
 [0.91814532 1.        ]]


In [8]:
# pre-process evaluation utterance
# different speaker
_, data = wavfile.read("../VCTK-Corpus/wav48/p237/p237_280.wav")
emphasized_signal = np.append(data[0], data[1:] - pre_emphasis * data[:-1])
evaluation_data = []
evaluation_label = []
form_input_data((emphasized_signal,pid), evaluation_data, evaluation_label)
intermediate_output = intermediate_layer_model.predict(np.array(evaluation_data))
d_eva = np.zeros(256)
for out in intermediate_output:
    d_eva += out/sum(out)

In [9]:
for n,vector in enumerate(d_utterance_list):
    print(np.corrcoef(vector,d_eva))

[[1.         0.41615877]
 [0.41615877 1.        ]]
[[1.         0.51021078]
 [0.51021078 1.        ]]
[[1.         0.52906224]
 [0.52906224 1.        ]]
[[1.         0.44008616]
 [0.44008616 1.        ]]
[[1.        0.3603832]
 [0.3603832 1.       ]]


In [10]:
print(np.corrcoef(d_model,d_eva))

[[1.         0.47151495]
 [0.47151495 1.        ]]
