In [1]:
import tensorflow as tf
import numpy as np
import sys

In [2]:
import EER_mod
import read_mod

In [3]:
# provide the file with the speeches passed trough openl3 in .npy format
fl_inp = open('S10_sp3_for_model.txt','r')

# The content of this file is following
for l in fl_inp.readlines():
    print(l.strip())
fl_inp.seek(0)

speeches/19-198-0000.npy
speeches/19-198-0001.npy
speeches/19-198-0002.npy
speeches/26-495-0000.npy
speeches/26-495-0001.npy
speeches/26-495-0002.npy
speeches/27-123349-0000.npy
speeches/27-123349-0001.npy
speeches/27-123349-0002.npy
speeches/32-21625-0000.npy
speeches/32-21625-0001.npy
speeches/32-21625-0002.npy
speeches/39-121914-0000.npy
speeches/39-121914-0001.npy
speeches/39-121914-0002.npy
speeches/40-121026-0000.npy
speeches/40-121026-0001.npy
speeches/40-121026-0002.npy
speeches/60-121082-0000.npy
speeches/60-121082-0001.npy
speeches/60-121082-0002.npy
speeches/78-368-0000.npy
speeches/78-368-0001.npy
speeches/78-368-0002.npy
speeches/83-11691-0000.npy
speeches/83-11691-0001.npy
speeches/83-11691-0002.npy
speeches/87-121553-0000.npy
speeches/87-121553-0001.npy
speeches/87-121553-0002.npy


0

In [4]:
# load .npy files and create labels from the names of the files
# the number before the '-' sighn stands for the speaker, e.g.,
# in 19-xxxx-xxx.npy, 19 stands for the speaker's ID
n_S, n_speeches, emb, labels = read_mod.read_speeches(fl_inp)
n_data = len(labels)

In [5]:
# neurons in the input layer
n_in = emb.shape[1]
print('number of neurons in the input layer = ', emb.shape[1])

number of neurons in the input layer =  6144


In [6]:
# shuffle data
rand_perm = np.arange(n_data)
np.random.shuffle( rand_perm )
labels = labels[rand_perm]
emb = emb[rand_perm]

In [7]:
# divide into training and test sets
# The training set is 10% from the whole set
n_test = int(n_data * 0.1)
print('size of the testing set = ', n_test)

labels_test = labels[:n_test]
data_test = emb[:n_test,:]

labels_train = labels[n_test:]
data_train = emb[n_test:,:]

print('size of the training set = ', labels_train.shape[0])

size of the testing set =  350
size of the training set =  3158


In [8]:
# Construct the model with only 1 hidden layer
n_hidden = 512
model = tf.keras.Sequential([
    tf.keras.layers.Dense(n_hidden, activation='relu'),
    tf.keras.layers.Dense(n_S)
])


# Compile the model
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Train the model
epochs=20
model.fit(data_train, 
          labels_train, 
          epochs=epochs,
          verbose=2)

Epoch 1/20
99/99 - 2s - loss: 4.9581 - accuracy: 0.4785 - 2s/epoch - 17ms/step
Epoch 2/20
99/99 - 1s - loss: 0.4893 - accuracy: 0.8838 - 1s/epoch - 14ms/step
Epoch 3/20
99/99 - 1s - loss: 0.2357 - accuracy: 0.9531 - 1s/epoch - 14ms/step
Epoch 4/20
99/99 - 1s - loss: 0.1233 - accuracy: 0.9870 - 1s/epoch - 13ms/step
Epoch 5/20
99/99 - 2s - loss: 0.0693 - accuracy: 0.9962 - 2s/epoch - 16ms/step
Epoch 6/20
99/99 - 1s - loss: 0.0483 - accuracy: 0.9991 - 1s/epoch - 15ms/step
Epoch 7/20
99/99 - 1s - loss: 0.0427 - accuracy: 0.9991 - 1s/epoch - 14ms/step
Epoch 8/20
99/99 - 1s - loss: 0.0312 - accuracy: 0.9987 - 1s/epoch - 14ms/step
Epoch 9/20
99/99 - 1s - loss: 0.0238 - accuracy: 1.0000 - 1s/epoch - 13ms/step
Epoch 10/20
99/99 - 1s - loss: 0.0176 - accuracy: 0.9997 - 1s/epoch - 13ms/step
Epoch 11/20
99/99 - 1s - loss: 0.0150 - accuracy: 0.9997 - 1s/epoch - 13ms/step
Epoch 12/20
99/99 - 1s - loss: 0.0117 - accuracy: 1.0000 - 1s/epoch - 13ms/step
Epoch 13/20
99/99 - 1s - loss: 0.0110 - accuracy:

<keras.callbacks.History at 0x7fe3a81e69d0>

In [9]:
# Test the model
test_loss, test_acc = model.evaluate(data_test,  labels_test)
print('test_loss = ', test_loss)
print('test_loss = ', test_acc)

test_loss =  0.024889269843697548
test_loss =  0.9942857027053833


In [11]:
# Create the new model for features extracting 
# from the layer before the last one
new_model = tf.keras.Model(inputs=model.input,
                           outputs=model.layers[-2].output)

# save new model for usage in speakers comparison
model_name = 'model_S' + str(n_S) + '_speeches' + str(n_speeches)
model.save(model_name)

INFO:tensorflow:Assets written to: model_S10_speeches30/assets


In [12]:
# Calculate EER on the test set
# all data points from the test set will be compared with each other
hidden_layer_pred = new_model.predict(data_test)


EER, thres = EER_mod.EER(hidden_layer_pred, labels_test)
print('EER = ', f'{EER*100: .1f}','%')
print('Threshold = ', f'{thres: .3f}')

EER =   3.1 %
Threshold =   0.981


In [13]:
# The name of the model, threshold, and list of speeches, 
# which were used for training are saved

fl_model = open(model_name+'.dat','w')
print('model_name = ', model_name, file = fl_model)
print('thres = ', thres, file = fl_model)


fl_inp.seek(0)
for speech in fl_inp.readlines():
    print('speech = ', speech.strip(), file = fl_model)