In [None]:
%%time
import os
import random
import numpy as np
from numpy import linalg as la
import pandas as pd
import scipy as sp
from scipy import signal
import librosa
from librosa import effects
import wave
import pyaudio

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.layers import (Input, Conv2D,
    Flatten, LSTM, concatenate, TimeDistributed)
from tensorflow.keras.activations import relu
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.models import Model
from tensorflow.keras.losses import Loss

main = 'D:/Загрузки/train-clean/train/'

os.environ['TF_CUDNN_DETERMINISM'] = '1'
os.environ['TF_DETERMINISTIC_OPS'] = '1'
np.random.seed(1)
tf.random.set_seed(1)
random.seed(1)

# For preventing failing during training on gpu
gpus = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

In [None]:
%%time
speak_with_dirs = [(e[0], e[2]) for e in os.walk(main)]

# Remove non-parents dirs
speak_with_dirs = [e for e in speak_with_dirs
                   if len(e[1]) > 0]

In [None]:
# Get closest path = it's path to parent dir
parents = list(map(lambda e: e[0], speak_with_dirs))

# Last dir in parent path = speaker's target
targets = list(map(lambda s: s.split('\\')[-1], parents))

# Paths without targets
paths = list(map(lambda s: '/'.join(s.split('\\')[:-1]), parents))

# Associate with speaker files
wavs = list(map(lambda e: e[1], speak_with_dirs))

In [None]:
# Filtering by flac extension
wavs = [list(filter(lambda s: s.endswith('.flac'), e))
        for e in wavs]

In [None]:
# Connect files with relevant target and filepaths
wavs = [[(path, file, target) for file in files]
        for target, files, path in zip(targets, wavs, paths)]

In [None]:
import itertools

wavs = list(itertools.chain(*wavs))
print(f'Count of audio files {len(wavs)}')

In [None]:
%%time
df_all = pd.DataFrame(wavs, columns=[
    'Parent', 'File', 'Target'
])

fulls = list(map(
    lambda e: '/'.join(e),
    zip(df_all['Parent'], df_all['Target'], df_all['File'])
))
df_all['Full_path'] = fulls

In [None]:
target_vc = df_all['Target'].value_counts()

# Select n most popular speakers by id
count_speakers = 5
top_speaks = target_vc[:count_speakers]
top_ids = top_speaks.index

df = df_all[df_all['Target'].isin(top_ids)]
print(f'{df.shape[0]} records by {len(top_ids)} speakers')

df.index = range(len(df))

Dataset of threes building.
Each samples includes anchor, positive and negative

In [None]:
%%time
#three_lens = top_speaks // 2

# a - Anchors and positives collection
a = []
for label in top_ids:
    b = df[df['Target'] == label]
    a.append(b)

In [None]:
%%time
df1 = pd.concat(a)
# Records count for each speaker
three_len = 30

# Collections of anchors and positives examples paths
anchors = []
positives = []

for label in top_ids:
    sub_df = df1.loc[df1['Target'] == label,
                     ['Full_path', 'Target']]
    
    anchor = sub_df.iloc[:three_len]
    positive = sub_df.iloc[three_len: 2*three_len]
    
    anchors.append(anchor['Full_path'].to_numpy())
    positives.append(positive['Full_path'].to_numpy())
    
print('Selected {} files'.format(three_len * count_speakers))

In [None]:
%%time
anchors = np.array(list(itertools.chain(*anchors)))
positives = np.array(list(itertools.chain(*positives)))

# Dataset length
samples_count = len(anchors)

# Select random samples of rest dataset
# to build impostor set 
neg_df = df_all[~df_all.index.isin(df.index)]
imposts = neg_df.sample(samples_count, random_state=5)
negatives = imposts['Full_path'].to_numpy()

In [None]:
anchors = anchors.reshape((samples_count, 1))
positives = positives.reshape((samples_count, 1))
negatives = negatives.reshape((samples_count, 1))

dataset = np.concatenate([
    anchors, positives, negatives
], axis=1)
anch_paths, pos_paths, neg_paths = dataset.T

# Flat paths of whole dataset
ds_all = dataset.ravel()

In [None]:
%%time
# Dataset default samples rate
samples_rate = 16 * 10**3
emph_alpha = 0.95

audios = [librosa.load(path, sr=samples_rate)[0]
          for path in ds_all]
audios = [effects.preemphasis(wave, coef=emph_alpha)
          for wave in audios]

In [None]:
# Framing and spectrogram parameters
frame_sec_size = 0.025
overlap_sec_size = 0.02
nfft = int(samples_rate * frame_sec_size)
win_len = nfft
hop_len = int(samples_rate * overlap_sec_size)
num_segments = 2
num_features = nfft // 2 + 1
hamming = sp.signal.windows.hamming(win_len)

print('Window {}, overlap: {}\n'
      'consecutive segments: {}\n'
      'frequencies features: {}'.format(
          win_len, hop_len,
          num_segments, num_features))

In [None]:
%%time
# Clear from silent frames
no_silence = [effects.split(wave, frame_length=win_len,
                            hop_length=hop_len, top_db=20)
              for wave in audios]

In [None]:
%%time
soundeds = [[wave[d[0]:d[1]] for d in interval]
            for wave, interval in zip(audios, no_silence)]

soundeds = [np.concatenate(s) for s in soundeds]
cleared_lens = [len(s) for s in soundeds]

In [None]:
%%time
# Padding or trancating audios to threshold length
length_threshold = int(np.quantile(cleared_lens, 0.9))
pad_sounds = np.zeros((len(soundeds), length_threshold))

for i, s in enumerate(soundeds):
    if len(s) > length_threshold:
        pad_sounds[i] = s[:length_threshold]
    else:
        pad_sounds[i, :len(s)] = s

d = pad_sounds.reshape((samples_count, 3, -1))
d.shape

In [None]:
audio_dur = length_threshold / samples_rate
print(f'Audio duration: {audio_dur}')

1. Spectrogram segmentation over time
   Below - function dividing spectrogram array into segments
   of num_segs consecutive pieces of full spectrogram

In [None]:
def segment_spectrogram(stft_data, num_segs, num_ftrs):
    concats = np.concatenate([stft_data[:, 0:num_segs-1], stft_data], axis=1)
    stft_segs = np.zeros((num_ftrs, num_segs,
                          concats.shape[1] - num_segs + 1))

    for index in range(concats.shape[1] - num_segs + 1):
        stft_segs[:, :, index] = concats[:, index:index + num_segs]
        
    shape = stft_segs.shape
    stft_segs = np.reshape(stft_segs, (
        shape[0], shape[1], 1, shape[2]
    ))
    stft_segs = np.transpose(
        stft_segs, (3, 0, 1, 2)
    ).astype(np.float32)
    
    return stft_segs

In [None]:
def get_spectrogram(wave, _nfft, _hop_len, _win_len, _window):
    spec = librosa.stft(wave, n_fft=_nfft, hop_length=_hop_len,
                        win_length=_win_len, window=_window)
    spec = np.abs(spec)
    spec_mean = np.mean(spec)
    spec_std = np.std(spec)
    spec = (spec - spec_mean) / spec_std
    return spec

In [None]:
%%time
spec_shape = (len(d), length_threshold // hop_len + 1,
              num_features, num_segments, 1)
anch_specs = np.empty(spec_shape, np.float32)
pos_specs = np.empty(spec_shape, np.float32)
neg_specs = np.empty(spec_shape, np.float32)

for i, three in enumerate(d):
    anch_wave, pos_wave, neg_wave = three
    
    anch_spec = get_spectrogram(anch_wave, nfft, hop_len,
                                win_len, hamming)
    pos_spec = get_spectrogram(pos_wave, nfft, hop_len,
                               win_len, hamming)
    neg_spec = get_spectrogram(neg_wave, nfft, hop_len,
                               win_len, hamming)
    
    anch_features = segment_spectrogram(
        anch_spec, num_segments, num_features)
    anch_specs[i] = anch_features
    
    pos_features = segment_spectrogram(
        pos_spec, num_segments, num_features)
    pos_specs[i] = pos_features
    
    neg_features = segment_spectrogram(
        neg_spec, num_segments, num_features)
    neg_specs[i] = neg_features
    
print('Spectrogram shape {}'.format(spec_shape[1:]))

Spectrogram convolution class

In [None]:
class SpectrogramConvolution:
    def __init__(self, _shape):
        self.conv1 = Conv2D(filters=1, input_shape=_shape[1:],
                            kernel_size=(5, 3), strides=1,
                            activation=relu, padding='same')
        self.conv2 = Conv2D(filters=1, kernel_size=(5, 3),
                            strides=1, activation=relu, padding='same')
        
        self.distr = TimeDistributed(self.conv1)
        self.distr2 = TimeDistributed(self.conv2)
        self.flat = Flatten()

    def convolute(self, inputs):
        inputs = self.distr(inputs)
        inputs = self.distr2(inputs)
        return self.flat(inputs)

In [None]:
_input_shape = anch_specs.shape[1:]
print(f'Input to convolution: {_input_shape}')
spec_conv = SpectrogramConvolution(_input_shape)

In [None]:
%%time
conv_anchors = spec_conv.convolute(anch_specs)
conv_pos = spec_conv.convolute(pos_specs)
conv_neg = spec_conv.convolute(neg_specs)

In [None]:
%%time
timed_shape = (conv_anchors.shape[0],
               _input_shape[0], -1)

conv_anchors = tf.reshape(conv_anchors, timed_shape).numpy()
conv_pos = tf.reshape(conv_pos, timed_shape).numpy()
conv_neg = tf.reshape(conv_neg, timed_shape).numpy()

In [None]:
class TripletLoss(Loss):
    def __init__(self, margin):
        self.margin = margin
        super(TripletLoss, self).__init__()

    def call(self, y_true, y_pred):
        assert y_pred.shape[0] == 3
        anchor, positive, negative = tf.unstack(y_pred)
        pos_dist = K.sum(K.square(anchor - positive), axis=-1)
        neg_dist = K.sum(K.square(anchor - negative), axis=-1)
        base_loss = pos_dist - neg_dist + self.margin
        return K.mean(K.maximum(base_loss, 0.0), axis=0)

In [None]:
def base_rnn(_shape, out_units):
    inp = Input(shape=_shape, name='input')
    lstm1 = LSTM(16, return_sequences=True,
                 name='seq2seq')(inp)
    lstm2 = LSTM(out_units, name='seq2one')(lstm1)
    out = Flatten(name='flattened')(lstm2)
    model = Model(inputs=inp, outputs=out)
    return model


def build_siam(input_shape, out_units,
               optimizer, loss):
    anchor_input = Input(shape=input_shape, name='anchor')
    positive_input = Input(shape=input_shape, name='positive')
    negative_input = Input(shape=input_shape, name='negative')

    rnn = base_rnn(input_shape, out_units)
    anch_out = rnn(anchor_input)
    pos_out = rnn(positive_input)
    neg_out = rnn(negative_input)
    
    out1 = tf.expand_dims(anch_out, axis=0)
    out2 = tf.expand_dims(pos_out, axis=0)
    out3 = tf.expand_dims(neg_out, axis=0)
    output = concatenate([out1, out2, out3], axis=0)

    model = Model(inputs=[
        anchor_input, positive_input, negative_input
    ], outputs=output)
    
    model.compile(optimizer=optimizer, loss=loss)
    return model

In [None]:
%%time
_out_units = 128
siam_input = conv_anchors.shape[1:]
optim = SGD()
_margin = 0.15
triplet = TripletLoss(margin=_margin)

siam = build_siam(input_shape=siam_input,
    out_units=_out_units, optimizer=optim,
    loss=triplet)

In [None]:
%%time
test_len = 0
train_data = [conv_anchors[test_len:],
              conv_pos[test_len:],
              conv_neg[test_len:]]

num_epochs = 20
batch_size = 16

# Dummy 'true' labels to provide gradients
dummy_labels = np.zeros((len(train_data[0]),))

history = siam.fit(train_data, y=dummy_labels,
    epochs=num_epochs, batch_size=batch_size)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

loss = history.history['loss']
epochs = np.arange(1, len(loss)+1)

plt.figure(figsize=(14, 6))
plt.plot(epochs, loss)
plt.xlabel('Epochs')
plt.ylabel('Triplet loss')
plt.grid()

Database encodings creation.
One vector per recognized speaker.

In [None]:
%%time
# Extract convolution vectors for each anchor
# Unique labels
user_labels = top_ids.to_numpy()
# Unique encodings (one-to-one)
target_convs = np.asarray([
    conv_anchors[i] for i in range(0, len(conv_anchors), three_len)
])
target_convs.shape

In [None]:
%%time
## Empty dummy variable to substitute pos and neg inputs
## Only anchor prediction will be used further
dummy_x = np.empty(target_convs.shape)
verify_data =  [target_convs, dummy_x, dummy_x]

## Predict vectors for convolution in database
## Using direct model call instead of predict due to vary small amount of data
verify_vects = siam(verify_data)[0]

print('There are {}-length encoding for each of {} speakers'.format(
    verify_vects.shape[1], verify_vects.shape[0]
))

In [None]:
database = {label: verify_vects[i]
            for i, label in enumerate(user_labels)}

In [None]:
## Verify one record with known database
def identity_verification(audio_conv, db: dict, model):
    assert len(audio_conv.shape) == 2
    min_dist = 1
    verified = False
    identity = None
    
    audio_conv = tf.expand_dims(audio_conv, axis=0)
    dummy_vect = np.empty(audio_conv.shape)
    datas = [audio_conv, dummy_vect, dummy_vect]
    
    encoding = model(datas)
    
    for label, db_enc in db.items():
        dist = la.norm(db_enc - encoding)
        if dist < min_dist:
            min_dist = dist
            identity = label
            verified = True
            
    return min_dist, verified, identity

In [None]:
dist, is_verif, label = identity_verification(
    conv_anchors[0], database, siam
)

In [None]:
%%time
eval_loss = siam.evaluate(train_data, dummy_labels)
eval_loss

In [None]:
%%time
test_data = [conv_anchors[:test_len],
             conv_pos[:test_len],
             conv_neg[:test_len]]

# Batch size = test dataset length ?!
preds = siam.predict(test_data, batch_size=test_len)
test_loss = triplet(None, preds)
print(f'Test triplet loss: {test_loss.numpy():.4f}')