<h2>Speaker Encoder</h2>

In [188]:
import os
from pathlib import Path
import sys
import pandas as pd
import librosa as lr
import librosa.display as ld
import glob
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import nnabla as nn
import nnabla.parametric_functions as PF
import nnabla.functions as F
import nnabla.solvers as S
import tensorflow as tf
from nnabla.utils.data_iterator import data_iterator_simple
import nnabla.monitor as M

In [241]:
############## Hyperparameters (to be moved to hparams.py)  #######################

### Directory Locations ###
#basedir = 'drive/My Drive/Colab Notebooks/SV2TTS'
data_dir = "./data/LJSpeech/"
label_dir = "./data/LJSpeech/labels/"
save_dir_mfcc = "./data/LJSpeech/mfcc/"
save_dir_transcripts = "./data/LJSpeech/transcripts/"

### FFT Parameters ###
sr = 22500
n_fft = 2048
hop_length = 512
n_mfcc = 13
mel_len = 290                      # frame length of mel spectrogram > Spectrogram is split into short-time frames
n_fft = 1024 
n_mels = 80                        # number of mel filters (number of Mel bands to generate)
hop_length = 256                   # audio samples between adjacent STFT columns
win_length = 1024                  # window length
mel_fmin = 0.0                     # minimum mel bank
mel_fmax = 8000                    # maximum mel bank
r = 3                              # number of frames generated on each timestep

### Model Parameters ###
batch_size = 20
lstm_layers = 3
lstm_hidden = 256
lstm_directions = 1
affine_hidden = 256
embed_size = 256

In [77]:
ys = np.load(label_dir + 's_id.npy')

In [78]:
len(ys)

9725

In [None]:
labels = np.utils.to_categorical(ys, )

In [27]:
xs = []
mfccs = sorted(glob.glob(save_dir_mfcc + '*.npy'))
for i in tqdm(range(len(mfccs))):
    xs.append(np.load(mfccs[i]))
    

100%|████████████████████████████████████████████████████████████████████████████| 9725/9725 [00:02<00:00, 3654.27it/s]


In [9]:
len(xs)

9725

In [256]:
def encoder_network(inputs, training = False):
    with nn.parameter_scope('encoder_network/lstm'):
        h = nn.Variable((lstm_layers, lstm_directions, batch_size, lstm_hidden))
        c = nn.Variable((lstm_layers, lstm_directions, batch_size, lstm_hidden))
        print("Encoder:")
        print(h.shape)
        
        y, hn, cn = PF.lstm(inputs, h, c, training = training)
    with nn.parameter_scope('encoder_network/dense'):
        out = PF.affine(hn[-1], affine_hidden)
        out = F.relu(out) 
        embeds = out/(np.linalg.norm(out,'fro'))
    return embeds

In [47]:
idx = {k: i for i, k in enumerate(sorted(set(ys)))}
print(idx)

{'LJ001': 0, 'LJ002': 1, 'LJ003': 2, 'LJ004': 3, 'LJ005': 4, 'LJ006': 5, 'LJ007': 6, 'LJ008': 7, 'LJ009': 8, 'LJ010': 9}


In [63]:
def get_one_hot(ys):
    idx = {k: i for i, k in enumerate(sorted(set(ys)))}
    labels = [idx[i] for i in ys]
    labels = np.array(labels)
    nb_classes = len(set(labels))
    res = np.eye(nb_classes)[np.array(labels).reshape(-1)]
    return res.reshape(list(labels.shape)+[nb_classes])

In [65]:
ys = get_one_hot(ys)

In [66]:
def load_func(i):
    return(xs[i], ys[i])

In [74]:
inputs = nn.utils.data_iterator.data_iterator_simple(load_func, 
                                                     len(ys), 
                                                     batch_size, 
                                                     shuffle = True, 
                                                     with_file_cache = False)

2020-12-06 17:18:40,365 [nnabla][INFO]: DataSource with shuffle(True)
2020-12-06 17:18:40,367 [nnabla][INFO]: Using DataSourceWithMemoryCache
2020-12-06 17:18:40,368 [nnabla][INFO]: DataSource with shuffle(True)
2020-12-06 17:18:40,369 [nnabla][INFO]: On-memory
2020-12-06 17:18:40,369 [nnabla][INFO]: Using DataIterator


In [75]:
xs.shape

(9725, 13, 141)

In [111]:
''' 
Get total utterances for each speaker
'''

utter_count = {k: 0 for k in sorted(set(ys))}
mfccs = {k:[] for k in sorted(set(ys))}

for i, s_id in enumerate(ys):
    utter_count[s_id]+=1
    mfccs[s_id].append(xs[i])


In [113]:
print(utter_count)
for key in mfccs:
    print(np.array(mfccs[key]).shape)

{'LJ001': 598, 'LJ002': 1130, 'LJ003': 1201, 'LJ004': 816, 'LJ005': 989, 'LJ006': 1049, 'LJ007': 850, 'LJ008': 1031, 'LJ009': 973, 'LJ010': 1088}
(598, 13, 141)
(1130, 13, 141)
(1201, 13, 141)
(816, 13, 141)
(989, 13, 141)
(1049, 13, 141)
(850, 13, 141)
(1031, 13, 141)
(973, 13, 141)
(1088, 13, 141)


In [117]:
mfccs['LJ001'][1]

array([[-3.021952  , -2.997819  , -3.0747917 , ..., -2.6599905 ,
        -2.8426325 , -2.770187  ],
       [-0.86400616, -0.7827282 , -0.6735879 , ...,  0.94493216,
         1.0947886 ,  1.4090455 ],
       [ 0.9714152 ,  1.0112687 ,  0.8982654 , ...,  0.5989028 ,
         0.6586964 ,  0.80930406],
       ...,
       [ 0.01241515, -0.00476354, -0.0360523 , ...,  0.06325874,
         0.10073408,  0.1603219 ],
       [ 0.5045489 ,  0.48247862,  0.43978587, ...,  0.17858744,
         0.16974679,  0.1903773 ],
       [ 0.12723784,  0.18586203,  0.19485734, ...,  0.04302476,
         0.07867512,  0.12346806]], dtype=float32)

In [201]:
'''
Create customized dataset
'''
n_speakers = len(set(utter_count)) #number of speakers
n_utterances = 2 #utterances per batch
max_utter = utter_count[max(utter_count, key=utter_count.get)] 

dataset = []
labels = []

## Batch_size =20 , each batch will contain 2 utterances from each of the 10 speakers
def split_dataset(xs, ys):
    
    for i in tqdm(range(max_utter)):
        
        for s_id in utter_count:
            max_idx = utter_count[s_id] #Allow repeating data
            
            ## Appending n_utterances for each speaker (in one batch)
            for j in range(n_utterances):
                data = mfccs[s_id][(i+j)%max_idx]
                dataset.append(data)
                labels.append(s_id)
                
                
    return dataset,labels

In [202]:
dataset, labels = split_dataset(xs,ys)

100%|██████████████████████████████████████████████████████████████████████████| 1201/1201 [00:00<00:00, 133794.40it/s]


In [242]:
def generate_batch(dataset,labels, batch_size):
    batch_data = dataset[:batch_size]
    batch_labels = labels[:batch_size]
    del dataset[:batch_size]
    del labels[:batch_size]
    dataset += batch_data
    labels += batch_labels
    
    return np.array(batch_data), np.array(batch_labels)

In [243]:
batch_size

20

In [173]:
sim_weight = nn.Variable([1], need_grad = True)
sim_weight.d = 10.0
sim_bias = nn.Variable([1], need_grad = True)
sim_bias.d = -5.0

In [163]:
def similarity_matrix(embeddings):
    """
        Computes the similarity matrix according the section 2.1 of GE2E.
        :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
        utterances_per_speaker, embedding_size)
        :return: the similarity matrix as a tensor of shape (speakers_per_batch,
        utterances_per_speaker, speakers_per_batch)
        """
    embeddings = np.reshape(embeddings, [n_speakers, n_utterances, embed_size])
    centroids_incl = embeddings.mean(axis = 1, keepdims = True) # one centroid per speaker
    centroids_incl = centroids_incl/np.linalg.norm(centroids_incl, axis = 2, keepdims = True)
    
    centroids_excl = np.sum(embeddings, axis = 1, keepdims = True) - embeddings
    centroids_excl /= (n_utterances - 1)
    centroids_excl = centroids_excl/np.linalg.norm(centroids_excl, axis = 2, keepdims = True)
    
    sim_matrix = np.zeros(n_speakers, n_utterances, n_speakers)
    mask_matrix = 1 - np.eye(n_speakers, dtype = np.int)
    
    for j in range(n_speakers):
        mask = np.where(mask_matrix[j])[0]
        sim_matrix[mask, :, j] = (embeddings[mask] * centroids_incl[j]).sum(axis=2)
        sim_matrix[j, :, j] = (embeddings[j] * centroids_excl[j]).sum(axis=1)
        
    sim_matrix = sim_matrix * sim_weight + sim_bias
    return sim_matrix

In [165]:
def get_one_hot(targets, nb_classes):
    res = np.eye(nb_classes)[np.array(targets).reshape(-1)]
    return res.reshape(list(targets.shape)+[nb_classes])

In [177]:
def get_loss(embeddings):
    sim_matrix = similarity_matrix(embeddings)
    sim_matrix = sim_matrix.reshape((n_speakers * n_utterances, n_speakers))
    ground_truth = np.repeat(np.arange(n_speakers), n_utterances)
    loss = F.softmax_cross_entropy(sim_matrix, get_one_hot(ground_truth, n_speakers))
    return loss

In [245]:
def forward(xs):
    embeds = encoder_network(xs, training = True)
    return embeds

In [250]:
n_batch = len(dataset)//batch_size
max_epochs = 1000

def train():
    monitor = M.Monitor('.')
    monitor_loss = M.MonitorSeries(
        "Training loss", monitor, interval=1000)
    monitor_time = M.MonitorTimeElapsed(
        "Training time", monitor, interval=1000)
    optimizer = S.RMSprop()
        
    for epoch in range(max_epochs):
        
        #Iterations per epoch
        
        for i in range(n_batch):
            xi = nn.Variable((batch_size, 13,141))            
            xi.d, yi = generate_batch(dataset,labels, batch_size)
            optimizer.zero_grad()
            embeddings = encoder_network(xi, True)
            loss = get_loss(embeddings)
            loss.backward()
            optimizer.update()
        
            # monitor
            itr = epoch * n_batch + i
            monitor_loss.add(itr, loss.d)
            monitor_time.add(itr)
        

In [257]:
train()

Encoder:
(3, 1, 20, 256)


RuntimeError: value error in nbla::LSTM<float>::setup_impl
c:\ci\builds\5czz_5xk\0\nnabla\builders\all\nnabla\src\nbla\function\./generic/lstm.cpp:48
Failed `hshape[2] == batch_size_`: Input h must be a 4 dimensional array with a shape of (num_layers, num_directions, batch_size, hidden_size).
