TACOTRON

In [1]:
from __future__ import print_function
import mxnet as mx
import numpy as np
from mxnet import nd, autograd
from IPython.display import clear_output
ctx= mx.cpu()
import csv
import codecs
import re
import audio_process
import traceback
import subprocess
import math
from params import Hyperparams as hp 


In [4]:

# command="ls train_data/*wav|while read i; do sox $i -n stat 2>&1|grep Leng|cut -d':'  -f 2|sed -r 's/\s//g'; done|awk 'BEGIN{max_len=0}{if($0>max_len)max_len=$0}END{print max_len}'"
# max_len=subprocess.Popen(command, shell=True, stdout=subprocess.PIPE).stdout.read().strip()
# max_audio_sec_len=math.ceil((float(max_len.decode("utf-8"))))

In [97]:
num_hidden = 256
reduction_factor=2
emb_size=256
batch_size=1


def generate_vocabulary(texts_list):    
    # get unique chars and put into a list
    return list(set(''.join(texts_list)))
    

def generate_chars2numbers_mappings(vocabulary):
    # create a chars <-> numbers mappings
    char2index = {char:i for i,char in enumerate(vocabulary)}
    index2char = {i:char for i,char in enumerate(vocabulary)}
    
    return char2index,index2char


def text2numbers(texts_list,char2index_mapping):
    numerical_texts=[]
    for text in texts_list:
        numerical_texts.append([char2index_mapping[char] for char in text])
    return numerical_texts

def open_data(input_file_path):
      
    texts, sound_files = [], []
    
    reader = csv.reader(codecs.open(input_file_path, 'rb', 'utf-8'))
    for row in reader:
        sound_filename, text = row
        sound_file = "train_data/" + sound_filename + ".wav"
        text = re.sub(r"[^ a-z']", "", text.strip().lower())
         
        texts.append(text)
        sound_files.append(sound_file)
             
    return texts, sound_files

def generate_train_eval_data(text_trainset, sound_labels):
    
    assert len(sound_labels) == len(text_trainset)
    
    print("Generating spectrograms")
    
    #tuples of wav and sr of that wav. wav is a 1D floats vector
    wavs_srs = [audio_process.load_wave(sound_clip) for sound_clip in sound_labels]
    longest_wav_sr = (max(wavs_srs, key= lambda wav: len(wav[0])))
    #save the longest audio file length
    max_samples_length=(len(longest_wav_sr[0]))
    print("max audio sample length:",max_samples_length)

    spectra_lin = np.zeros((len(sound_labels),1+(hp.n_fft//2),math.ceil(max_samples_length/hp.hop_length)))
    print("Padding and compute spectrum")
    for indx,wav_sr in enumerate(wavs_srs):
        wav = wav_sr[0]
        wav_length = len(wav)
#         print("wav l",w_length)
        diff = max_samples_length-wav_length
#         print("num of zeros to add",diff)
        padded = np.append(wav,np.zeros(diff))
        spectrum_lin, spectrum_mel=(audio_process.do_spectrogram(y=padded,sr=hp.sr))
#         print(padded_spectrum_lin.shape)
        spectra_lin[indx,:,:]=spectrum_lin[:,:]
    
    print("Processing text..")
    vocabulary = generate_vocabulary(texts_list)
    vocab_size=len(vocabulary)
    char2index,index2char = generate_chars2numbers_mappings(vocabulary)

    print("Converting text to integers..")
    texts_numerical = text2numbers(texts_list,char2index)
#   /D E L E T E M E/ 
    texts_numerical[4]=np.concatenate((texts_numerical[4],[8,9]))
#   /D E L E T E M E/

    print("Get the length of the longest sequence..")
    longest_sequence = (max(texts_numerical, key= lambda seq: len(seq)))
    longest_sequence_len=len(longest_sequence)
    print("Pad sequences..")
    
    def _padseq(seq,max_len):
        diff=max_len-len(seq)
        if diff>0: #SHITTY USELESS MXNET API. CANNOT CONCAT A NON-EMPTY WITH EMPTY ARRAY. EDIT: use numpy now. Still using this condition
            pad = np.zeros(diff)-1
            seq=np.append(seq,[pad])
        return seq
    
    padded_sequences = mx.nd.array(
        list(
            map(
                lambda seq: _padseq(seq,longest_sequence_len), texts_numerical
            )
        )
    )
        
    
    texts_one_hot=mx.ndarray.one_hot(padded_sequences,vocab_size)
    train_data = mx.ndarray.take(texts_one_hot,mx.nd.array([0,1]))
    train_label = []
    eval_data = []
    eval_label = []
 
    
    # get 10% of dataset as eval data
    random_eval_indxs = (np.random.randint(0, high=len(sound_labels), size=len(sound_labels)//10))
    all_indxs = np.arange(50)
    diff = np.setdiff1d(all_indxs,random_eval_indxs)
    print("diff",diff)
    
    random_train_indxs = len(sound_labels)-(np.random.randint(0,high=len(sound_labels),size=len(sound_labels)//10))
    print("random_eval_indxs",random_eval_indxs)
    print("random_train_indxs",random_train_indxs)
    
#     for i in range(size//10):
#         if i%10 == 0:
#             print("Processed %d samples" % i)
#         try:
#             if i in random_samples:
#                 eval_data=mx.ndarray.concat(eval_data,texts_one_hot[i])
#                 eval_label.append(spectra_lin[i])
#             else:
#                 train_data.append(texts_one_hot[i])
#                 train_label.append(spectra_lin[i])
                
#         except IndexError as e:
#             print(e)
#             print("i=%s, thing to add: %s %s"%(i,texts_one_hot[i],sound_labels[i]))

    print("Split train and eval data")
    print(eval_data)

    train_data = mx.nd.array(train_data)
    train_label = mx.nd.array(train_label)
    eval_data = mx.nd.array(eval_data)
    eval_label = mx.nd.array(eval_label)
    

    try:
        print("Populating traindata iterator")
        traindata_iterator = mx.io.NDArrayIter(data={'mel_spectrogram':train_data},
                                label={'linear_spectrogram':train_label},
                                batch_size=batch_size)
        print("Populating evaldata iterator")
        evaldata_iterator = mx.io.NDArrayIter(data={'mel_spectrogram':eval_data},
                                label={'linear_spectrogram':eval_label},
                                batch_size=batch_size)
    except Exception as e:
        print(e)
        traceback.print_exc()
    
    
    for batch in traindata_iterator:
        print(batch.data[0].asnumpy())
        print(batch.data[0].shape)

    #return traindata_iterator, evaldata_iterator


texts_list, sound_files_list = open_data('train_data/dataset.csv')
size=len(sound_files_list)

longest_word = 0
for text in texts_list:
    longest_word = max(longest_word,len(text))
generate_train_eval_data(texts_list, sound_files_list)

# vocabulary = generate_vocabulary(texts_list)
# vocab_size=len(vocabulary)
# char2index,index2char = generate_chars2numbers_mappings(vocabulary)

Generating spectrograms
max audio sample length: 17567
Padding and compute spectrum
Processing text..
Converting text to integers..
Get the length of the longest sequence..
Pad sequences..
diff [ 0  1  2  3  4  5  6  7  8  9 10 12 13 14 15 16 17 18 19 20 21 22 23 25 26
 27 28 29 30 31 32 34 35 36 37 38 39 40 41 42 43 44 45 46 47 49]
random_eval_indxs [11 33 48 24 11]
random_train_indxs [39 49 41  7 20]
Split train and eval data
[]
Populating traindata iterator
Populating evaldata iterator
batch_size needs to be smaller than data size.


Traceback (most recent call last):
  File "<ipython-input-97-c687b7873cb6>", line 148, in generate_train_eval_data
    batch_size=batch_size)
  File "/home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/io.py", line 648, in __init__
    "batch_size needs to be smaller than data size."
AssertionError: batch_size needs to be smaller than data size.


MXNetError: [00:51:52] src/ndarray/ndarray.cc:102: Check failed: shape_[0] >= end (0 vs. 1) Slice end index out of range

Stack trace returned 10 entries:
[bt] (0) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x17dbbc) [0x7fda6f5cfbbc]
[bt] (1) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2174e77) [0x7fda715c6e77]
[bt] (2) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x217ea95) [0x7fda715d0a95]
[bt] (3) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(MXNDArraySlice+0x48) [0x7fda713d0918]
[bt] (4) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7fda95694ec0]
[bt] (5) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7fda9569487d]
[bt] (6) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7fda958a982e]
[bt] (7) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x12265) [0x7fda958aa265]
[bt] (8) /home/stefano/anaconda3/envs/mxnet/bin/python(_PyObject_FastCallDict+0x8b) [0x7fda9c89a54b]
[bt] (9) /home/stefano/anaconda3/envs/mxnet/bin/python(+0x19f00e) [0x7fda9c92d00e]


In [30]:
"""
FC-256-ReLU → Dropout(0.5) → FC-128-ReLU → Dropout(0.5)
"""
def prenet_pass(data):
    fc1 = mx.symbol.FullyConnected(data=data, num_hidden=emb_size, name='prenet_fc1')
    act1 = mx.symbol.Activation(data=fc1, act_type='relu', name='prenet_act1')
    drop1 = mx.symbol.Dropout(act1, p=0.5, name='prenet_drop1')
    
    fc2 = mx.symbol.FullyConnected(data=drop1, num_hidden=emb_size//2, name='prenet_fc2')
    act2 = mx.symbol.Activation(data=fc2, act_type='relu', name='prenet_act2')
    prenet_output = mx.symbol.Dropout(act2, p=0.5, name='prenet_drop2')
    
    return prenet_output

In [31]:
# banco di filtri convolutivi. Vengono creati K filtri con kernel 1D di dimensione:k 
def conv1dBank(conv_input, K):
    conv=mx.sym.Convolution(data=conv_input, kernel=(1,1), num_filter=emb_size//2)
    (conv, mean, var) = mx.sym.BatchNorm(data=conv, output_mean_var=True)
    conv = mx.sym.Activation(data=conv, act_type='relu')
    for k in range(2, K+1):
        convi = mx.sym.Convolution(data=conv_input, kernel=(k,1), num_filter=emb_size//2)
        (convi, mean, var) = mx.sym.BatchNorm(data=convi, output_mean_var=True)
        convi = mx.sym.Activation(data=convi, act_type='relu')
        conv = mx.symbol.concat(conv,convi)
    return conv

In [32]:
# highway
def highway_layer(data):
    H= mx.symbol.Activation(
        data=mx.symbol.FullyConnected(data=data, num_hidden=emb_size//2, name="highway_fcH"),
        act_type="relu"
    )
    T= mx.symbol.Activation(
        data=mx.symbol.FullyConnected(data=data, num_hidden=emb_size//2, bias=mx.sym.Variable('bias'), name="highway_fcT"),
        act_type="sigmoid"
    )
    return  H * T + data * (1.0 - T)


In [33]:
# CBHG
def CBHG(data,K,proj1_size,proj2_size):
    #se si usa infer_shape su convbank dando la dimensione dell'input, viene dedotta la shape appunto 
    bank = conv1dBank(data,K)
    poold_bank = mx.sym.Pooling(data=bank, pool_type='max', kernel=(2, 1), stride=(1,1), name="CBHG_pool")

    proj1 = mx.sym.Convolution(data=poold_bank, kernel=(3,1), num_filter=proj1_size, name='CBHG_conv1')
    (proj1, proj1_mean, proj1_var) = mx.sym.BatchNorm(data=proj1, output_mean_var=True, name='CBHG_batch1')
    proj1 = mx.sym.Activation(data=proj1, act_type='relu', name='CBHG_act1')

    proj2 = mx.sym.Convolution(proj1, kernel=(3,1), num_filter=proj2_size, name='CBHG_conv2')
    (proj2, proj2_mean, proj2_var) = mx.sym.BatchNorm(data=proj2, output_mean_var=True, name='CBHG_batch2')
    
    residual= proj2 + data

    for i in range(4):
        residual = highway_layer(residual)
    highway_pass = residual
   
    bidirectional_gru_cell = mx.rnn.BidirectionalCell(
        mx.rnn.GRUCell(num_hidden=emb_size//2, prefix='CBHG_gru1'),
        mx.rnn.GRUCell(num_hidden=emb_size//2, prefix='CBHG_gru2'),
        output_prefix='CBHG_bi_'
    )
    outputs, states = bidirectional_gru_cell.unroll(1, inputs=highway_pass, merge_outputs=True)
    return outputs

In [34]:
# encoder
def encoder(data):
    embed_vector = mx.sym.Embedding(data=data, input_dim=longest_word, output_dim=emb_size, name='encoder_embed')
    prenet_output = prenet_pass(embed_vector)
    return CBHG(prenet_output,16, emb_size//2, emb_size//2)

In [35]:
text = mx.sym.Variable('text')

encoded = encoder(text)
graph=mx.viz.plot_network(
    encoded,
    save_format='pdf',
    title='encoder')
#graph.render()

In [36]:
# decoder
def decoder(input_spectrogram,context,reduction_factor):
    #embed_vector = mx.sym.Embedding(data=input_spectrogram, input_dim=80, output_dim=emb_size, name='decoder_embed')
    prenet_output = prenet_pass(input_spectrogram)
        
    stack = mx.rnn.SequentialRNNCell()
    stack.add(mx.rnn.GRUCell(num_hidden=emb_size,prefix='decoder_layer1_'))
    stack.add(mx.rnn.GRUCell(num_hidden=emb_size,prefix='decoder_layer2_'))
    
    residual_gru_stack = mx.rnn.ResidualCell(stack)
    
    gru_outputs,states = residual_gru_stack.unroll(length=1,
                                               inputs=prenet_output,
                                               begin_state=context,
                                               merge_outputs=True)

    predicted_frames = mx.symbol.Activation(
        data=mx.symbol.FullyConnected(data=gru_outputs, num_hidden=80*reduction_factor),
        act_type="relu"
    )
    
    return predicted_frames, states

In [41]:
def postprocess(input_mel_spectgrograms):
    linear_scale_spectrograms=CBHG(input_mel_spectgrograms,8,emb_size,80)
    return linear_scale_spectrograms

In [51]:
linear_spectrogram = mx.sym.Variable('linear_spectrogram')

spectrograms_count=5 #dummy value
decoder_state=[encoded,encoded]
predicted_frames=mx.sym.zeros((1,80))
full_frame=mx.sym.zeros((1,80))
mel_spectrogram = mx.sym.Variable('mel_spectrogram')

In [56]:
net = mx.sym.MAERegressionOutput(data=postprocess(mel_spectrogram), label=linear_spectrogram)

In [57]:
model = mx.mod.Module(symbol=net,
                      context=ctx,
                      data_names=['mel_spectrogram'],
                      label_names=['linear_spectrogram']
                     )

In [60]:
model.fit(traindata_iterator,
          eval_data=evaldata_iterator,
          optimizer=mx.optimizer.Adam,
          optimizer_params={'learning_rate': 0.1, 'momentum': 0.9},
          eval_metric='acc',
          num_epoch=8)

ValueError: Data provided by data_shapes don't match names specified by data_names ([DataDesc[_0_data,(2, 4),<class 'numpy.float32'>,NCHW], DataDesc[_1_data,(2, 4),<class 'numpy.float32'>,NCHW], DataDesc[_2_data,(2, 4),<class 'numpy.float32'>,NCHW], DataDesc[_3_data,(2, 4),<class 'numpy.float32'>,NCHW], DataDesc[_4_data,(2, 4),<class 'numpy.float32'>,NCHW], DataDesc[_5_data,(2, 4),<class 'numpy.float32'>,NCHW], DataDesc[_6_data,(2, 4),<class 'numpy.float32'>,NCHW], DataDesc[_7_data,(2, 4),<class 'numpy.float32'>,NCHW], DataDesc[_8_data,(2, 4),<class 'numpy.float32'>,NCHW], DataDesc[_9_data,(2, 4),<class 'numpy.float32'>,NCHW]] vs. ['mel_spectrogram'])

In [37]:
for i in range(spectrograms_count):
    predicted_frames,decoder_state = decoder(predicted_frames,decoder_state,reduction_factor)
    full_frame=mx.sym.concat(full_frame,predicted_frames)

spectral_magnitude=CBHG(full_frame, 8, emb_size, 80)

graph=mx.viz.plot_network(
    spectral_magnitude,
    save_format='pdf',
    title='decoder')
#graph.render()