TACOTRON

In [6]:
from __future__ import print_function
import mxnet as mx
import numpy as np
from mxnet import nd, autograd
from IPython.display import clear_output
ctx= mx.cpu()
import csv
import codecs
import re
import audio_process
import traceback
import subprocess
import math
from params import Hyperparams as hp 


In [7]:

# command="ls train_data/*wav|while read i; do sox $i -n stat 2>&1|grep Leng|cut -d':'  -f 2|sed -r 's/\s//g'; done|awk 'BEGIN{max_len=0}{if($0>max_len)max_len=$0}END{print max_len}'"
# max_len=subprocess.Popen(command, shell=True, stdout=subprocess.PIPE).stdout.read().strip()
# max_audio_sec_len=math.ceil((float(max_len.decode("utf-8"))))

In [30]:
num_hidden = 256
reduction_factor=2
emb_size=256
batch_size=10


def generate_vocabulary(texts_list):    
    # get unique chars and put into a list
    return list(set(''.join(texts_list)))
    

def generate_chars2numbers_mappings(vocabulary):
    # create a chars <-> numbers mappings
    char2index = {char:i for i,char in enumerate(vocabulary)}
    index2char = {i:char for i,char in enumerate(vocabulary)}
    
    return char2index,index2char


def text2numbers(texts_list,char2index_mapping):
    numerical_texts=[]
    for text in texts_list:
        numerical_texts.append([char2index_mapping[char] for char in text])
    return numerical_texts

def open_data(input_file_path):
      
    texts, sound_files = [], []
    
    reader = csv.reader(codecs.open(input_file_path, 'rb', 'utf-8'))
    for row in reader:
        sound_filename, text = row
        sound_file = "train_data/" + sound_filename + ".wav"
        text = re.sub(r"[^ a-z']", "", text.strip().lower())
         
        texts.append(text)
        sound_files.append(sound_file)
             
    return texts, sound_files
# Returns: one-hot-encoded-text, linear spectrum, mel spectrum
# Shapes: (data_length, ?, ?), (data_length, (n_fft/2)+1, ceil(max_audio_length))
def generate_text_spectra(text_trainset, sound_labels):
    
    assert len(sound_labels) == len(text_trainset)
    
    print("Generating spectrograms")
    
    #tuples of wav and sr of that wav. wav is a 1D floats vector
    wavs_srs = [audio_process.load_wave(sound_clip) for sound_clip in sound_labels]
    longest_wav_sr = (max(wavs_srs, key= lambda wav: len(wav[0])))
    #save the longest audio file length
    max_samples_length=(len(longest_wav_sr[0]))
    print("max audio sample length:",max_samples_length)

    #prepare the data structure for save all the spectra
    spectra_lin = mx.ndarray.zeros((len(sound_labels),1+(hp.n_fft//2),math.ceil(max_samples_length/hp.hop_length)))
    spectra_mel = mx.ndarray.zeros((len(sound_labels),hp.n_mels,math.ceil(max_samples_length/hp.hop_length)))
    print("Padding audio and compute mel and lin spectra..")
    for indx,wav_sr in enumerate(wavs_srs):
        wav = wav_sr[0]
        wav_length = len(wav)
#         print("wav l",w_length)
        diff = max_samples_length-wav_length
#         print("num of zeros to add",diff)
        padded = np.append(wav,np.zeros(diff))
        # get the spectrum from the padded sound
        spectrum_lin, spectrum_mel=(audio_process.do_spectrogram(y=padded,sr=hp.sr))
#         print(padded_spectrum_lin.shape)
        # save into the ndarray
        spectra_lin[indx,:,:]=spectrum_lin[:,:]
        spectra_mel[indx,:,:]=spectrum_mel[:,:]
    
    
    print("Processing text..")
    vocabulary = generate_vocabulary(text_trainset)
    vocab_size=len(vocabulary)
    char2index,index2char = generate_chars2numbers_mappings(vocabulary)

    print("Converting text to integers..")
    texts_numerical = text2numbers(text_trainset,char2index)
    # simulate a different sequence length
#   /D E L E T E M E/ 
    texts_numerical[4]=np.concatenate((texts_numerical[4],[8,9]))
#   /D E L E T E M E/

    longest_sequence = (max(texts_numerical, key= lambda seq: len(seq)))
    longest_sequence_len=len(longest_sequence)
    print("Pad sequences to",longest_sequence_len,"..")
    # helper function for the lambda expression
    def _padseq(seq,max_len):
        diff=max_len-len(seq)
        if diff>0: 
            # SHITTY USELESS MXNET API. CANNOT CONCAT A NON-EMPTY WITH EMPTY ARRAY. 
            # EDIT: use numpy now. Still using this condition for safety
            pad = np.zeros(diff)-1
            seq=np.append(seq,[pad])
        return seq
    
    padded_sequences = mx.nd.array(
        list(
            map(
                lambda seq: _padseq(seq,longest_sequence_len), texts_numerical
            )
        )
    )   
    
    texts_one_hot=mx.ndarray.one_hot(padded_sequences,vocab_size)
    
    return texts_one_hot, spectra_lin, spectra_mel

In [31]:
def get_iterators(data='train_data/dataset.csv'):
    texts_list, sound_files_list = open_data(data)
    size=len(sound_files_list)

    texts_one_hot, spectra_lin, spectra_mel = generate_text_spectra(texts_list, sound_files_list)

    # get 10% of dataset as eval data 
    eval_indxs = list(set(np.random.randint(0, high=size, size=size//10)))
    # remaining indexes for the train
    train_indxs = np.setdiff1d(np.arange(size),eval_indxs)

    print("I will take those for eval:",eval_indxs)
    print("..and the remaining for train:",train_indxs,"\n")

    #take from the array (1st arg) the indexes of the first dimension specified by the 2nd arg
    #train_txt take the one_hot matrices
    train_txt_data = mx.ndarray.take(texts_one_hot,mx.nd.array(train_indxs))
    eval_txt_data = mx.ndarray.take(texts_one_hot,mx.nd.array(eval_indxs))

    train_data = mx.ndarray.take(spectra_mel,mx.nd.array(train_indxs))
    train_label = mx.ndarray.take(spectra_lin,mx.nd.array(train_indxs))

    eval_data = mx.ndarray.take(spectra_mel,mx.nd.array(eval_indxs))
    eval_label = mx.ndarray.take(spectra_lin,mx.nd.array(eval_indxs))

    print("train data shape:",train_data.shape,"train label shape:",train_label.shape)
    print("eval data shape:", eval_data.shape,"eval label shape:",eval_label.shape,"\n")


    try:
        print("Populating traindata iterator")
        traindata_iterator = mx.io.NDArrayIter(data={'mel_spectrogram':train_data},
                                label={'linear_spectrogram':train_label},
                                batch_size=batch_size,
                                shuffle=True)
        print("Populating evaldata iterator")
        evaldata_iterator = mx.io.NDArrayIter(data={'mel_spectrogram':eval_data},
                                label={'linear_spectrogram':eval_label},
                                batch_size=batch_size)
    except Exception as e:
        print(e)
        traceback.print_exc()


#     for batch in traindata_iterator:
#         print(batch.data[0].asnumpy())
#         print(batch.data[0].shape)
    
    return traindata_iterator,evaldata_iterator

#print(get_iterators())



In [12]:
"""
FC-256-ReLU → Dropout(0.5) → FC-128-ReLU → Dropout(0.5)
"""
def prenet_pass(data):
    fc1 = mx.symbol.FullyConnected(data=data, num_hidden=emb_size, name='prenet_fc1')
    act1 = mx.symbol.Activation(data=fc1, act_type='relu', name='prenet_act1')
    drop1 = mx.symbol.Dropout(act1, p=0.5, name='prenet_drop1')
    
    fc2 = mx.symbol.FullyConnected(data=drop1, num_hidden=emb_size//2, name='prenet_fc2')
    act2 = mx.symbol.Activation(data=fc2, act_type='relu', name='prenet_act2')
    prenet_output = mx.symbol.Dropout(act2, p=0.5, name='prenet_drop2')
    
    return prenet_output

In [13]:
# banco di filtri convolutivi. Vengono creati K filtri con kernel 1D di dimensione:k 
def conv1dBank(conv_input, K):
    conv=mx.sym.Convolution(data=conv_input, kernel=(1,1), num_filter=emb_size//2)
    (conv, mean, var) = mx.sym.BatchNorm(data=conv, output_mean_var=True)
    conv = mx.sym.Activation(data=conv, act_type='relu')
    for k in range(2, K+1):
        convi = mx.sym.Convolution(data=conv_input, kernel=(k,1), num_filter=emb_size//2)
        (convi, mean, var) = mx.sym.BatchNorm(data=convi, output_mean_var=True)
        convi = mx.sym.Activation(data=convi, act_type='relu')
        conv = mx.symbol.concat(conv,convi)
    return conv

In [14]:
# highway
def highway_layer(data):
    H= mx.symbol.Activation(
        data=mx.symbol.FullyConnected(data=data, num_hidden=emb_size//2, name="highway_fcH"),
        act_type="relu"
    )
    T= mx.symbol.Activation(
        data=mx.symbol.FullyConnected(data=data, num_hidden=emb_size//2, bias=mx.sym.Variable('bias'), name="highway_fcT"),
        act_type="sigmoid"
    )
    return  H * T + data * (1.0 - T)


In [15]:
# CBHG
def CBHG(data,K,proj1_size,proj2_size):
    #se si usa infer_shape su convbank dando la dimensione dell'input, viene dedotta la shape appunto 
    bank = conv1dBank(data,K)
    poold_bank = mx.sym.Pooling(data=bank, pool_type='max', kernel=(2, 1), stride=(1,1), name="CBHG_pool")

    proj1 = mx.sym.Convolution(data=poold_bank, kernel=(3,1), num_filter=proj1_size, name='CBHG_conv1')
    (proj1, proj1_mean, proj1_var) = mx.sym.BatchNorm(data=proj1, output_mean_var=True, name='CBHG_batch1')
    proj1 = mx.sym.Activation(data=proj1, act_type='relu', name='CBHG_act1')

    proj2 = mx.sym.Convolution(proj1, kernel=(3,1), num_filter=proj2_size, name='CBHG_conv2')
    (proj2, proj2_mean, proj2_var) = mx.sym.BatchNorm(data=proj2, output_mean_var=True, name='CBHG_batch2')
    
    residual= proj2 + data

    for i in range(4):
        residual = highway_layer(residual)
    highway_pass = residual
   
    bidirectional_gru_cell = mx.rnn.BidirectionalCell(
        mx.rnn.GRUCell(num_hidden=emb_size//2, prefix='CBHG_gru1'),
        mx.rnn.GRUCell(num_hidden=emb_size//2, prefix='CBHG_gru2'),
        output_prefix='CBHG_bi_'
    )
    outputs, states = bidirectional_gru_cell.unroll(1, inputs=highway_pass, merge_outputs=True)
    return outputs

In [16]:
# encoder
def encoder(data):
    embed_vector = mx.sym.Embedding(data=data, input_dim=longest_word, output_dim=emb_size, name='encoder_embed')
    prenet_output = prenet_pass(embed_vector)
    return CBHG(prenet_output,16, emb_size//2, emb_size//2)

In [19]:
#text = mx.sym.Variable('text')

#encoded = encoder(text)
#graph=mx.viz.plot_network(
#    encoded,
#    save_format='pdf',
#    title='encoder')
#graph.render()

In [20]:
# decoder
def decoder(input_spectrogram,context,reduction_factor):
    #embed_vector = mx.sym.Embedding(data=input_spectrogram, input_dim=80, output_dim=emb_size, name='decoder_embed')
    prenet_output = prenet_pass(input_spectrogram)
        
    stack = mx.rnn.SequentialRNNCell()
    stack.add(mx.rnn.GRUCell(num_hidden=emb_size,prefix='decoder_layer1_'))
    stack.add(mx.rnn.GRUCell(num_hidden=emb_size,prefix='decoder_layer2_'))
    
    residual_gru_stack = mx.rnn.ResidualCell(stack)
    
    gru_outputs,states = residual_gru_stack.unroll(length=1,
                                               inputs=prenet_output,
                                               begin_state=context,
                                               merge_outputs=True)

    predicted_frames = mx.symbol.Activation(
        data=mx.symbol.FullyConnected(data=gru_outputs, num_hidden=80*reduction_factor),
        act_type="relu"
    )
    
    return predicted_frames, states

In [21]:
def postprocess(input_mel_spectgrograms):
    linear_scale_spectrograms=CBHG(input_mel_spectgrograms,8,emb_size,80)
    return linear_scale_spectrograms

In [23]:
linear_spectrogram = mx.sym.Variable('linear_spectrogram')

#spectrograms_count=5 #dummy value
#decoder_state=[encoded,encoded]
#predicted_frames=mx.sym.zeros((1,80))
full_frame=mx.sym.zeros((1,80))
mel_spectrogram = mx.sym.Variable('mel_spectrogram')

In [24]:
net = mx.sym.MAERegressionOutput(data=postprocess(mel_spectrogram), label=linear_spectrogram)

In [None]:
model = mx.mod.Module(symbol=net,
                      context=ctx,
                      data_names=['mel_spectrogram'],
                      label_names=['linear_spectrogram']
                     )

In [32]:
traindata_iterator, evaldata_iterator = get_iterators()

Generating spectrograms
max audio sample length: 23999
Padding audio and compute mel and lin spectra..
Processing text..
Converting text to integers..
Pad sequences to 7 ..
I will take those for eval: [33, 130, 35, 36, 100, 198, 39, 40, 41, 11, 172, 75, 109, 145, 182, 86, 120, 124, 61]
..and the remaining for train: [  0   1   2   3   4   5   6   7   8   9  10  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  34  37  38  42
  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60
  62  63  64  65  66  67  68  69  70  71  72  73  74  76  77  78  79  80
  81  82  83  84  85  87  88  89  90  91  92  93  94  95  96  97  98  99
 101 102 103 104 105 106 107 108 110 111 112 113 114 115 116 117 118 119
 121 122 123 125 126 127 128 129 131 132 133 134 135 136 137 138 139 140
 141 142 143 144 146 147 148 149 150 151 152 153 154 155 156 157 158 159
 160 161 162 163 164 165 166 167 168 169 170 171 173 174 175 176 177 178
 179 180 181 183 184 185 

In [33]:
item = traindata_iterator.next()
print(item)
traindata_iterator.reset()

DataBatch: data shapes: [(10, 80, 240)] label shapes: [(10, 1025, 240)]


In [28]:
model.fit(traindata_iterator,
          eval_data=evaldata_iterator,
          optimizer=mx.optimizer.Adam,
          optimizer_params={'learning_rate': 0.1, 'momentum': 0.9},
          eval_metric='acc',
          num_epoch=8)

RuntimeError: simple_bind error. Arguments:
mel_spectrogram: (1, 80, 240)
linear_spectrogram: (1, 1025, 240)
Error in operator convolution0: [14:35:36] src/operator/./convolution-inl.h:491: Check failed: dshp.ndim() == 4U (3 vs. 4) Input data should be 4D in batch-num_filter-y-x

Stack trace returned 10 entries:
[bt] (0) /home/ai_ja_nai/pischool/mxnet/venv3/local/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x17dbbc) [0x7f3d2d685bbc]
[bt] (1) /home/ai_ja_nai/pischool/mxnet/venv3/local/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x2299e00) [0x7f3d2f7a1e00]
[bt] (2) /home/ai_ja_nai/pischool/mxnet/venv3/local/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x21f67b7) [0x7f3d2f6fe7b7]
[bt] (3) /home/ai_ja_nai/pischool/mxnet/venv3/local/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x20180ee) [0x7f3d2f5200ee]
[bt] (4) /home/ai_ja_nai/pischool/mxnet/venv3/local/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x201abc1) [0x7f3d2f522bc1]
[bt] (5) /home/ai_ja_nai/pischool/mxnet/venv3/local/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x1ffe5c9) [0x7f3d2f5065c9]
[bt] (6) /home/ai_ja_nai/pischool/mxnet/venv3/local/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x1fff084) [0x7f3d2f507084]
[bt] (7) /home/ai_ja_nai/pischool/mxnet/venv3/local/lib/python3.5/site-packages/mxnet/libmxnet.so(MXExecutorSimpleBind+0x2300) [0x7f3d2f4946b0]
[bt] (8) /home/ai_ja_nai/pischool/mxnet/venv3/lib/python3.5/lib-dynload/_ctypes.cpython-35m-x86_64-linux-gnu.so(ffi_call_unix64+0x4c) [0x7f3d5c2d0e20]
[bt] (9) /home/ai_ja_nai/pischool/mxnet/venv3/lib/python3.5/lib-dynload/_ctypes.cpython-35m-x86_64-linux-gnu.so(ffi_call+0x2eb) [0x7f3d5c2d088b]


In [37]:
for i in range(spectrograms_count):
    predicted_frames,decoder_state = decoder(predicted_frames,decoder_state,reduction_factor)
    full_frame=mx.sym.concat(full_frame,predicted_frames)

spectral_magnitude=CBHG(full_frame, 8, emb_size, 80)

graph=mx.viz.plot_network(
    spectral_magnitude,
    save_format='pdf',
    title='decoder')
#graph.render()