<h1>TACOTRON</h1>

In [2]:
from __future__ import print_function
import mxnet as mx
import numpy as np
from mxnet import nd, autograd
from IPython.display import clear_output
ctx= mx.cpu()
import csv
import codecs
import re
import audio_process
import traceback
import subprocess
import math
from params import Hyperparams as hp 


<h3> DATA SETUP </h3>

In [3]:
num_hidden = hp.embed_size
reduction_factor=hp.r
emb_size=hp.embed_size
batch_size=2

In [4]:
def generate_vocabulary(texts_list):    
    # get unique chars and put into a list
    return list(set(''.join(texts_list)))
    

def generate_chars2numbers_mappings(vocabulary):
    # create a chars <-> numbers mappings
    char2index = {char:i for i,char in enumerate(vocabulary)}
    index2char = {i:char for i,char in enumerate(vocabulary)}
    
    return char2index,index2char


def text2numbers(texts_list,char2index_mapping):
    numerical_texts=[]
    for text in texts_list:
        numerical_texts.append([char2index_mapping[char] for char in text])
    return numerical_texts

def open_data(input_file_path):
      
    texts, sound_files = [], []
    
    reader = csv.reader(codecs.open(input_file_path, 'rb', 'utf-8'))
    for row in reader:
        sound_filename, text = row
        sound_file = hp.sound_fpath +"/"+ sound_filename + ".wav"
        text = re.sub(r"[^ a-z']", "", text.strip().lower())
         
        texts.append(text)
        sound_files.append(sound_file)
             
    return texts, sound_files
# Returns: one-hot-encoded-text, linear spectrum, mel spectrum
# Shapes: (data_length, ?, ?) , (data_length, (n_fft/2)+1, ceil(max_audio_length/hop_size)), (data_length, n_mels, ceil(max_audio_length/hop_size))
def generate_text_spectra(texts_list, sound_labels):
    
    assert len(sound_labels) == len(texts_list)
    
    print("Generating spectrograms")
    
    #tuples of wav and sr of that wav. wav is a 1D floats vector
    wavs_srs = [audio_process.load_wave(sound_clip) for sound_clip in sound_labels]
    longest_wav_sr = (max(wavs_srs, key= lambda wav: len(wav[0])))
    #save the longest audio file length
    max_samples_length=(len(longest_wav_sr[0]))
    print("max audio sample length:",max_samples_length)

    #prepare the data structure for save all the spectra
    spectra_lin = mx.ndarray.zeros((len(sound_labels),1+(hp.n_fft//2),math.ceil(max_samples_length/hp.hop_length)))
    spectra_mel = mx.ndarray.zeros((len(sound_labels),hp.n_mels,math.ceil(max_samples_length/hp.hop_length)))
    print("Padding audio and compute mel and lin spectra..")
    for indx,wav_sr in enumerate(wavs_srs):
        wav = wav_sr[0]
        wav_length = len(wav)
#         print("wav l",w_length)
        diff = max_samples_length-wav_length
#         print("num of zeros to add",diff)
        padded = np.append(wav,np.zeros(diff))
        # get the spectrum from the padded sound
        spectrum_lin, spectrum_mel=(audio_process.do_spectrogram(y=padded,sr=hp.sr))
#         print(padded_spectrum_lin.shape)
        # save into the ndarray
        spectra_lin[indx,:,:]=spectrum_lin[:,:]
        spectra_mel[indx,:,:]=spectrum_mel[:,:]
    
    
    print("Processing text..")
    vocabulary = generate_vocabulary(texts_list)
    vocab_size=len(vocabulary)
    char2index,index2char = generate_chars2numbers_mappings(vocabulary)

    print("Converting text to integers..")
    texts_numerical = text2numbers(texts_list,char2index)
    # simulate a different sequence length
#   /D E L E T E M E/ 
    texts_numerical[4]=np.concatenate((texts_numerical[4],[8,9]))
#   /D E L E T E M E/

    longest_sequence = (max(texts_numerical, key= lambda seq: len(seq)))
    longest_sequence_len=len(longest_sequence)
    print("Pad sequences to",longest_sequence_len,"..")
    # helper function for the lambda expression
    def _padseq(seq,max_len):
        diff=max_len-len(seq)
        if diff>0: 
            # SHITTY USELESS MXNET API. CANNOT CONCAT A NON-EMPTY WITH EMPTY ARRAY. 
            # EDIT: use numpy now. Still using this condition for safety
            pad = np.zeros(diff)-1
            seq=np.append(seq,[pad])
        return seq
    
    padded_sequences = mx.nd.array(
        list(
            map(
                lambda seq: _padseq(seq,longest_sequence_len), texts_numerical
            )
        )
    )   
    
    texts_one_hot=mx.ndarray.one_hot(padded_sequences,vocab_size)
    
    return texts_one_hot, spectra_lin, spectra_mel

In [5]:
def get_iterators(data='train_data/dataset.csv'):
    texts_list, sound_files_list = open_data(data)
    size=len(sound_files_list)

    texts_one_hot, spectra_lin, spectra_mel = generate_text_spectra(texts_list, sound_files_list)

    # get 10% of dataset as eval data 
    eval_indxs = (np.random.randint(0, high=size, size=size//10))
    # remaining indexes for the train
    train_indxs = np.setdiff1d(np.arange(size),eval_indxs)

    print("I will take those for eval:",eval_indxs)
    print("..and the remaining for train:",train_indxs,"\n")

    #take from the array (1st arg) the indexes of the first dimension specified by the 2nd arg
    #train_txt take the one_hot matrices
    train_txt_data = mx.ndarray.take(texts_one_hot,mx.nd.array(train_indxs))
    eval_txt_data = mx.ndarray.take(texts_one_hot,mx.nd.array(eval_indxs))

    train_data = mx.ndarray.take(spectra_mel,mx.nd.array(train_indxs))
    train_label = mx.ndarray.take(spectra_lin,mx.nd.array(train_indxs))

    eval_data = mx.ndarray.take(spectra_mel,mx.nd.array(eval_indxs))
    eval_label = mx.ndarray.take(spectra_lin,mx.nd.array(eval_indxs))

    print("train data shape:",train_data.shape,"train label shape:",train_label.shape)
    print("eval data shape:", eval_data.shape,"eval label shape:",eval_label.shape,"\n")


    try:
        print("Populating traindata iterator")
        traindata_iterator = mx.io.NDArrayIter(data={'mel_spectrogram':train_data},
                                label={'linear_spectrogram':train_label},
                                batch_size=batch_size,
                                shuffle=True)
        print("Populating evaldata iterator")
        evaldata_iterator = mx.io.NDArrayIter(data={'mel_spectrogram':eval_data},
                                label={'linear_spectrogram':eval_label},
                                batch_size=batch_size)
    except Exception as e:
        print(e)
        traceback.print_exc()


#     for batch in traindata_iterator:
#         print(batch.data[0].asnumpy())
#         print(batch.data[0].shape)
    
    return traindata_iterator,evaldata_iterator


b=get_iterators(hp.text_file)[0].next()
print("Batch size=",batch_size,"due few data in dummy train set")
print(b)

Generating spectrograms
max audio sample length: 17567
Padding audio and compute mel and lin spectra..
Processing text..
Converting text to integers..
Pad sequences to 7 ..
I will take those for eval: [26 25 10  1  4]
..and the remaining for train: [ 0  2  3  5  6  7  8  9 11 12 13 14 15 16 17 18 19 20 21 22 23 24 27 28 29
 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49] 

train data shape: (45, 80, 176) train label shape: (45, 1025, 176)
eval data shape: (5, 80, 176) eval label shape: (5, 1025, 176) 

Populating traindata iterator
Populating evaldata iterator
Batch size= 2 due few data in dummy train set
DataBatch: data shapes: [(2, 80, 176)] label shapes: [(2, 1025, 176)]


<h3> Modules </h3>

<h4> Prenet </h4>

In [27]:
"""
FC-256-ReLU → Dropout(0.5) → FC-128-ReLU → Dropout(0.5)
"""
def prenet_pass(data):
    fc1 = mx.symbol.FullyConnected(data=data, num_hidden=emb_size, name='prenet_fc1',flatten=False)
    act1 = mx.symbol.Activation(data=fc1, act_type='relu', name='prenet_act1')
    drop1 = mx.symbol.Dropout(act1, p=0.5, name='prenet_drop1')
    
    fc2 = mx.symbol.FullyConnected(data=drop1, num_hidden=emb_size//2, name='prenet_fc2', flatten=False)
    act2 = mx.symbol.Activation(data=fc2, act_type='relu', name='prenet_act2')
    prenet_output = mx.symbol.Dropout(act2, p=0.5, name='prenet_drop2')
    
    return prenet_output

<h4> Convolution 1D Bank </h4>

In [24]:
# Convolution bank of K filter
def conv1dBank(conv_input, K):
    conv=mx.sym.Convolution(data=conv_input, kernel=(1,), num_filter=emb_size//2) #
    (conv, mean, var) = mx.sym.BatchNorm(data=conv, output_mean_var=True)
    conv = mx.sym.Activation(data=conv, act_type='relu')
    for k in range(2, K+1):
        convi = mx.sym.Convolution(data=conv_input, kernel=(k,), num_filter=emb_size//2)
        (convi, mean, var) = mx.sym.BatchNorm(data=convi, output_mean_var=True)
        convi = mx.sym.Activation(data=convi, act_type='relu')
        conv = mx.symbol.concat(conv,convi,dim=2)
    return conv

<h4> Highway </h4>

In [17]:
# highway
def highway_layer(data):
    H= mx.symbol.Activation(
        data=mx.symbol.FullyConnected(data=data, num_hidden=emb_size//2, name="highway_fcH"),
        act_type="relu"
    )
    T= mx.symbol.Activation(
        data=mx.symbol.FullyConnected(data=data, num_hidden=emb_size//2, bias=mx.sym.Variable('bias'), name="highway_fcT"),
        act_type="sigmoid"
    )
    return  H * T + data * (1.0 - T)


<h4> CBHG </h4>

In [18]:
# CBHG
def CBHG(data,K,proj1_size,proj2_size):
    bank = conv1dBank(data,K)
#     bank=mx.sym.Convolution(data=conv_input, kernel=(1,), num_filter=emb_size//2)
#     for k in range (2,K):
#         bank=mx.sym.Convolution(data=bank, kernel=(k,), num_filter=k*(emb_size//2))
    
    poold_bank = mx.sym.Pooling(data=bank, pool_type='max', kernel=(2,), stride=(1,), name="CBHG_pool")

    proj1 = mx.sym.Convolution(data=poold_bank, kernel=(3,), num_filter=proj1_size, name='CBHG_conv1')
    (proj1, proj1_mean, proj1_var) = mx.sym.BatchNorm(data=proj1, output_mean_var=True, name='CBHG_batch1')
    proj1 = mx.sym.Activation(data=proj1, act_type='relu', name='CBHG_act1')

    proj2 = mx.sym.Convolution(proj1, kernel=(3,), num_filter=proj2_size, name='CBHG_conv2')
    (proj2, proj2_mean, proj2_var) = mx.sym.BatchNorm(data=proj2, output_mean_var=True, name='CBHG_batch2')
    
    residual= proj2 + data #How can I do the residual sum with different shapes? 

    for i in range(4):
        residual = highway_layer(residual)
    highway_pass = residual
   
    bidirectional_gru_cell = mx.rnn.BidirectionalCell(
        mx.rnn.GRUCell(num_hidden=emb_size//2, prefix='CBHG_gru1'),
        mx.rnn.GRUCell(num_hidden=emb_size//2, prefix='CBHG_gru2'),
        output_prefix='CBHG_bi_'
    )
    outputs, states = bidirectional_gru_cell.unroll(1, inputs=highway_pass, merge_outputs=True)
    return residual


<h4> Encoder </h4>

In [19]:
# encoder
def encoder(data):
    embed_vector = mx.sym.Embedding(data=data, input_dim=longest_word, output_dim=emb_size, name='encoder_embed')
    prenet_output = prenet_pass(embed_vector)
    return CBHG(prenet_output,16, emb_size//2, emb_size//2)

<h4> Decoder (stub)</h4>

In [20]:
# decoder
def decoder(input_spectrogram,context,reduction_factor):
    #embed_vector = mx.sym.Embedding(data=input_spectrogram, input_dim=80, output_dim=emb_size, name='decoder_embed')
    prenet_output = prenet_pass(input_spectrogram)
        
    stack = mx.rnn.SequentialRNNCell()
    stack.add(mx.rnn.GRUCell(num_hidden=emb_size,prefix='decoder_layer1_'))
    stack.add(mx.rnn.GRUCell(num_hidden=emb_size,prefix='decoder_layer2_'))
    
    residual_gru_stack = mx.rnn.ResidualCell(stack)
    
    gru_outputs,states = residual_gru_stack.unroll(length=1,
                                               inputs=prenet_output,
                                               begin_state=context,
                                               merge_outputs=True)

    predicted_frames = mx.symbol.Activation(
        data=mx.symbol.FullyConnected(data=gru_outputs, num_hidden=80*reduction_factor),
        act_type="relu"
    )
    
    return predicted_frames, states

<h1> Test section </h1>

<h2> inference tests on CONV1BANK </h2>

In [134]:

conv1=mx.sym.Convolution(data=conv_input, kernel=(1,), num_filter=emb_size//2)
conv2=mx.sym.Convolution(data=conv_input, kernel=(2,), pad=(3,), num_filter=emb_size//2)
conv3=mx.sym.Convolution(data=conv_input, kernel=(3,) , num_filter=emb_size//2)
conv4=mx.sym.Convolution(data=conv_input, kernel=(4,) , num_filter=emb_size//2)
conv5=mx.sym.Convolution(data=conv_input, kernel=(5,) , num_filter=emb_size//2)
conv6=mx.sym.Convolution(data=conv_input, kernel=(6,) , num_filter=emb_size//2)
conv6=mx.sym.Convolution(data=conv_input, kernel=(7,) , num_filter=emb_size//2)
conv6=mx.sym.Convolution(data=conv_input, kernel=(8,) , num_filter=emb_size//2)


print(conv1.infer_shape(mel_spectrogram=mel_spectrogram_shape)[1])
print(conv2.infer_shape(mel_spectrogram=mel_spectrogram_shape)[1])
print(conv3.infer_shape(mel_spectrogram=mel_spectrogram_shape)[1])
print(conv4.infer_shape(mel_spectrogram=mel_spectrogram_shape)[1])
print(conv5.infer_shape(mel_spectrogram=mel_spectrogram_shape)[1])
print(conv6.infer_shape(mel_spectrogram=mel_spectrogram_shape)[1])


TypeError: __init__() missing 6 required positional arguments: 'data_names', 'data_shapes', 'data_gen', 'label_names', 'label_shapes', and 'label_gen'

In [99]:
# Convolution bank of K filter
def conv1dBank_1(conv_input, K):
    conv=mx.sym.Convolution(data=conv_input, kernel=(3,), num_filter=emb_size//2)
    #conv=mx.sym.swapaxes(conv,1,2)
    (conv, mean, var) = mx.sym.BatchNorm(data=conv, output_mean_var=True)
    conv = mx.sym.Activation(data=conv, act_type='relu')
    
    #conv2 = mx.sym.Convolution(data=conv_input, kernel=(2,), num_filter=emb_size//2)
    #conv = mx.symbol.concat(conv,conv2,dim=2)
    #conv3 = mx.sym.Convolution(data=conv_input, kernel=(3,), num_filter=emb_size//2)
#     for k in range(2, K+1):
#         convi = mx.sym.Convolution(data=conv_input, kernel=(k,), num_filter=emb_size//2)
#         convi=mx.sym.swapaxes(convi,1,2)
#         (convi, mean, var) = mx.sym.BatchNorm(data=convi, output_mean_var=True)
#         convi = mx.sym.Activation(data=convi, act_type='relu')
#         conv = mx.symbol.concat(conv,convi,dim=2)
    return conv
prenet_out = prenet_pass(mel_spectrogram)
c1b_args_shape, c1b_out_shape, c1b_aux_shape = conv1dBank_1(prenet_out,8).infer_shape(mel_spectrogram=mel_spectrogram_shape)
print("input shape for prenet :",mel_spectrogram_shape)
print("input shape for bank :",prenet_out.infer_shape(mel_spectrogram=mel_spectrogram_shape)[1])
print("conv1bank output shape:",c1b_out_shape) #[b,num_fil,out_width]
#1: 2,128,100|
#2: 2,128,99 |concat: 2,128,199|
#3: 2,128,98                   |concat:2,128,297
#4: 2,128,97...
#shape = {"in_conv1bnk" : in_conv1bnk_shape}
#mx.viz.plot_network(symbol=conv1dBank_1(in_conv1bnk,8), shape=shape)

input shape for prenet : (2, 100, 80)
input shape for bank : [(2, 100, 128)]
conv1bank output shape: [(2, 128, 126)]


<h2> inference tests on CBHG </h2>

<h2> inference tests on CBHG step by step until residual add</h2>

In [79]:
K=8
# conv=mx.sym.Convolution(data=in_cbhg, kernel=(1,), num_filter=emb_size//2)
# (conv, mean, var) = mx.sym.BatchNorm(data=conv, output_mean_var=True)
# conv = mx.sym.Activation(data=conv, act_type='relu')
# for k in range(2, K+1):
#     convi = mx.sym.Convolution(data=in_cbhg, kernel=(k,), num_filter=emb_size//2)
#     (convi, mean, var) = mx.sym.BatchNorm(data=convi, output_mean_var=True)
#     convi = mx.sym.Activation(data=convi, act_type='relu')
#     conv = mx.symbol.concat(conv,convi,dim=2)
in_cbhg = prenet_pass(mel_spectrogram)
bank=mx.sym.Convolution(data=in_cbhg, kernel=(1,), num_filter=emb_size//2)
for k in range (2,K+1):
    bank=mx.sym.Convolution(data=bank, kernel=(k,), num_filter=k*(emb_size//2))
    

poold_bank = mx.sym.Pooling(data=bank, pool_type='max', kernel=(2,), stride=(1,), name="CBHG_pool")

proj1 = mx.sym.Convolution(data=poold_bank, kernel=(3,), num_filter=256, name='CBHG_conv1')
(proj1, proj1_mean, proj1_var) = mx.sym.BatchNorm(data=proj1, output_mean_var=True, name='CBHG_batch1')
proj1 = mx.sym.Activation(data=proj1, act_type='relu', name='CBHG_act1')

proj2 = mx.sym.Convolution(proj1, kernel=(3,), num_filter=128, name='CBHG_conv2')
(proj2, proj2_mean, proj2_var) = mx.sym.BatchNorm(data=proj2, output_mean_var=True, name='CBHG_batch2')

residual= proj2 + mx.sym.swapaxes(in_cbhg,1,2)

print("conv:")
c_arg_shapes, c_out_shapes, c_aux_shapes=bank.infer_shape(mel_spectrogram=mel_spectrogram_shape)
#print("c_arg_shapes",c_arg_shapes)
print("c_out_shapes",c_out_shapes)
#print("c_aux_shapes",c_aux_shapes)

print("pool:")
pool_arg_shapes, pool_out_shapes, pool_aux_shapes=poold_bank.infer_shape(mel_spectrogram=mel_spectrogram_shape)
#print("pool_arg_shapes",pool_arg_shapes)
print("pool_out_shapes",pool_out_shapes)
#print("pool_aux_shapes",pool_aux_shapes)

print("proj1")
proj1_arg_shapes, proj1_out_shapes, proj1_aux_shapes = proj1.infer_shape(mel_spectrogram=mel_spectrogram_shape)
#print("proj1_arg_shapes",proj1_arg_shapes)
print("proj1_out_shapes",proj1_out_shapes)
#print("proj1_aux_shapes",proj1_aux_shapes)

print("proj2")
proj2_arg_shapes, proj2_out_shapes, proj2_aux_shapes =proj2.infer_shape(mel_spectrogram=mel_spectrogram_shape)
#print("proj2_arg_shapes",proj2_arg_shapes)
print("proj2_out_shapes",proj2_out_shapes)
#print("proj2_aux_shapes",proj2_aux_shapes)

print("residual")
residual_arg_shapes, residual_out_shapes, residual_aux_shapes = residual.infer_shape(mel_spectrogram=mel_spectrogram_shape)
print("residual_arg_shapes",residual_arg_shapes,"residual_out_shapes",residual_out_shapes,"residual_aux_shapes",residual_aux_shapes)

conv:
c_out_shapes [(2, 1024, 100)]
pool:
pool_out_shapes [(2, 1024, 99)]
proj1
proj1_out_shapes [(2, 256, 97)]
proj2
proj2_out_shapes [(2, 128, 95)]
residual
infer_shape error. Arguments:
  mel_spectrogram: (2, 100, 80)


MXNetError: Error in operator _plus10: [12:04:36] src/operator/nn/./../tensor/../elemwise_op_common.h:122: Check failed: assign(&dattr, (*vec)[i]) Incompatible attr in node _plus10 at 1-th input: expected (2,128,95), got (2,128,100)

Stack trace returned 10 entries:
[bt] (0) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x17dbbc) [0x7f6abaa69bbc]
[bt] (1) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x293a73) [0x7f6abab7fa73]
[bt] (2) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x294ab4) [0x7f6abab80ab4]
[bt] (3) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x20180ee) [0x7f6abc9040ee]
[bt] (4) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x201ac50) [0x7f6abc906c50]
[bt] (5) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(MXSymbolInferShape+0x1539) [0x7f6abc895289]
[bt] (6) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f6adfd45ec0]
[bt] (7) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7f6adfd4587d]
[bt] (8) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7f6adff5a82e]
[bt] (9) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x12265) [0x7f6adff5b265]


In [19]:
def postprocess(input_mel_spectgrograms):
    linear_scale_spectrograms=CBHG(input_mel_spectgrograms,8,hp.embed_size,hp.n_mels)
    return linear_scale_spectrograms

In [20]:
traindata_iterator, evaldata_iterator = get_iterators()
linear_spectrogram = mx.sym.Variable('linear_spectrogram')
mel_spectrogram = mx.sym.Variable('mel_spectrogram')

net = mx.sym.MAERegressionOutput(data=postprocess(mel_spectrogram), label=linear_spectrogram)
model = mx.mod.Module(symbol=net,
                      context=ctx,
                      data_names=['mel_spectrogram'],
                      label_names=['linear_spectrogram']
                     )

Generating spectrograms
max audio sample length: 17567
Padding audio and compute mel and lin spectra..
Processing text..
Converting text to integers..
Pad sequences to 7 ..
I will take those for eval: [ 2 35  9  2  4]
..and the remaining for train: [ 0  1  3  5  6  7  8 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
 28 29 30 31 32 33 34 36 37 38 39 40 41 42 43 44 45 46 47 48 49] 

train data shape: (46, 80, 176) train label shape: (46, 1025, 176)
eval data shape: (5, 80, 176) eval label shape: (5, 1025, 176) 

Populating traindata iterator
Populating evaldata iterator


In [21]:

model.fit(traindata_iterator,
          eval_data=evaldata_iterator,
          optimizer=mx.optimizer.Adam,
          optimizer_params={'learning_rate': 0.1, 'momentum': 0.9},
          eval_metric='acc',
          num_epoch=8)

RuntimeError: simple_bind error. Arguments:
mel_spectrogram: (2, 80, 176)
linear_spectrogram: (2, 1025, 176)
Error in operator _plus11: [16:29:23] src/operator/nn/./../tensor/../elemwise_op_common.h:122: Check failed: assign(&dattr, (*vec)[i]) Incompatible attr in node _plus11 at 1-th input: expected (2,80,1375), got (2,80,176)

Stack trace returned 10 entries:
[bt] (0) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x17dbbc) [0x7f5f135cfbbc]
[bt] (1) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x293a73) [0x7f5f136e5a73]
[bt] (2) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x294ab4) [0x7f5f136e6ab4]
[bt] (3) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x20180ee) [0x7f5f1546a0ee]
[bt] (4) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x201ac2c) [0x7f5f1546cc2c]
[bt] (5) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1ffe5c9) [0x7f5f154505c9]
[bt] (6) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1fff084) [0x7f5f15451084]
[bt] (7) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/site-packages/mxnet/libmxnet.so(MXExecutorSimpleBind+0x2300) [0x7f5f153de6b0]
[bt] (8) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f5f3958bec0]
[bt] (9) /home/stefano/anaconda3/envs/mxnet/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7f5f3958b87d]


In [37]:
predicted_frames=mx.sym.zeros((1,80))
full_frame=mx.sym.zeros((1,80))

spectrograms_count=5 #dummy value
for i in range(spectrograms_count):
    predicted_frames,decoder_state = decoder(predicted_frames,decoder_state,reduction_factor)
    full_frame=mx.sym.concat(full_frame,predicted_frames)

spectral_magnitude=CBHG(full_frame, 8, emb_size, 80)

graph=mx.viz.plot_network(
    spectral_magnitude,
    save_format='pdf',
    title='decoder')
#graph.render()