TACOTRON

In [16]:
from __future__ import print_function
import mxnet as mx
import numpy as np
from mxnet import nd, autograd
from IPython.display import clear_output
ctx= mx.cpu()

In [6]:
def load_vocab():
    vocab = "EG abcdefghijklmnopqrstuvwxyz'" # E: Empty. ignore G
    char2idx = {char:idx for idx, char in enumerate(vocab)}
    idx2char = {idx:char for idx, char in enumerate(vocab)}
    return char2idx, idx2char  

In [7]:
"""
FC-256-ReLU → Dropout(0.5) → FC-128-ReLU → Dropout(0.5)
"""
def prenet_pass(data):
    fc1 = mx.symbol.FullyConnected(data=data, num_hidden=emb_size)
    act1 = mx.symbol.Activation(data=fc1, act_type='relu')
    drop1 = mx.symbol.Dropout(act1, p=0.5)
    
    fc2 = mx.symbol.FullyConnected(data=data, num_hidden=emb_size//2)
    act2 = mx.symbol.Activation(data=fc2, act_type='relu')
    prenet_output = mx.symbol.Dropout(act2, p=0.5)
    
    return prenet_output

In [8]:
# banco di filtri convolutivi. Vengono creati K filtri con kernel 1D di dimensione:k 
def conv1dBank(conv_input, K):
    conv=mx.sym.Convolution(data=conv_input, kernel=(1,1), num_filter=emb_size//2)
    (conv, mean, var) = mx.sym.BatchNorm(data=conv, output_mean_var=True)
    conv = mx.sym.Activation(data=conv, act_type='relu')
    for k in range(2, K+1):
        convi = mx.sym.Convolution(data=conv_input, kernel=(k,1), num_filter=emb_size//2)
        (convi, mean, var) = mx.sym.BatchNorm(data=convi, output_mean_var=True)
        convi = mx.sym.Activation(data=convi, act_type='relu')
        conv = mx.symbol.concat(conv,convi)
    return conv

In [9]:
# highway
def highway_layer(data):
    H= mx.symbol.Activation(data=mx.symbol.FullyConnected(data=data, num_hidden=emb_size//2), act_type="relu")
    T= mx.symbol.Activation(data=mx.symbol.FullyConnected(data=data, num_hidden=emb_size//2, bias=mx.sym.Variable('bias') ), act_type="sigmoid")
    return  H * T + data * (1.0 - T)


In [10]:
def GRULayer(data, bidirectional_mode=True, size=emb_size):
    net = mx.sym.RNN(data=data,bidirectional=bidirectional_mode,mode='gru', num_layers=1, state_size=size)
    return net

In [11]:
# CBHG
def CBHG(data,K,proj1_size,proj2_size):
    #se si usa infer_shape su convbank dando la dimensione dell'input, viene dedotta la shape appunto 
    bank = conv1dBank(data,K)
    poold_bank = mx.sym.Pooling(data=bank, pool_type='max', kernel=(2, 1), stride=(1,1)) #TOSTUDY: stride?

    proj1 = mx.sym.Convolution(data=poold_bank, kernel=(3,1), num_filter=proj1_size)
    (proj1, proj1_mean, proj1_var) = mx.sym.BatchNorm(data=proj1, output_mean_var=True) #TOSTUDY: does the symbol encapsule the mean and var too?
    proj1 = mx.sym.Activation(data=proj1, act_type='relu')#Is it ok to declare/use again the same variable?  

    proj2 = mx.sym.Convolution(proj1, kernel=(3,1), num_filter=proj2_size)
    (proj2, proj2_mean, proj2_var) = mx.sym.BatchNorm(data=proj2, output_mean_var=True)
    
    residual= proj2 + data

    for i in range(4):
        residual = highway_layer(residual)
    highway_pass = residual
   
    encoded = GRULayer(highway_pass, size=emb_size//2)

    return encoded

In [12]:
# encoder
def encoder(data):
    char2index, index2char = load_vocab()
    
    onehot = mx.sym.one_hot(data,len(char2index))
    embed_vector = mx.sym.Embedding(data=onehot, input_dim=100, output_dim=emb_size)
    prenet_output = prenet_pass(embed_vector)
    return CBHG(prenet_output,16, emb_size//2, emb_size//2)

In [15]:
# decoder
def decoder(data,encoder_context):
    prenet_output = prenet_pass(data) # ???
    
    #attended = attention(prenet_output, encoder_context, num_inputs=emb_size) # pseudo shit
    
    gru1 = GRULayer(encoder_context,bidirectional_mode=False, size=emb_size)
    gru2 = GRULayer(encoder_context,bidirectional_mode=False, size=emb_size)
    
    rnn_output = mx.symbol.Activation(
        data=mx.symbol.FullyConnected(data=encoder_context + gru1 + gru2, num_hidden=emb_size),
        act_type="relu"
    )
    
    return CBHG(rnn_output,8, emb_size, 80)

In [22]:
num_hidden = 256

emb_size=256;


text = mx.sym.Variable('text')
spectrogram = mx.sym.Variable('spectrogram')

encoded = encoder(data)
spectrogram_output = decoder(spectrogram,encoded)



<h2> Attention model part </h2>

Need to figure out what is going on here.

"memory" part = data (embeddings, apples, whatever)-> prenet -> encoder-CBHG   

keithito implementation: 

      # Attention
      attention_cell = AttentionWrapper(
        DecoderPrenetWrapper(GRUCell(256), is_training),
        BahdanauAttention(256, encoder_outputs),
        alignment_history=True,
        output_attention=False)                                                  # [N, T_in, 256]
first arg: a RNN cell 
second: attention mechanism

both AttentionWrapper and BahdanauAttention comes from tf.contrib.seq2seq package
    
The DecoderPrenetWrapper:
<div style="background-color:gray">
<code style="background-color:gray">
class DecoderPrenetWrapper(RNNCell):
  '''Runs RNN inputs through a prenet before sending them to the cell.'''
  
  bla bla bla
  
  <b>def call(self, inputs, state):
    prenet_out = prenet(inputs, self._is_training, scope='decoder_prenet')
    return self._cell(prenet_out, state)
  </b>  
  bla bla bla
</code></div>

So it just send data to the prenet. Data comes from encoder_cbhg.
    The AttentionWrapper, wraps a attention mechanism (Bahdanau) that look at a memory to query (the encoder output) and a net (GRU 256) that is the attention net


In [14]:
#mx.viz.plot_network(encoder(data))