In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import hyperparams as hp
import numpy as np
import math
import glu
import positional_encoding

In [7]:
class Encoder(nn.Module):
    """
    Encoder Network
    """
    def __init__(self, para):
        """
        :param para: dictionary that contains all parameters
        """
        super(Encoder, self).__init__()
        #self.alpha = nn.Parameter(t.ones(1))
        
        self.emb_phone = nn.Embedding(para['phone_size'], para['emb_dim'])
        #full connected
        self.fc_1 = nn.Linear(para['emb_dim'], para['GLU_in_dim'])
        
        self.GLU = glu.GLU(para['num_layers'], para['hidden_size'], para['kernel_size'], para['dropout'], para['GLU_in_dim'])
        
        self.fc_2 = nn.Linear(para['hidden_size'], para['emb_dim'])
        
    def refine(self, align_phone):
        '''filter silence phone and repeat phone'''
        out = []
        length = []
        batch_size = align_phone.shape[0]
        max_length = align_phone.shape[1]
        before = 0
        for i in range(batch_size):
            line = []
            for j in range(max_length):
                if align_phone[i][j] == 1 or align_phone[i][j] == 0:      #silence phone or padding
                    continue
                elif align_phone[i][j] == before:   #the same with the former phone
                    continue
                else:
                    before = align_phone[i][j]
                    line.append(before)
            out.append(line)
            length.append(len(line))
        
        #pad 0
        seq_length = max(length)
        Data = np.zeros((batch_size, seq_length))
        for i in range(batch_size):
            for j in range(seq_length):
                if j < len(out[i]):
                    Data[i][j] = out[i][j]
                    
        return torch.from_numpy(Data).type(torch.LongTensor)
        
    def forward(self, input):
        """
        input dim: [batch_size, text_phone_length]
        output dim : [batch_size, text_phone_length, embedded_dim]
        """
        input = self.refine(input)
        print(input)
        embedded_phone = self.emb_phone(input)    # [src len, batch size, emb dim]
        print(embedded_phone.shape,embedded_phone)
        glu_out = self.GLU(self.fc_1(embedded_phone))
        print(glu_out.shape)
        glu_out = self.fc_2(torch.transpose(glu_out, 1, 2))
        print(glu_out.shape,glu_out)
        out = embedded_phone + glu_out
        print(out.shape,out)
        out = out *  math.sqrt(0.5)
        print(out.shape,out)
        return out


In [8]:
class Encoder_Postnet(nn.Module):
    """
    Encoder Postnet
    """
    def __init__(self):
        super(Encoder_Postnet, self, seq_length).__init__()
        #length of sequence = number of frames
        self.fc = nn.Linear(seq_length, seq_length)
         
    def aligner(encoder_out, align_phone):
        return
        
    def forward(self, encoder_out, align_phone, pitch, beats):
        aligner_out = aligner(encoder_out, align_phone)
        pitch = self.fc(pitch)
        out = aligner_out + pitch
        beats_avg = len(beats) / sum(beats)
        return


In [9]:
para = {'phone_size':67, 'emb_dim':256, 'GLU_in_dim':64, 'num_layers':6, 'kernel_size':3, 'hidden_size':64, 'dropout':0.1 }
encoder = Encoder(para)
phone = torch.tensor([[1,3,3,3,3,5,5,6,0,0,0],[1,1,1,4,2,2,2,3,7,1,1]])
out = encoder(phone)
#print(out.shape,out)

tensor([[3, 5, 6, 0],
        [4, 2, 3, 7]])
torch.Size([2, 4, 256]) tensor([[[ 4.7961e-01,  1.7784e-01, -1.9790e-01,  ...,  9.5143e-02,
           1.3601e-01,  6.3780e-01],
         [ 1.5068e+00, -1.4208e+00, -2.5689e-01,  ..., -4.8902e-01,
           2.7645e-01,  1.3198e+00],
         [-1.6917e+00,  2.9500e-01,  6.6839e-01,  ..., -3.9285e-01,
           9.0842e-01, -6.3534e-01],
         [ 1.4873e+00, -1.1569e+00, -9.7593e-01,  ...,  6.0804e-01,
          -4.3319e-02,  4.8037e-01]],

        [[ 3.8978e-01, -3.6042e-01,  5.8337e-01,  ..., -3.7904e-02,
          -6.6994e-01, -4.7379e-01],
         [-7.8871e-01,  2.1669e+00, -2.1239e-01,  ...,  1.3861e-03,
           7.2562e-01, -1.4388e+00],
         [ 4.7961e-01,  1.7784e-01, -1.9790e-01,  ...,  9.5143e-02,
           1.3601e-01,  6.3780e-01],
         [ 1.1055e-01, -2.7115e-01,  7.4965e-01,  ..., -4.9486e-01,
          -1.1627e+00,  4.7236e-01]]], grad_fn=<EmbeddingBackward>)
torch.Size([2, 64, 4])
torch.Size([2, 4, 256]) tensor([[[ 

In [10]:
beats = [[[0,1,0,0,0,1,0]],[[0,0,0,1,1,0,1]]]
pos = positional_encoding.PositionalEncoding(7)
out = pos(beats)
print(out)

NameError: name 'math' is not defined