In [1]:
# only thing we want to change is the mode
# however to do this, we have to run it using that script from the main

In [2]:
import torch, pdb
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


class LinearNorm(torch.nn.Module):
    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
        super(LinearNorm, self).__init__()
        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)

        torch.nn.init.xavier_uniform_(
            self.linear_layer.weight,
            gain=torch.nn.init.calculate_gain(w_init_gain))

    def forward(self, x):
        return self.linear_layer(x)

class ConvNorm1d(torch.nn.Module): 
    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, 
                 padding=None, dilation=1, bias=True, w_init_gain='linear'): 
        super(ConvNorm1d, self).__init__() 
        if padding is None: 
            assert(kernel_size % 2 == 1) 
            padding = int(dilation * (kernel_size - 1) / 2) 
 
        self.conv = torch.nn.Conv1d(in_channels, out_channels, 
                                    kernel_size=kernel_size, stride=stride, 
                                    padding=padding, dilation=dilation, 
                                    bias=bias) 
 
        torch.nn.init.xavier_uniform_( 
            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) 

    def forward(self, signal):
        conv_signal = self.conv(signal)
        return conv_signal

class ConvNorm2d(torch.nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
                 padding=None, dilation=1, bias=True, w_init_gain='linear'):
        super(ConvNorm2d, self).__init__()
        if padding is None:
            assert(kernel_size % 2 == 1)
            padding = int(dilation * (kernel_size - 1) / 2)

        self.conv = torch.nn.Conv2d(in_channels, out_channels,
                                    kernel_size=kernel_size, stride=stride,
                                    padding=padding, dilation=dilation,
                                    bias=bias)

        torch.nn.init.xavier_uniform_(
            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))

    def forward(self, signal):
        conv_signal = self.conv(signal)
        return conv_signal

class ConvT2d(torch.nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
                 padding=None, dilation=1, bias=True, w_init_gain='relu'):
        super(ConvT2d, self).__init__()
        if padding is None:
            assert(kernel_size % 2 == 1)
            padding = int(dilation * (kernel_size - 1) / 2)
            
        self.conv = torch.nn.ConvTranspose2d(in_channels, out_channels,
                                    kernel_size=kernel_size, stride=stride,
                                    padding=padding, dilation=dilation,
                                    bias=bias)

        torch.nn.init.xavier_uniform_(
            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)),

    def forward(self, signal):
        conv_signal = self.conv(signal)
        return conv_signal 

# "4.2. The Content Encoder"
class Encoder(nn.Module):
    """Encoder module:
    """
    def __init__(self, dim_neck, dim_emb, freq):
        super(Encoder, self).__init__()
        self.dim_neck = dim_neck
        self.freq = freq
        convolutions = []
        for i in range(4):
        # "the input to the content encoder is the 80-dimensional mel-spectrogram of X1 concatenated with the speaker embedding" - I think the embeddings are copy pasted from a dataset, as the Speaker Decoder is pretrained and may not actually appear in this implementation?
            conv_layer = nn.Sequential(
        # "the input to the content encoder is the 80-dimensional mel-spectrogram of X1 concatenated with the speaker embedding. The concatenated features are fed into three 5 × 1 convolutional layers, each followed by batch normalization and ReLU activation. The number of channels is 512"
                ConvNorm2d(1 if i==0 else 64 if i==1 else 128 if i==2 else 256,
                         64 if i==0 else 128 if i==1 else 256 if i==2 else 512,
                         kernel_size=3, stride=1,
                         padding=1,
                         dilation=1, w_init_gain='relu')
                ,nn.BatchNorm2d(64 if i==0 else 128 if i==1 else 256 if i==2 else 512)
                ,nn.ReLU()
                ,nn.MaxPool2d((4,1))
            )
                
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)

        
        # "Both the forward and backward cell dimensions are 32, so their (LSTMs) combined dimension is 64."
        self.lstm = nn.LSTM(512, dim_neck, 2, batch_first=True, bidirectional=True)

        # c_org is speaker embedding
    def forward(self, x, c_org):
        #pdb.set_trace()
        x = x.transpose(2,1).unsqueeze(1) # after this transpose, tensor should be shape (batch, feature, time)
        # broadcasts c_org to a compatible shape to merge with x
        c_org = c_org.unsqueeze(-1).expand(-1, -1, x.size(-1)).unsqueeze(1)
        x = torch.cat((x, c_org), dim=2)
        for conv in self.convolutions:
            x = conv(x)
        x = x.squeeze(2).transpose(-1,-2)
        self.lstm.flatten_parameters()
        # lstms output 64 dim
        outputs, _ = self.lstm(x)
        # backward is the first half of dimensions, forward is the second half
        # pdb.set_trace()
        out_forward = outputs[:, :, :self.dim_neck] #takes the first half of outputs
        out_backward = outputs[:, :, self.dim_neck:]

        # pdb.set_trace()
        codes = []
        
        # for each timestep, skipping self.freq frames
        for i in range(0, outputs.size(1), self.freq):
            # remeber that i is self.freq, not increments of 1)
            codes.append(torch.cat((out_forward[:,i+self.freq-1,:],out_backward[:,i,:]), dim=-1))
        
        # if self.freq is 32, then codes is a list of 4 tensors of size 64
        return codes

In [3]:
import pickle
config_path = '/homes/bdoc3/my_data/autovc_data/vte-autovc/model_saves/vte_autovcTest_dim16/config.pkl'
config = pickle.load(open(config_path, 'rb'))

In [4]:
import os, pdb, pickle, argparse, shutil, yaml
from data_loader import get_loader, pathSpecDataset
from torch.backends import cudnn
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader

# For fast training.
cudnn.benchmark = True

with open(config.spmel_dir +'/spmel_params.yaml') as File:
    spmel_params = yaml.load(File, Loader=yaml.FullLoader)
vocalSet = pathSpecDataset(config, spmel_params)
vocalSet_loader = DataLoader(vocalSet, batch_size=config.batch_size, shuffle=True, drop_last=False)
# Data loader.
#vcc_loader = get_loader(config)
# pass dataloader and configuration params to Solver NN
if config.file_name == 'defaultName' or config.file_name == 'deletable':
    writer = SummaryWriter('testRuns/test')
else:
    writer = SummaryWriter(comment = '_' +config.file_name)

In [5]:
from collections import OrderedDict
from vte_model import Vt_Embedder
vte =  Vt_Embedder(config, spmel_params)
for param in vte.parameters():
    param.requires_grad = False
vte_optimizer = torch.optim.Adam(vte.parameters(), 0.0001)
vte_checkpoint = torch.load(config.emb_ckpt)
new_state_dict = OrderedDict()
for i, (key, val) in enumerate(vte_checkpoint['model_state_dict'].items()):
    if key.startswith('class_layer'):
        continue
    new_state_dict[key] = val 
vte.load_state_dict(new_state_dict)

for state in vte_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda(which_cuda)

device = torch.device(f'cuda:{config.which_cuda}' if torch.cuda.is_available() else 'cpu')
vte.to(device)
vte.eval()

Vt_Embedder(
  (conv_layer1): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv_layer2): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=4, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv_layer3): Sequential(
    (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv_layer4): Sequential(
    (0): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), pa

In [67]:
hist_arr = np.array([0,0,0])
# Print logs in specified order

data_iter = iter(vocalSet_loader)
x_real, style_idx, singer_idx = next(data_iter)

x_real = x_real.to(device) 

x_real_chunked = x_real.view(x_real.shape[0]*config.chunk_num, x_real.shape[1]//config.chunk_num, -1)
emb_org = vte(x_real_chunked)
x = x_real

# codes is a LIST of tensors 
encoder = Encoder(config.dim_neck, config.dim_emb, config.freq)
encoder.to(device)

codes = encoder(x, emb_org)
# if no c_trg given, then just return the formatted encoder codes

# list of reformatted codes        
tmp = []
for code in codes:
    # reformatting tmp from list to tensor, and giving it new dim of 128 (x.size(1))
    tmp.append(code.unsqueeze(1).expand(-1,int(x.size(1)/len(codes)),-1))
code_exp = torch.cat(tmp, dim=1)
# concat reformated encoder output with target speaker embedding
encoder_outputs = torch.cat((code_exp, emb_org.unsqueeze(1).expand(-1,x.size(1),-1)), dim=-1)
# shape should now be (2,192,320 (256+64)
print(encoder_outputs.shape)

torch.Size([2, 192, 288])


In [77]:
class Decoder(nn.Module):
    """Decoder module:
    """
    def __init__(self, dim_neck, dim_emb, dim_pre):
        super(Decoder, self).__init__()
        
        self.lstm1 = nn.LSTM(dim_neck*2+dim_emb, dim_pre, 1, batch_first=True)
        
        convolutions = []
        for i in range(3):
            conv_layer = nn.Sequential(
                ConvT2d(1 if i==0 else 64 if i==1 else 128 if i==2 else 256,
                        64 if i==0 else 128 if i==1 else 256 if i==2 else 512,
                        kernel_size=5,
                        stride=1,
                        padding=2),
                nn.BatchNorm2d(64 if i==0 else 128 if i==1 else 256 if i==2 else 512))
            convolutions.append(conv_layer) 
        self.convolutions = nn.ModuleList(convolutions)         

        self.lstm2 = nn.LSTM(dim_pre, 1024, 2, batch_first=True)
        self.linear_projection = LinearNorm(1024, 80)

    def forward(self, x):
        #self.lstm1.flatten_parameters()
        x, _ = self.lstm1(x)
        x = x.transpose(1, 2)
        
        for conv in self.convolutions:
            x = F.relu(conv(x))
        x = x.transpose(1, 2)
        
        outputs, _ = self.lstm2(x)
        
        decoder_output = self.linear_projection(outputs)

        return decoder_output

decoder = Decoder(config.dim_neck, config.dim_emb, config.dim_pre)
decoder.to(device)
print(encoder_outputs.shape)
outs, _ = decoder.lstm1(encoder_outputs)
outs = outs.transpose(1, 2).unsqueeze(1)
print(outs.shape)
for conv in decoder.convolutions:
    outs = F.relu(conv(outs))
print(outs.shape)
# outs, _ = decoder.lstm2(outs)

torch.Size([2, 192, 288])
torch.Size([2, 1, 512, 192])
torch.Size([2, 256, 512, 192])


In [78]:
class Postnet(nn.Module):
    """Postnet
        - Five 1-d convolution with 512 channels and kernel size 5
    """

    def __init__(self):
        super(Postnet, self).__init__()
        self.convolutions = nn.ModuleList()

        self.convolutions.append(
            nn.Sequential(
                ConvNorm2d(256, 128,
                         kernel_size=5, stride=1,
                         padding=(0,2),
                         dilation=1, w_init_gain='tanh'),
                nn.BatchNorm2d(128),
                nn.MaxPool2d((3,1))),
        )

        for i in range(1, 5 - 1):
            self.convolutions.append(
                nn.Sequential(
                    ConvNorm2d(128 if i==1 else 64,
                             64,
                             kernel_size=5, stride=1, padding=(1,2) if i<1 else 2,
                             dilation=1, w_init_gain='tanh'),
                    nn.BatchNorm2d(64),
                    nn.MaxPool2d((2,1) if i==1 else (1,1))),
            )

        self.convolutions.append(
            nn.Sequential(
                ConvNorm2d(64, 1,
                         kernel_size=5, stride=1,
                         padding=(0,2),
                         dilation=1, w_init_gain='linear'))

            )

    def forward(self, x):
        x = x.unsqueeze(1)
        for i in range(len(self.convolutions) - 1):
            print(i)
            x = torch.tanh(self.convolutions[i](x))

        x = self.convolutions[-1](x)

        return x    

postnet = Postnet()
postnet.to(device)

x0 = torch.tanh(postnet.convolutions[0](outs))
print(x0.shape)
x1 = torch.tanh(postnet.convolutions[1](x0))
print(x1.shape)
x2 = torch.tanh(postnet.convolutions[2](x1))
print(x2.shape)
x3 = torch.tanh(postnet.convolutions[2](x2))
print(x3.shape)

x = postnet.convolutions[-1](x3)
x.shape

torch.Size([2, 128, 169, 192])
torch.Size([2, 64, 84, 192])
torch.Size([2, 64, 84, 192])
torch.Size([2, 64, 84, 192])


torch.Size([2, 1, 80, 192])