In [1]:
import sys
from os.path import dirname, join
from tqdm import tqdm, trange
from datetime import datetime

import torch
from torch.utils import data as data_utils
from torch.autograd import Variable
from torch import nn
from torch import optim
import torch.backends.cudnn as cudnn
from torch.utils import data as data_utils
from torch.utils.data.sampler import Sampler
import numpy as np
from numba import jit
_frontend = None  # to be set later

In [2]:
from nnmnkwii.datasets import FileSourceDataset, FileDataSource
from os.path import join, expanduser

In [3]:
from utils import generate_cloned_samples, Speech_Dataset
import dv3
from dv3 import build_deepvoice_3
from dv3.hparams import hparams, hparams_debug_string
from dv3.train import train as train_dv3
from dv3.train import TextDataSource,MelSpecDataSource,LinearSpecDataSource,\
                        PyTorchDataset,PartialyRandomizedSimilarTimeLengthSampler
from dv3.train import collate_fn
from dv3.deepvoice3_pytorch import frontend
from dv3.train import sequence_mask
from dv3.train import save_checkpoint as save_checkpoint_dv3
from dv3.train import save_states as save_states_dv3
from tensorboardX import SummaryWriter

In [4]:
from utils import generate_cloned_samples, Speech_Dataset
from SpeechEmbedding import Encoder
from train_encoder import get_cloned_voices,build_encoder,get_speaker_embeddings
from train_encoder import load_checkpoint as load_checkpoint_encoder
from train_encoder import save_checkpoint as save_checkpoint_encoder
from train_encoder import train as train_encoder

In [5]:
data_root = "./../data/vctk-preprocessed/"
speaker_id = None

dv3.train._frontend = getattr(frontend, hparams.frontend)


In [6]:
# Input dataset definitions
X = FileSourceDataset(TextDataSource(data_root, speaker_id))
Mel = FileSourceDataset(MelSpecDataSource(data_root, speaker_id))
Y = FileSourceDataset(LinearSpecDataSource(data_root, speaker_id))

In [7]:
use_cuda = False
import os
checkpoint_dv3 = None
checkpoint_encoder = None
speaker_id = None
preset_dv3 =None
checkpoint_dir = './checkpoint/'

data_root = "./../data/vctk-preprocessed/"
if data_root is None:
    data_root = join(dirname(__file__), "data", "ljspeech")



train_dv3_v = None
train_encoder_v = None


if not train_dv3_v and not train_encoder_v:
    print("Training whole model")
    train_dv3_v,train_encoder_v= True,True
if train_dv3_v:
    print("Training deep voice 3 model")
elif train_encoder_v:
    print("Training encoder model")
else:
    assert False, "must be specified wrong args"

os.makedirs(checkpoint_dir , exist_ok=True)

# Input dataset definitions
X = FileSourceDataset(TextDataSource(data_root, speaker_id))
Mel = FileSourceDataset(MelSpecDataSource(data_root, speaker_id))
Y = FileSourceDataset(LinearSpecDataSource(data_root, speaker_id))

# Prepare sampler
frame_lengths = Mel.file_data_source.frame_lengths
sampler = PartialyRandomizedSimilarTimeLengthSampler(frame_lengths, batch_size=hparams.batch_size)

# Dataset and Dataloader setup
dataset = PyTorchDataset(X, Mel, Y)
data_loader_dv3 = data_utils.DataLoader(
    dataset, batch_size=hparams.batch_size,
    num_workers=hparams.num_workers, sampler=sampler,
    collate_fn=collate_fn, pin_memory=hparams.pin_memory)
print("dataloader for dv3 prepared")

dv3.train._frontend = getattr(frontend, hparams.frontend)
model_dv3 = build_deepvoice_3(preset_dv3 , checkpoint_dv3)
print("Built dv3!")

if use_cuda:
    model_dv3 = model_dv3.cuda()

optimizer_dv3 = optim.Adam(model_dv3.get_trainable_parameters(),
                            lr=hparams.initial_learning_rate,
                            betas=(hparams.adam_beta1, hparams.adam_beta2),
                            eps=hparams.adam_eps,
                            weight_decay=hparams.weight_decay)

log_event_path = "log/run-test" + str(datetime.now()).replace(" ", "_")
print("Log event path for dv3: {}".format(log_event_path))
writer_dv3 = SummaryWriter(log_dir=log_event_path)

Training whole model
Training deep voice 3 model
dataloader for dv3 prepared
Built dv3!
Log event path for dv3: log/run-test2018-11-04_15:25:22.820168


In [10]:


# ENCODER
all_speakers = get_cloned_voices(model_dv3)
print("Cloning Texts are produced")

speaker_embed = get_speaker_embeddings(model_dv3)

encoder = build_encoder()

print("Encoder is built!")

speech_data_encoder = Speech_Dataset(all_speakers, speaker_embed)

criterion_encoder = nn.L1Loss()

optimizer_encoder = torch.optim.SGD(encoder.parameters(),lr=0.0006)

lambda1_encoder = lambda epoch: 0.6 if epoch%8000==7999 else 1#???????????
scheduler_encoder = torch.optim.lr_scheduler.LambdaLR(optimizer_encoder, lr_lambda=lambda1_encoder)

data_loader_encoder = data_utils.DataLoader(speech_data_encoder, batch_size=16, shuffle=True, drop_last=True)
# Training The Encoder
dataiter_encoder = iter(data_loader_encoder)

if use_cuda:
    encoder = encoder.cuda()

if checkpoint_encoder !=None and os.path.isfile(checkpoint_encoder):
    encoder, optimizer_encoder = load_checkpoint_encoder(encoder, optimizer_encoder)



Cloned_voices Loaded!
Cloning Texts are produced


NameError: name 'N_samples' is not defined

In [None]:
global_step = 0
global_epoch = 0
use_cuda = torch.cuda.is_available()
model_encoder = encoder
model_dv3 = model_dv3
init_lr_dv3 = 0.002
nepochs = 1

In [None]:
grad = {}
def save_grad(name):
    def hook(grad):
        grads[name] = grad
    return hook

# to remember the embeddings of the speakers
model_dv3.embed_speakers.weight.register_hook(save_grad('embeddings'))

if use_cuda:
    model_dv3 = model_dv3.cuda()
    model_encoder = model_encoder.cuda()
linear_dim = model_dv3.linear_dim
r = hparams.outputs_per_step
downsample_step = hparams.downsample_step
current_lr = init_lr_dv3

binary_criterion_dv3 = nn.BCELoss()

global global_step, global_epoch
while global_epoch < nepochs:
    running_loss = 0.0
    for step, (x, input_lengths, mel, y, positions, done, target_lengths,
               speaker_ids) \
            in tqdm(enumerate(data_loader_dv3)):


        model_dv3.zero_grad()
        encoder.zero_grad()

        #Declaring Requirements
        model_dv3.train()
        ismultispeaker = speaker_ids is not None
        # Learning rate schedule
        if hparams.lr_schedule is not None:
            lr_schedule_f = getattr(dv3.lrschedule, hparams.lr_schedule)
            current_lr = lr_schedule_f(
                init_lr_dv3, global_step, **hparams.lr_schedule_kwargs)
            for param_group in optimizer_dv3.param_groups:
                param_group['lr'] = current_lr
        optimizer_dv3.zero_grad()

        # Used for Position encoding
        text_positions, frame_positions = positions

        # Downsample mel spectrogram
        if downsample_step > 1:
            mel = mel[:, 0::downsample_step, :].contiguous()

        # Lengths
        input_lengths = input_lengths.long().numpy()
        decoder_lengths = target_lengths.long().numpy() // r // downsample_step

        voice_encoder = mel.view(mel.shape[0],1,mel.shape[1],mel.shape[2])
        # Feed data
        x, mel, y = Variable(x), Variable(mel), Variable(y)
        voice_encoder = Variable(voice_encoder)
        text_positions = Variable(text_positions)
        frame_positions = Variable(frame_positions)
        done = Variable(done)
        target_lengths = Variable(target_lengths)
        speaker_ids = Variable(speaker_ids) if ismultispeaker else None
        if use_cuda:
            x = x.cuda()
            text_positions = text_positions.cuda()
            frame_positions = frame_positions.cuda()
            y = y.cuda()
            mel = mel.cuda()
            voice_encoder = voice_encoder.cuda()
            done, target_lengths = done.cuda(), target_lengths.cuda()
            speaker_ids = speaker_ids.cuda() if ismultispeaker else None

        # Create mask if we use masked loss
        if hparams.masked_loss_weight > 0:
            # decoder output domain mask
            decoder_target_mask = sequence_mask(
                target_lengths / (r * downsample_step),
                max_len=mel.size(1)).unsqueeze(-1)
            if downsample_step > 1:
                # spectrogram-domain mask
                target_mask = sequence_mask(
                    target_lengths, max_len=y.size(1)).unsqueeze(-1)
            else:
                target_mask = decoder_target_mask
            # shift mask
            decoder_target_mask = decoder_target_mask[:, r:, :]
            target_mask = target_mask[:, r:, :]
        else:
            decoder_target_mask, target_mask = None, None
        print(voice_encoder.shape)
        #apply encoder model
        encoder_out = model_encoder(voice_encoder)


        model_dv3.embed_speakers.weight.data = (encoder_out).data
        
        print("set")
        # Apply dv3 model
        mel_outputs, linear_outputs, attn, done_hat = model_dv3(
                x, mel, speaker_ids=speaker_ids,
                text_positions=text_positions, frame_positions=frame_positions,
                input_lengths=input_lengths)
        
        
        print("foward dv3 done")
        break
        # Losses
        w = hparams.binary_divergence_weight

        # mel:
        mel_l1_loss, mel_binary_div = spec_loss(
                mel_outputs[:, :-r, :], mel[:, r:, :], decoder_target_mask)
        mel_loss = (1 - w) * mel_l1_loss + w * mel_binary_div

        # done:
        done_loss = binary_criterion(done_hat, done)

        # linear:
        n_priority_freq = int(hparams.priority_freq / (fs * 0.5) * linear_dim)
        linear_l1_loss, linear_binary_div = spec_loss(
                linear_outputs[:, :-r, :], y[:, r:, :], target_mask,
                priority_bin=n_priority_freq,
                priority_w=hparams.priority_freq_weight)
        linear_loss = (1 - w) * linear_l1_loss + w * linear_binary_div

        # Combine losses
        loss_dv3 = mel_loss + linear_loss + done_loss
        loss_dv3 = mel_loss + done_loss
        loss_dv3 = linear_loss

        # attention
        if hparams.use_guided_attention:
            soft_mask = guided_attentions(input_lengths, decoder_lengths,
                                          attn.size(-2),
                                          g=hparams.guided_attention_sigma)
            soft_mask = Variable(torch.from_numpy(soft_mask))
            soft_mask = soft_mask.cuda() if use_cuda else soft_mask
            attn_loss = (attn * soft_mask).mean()
            loss_dv3 += attn_loss

        if global_step > 0 and global_step % checkpoint_interval == 0:
            save_states_dv3(
                global_step, writer, mel_outputs, linear_outputs, attn,
                mel, y, input_lengths, checkpoint_dir)
            save_checkpoint_dv3(
                model, optimizer, global_step, checkpoint_dir, global_epoch,
                train_seq2seq, train_postnet)

        if global_step > 0 and global_step % hparams.eval_interval == 0:
            eval_model(global_step, writer, model, checkpoint_dir, ismultispeaker)

        # Update
        loss_dv3.backward()
        encoder_out.backward(grads['embeddings'])

        optimizer_dv3.step()
        optimizer_encoder.step()

        # if clip_thresh> 0:
        #     grad_norm = torch.nn.utils.clip_grad_norm(
        #         model.get_trainable_parameters(), clip_thresh)
        global_step += 1
        running_loss += loss.data[0]

    averaged_loss = running_loss / (len(data_loader))

    print("Loss: {}".format(running_loss / (len(data_loader))))

    global_epoch += 1


# dv3 loss function
# backward on that
mel_outputs.backward()

In [9]:
# linear_dim = model_dv3.linear_dim
r = hparams.outputs_per_step
downsample_step = hparams.downsample_step
for step, (x, input_lengths, mel, y, positions, done, target_lengths,
                   speaker_ids) in tqdm(enumerate(data_loader_dv3)):
    print(step)
    ismultispeaker = speaker_ids is not None
    text_positions, frame_positions = positions
    input_lengths = input_lengths.long().numpy()
    decoder_lengths = target_lengths.long().numpy() // r // downsample_step
    # Feed data
    voice_encoder = mel.view(mel.shape[0],1,mel.shape[1],mel.shape[2])
    x, mel, y = Variable(x), Variable(mel), Variable(y)
    voice_encoder = mel.view(mel.shape[0],1,mel.shape[1],mel.shape[2])
    text_positions = Variable(text_positions)
    frame_positions = Variable(frame_positions)
    done = Variable(done)
    target_lengths = Variable(target_lengths)
    speaker_ids = Variable(speaker_ids) if ismultispeaker else None
    
    print(step)
    print(x.shape)
    print(mel.shape)
    print(y.shape)
    print(voice_encoder.shape)
    print('-'*10)

    

1it [00:01,  1.54s/it]

0
0
torch.Size([16, 61])
torch.Size([16, 196, 80])
torch.Size([16, 196, 513])
torch.Size([16, 1, 196, 80])
----------
1
1
torch.Size([16, 57])
torch.Size([16, 180, 80])
torch.Size([16, 180, 513])
torch.Size([16, 1, 180, 80])
----------


4it [00:03,  1.05it/s]

2
2
torch.Size([16, 65])
torch.Size([16, 136, 80])
torch.Size([16, 136, 513])
torch.Size([16, 1, 136, 80])
----------
3
3
torch.Size([16, 41])
torch.Size([16, 132, 80])
torch.Size([16, 132, 513])
torch.Size([16, 1, 132, 80])
----------


Process Process-2:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/mehul/machine_learning/Speech Processing/Neural-Voice-Cloning-with-Few-Samples-personal/venv/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 106, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/mehul/machine_learning/Speech Processing/Neural-Voice-Cloning-with-Few-Samples-personal/venv/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 106, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/mehul/machine_learning/Speech Processing/Neural-Voice-Cloning-With-Few-Samples/dv3/train.py", line 243, in __getitem__
    return text, self.Mel[idx], self.Y[idx], speaker_id
Process Process-1:
  File "/home/m

Traceback (most recent call last):
  File "/home/mehul/machine_learning/Speech Processing/Neural-Voice-Cloning-with-Few-Samples-personal/venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-d742eafd6b9e>", line 5, in <module>
    speaker_ids) in tqdm(enumerate(data_loader_dv3)):
  File "/home/mehul/machine_learning/Speech Processing/Neural-Voice-Cloning-with-Few-Samples-personal/venv/lib/python3.6/site-packages/tqdm/_tqdm.py", line 937, in __iter__
    for obj in iterable:
  File "/home/mehul/machine_learning/Speech Processing/Neural-Voice-Cloning-with-Few-Samples-personal/venv/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 330, in __next__
    idx, batch = self._get_batch()
  File "/home/mehul/machine_learning/Speech Processing/Neural-Voice-Cloning-with-Few-Samples-personal/venv/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 309, in

KeyboardInterrupt: 

Cloned_voices Loaded!
Cloning Texts are produced


In [11]:
from utils import Speech_Dataset
speech_data = Speech_Dataset(all_speakers, speaker_embed)
data_loader = data_utils.DataLoader(speech_data, batch_size=16, shuffle=True, drop_last=True)

(108, 23, 201, 80)


In [12]:
for i_element, element in enumerate(data_loader):
    voice, embed = element[0], element[1]
    print(voice.shape)

    


torch.Size([16, 23, 201, 80])
torch.Size([16, 23, 201, 80])
torch.Size([16, 23, 201, 80])
torch.Size([16, 23, 201, 80])
torch.Size([16, 23, 201, 80])
torch.Size([16, 23, 201, 80])


In [5]:
encoder = Encoder()

In [7]:
a = torch.randn(16,23 , 123 , 80)
a.size()

torch.Size([16, 23, 123, 80])

In [8]:
encoder(a)



torch.Size([16, 23])
torch.Size([16, 23])


NameError: name 'exit' is not defined