In [1]:
import os
import tgt
import glob
import tqdm
import json
import torch
import scipy
import random
import librosa
import sklearn
import speechbrain
import numpy as np
import pyworld as pw
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from text import _clean_text
from pathlib import Path
from IPython.display import Audio
from matplotlib.lines import Line2D
from torch.utils.tensorboard import SummaryWriter
from sklearn.preprocessing import StandardScaler
from speechbrain.lobes.models.FastSpeech2 import mel_spectogram

In [2]:
##############################################
# 1. Paths
##############################################
DATA_PATH           = '/workspace/data/EmoV-DB'
CORPUS_PATH         = '/workspace/montreal_forced_aligner/corpus'
TEXTGRID_PATH       = '/workspace/montreal_forced_aligner/aligned'
PREPROCESSED_PATH   = '/workspace/preprocessed'
EXPERIMENT_PATH     = '/workspace/experiments'


##############################################
# 2. Preprocessing
##############################################
NOISE_SYMBOL        = ' [noise] '
SPEAKERS            = ['bea', 'jenie', 'josh', 'sam']
EMOTIONS            = ['neutral', 'amused', 'angry', 'disgusted', 'sleepy']
SIL_PHONES          = ['sil', 'spn', 'sp', '']
VALID_TOKENS        = ['@'] + speechbrain.utils.text_to_sequence.valid_symbols + SIL_PHONES
PITCH_AVERAGING     = False
ENERGY_AVERAGING    = False
MATCH_TRANSCRIPT    = True


##############################################
# 3. Audio (optimized for vocoder)
##############################################
SAMPLING_RATE       = 16000
HOP_LENGTH          = 256
WIN_LENGTH          = 1024
N_FFT               = 1024
N_MELS              = 80
F_MIN               = 0.0
F_MAX               = 8000.0


##############################################
# 4. Training
##############################################
N_EPOCHS            = 100
MAX_ITERATIONS      = 50000
BATCH_SIZE          = 16
LEARNING_RATE       = 0.000001


##############################################
# 5. Model
##############################################
N_ENCODER_LAYERS    = 4
N_HEADS             = 2
HIDDEN_DIM          = 256
KERNEL_SIZE         = 9
DROPOUT             = 0.1
ALPHA               = 0.1       # mixup
BETA                = 1.0       # rank


##############################################
# 6. Miscellaneous
##############################################
MARKER              = ['o', '^', 's', 'd']
COLORS              = ['#7C00FE', '#F9E400', '#FFAF00', '#F5004F', '#00B2A9']

In [3]:
# train, test 데이터셋 분리
train_list, valid_list = [], []
for speaker in SPEAKERS:
    paths = glob.glob(os.path.join(PREPROCESSED_PATH, speaker, '*.npz'))
    random.shuffle(paths)

    n_train = int(len(paths) * 0.8)
    train_list.extend(paths[:n_train])
    valid_list.extend(paths[n_train:])


if not os.path.exists(os.path.join(PREPROCESSED_PATH, 'fs2_train.txt')):
    # train, valid 데이터셋을 파일로 저장
    with open(os.path.join(PREPROCESSED_PATH, 'fs2_train.txt'), 'w') as f:
        f.write('\n'.join(train_list) + '\n')

    with open(os.path.join(PREPROCESSED_PATH, 'fs2_valid.txt'), 'w') as f:
        f.write('\n'.join(valid_list) + '\n')
else:
    print('Skipping')

Skipping


In [4]:
def phoneme2sequence(phoneme):
    seq = [VALID_TOKENS.index(token) for token in phoneme]
    return seq

def sequence2phoneme(sequence):
    phoneme = [VALID_TOKENS[i] for i in sequence]
    return phoneme

In [5]:
class FastSpeech2Dataset(torch.utils.data.Dataset):

    def __init__(self, mode='train'):
        super(FastSpeech2Dataset, self).__init__()
        
        self.data_paths = []
        with open(os.path.join(PREPROCESSED_PATH, f'fs2_{mode}.txt'), 'r') as f:
            self.data_paths = [line.strip() for line in f.readlines()]
        
    def __len__(self):
        return len(self.data_paths)

    def __getitem__(self, idx):
        data_path = self.data_paths[idx]
        data = np.load(data_path, allow_pickle=True)
        
        # Load features
        mel = data['mel']
        pitch = data['pitch']
        energy = data['energy']
        duration = data['durations']
        phoneme = data['phones'].tolist()

        # metadata
        speaker = data['speaker'].item()
        emotion = data['emotion'].item()
        text = data['transcript'].item().replace(NOISE_SYMBOL.strip(), '').strip()
        audio_path = data['audio_path'].item()

        
        return {
            'mel': torch.FloatTensor(mel),
            'pitch': torch.FloatTensor(pitch),
            'energy': torch.FloatTensor(energy),
            'duration': torch.LongTensor(duration),
            'phoneme': torch.LongTensor(phoneme2sequence(phoneme)),
            'speaker': torch.tensor(SPEAKERS.index(speaker), dtype=torch.long),
            'emotion': torch.tensor(EMOTIONS.index(emotion), dtype=torch.long),
            'text': text,
            'audio_path': audio_path
        }


dataset = FastSpeech2Dataset(mode='train')
for data in tqdm.notebook.tqdm(dataset):
    print('Melspectrogram shape:', data['mel'].shape)
    print('Pitch shape:', data['pitch'].shape)
    print('Energy shape:', data['energy'].shape)
    print('Duration shape:', data['duration'].shape)
    print('Phoneme sequence:', data['phoneme'].shape)
    print('*Total duration:', data['duration'].sum().item())
    print('Speaker index:', data['speaker'])
    print('Emotion index:', data['emotion'])
    print('Text:', data['text'])
    break

  0%|          | 0/5498 [00:00<?, ?it/s]

Melspectrogram shape: torch.Size([80, 305])
Pitch shape: torch.Size([305])
Energy shape: torch.Size([305])
Duration shape: torch.Size([45])
Phoneme sequence: torch.Size([45])
*Total duration: 305
Speaker index: tensor(0)
Emotion index: tensor(4)
Text: the promoter's eyes were heavy, with little puffy bags under them.


In [6]:
class TextMelCollateWithAlignment:

    def __call__(self, batch):

        # Right zero-pad all one-hot text sequences to the max input length
        input_lengths, ids_sorted_decreasing = torch.sort(
            torch.LongTensor([len(x['phoneme']) for x in batch]),
            dim=0, descending=True)

    
        max_input_len = input_lengths[0]

        phoneme_padded = torch.LongTensor(len(batch), max_input_len)
        phoneme_padded.zero_()
        duration_padded = torch.LongTensor(len(batch), max_input_len)
        duration_padded.zero_()

        for i in range(len(ids_sorted_decreasing)):
            phoneme = batch[ids_sorted_decreasing[i]]['phoneme']
            phoneme_padded[i, :phoneme.size(0)] = phoneme
            duration = batch[ids_sorted_decreasing[i]]['duration']
            duration_padded[i, :duration.size(0)] = duration

        # Right zero-pad mel-spec
        num_mels = batch[0]['mel'].size(0)
        max_target_len = max([x['mel'].size(1) for x in batch])

        # include mel padded and gate padded
        mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len)
        mel_padded.zero_()
        pitch_padded = torch.FloatTensor(len(batch), max_target_len)
        pitch_padded.zero_()
        energy_padded = torch.FloatTensor(len(batch), max_target_len)
        energy_padded.zero_()
        output_lengths = torch.LongTensor(len(batch))
        labels, wavs = [], []
        speakers = torch.LongTensor(len(batch))

        for i in range(len(ids_sorted_decreasing)):
            idx = ids_sorted_decreasing[i]
            mel = batch[idx]['mel']
            pitch = batch[idx]['pitch']
            energy = batch[idx]['energy']
            mel_padded[i, :, :mel.size(1)] = mel
            pitch_padded[i, :pitch.size(0)] = pitch
            energy_padded[i, :energy.size(0)] = energy
            output_lengths[i] = mel.size(1)
            labels.append(batch[idx]['text'])
            wavs.append(batch[idx]['audio_path'])
            speakers[i] = batch[idx]['speaker']

        mel_padded = mel_padded.permute(0, 2, 1)
        return (
            phoneme_padded,
            speakers,
            input_lengths,
            mel_padded,
            pitch_padded,
            energy_padded,
            duration_padded,
            output_lengths,
            labels,
            wavs,
        )

dataset = FastSpeech2Dataset(mode='train')
collate_fn = TextMelCollateWithAlignment()
dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=4,
    pin_memory=True,
    drop_last=True,
)

for batch in dataloader:
    phoneme, speakers, input_lengths, mel, pitch, energy, duration, output_lengths, labels, wavs = batch
    print('Phoneme shape:', phoneme.shape)
    print('Input lengths:', input_lengths)
    print('Mel shape:', mel.shape)
    print('Pitch shape:', pitch.shape)
    print('Energy shape:', energy.shape)
    print('Duration shape:', duration.shape)
    print('Output lengths:', output_lengths)
    print('Speakers:', speakers)  # Print first 5 speaker indices
    print('Labels:', labels[:5])  # Print first 5 labels
    print('Wavs:', wavs[:5])      # Print first 5 audio paths
    break

Phoneme shape: torch.Size([16, 58])
Input lengths: tensor([3, 3, 0, 2, 3, 3, 3, 2, 3, 0, 3, 0, 0, 2, 3, 3])
Mel shape: torch.Size([16])
Pitch shape: torch.Size([16, 506, 80])
Energy shape: torch.Size([16, 506])
Duration shape: torch.Size([16, 506])
Output lengths: tensor([[ 5,  4, 11, 13, 15,  5, 10,  5,  8, 26,  3,  7,  3,  7, 12, 25,  6, 12,
          5,  3,  5,  7,  2,  7,  3,  3,  3,  4, 13,  2,  3,  3,  5,  2, 10, 14,
         39,  4,  5,  4, 11, 10,  2,  9,  4,  8, 68,  7,  4,  2,  2,  6, 14, 13,
         10,  6,  4,  3],
        [ 2,  6,  3,  8,  5,  8,  5,  3,  3,  9,  7,  8, 19,  5,  5, 12, 18,  9,
         13, 10,  2, 13, 23,  2,  3,  5,  3, 28,  5,  2, 34,  7, 14,  4, 12, 36,
          7,  9,  6,  8,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0],
        [ 3,  2,  8,  3,  7,  6,  5,  2,  4,  5,  5, 11,  3, 13,  5,  9, 12,  5,
          5,  2,  3,  2,  9,  2,  4,  5,  4,  4,  2,  3,  5,  4,  2,  2,  3,  4,
          3,  3, 14,  0,  0,  0,  0

In [7]:
# move to designated device
def batch_to_device(batch, device):
    return (
        batch[0].to(device),  # phoneme
        batch[1].to(device),  # speaker
        batch[2].to(device),  # input_lengths
        batch[3].to(device),  # mel
        batch[4].to(device),  # pitch
        batch[5].to(device),  # energy
        batch[6].to(device),  # duration
        batch[7].to(device),  # output_lengths
        batch[8],             # labels (strings)
        batch[9],             # wavs (file paths)
    )

### FastSpeech2 Model.

In [8]:
"""
Neural network modules for the FastSpeech 2: Fast and High-Quality End-to-End Text to Speech
synthesis model
Authors
* Sathvik Udupa 2022
* Pradnya Kandarkar 2023
* Yingzhi Wang 2023
"""

import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from torch.nn.modules.loss import _Loss

from speechbrain.lobes.models.transformer.Transformer import (
    PositionalEncoding,
    TransformerEncoder,
    get_key_padding_mask,
    get_mask_from_lengths,
)
from speechbrain.nnet import CNN, linear
from speechbrain.nnet.embedding import Embedding
from speechbrain.nnet.losses import bce_loss
from speechbrain.nnet.normalization import LayerNorm

from speechbrain.lobes.models.FastSpeech2 import (
    EncoderPreNet,
    DurationPredictor,
    PostNet,
    upsample,
    average_over_durations,
    SSIMLoss,
)




class FastSpeech2(nn.Module):
    """The FastSpeech2 text-to-speech model.
    This class is the main entry point for the model, which is responsible
    for instantiating all submodules, which, in turn, manage the individual
    neural network layers
    Simplified STRUCTURE: input->token embedding ->encoder ->duration/pitch/energy predictor ->duration
    upsampler -> decoder -> output
    During training, teacher forcing is used (ground truth durations are used for upsampling)

    Arguments
    ---------
    enc_num_layers: int
        number of transformer layers (TransformerEncoderLayer) in encoder
    enc_num_head: int
        number of multi-head-attention (MHA) heads in encoder transformer layers
    enc_d_model: int
        the number of expected features in the encoder
    enc_ffn_dim: int
        the dimension of the feedforward network model
    enc_k_dim: int
        the dimension of the key
    enc_v_dim: int
        the dimension of the value
    enc_dropout: float
        Dropout for the encoder
    dec_num_layers: int
        number of transformer layers (TransformerEncoderLayer) in decoder
    dec_num_head: int
        number of multi-head-attention (MHA) heads in decoder transformer layers
    dec_d_model: int
        the number of expected features in the decoder
    dec_ffn_dim: int
        the dimension of the feedforward network model
    dec_k_dim: int
        the dimension of the key
    dec_v_dim: int
        the dimension of the value
    dec_dropout: float
        dropout for the decoder
    normalize_before: bool
        whether normalization should be applied before or after MHA or FFN in Transformer layers.
    ffn_type: str
        whether to use convolutional layers instead of feed forward network inside transformer layer.
    ffn_cnn_kernel_size_list: list of int
        conv kernel size of 2 1d-convs if ffn_type is 1dcnn
    n_char: int
        the number of symbols for the token embedding
    n_mels: int
        number of bins in mel spectrogram
    postnet_embedding_dim: int
       output feature dimension for convolution layers
    postnet_kernel_size: int
       postnet convolution kernel size
    postnet_n_convolutions: int
       number of convolution layers
    postnet_dropout: float
        dropout probability for postnet
    padding_idx: int
        the index for padding
    dur_pred_kernel_size: int
        the convolution kernel size in duration predictor
    pitch_pred_kernel_size: int
        kernel size for pitch prediction.
    energy_pred_kernel_size: int
        kernel size for energy prediction.
    variance_predictor_dropout: float
        dropout probability for variance predictor (duration/pitch/energy)

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.FastSpeech2 import FastSpeech2
    >>> model = FastSpeech2(
    ...    enc_num_layers=6,
    ...    enc_num_head=2,
    ...    enc_d_model=384,
    ...    enc_ffn_dim=1536,
    ...    enc_k_dim=384,
    ...    enc_v_dim=384,
    ...    enc_dropout=0.1,
    ...    dec_num_layers=6,
    ...    dec_num_head=2,
    ...    dec_d_model=384,
    ...    dec_ffn_dim=1536,
    ...    dec_k_dim=384,
    ...    dec_v_dim=384,
    ...    dec_dropout=0.1,
    ...    normalize_before=False,
    ...    ffn_type='1dcnn',
    ...    ffn_cnn_kernel_size_list=[9, 1],
    ...    n_char=40,
    ...    n_mels=80,
    ...    postnet_embedding_dim=512,
    ...    postnet_kernel_size=5,
    ...    postnet_n_convolutions=5,
    ...    postnet_dropout=0.5,
    ...    padding_idx=0,
    ...    dur_pred_kernel_size=3,
    ...    pitch_pred_kernel_size=3,
    ...    energy_pred_kernel_size=3,
    ...    variance_predictor_dropout=0.5)
    >>> inputs = torch.tensor([
    ...     [13, 12, 31, 14, 19],
    ...     [31, 16, 30, 31, 0],
    ... ])
    >>> input_lengths = torch.tensor([5, 4])
    >>> durations = torch.tensor([
    ...     [2, 4, 1, 5, 3],
    ...     [1, 2, 4, 3, 0],
    ... ])
    >>> mel_post, postnet_output, predict_durations, predict_pitch, avg_pitch, predict_energy, avg_energy, mel_lens = model(inputs, durations=durations)
    >>> mel_post.shape, predict_durations.shape
    (torch.Size([2, 15, 80]), torch.Size([2, 5]))
    >>> predict_pitch.shape, predict_energy.shape
    (torch.Size([2, 5, 1]), torch.Size([2, 5, 1]))
    """

    def __init__(
        self,
        # encoder parameters
        enc_num_layers,
        enc_num_head,
        enc_d_model,
        enc_ffn_dim,
        enc_k_dim,
        enc_v_dim,
        enc_dropout,
        # decoder parameters
        dec_num_layers,
        dec_num_head,
        dec_d_model,
        dec_ffn_dim,
        dec_k_dim,
        dec_v_dim,
        dec_dropout,
        normalize_before,
        ffn_type,
        ffn_cnn_kernel_size_list,
        n_char,
        n_mels,
        postnet_embedding_dim,
        postnet_kernel_size,
        postnet_n_convolutions,
        postnet_dropout,
        padding_idx,
        dur_pred_kernel_size,
        pitch_pred_kernel_size,
        energy_pred_kernel_size,
        variance_predictor_dropout,
    ):
        super().__init__()
        self.enc_num_head = enc_num_head
        self.dec_num_head = dec_num_head
        self.padding_idx = padding_idx
        self.sinusoidal_positional_embed_encoder = PositionalEncoding(
            enc_d_model
        )
        self.sinusoidal_positional_embed_decoder = PositionalEncoding(
            dec_d_model
        )

        self.speaker_emb = Embedding(
            num_embeddings=len(SPEAKERS),
            embedding_dim=enc_d_model,
            # padding_idx=padding_idx,
        )
        self.encPreNet = EncoderPreNet(
            n_char, padding_idx, out_channels=enc_d_model
        )
        self.durPred = DurationPredictor(
            in_channels=enc_d_model,
            out_channels=enc_d_model,
            kernel_size=dur_pred_kernel_size,
            dropout=variance_predictor_dropout,
        )
        self.pitchPred = DurationPredictor(
            in_channels=enc_d_model,
            out_channels=enc_d_model,
            kernel_size=dur_pred_kernel_size,
            dropout=variance_predictor_dropout,
        )
        self.energyPred = DurationPredictor(
            in_channels=enc_d_model,
            out_channels=enc_d_model,
            kernel_size=dur_pred_kernel_size,
            dropout=variance_predictor_dropout,
        )
        self.pitchEmbed = CNN.Conv1d(
            in_channels=1,
            out_channels=enc_d_model,
            kernel_size=pitch_pred_kernel_size,
            padding="same",
            skip_transpose=True,
        )

        self.energyEmbed = CNN.Conv1d(
            in_channels=1,
            out_channels=enc_d_model,
            kernel_size=energy_pred_kernel_size,
            padding="same",
            skip_transpose=True,
        )
        self.encoder = TransformerEncoder(
            num_layers=enc_num_layers,
            nhead=enc_num_head,
            d_ffn=enc_ffn_dim,
            d_model=enc_d_model,
            kdim=enc_k_dim,
            vdim=enc_v_dim,
            dropout=enc_dropout,
            activation=nn.ReLU,
            normalize_before=normalize_before,
            ffn_type=ffn_type,
            ffn_cnn_kernel_size_list=ffn_cnn_kernel_size_list,
        )

        self.decoder = TransformerEncoder(
            num_layers=dec_num_layers,
            nhead=dec_num_head,
            d_ffn=dec_ffn_dim,
            d_model=dec_d_model,
            kdim=dec_k_dim,
            vdim=dec_v_dim,
            dropout=dec_dropout,
            activation=nn.ReLU,
            normalize_before=normalize_before,
            ffn_type=ffn_type,
            ffn_cnn_kernel_size_list=ffn_cnn_kernel_size_list,
        )

        self.linear = linear.Linear(n_neurons=n_mels, input_size=dec_d_model)
        self.postnet = PostNet(
            n_mel_channels=n_mels,
            postnet_embedding_dim=postnet_embedding_dim,
            postnet_kernel_size=postnet_kernel_size,
            postnet_n_convolutions=postnet_n_convolutions,
            postnet_dropout=postnet_dropout,
        )


    def forward(
        self,
        tokens,
        speakers,
        durations=None,
        pitch=None,
        energy=None,
        pace=1.0,
        pitch_rate=1.0,
        energy_rate=1.0,
    ):
        """forward pass for training and inference

        Arguments
        ---------
        tokens: torch.Tensor
            batch of input tokens
        durations: torch.Tensor
            batch of durations for each token. If it is None, the model will infer on predicted durations
        pitch: torch.Tensor
            batch of pitch for each frame. If it is None, the model will infer on predicted pitches
        energy: torch.Tensor
            batch of energy for each frame. If it is None, the model will infer on predicted energies
        pace: float
            scaling factor for durations
        pitch_rate: float
            scaling factor for pitches
        energy_rate: float
            scaling factor for energies

        Returns
        -------
        mel_post: torch.Tensor
            mel outputs from the decoder
        postnet_output: torch.Tensor
            mel outputs from the postnet
        predict_durations: torch.Tensor
            predicted durations of each token
        predict_pitch: torch.Tensor
            predicted pitches of each token
        avg_pitch: torch.Tensor
            target pitches for each token if input pitch is not None
            None if input pitch is None
        predict_energy: torch.Tensor
            predicted energies of each token
        avg_energy: torch.Tensor
            target energies for each token if input energy is not None
            None if input energy is None
        mel_length:
            predicted lengths of mel spectrograms
        """
        srcmask = get_key_padding_mask(tokens, pad_idx=self.padding_idx)
        srcmask_inverted = (~srcmask).unsqueeze(-1)

        # prenet & encoder
        token_feats = self.encPreNet(tokens)
        pos = self.sinusoidal_positional_embed_encoder(token_feats)
        token_feats = torch.add(token_feats, pos) * srcmask_inverted
        attn_mask = (
            srcmask.unsqueeze(-1)
            .repeat(self.enc_num_head, 1, token_feats.shape[1])
            .permute(0, 2, 1)
            .bool()
        )
        token_feats, _ = self.encoder(
            token_feats, src_mask=attn_mask, src_key_padding_mask=srcmask
        )
        token_feats = token_feats * srcmask_inverted

        # ADD SPEAKER EMBEDDING -- modification.
        token_feats = token_feats + self.speaker_emb(speakers).unsqueeze(1).expand(
            -1, token_feats.shape[1], -1
        )

        # duration predictor
        predict_durations = self.durPred(token_feats, srcmask_inverted).squeeze(
            -1
        )

        if predict_durations.dim() == 1:
            predict_durations = predict_durations.unsqueeze(0)
        if durations is None:
            dur_pred_reverse_log = torch.clamp(
                torch.special.expm1(predict_durations), 0
            )

        # pitch predictor
        avg_pitch = None
        predict_pitch = self.pitchPred(token_feats, srcmask_inverted)
        # use a pitch rate to adjust the pitch
        predict_pitch = predict_pitch * pitch_rate
        if pitch is not None:
            avg_pitch = average_over_durations(pitch.unsqueeze(1), durations)
            pitch = self.pitchEmbed(avg_pitch)
            avg_pitch = avg_pitch.permute(0, 2, 1)
        else:
            pitch = self.pitchEmbed(predict_pitch.permute(0, 2, 1))
        pitch = pitch.permute(0, 2, 1)
        token_feats = token_feats.add(pitch)

        # energy predictor
        avg_energy = None
        predict_energy = self.energyPred(token_feats, srcmask_inverted)
        # use an energy rate to adjust the energy
        predict_energy = predict_energy * energy_rate
        if energy is not None:
            avg_energy = average_over_durations(energy.unsqueeze(1), durations)
            energy = self.energyEmbed(avg_energy)
            avg_energy = avg_energy.permute(0, 2, 1)
        else:
            energy = self.energyEmbed(predict_energy.permute(0, 2, 1))
        energy = energy.permute(0, 2, 1)
        token_feats = token_feats.add(energy)

        # upsamples the durations
        spec_feats, mel_lens = upsample(
            token_feats,
            durations if durations is not None else dur_pred_reverse_log,
            pace=pace,
        )
        srcmask = get_mask_from_lengths(torch.tensor(mel_lens))
        srcmask = srcmask.to(spec_feats.device)
        srcmask_inverted = (~srcmask).unsqueeze(-1)
        attn_mask = (
            srcmask.unsqueeze(-1)
            .repeat(self.dec_num_head, 1, spec_feats.shape[1])
            .permute(0, 2, 1)
            .bool()
        )

        # decoder
        pos = self.sinusoidal_positional_embed_decoder(spec_feats)
        spec_feats = torch.add(spec_feats, pos) * srcmask_inverted

        output_mel_feats, memory, *_ = self.decoder(
            spec_feats, src_mask=attn_mask, src_key_padding_mask=srcmask
        )

        # postnet
        mel_post = self.linear(output_mel_feats) * srcmask_inverted
        postnet_output = self.postnet(mel_post) + mel_post
        return (
            mel_post,
            postnet_output,
            predict_durations,
            predict_pitch,
            avg_pitch,
            predict_energy,
            avg_energy,
            torch.tensor(mel_lens),
        )








class Loss(nn.Module):
    """Loss Computation

    Arguments
    ---------
    log_scale_durations: bool
        applies logarithm to target durations
    ssim_loss_weight: float
        weight for ssim loss
    duration_loss_weight: float
        weight for the duration loss
    pitch_loss_weight: float
        weight for the pitch loss
    energy_loss_weight: float
        weight for the energy loss
    mel_loss_weight: float
        weight for the mel loss
    postnet_mel_loss_weight: float
        weight for the postnet mel loss
    spn_loss_weight: float
        weight for spn loss
    spn_loss_max_epochs: int
        Max number of epochs
    """

    def __init__(
        self,
        log_scale_durations,
        ssim_loss_weight,
        duration_loss_weight,
        pitch_loss_weight,
        energy_loss_weight,
        mel_loss_weight,
        postnet_mel_loss_weight,
        spn_loss_weight=1.0,
        spn_loss_max_epochs=8,
    ):
        super().__init__()

        self.ssim_loss = SSIMLoss()
        self.mel_loss = nn.MSELoss()
        self.postnet_mel_loss = nn.MSELoss()
        self.dur_loss = nn.MSELoss()
        self.pitch_loss = nn.MSELoss()
        self.energy_loss = nn.MSELoss()
        self.log_scale_durations = log_scale_durations
        self.ssim_loss_weight = ssim_loss_weight
        self.mel_loss_weight = mel_loss_weight
        self.postnet_mel_loss_weight = postnet_mel_loss_weight
        self.duration_loss_weight = duration_loss_weight
        self.pitch_loss_weight = pitch_loss_weight
        self.energy_loss_weight = energy_loss_weight
        self.spn_loss_weight = spn_loss_weight
        self.spn_loss_max_epochs = spn_loss_max_epochs


    def forward(self, predictions, targets, current_epoch):
        """Computes the value of the loss function and updates stats

        Arguments
        ---------
        predictions: tuple
            model predictions
        targets: tuple
            ground truth data
        current_epoch: int
            The count of the current epoch.

        Returns
        -------
        loss: torch.Tensor
            the loss value
        """
        (
            mel_target,
            target_durations,
            target_pitch,
            target_energy,
            mel_length,
            phon_len,
            # spn_labels,
        ) = targets
        assert len(mel_target.shape) == 3
        (
            mel_out,
            postnet_mel_out,
            log_durations,
            predicted_pitch,
            average_pitch,
            predicted_energy,
            average_energy,
            mel_lens,
            # spn_preds,
        ) = predictions

        predicted_pitch = predicted_pitch.squeeze(-1)
        predicted_energy = predicted_energy.squeeze(-1)

        target_pitch = average_pitch.squeeze(-1)
        target_energy = average_energy.squeeze(-1)

        log_durations = log_durations.squeeze(-1)
        if self.log_scale_durations:
            log_target_durations = torch.log1p(target_durations.float())
        # change this to perform batch level using padding mask

        for i in range(mel_target.shape[0]):
            if i == 0:
                mel_loss = self.mel_loss(
                    mel_out[i, : mel_length[i], :],
                    mel_target[i, : mel_length[i], :],
                )
                postnet_mel_loss = self.postnet_mel_loss(
                    postnet_mel_out[i, : mel_length[i], :],
                    mel_target[i, : mel_length[i], :],
                )
                dur_loss = self.dur_loss(
                    log_durations[i, : phon_len[i]],
                    log_target_durations[i, : phon_len[i]].to(torch.float32),
                )
                pitch_loss = self.pitch_loss(
                    predicted_pitch[i, : mel_length[i]],
                    target_pitch[i, : mel_length[i]].to(torch.float32),
                )
                energy_loss = self.energy_loss(
                    predicted_energy[i, : mel_length[i]],
                    target_energy[i, : mel_length[i]].to(torch.float32),
                )
            else:
                mel_loss = mel_loss + self.mel_loss(
                    mel_out[i, : mel_length[i], :],
                    mel_target[i, : mel_length[i], :],
                )
                postnet_mel_loss = postnet_mel_loss + self.postnet_mel_loss(
                    postnet_mel_out[i, : mel_length[i], :],
                    mel_target[i, : mel_length[i], :],
                )
                dur_loss = dur_loss + self.dur_loss(
                    log_durations[i, : phon_len[i]],
                    log_target_durations[i, : phon_len[i]].to(torch.float32),
                )
                pitch_loss = pitch_loss + self.pitch_loss(
                    predicted_pitch[i, : mel_length[i]],
                    target_pitch[i, : mel_length[i]].to(torch.float32),
                )
                energy_loss = energy_loss + self.energy_loss(
                    predicted_energy[i, : mel_length[i]],
                    target_energy[i, : mel_length[i]].to(torch.float32),
                )
        ssim_loss = self.ssim_loss(mel_out, mel_target, mel_length)
        mel_loss = torch.div(mel_loss, len(mel_target))
        postnet_mel_loss = torch.div(postnet_mel_loss, len(mel_target))
        dur_loss = torch.div(dur_loss, len(mel_target))
        pitch_loss = torch.div(pitch_loss, len(mel_target))
        energy_loss = torch.div(energy_loss, len(mel_target))

        # spn_loss = bce_loss(spn_preds, spn_labels)
        # if current_epoch > self.spn_loss_max_epochs:
        #     self.spn_loss_weight = 0

        total_loss = (
            ssim_loss * self.ssim_loss_weight
            + mel_loss * self.mel_loss_weight
            + postnet_mel_loss * self.postnet_mel_loss_weight
            + dur_loss * self.duration_loss_weight
            + pitch_loss * self.pitch_loss_weight
            + energy_loss * self.energy_loss_weight
            # + spn_loss * self.spn_loss_weight
        )

        loss = {
            "total_loss": total_loss,
            "ssim_loss": ssim_loss * self.ssim_loss_weight,
            "mel_loss": mel_loss * self.mel_loss_weight,
            "postnet_mel_loss": postnet_mel_loss * self.postnet_mel_loss_weight,
            "dur_loss": dur_loss * self.duration_loss_weight,
            "pitch_loss": pitch_loss * self.pitch_loss_weight,
            "energy_loss": energy_loss * self.energy_loss_weight,
            # "spn_loss": spn_loss * self.spn_loss_weight,
        }
        return loss

## Trainer

In [9]:
import yaml

# Load configuration from YAML file
config_path = os.path.join('/workspace/emo_rank_tts/params.yaml')
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

In [13]:
from IPython.display import clear_output
from collections import defaultdict

# misc
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# dataset
dataset = FastSpeech2Dataset(mode='train')
collate_fn = TextMelCollateWithAlignment()
dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=8,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=4,
    pin_memory=True,
    drop_last=True,
)

# model
model = FastSpeech2(**config['fastspeech2']['model']).to(device)

# optimizer
optim = torch.optim.Adam(
    model.parameters(),
    lr=1e-4
)

# loss
criterion = Loss(**config['fastspeech2']['loss']).to(device)


global_step = 0
for epoch in range(1, 400):

    epoch_avg_loss = defaultdict(float)

    for idx, batch in enumerate(tqdm.notebook.tqdm(dataloader)):
        batch = batch_to_device(batch, device)
        phoneme, speakers, phon_len, mel_target, target_pitch, target_energy, target_duration, mel_length, labels, wavs = batch
        global_step += 1

        # Forward pass
        predictions = model(phoneme, speakers, target_duration, target_pitch, target_energy)


        # Compute loss
        targets = (mel_target, target_duration, target_pitch, target_energy, mel_length, phon_len)

        optim.zero_grad()
        loss = criterion(predictions, targets, epoch)
        loss['total_loss'].backward()
        optim.step()

        # Accumulate loss
        for loss_name, loss_value in loss.items():
            epoch_avg_loss[loss_name] += loss_value

        # Print predicted mels
        if idx == 0:
            melspecs = predictions[0].cpu().detach().numpy()
            y_melspecs = mel_target.cpu().detach().numpy()
            all_melspecs = np.concatenate((melspecs, y_melspecs), axis=0)
            fig, axes = plt.subplots(4, 4, figsize=(16, 10))
            for ax_idx, (ax, mel) in enumerate(zip(axes.flatten(), all_melspecs)):
                ax.imshow(mel.T, aspect='auto', origin='lower', interpolation='none')
                if ax_idx < len(melspecs):
                    label = f"Pred {ax_idx + 1}"
                    color = 'blue'
                else:
                    label = f"GT {ax_idx - len(melspecs) + 1}"
                    color = 'red'

                ax.text(
                    0.95, 0.95, label,
                    horizontalalignment='right',
                    verticalalignment='top',
                    transform=ax.transAxes,
                    fontsize=12,
                    fontweight='bold',
                    color=color,
                )
                # ax.set_title('Generated Mel-Spectrogram')
                # ax.axis('off')
            plt.tight_layout()
            plt.savefig('epoch_{}.png'.format(epoch))
            plt.close()

        # end of epoch
    
    epoch_avg_loss = {k: v / len(dataloader) for k, v in epoch_avg_loss.items()}
    print("=" * 50)
    print("Epoch: {}".format(epoch))
    print("=" * 50)
    for loss_name, loss_value in epoch_avg_loss.items():
        print("{:<30s}{:>20.4f}".format(loss_name, loss_value))
    print("=" * 50, '\n\n')

  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 1
total_loss                                  8.6915
ssim_loss                                   0.4338
mel_loss                                    2.8304
postnet_mel_loss                            3.6387
dur_loss                                    0.3377
pitch_loss                                  0.9505
energy_loss                                 0.5005




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 2
total_loss                                  5.2044
ssim_loss                                   0.3431
mel_loss                                    1.3889
postnet_mel_loss                            2.0560
dur_loss                                    0.2241
pitch_loss                                  0.8657
energy_loss                                 0.3266




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 3
total_loss                                  4.5900
ssim_loss                                   0.3241
mel_loss                                    1.2164
postnet_mel_loss                            1.6996
dur_loss                                    0.2010
pitch_loss                                  0.8445
energy_loss                                 0.3043




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 4
total_loss                                  4.2138
ssim_loss                                   0.3098
mel_loss                                    1.1126
postnet_mel_loss                            1.4844
dur_loss                                    0.1876
pitch_loss                                  0.8282
energy_loss                                 0.2912




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 5
total_loss                                  3.9137
ssim_loss                                   0.3001
mel_loss                                    1.0298
postnet_mel_loss                            1.3160
dur_loss                                    0.1781
pitch_loss                                  0.8053
energy_loss                                 0.2843




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 6
total_loss                                  3.6648
ssim_loss                                   0.2884
mel_loss                                    0.9607
postnet_mel_loss                            1.1783
dur_loss                                    0.1697
pitch_loss                                  0.7911
energy_loss                                 0.2766




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 7
total_loss                                  3.4301
ssim_loss                                   0.2784
mel_loss                                    0.8991
postnet_mel_loss                            1.0601
dur_loss                                    0.1632
pitch_loss                                  0.7627
energy_loss                                 0.2665




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 8
total_loss                                  3.2336
ssim_loss                                   0.2685
mel_loss                                    0.8456
postnet_mel_loss                            0.9608
dur_loss                                    0.1597
pitch_loss                                  0.7392
energy_loss                                 0.2599




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 9
total_loss                                  3.0458
ssim_loss                                   0.2607
mel_loss                                    0.7964
postnet_mel_loss                            0.8749
dur_loss                                    0.1551
pitch_loss                                  0.7074
energy_loss                                 0.2512




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 10
total_loss                                  2.9043
ssim_loss                                   0.2533
mel_loss                                    0.7618
postnet_mel_loss                            0.8121
dur_loss                                    0.1509
pitch_loss                                  0.6801
energy_loss                                 0.2461




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 11
total_loss                                  2.7646
ssim_loss                                   0.2477
mel_loss                                    0.7255
postnet_mel_loss                            0.7553
dur_loss                                    0.1474
pitch_loss                                  0.6505
energy_loss                                 0.2382




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 12
total_loss                                  2.6295
ssim_loss                                   0.2411
mel_loss                                    0.6923
postnet_mel_loss                            0.7082
dur_loss                                    0.1441
pitch_loss                                  0.6128
energy_loss                                 0.2310




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 13
total_loss                                  2.5429
ssim_loss                                   0.2353
mel_loss                                    0.6697
postnet_mel_loss                            0.6771
dur_loss                                    0.1414
pitch_loss                                  0.5930
energy_loss                                 0.2263




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 14
total_loss                                  2.4506
ssim_loss                                   0.2304
mel_loss                                    0.6462
postnet_mel_loss                            0.6492
dur_loss                                    0.1387
pitch_loss                                  0.5683
energy_loss                                 0.2179




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 15
total_loss                                  2.3473
ssim_loss                                   0.2268
mel_loss                                    0.6224
postnet_mel_loss                            0.6236
dur_loss                                    0.1355
pitch_loss                                  0.5281
energy_loss                                 0.2109




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 16
total_loss                                  2.2595
ssim_loss                                   0.2228
mel_loss                                    0.5992
postnet_mel_loss                            0.5999
dur_loss                                    0.1326
pitch_loss                                  0.5011
energy_loss                                 0.2039




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 17
total_loss                                  2.1809
ssim_loss                                   0.2159
mel_loss                                    0.5817
postnet_mel_loss                            0.5822
dur_loss                                    0.1297
pitch_loss                                  0.4750
energy_loss                                 0.1964




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 18
total_loss                                  2.1309
ssim_loss                                   0.2147
mel_loss                                    0.5711
postnet_mel_loss                            0.5716
dur_loss                                    0.1268
pitch_loss                                  0.4570
energy_loss                                 0.1898




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 19
total_loss                                  2.0894
ssim_loss                                   0.2113
mel_loss                                    0.5667
postnet_mel_loss                            0.5672
dur_loss                                    0.1249
pitch_loss                                  0.4353
energy_loss                                 0.1841




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 20
total_loss                                  1.9949
ssim_loss                                   0.2053
mel_loss                                    0.5392
postnet_mel_loss                            0.5396
dur_loss                                    0.1223
pitch_loss                                  0.4108
energy_loss                                 0.1776




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 21
total_loss                                  1.9413
ssim_loss                                   0.2017
mel_loss                                    0.5253
postnet_mel_loss                            0.5257
dur_loss                                    0.1210
pitch_loss                                  0.3952
energy_loss                                 0.1724




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 22
total_loss                                  1.9051
ssim_loss                                   0.2001
mel_loss                                    0.5132
postnet_mel_loss                            0.5136
dur_loss                                    0.1186
pitch_loss                                  0.3918
energy_loss                                 0.1679




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 23
total_loss                                  1.8460
ssim_loss                                   0.1961
mel_loss                                    0.4997
postnet_mel_loss                            0.5003
dur_loss                                    0.1152
pitch_loss                                  0.3732
energy_loss                                 0.1614




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 24
total_loss                                  1.8001
ssim_loss                                   0.1942
mel_loss                                    0.4898
postnet_mel_loss                            0.4901
dur_loss                                    0.1134
pitch_loss                                  0.3572
energy_loss                                 0.1553




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 25
total_loss                                  1.7799
ssim_loss                                   0.1917
mel_loss                                    0.4840
postnet_mel_loss                            0.4843
dur_loss                                    0.1118
pitch_loss                                  0.3556
energy_loss                                 0.1525




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 26
total_loss                                  1.7450
ssim_loss                                   0.1901
mel_loss                                    0.4752
postnet_mel_loss                            0.4755
dur_loss                                    0.1100
pitch_loss                                  0.3457
energy_loss                                 0.1484




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 27
total_loss                                  1.6905
ssim_loss                                   0.1871
mel_loss                                    0.4600
postnet_mel_loss                            0.4602
dur_loss                                    0.1074
pitch_loss                                  0.3326
energy_loss                                 0.1432




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 28
total_loss                                  1.6453
ssim_loss                                   0.1838
mel_loss                                    0.4496
postnet_mel_loss                            0.4499
dur_loss                                    0.1052
pitch_loss                                  0.3171
energy_loss                                 0.1398




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 29
total_loss                                  1.6168
ssim_loss                                   0.1815
mel_loss                                    0.4407
postnet_mel_loss                            0.4409
dur_loss                                    0.1034
pitch_loss                                  0.3162
energy_loss                                 0.1341




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 30
total_loss                                  1.5655
ssim_loss                                   0.1772
mel_loss                                    0.4264
postnet_mel_loss                            0.4267
dur_loss                                    0.1014
pitch_loss                                  0.3040
energy_loss                                 0.1298




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 31
total_loss                                  1.5553
ssim_loss                                   0.1778
mel_loss                                    0.4297
postnet_mel_loss                            0.4300
dur_loss                                    0.0997
pitch_loss                                  0.2914
energy_loss                                 0.1267




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 32
total_loss                                  1.5469
ssim_loss                                   0.1765
mel_loss                                    0.4270
postnet_mel_loss                            0.4272
dur_loss                                    0.0987
pitch_loss                                  0.2932
energy_loss                                 0.1243




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 33
total_loss                                  1.4794
ssim_loss                                   0.1715
mel_loss                                    0.4062
postnet_mel_loss                            0.4064
dur_loss                                    0.0957
pitch_loss                                  0.2800
energy_loss                                 0.1197




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 34
total_loss                                  1.4606
ssim_loss                                   0.1700
mel_loss                                    0.4021
postnet_mel_loss                            0.4023
dur_loss                                    0.0939
pitch_loss                                  0.2751
energy_loss                                 0.1173




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 35
total_loss                                  1.4402
ssim_loss                                   0.1688
mel_loss                                    0.3998
postnet_mel_loss                            0.4000
dur_loss                                    0.0919
pitch_loss                                  0.2656
energy_loss                                 0.1140




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 36
total_loss                                  1.4056
ssim_loss                                   0.1667
mel_loss                                    0.3879
postnet_mel_loss                            0.3881
dur_loss                                    0.0907
pitch_loss                                  0.2598
energy_loss                                 0.1124




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 37
total_loss                                  1.4204
ssim_loss                                   0.1659
mel_loss                                    0.3930
postnet_mel_loss                            0.3932
dur_loss                                    0.0892
pitch_loss                                  0.2666
energy_loss                                 0.1124




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 38
total_loss                                  1.3823
ssim_loss                                   0.1630
mel_loss                                    0.3830
postnet_mel_loss                            0.3832
dur_loss                                    0.0879
pitch_loss                                  0.2570
energy_loss                                 0.1084




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 39
total_loss                                  1.3321
ssim_loss                                   0.1601
mel_loss                                    0.3687
postnet_mel_loss                            0.3689
dur_loss                                    0.0853
pitch_loss                                  0.2454
energy_loss                                 0.1036




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 40
total_loss                                  1.3333
ssim_loss                                   0.1584
mel_loss                                    0.3686
postnet_mel_loss                            0.3688
dur_loss                                    0.0846
pitch_loss                                  0.2505
energy_loss                                 0.1024




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 41
total_loss                                  1.3443
ssim_loss                                   0.1608
mel_loss                                    0.3797
postnet_mel_loss                            0.3799
dur_loss                                    0.0845
pitch_loss                                  0.2385
energy_loss                                 0.1009




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 42
total_loss                                  1.3032
ssim_loss                                   0.1586
mel_loss                                    0.3648
postnet_mel_loss                            0.3649
dur_loss                                    0.0821
pitch_loss                                  0.2327
energy_loss                                 0.1001




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 43
total_loss                                  1.2620
ssim_loss                                   0.1545
mel_loss                                    0.3511
postnet_mel_loss                            0.3513
dur_loss                                    0.0790
pitch_loss                                  0.2303
energy_loss                                 0.0958




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 44
total_loss                                  1.2622
ssim_loss                                   0.1549
mel_loss                                    0.3520
postnet_mel_loss                            0.3522
dur_loss                                    0.0793
pitch_loss                                  0.2301
energy_loss                                 0.0937




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 45
total_loss                                  1.2488
ssim_loss                                   0.1530
mel_loss                                    0.3472
postnet_mel_loss                            0.3540
dur_loss                                    0.0774
pitch_loss                                  0.2251
energy_loss                                 0.0921




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 46
total_loss                                  1.2556
ssim_loss                                   0.1509
mel_loss                                    0.3434
postnet_mel_loss                            0.3647
dur_loss                                    0.0764
pitch_loss                                  0.2293
energy_loss                                 0.0910




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 47
total_loss                                  1.2107
ssim_loss                                   0.1491
mel_loss                                    0.3355
postnet_mel_loss                            0.3384
dur_loss                                    0.0744
pitch_loss                                  0.2253
energy_loss                                 0.0880




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 48
total_loss                                  1.1915
ssim_loss                                   0.1480
mel_loss                                    0.3313
postnet_mel_loss                            0.3318
dur_loss                                    0.0737
pitch_loss                                  0.2196
energy_loss                                 0.0871




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 49
total_loss                                  1.1959
ssim_loss                                   0.1487
mel_loss                                    0.3359
postnet_mel_loss                            0.3361
dur_loss                                    0.0732
pitch_loss                                  0.2147
energy_loss                                 0.0873




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 50
total_loss                                  1.1825
ssim_loss                                   0.1484
mel_loss                                    0.3321
postnet_mel_loss                            0.3323
dur_loss                                    0.0720
pitch_loss                                  0.2126
energy_loss                                 0.0851




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 51
total_loss                                  1.1633
ssim_loss                                   0.1455
mel_loss                                    0.3269
postnet_mel_loss                            0.3271
dur_loss                                    0.0709
pitch_loss                                  0.2100
energy_loss                                 0.0830




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 52
total_loss                                  1.1260
ssim_loss                                   0.1428
mel_loss                                    0.3149
postnet_mel_loss                            0.3151
dur_loss                                    0.0687
pitch_loss                                  0.2042
energy_loss                                 0.0802




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 53
total_loss                                  1.1135
ssim_loss                                   0.1411
mel_loss                                    0.3109
postnet_mel_loss                            0.3111
dur_loss                                    0.0677
pitch_loss                                  0.2034
energy_loss                                 0.0794




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 54
total_loss                                  1.1130
ssim_loss                                   0.1417
mel_loss                                    0.3106
postnet_mel_loss                            0.3108
dur_loss                                    0.0671
pitch_loss                                  0.2041
energy_loss                                 0.0788




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 55
total_loss                                  1.1106
ssim_loss                                   0.1410
mel_loss                                    0.3097
postnet_mel_loss                            0.3099
dur_loss                                    0.0664
pitch_loss                                  0.2060
energy_loss                                 0.0776




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 56
total_loss                                  1.1016
ssim_loss                                   0.1407
mel_loss                                    0.3100
postnet_mel_loss                            0.3102
dur_loss                                    0.0660
pitch_loss                                  0.1974
energy_loss                                 0.0772




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 57
total_loss                                  1.0832
ssim_loss                                   0.1379
mel_loss                                    0.3038
postnet_mel_loss                            0.3040
dur_loss                                    0.0645
pitch_loss                                  0.1975
energy_loss                                 0.0755




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 58
total_loss                                  1.0657
ssim_loss                                   0.1366
mel_loss                                    0.2978
postnet_mel_loss                            0.2980
dur_loss                                    0.0629
pitch_loss                                  0.1965
energy_loss                                 0.0738




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 59
total_loss                                  1.0589
ssim_loss                                   0.1357
mel_loss                                    0.2951
postnet_mel_loss                            0.2952
dur_loss                                    0.0627
pitch_loss                                  0.1961
energy_loss                                 0.0740




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 60
total_loss                                  1.0630
ssim_loss                                   0.1367
mel_loss                                    0.2984
postnet_mel_loss                            0.2986
dur_loss                                    0.0622
pitch_loss                                  0.1939
energy_loss                                 0.0732




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 61
total_loss                                  1.0512
ssim_loss                                   0.1354
mel_loss                                    0.2947
postnet_mel_loss                            0.2948
dur_loss                                    0.0608
pitch_loss                                  0.1937
energy_loss                                 0.0717




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 62
total_loss                                  1.0316
ssim_loss                                   0.1339
mel_loss                                    0.2882
postnet_mel_loss                            0.2883
dur_loss                                    0.0606
pitch_loss                                  0.1901
energy_loss                                 0.0705




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 63
total_loss                                  1.0310
ssim_loss                                   0.1343
mel_loss                                    0.2902
postnet_mel_loss                            0.2904
dur_loss                                    0.0592
pitch_loss                                  0.1875
energy_loss                                 0.0695




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 64
total_loss                                  1.0210
ssim_loss                                   0.1337
mel_loss                                    0.2891
postnet_mel_loss                            0.2893
dur_loss                                    0.0585
pitch_loss                                  0.1816
energy_loss                                 0.0688




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 65
total_loss                                  0.9985
ssim_loss                                   0.1310
mel_loss                                    0.2807
postnet_mel_loss                            0.2808
dur_loss                                    0.0575
pitch_loss                                  0.1815
energy_loss                                 0.0670




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 66
total_loss                                  0.9880
ssim_loss                                   0.1297
mel_loss                                    0.2761
postnet_mel_loss                            0.2762
dur_loss                                    0.0572
pitch_loss                                  0.1815
energy_loss                                 0.0672




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 67
total_loss                                  1.0073
ssim_loss                                   0.1309
mel_loss                                    0.2815
postnet_mel_loss                            0.2817
dur_loss                                    0.0575
pitch_loss                                  0.1889
energy_loss                                 0.0668




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 68
total_loss                                  0.9882
ssim_loss                                   0.1300
mel_loss                                    0.2772
postnet_mel_loss                            0.2773
dur_loss                                    0.0569
pitch_loss                                  0.1813
energy_loss                                 0.0655




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 69
total_loss                                  0.9731
ssim_loss                                   0.1279
mel_loss                                    0.2725
postnet_mel_loss                            0.2726
dur_loss                                    0.0556
pitch_loss                                  0.1810
energy_loss                                 0.0635




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 70
total_loss                                  0.9715
ssim_loss                                   0.1281
mel_loss                                    0.2743
postnet_mel_loss                            0.2744
dur_loss                                    0.0547
pitch_loss                                  0.1774
energy_loss                                 0.0626




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 71
total_loss                                  0.9581
ssim_loss                                   0.1270
mel_loss                                    0.2675
postnet_mel_loss                            0.2676
dur_loss                                    0.0540
pitch_loss                                  0.1796
energy_loss                                 0.0623




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 72
total_loss                                  0.9575
ssim_loss                                   0.1271
mel_loss                                    0.2672
postnet_mel_loss                            0.2673
dur_loss                                    0.0541
pitch_loss                                  0.1785
energy_loss                                 0.0632




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 73
total_loss                                  0.9415
ssim_loss                                   0.1251
mel_loss                                    0.2629
postnet_mel_loss                            0.2630
dur_loss                                    0.0531
pitch_loss                                  0.1760
energy_loss                                 0.0613




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 74
total_loss                                  0.9496
ssim_loss                                   0.1249
mel_loss                                    0.2630
postnet_mel_loss                            0.2630
dur_loss                                    0.0538
pitch_loss                                  0.1823
energy_loss                                 0.0625




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 75
total_loss                                  0.9526
ssim_loss                                   0.1252
mel_loss                                    0.2646
postnet_mel_loss                            0.2647
dur_loss                                    0.0533
pitch_loss                                  0.1842
energy_loss                                 0.0607




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 76
total_loss                                  0.9424
ssim_loss                                   0.1244
mel_loss                                    0.2628
postnet_mel_loss                            0.2628
dur_loss                                    0.0523
pitch_loss                                  0.1810
energy_loss                                 0.0591




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 77
total_loss                                  0.9307
ssim_loss                                   0.1229
mel_loss                                    0.2584
postnet_mel_loss                            0.2585
dur_loss                                    0.0516
pitch_loss                                  0.1796
energy_loss                                 0.0596




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 78
total_loss                                  0.9199
ssim_loss                                   0.1221
mel_loss                                    0.2556
postnet_mel_loss                            0.2557
dur_loss                                    0.0521
pitch_loss                                  0.1755
energy_loss                                 0.0589




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 79
total_loss                                  0.9161
ssim_loss                                   0.1219
mel_loss                                    0.2563
postnet_mel_loss                            0.2564
dur_loss                                    0.0509
pitch_loss                                  0.1730
energy_loss                                 0.0576




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 80
total_loss                                  0.9394
ssim_loss                                   0.1232
mel_loss                                    0.2634
postnet_mel_loss                            0.2634
dur_loss                                    0.0513
pitch_loss                                  0.1790
energy_loss                                 0.0590




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 81
total_loss                                  0.9383
ssim_loss                                   0.1235
mel_loss                                    0.2646
postnet_mel_loss                            0.2647
dur_loss                                    0.0512
pitch_loss                                  0.1758
energy_loss                                 0.0586




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 82
total_loss                                  0.8802
ssim_loss                                   0.1198
mel_loss                                    0.2462
postnet_mel_loss                            0.2463
dur_loss                                    0.0487
pitch_loss                                  0.1639
energy_loss                                 0.0553




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 83
total_loss                                  0.8682
ssim_loss                                   0.1179
mel_loss                                    0.2421
postnet_mel_loss                            0.2421
dur_loss                                    0.0480
pitch_loss                                  0.1634
energy_loss                                 0.0547




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 84
total_loss                                  0.8723
ssim_loss                                   0.1177
mel_loss                                    0.2425
postnet_mel_loss                            0.2425
dur_loss                                    0.0483
pitch_loss                                  0.1664
energy_loss                                 0.0549




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 85
total_loss                                  0.8827
ssim_loss                                   0.1188
mel_loss                                    0.2461
postnet_mel_loss                            0.2462
dur_loss                                    0.0485
pitch_loss                                  0.1673
energy_loss                                 0.0558




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 86
total_loss                                  0.8825
ssim_loss                                   0.1181
mel_loss                                    0.2472
postnet_mel_loss                            0.2473
dur_loss                                    0.0483
pitch_loss                                  0.1668
energy_loss                                 0.0548




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 87
total_loss                                  0.8743
ssim_loss                                   0.1174
mel_loss                                    0.2443
postnet_mel_loss                            0.2444
dur_loss                                    0.0475
pitch_loss                                  0.1661
energy_loss                                 0.0546




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 88
total_loss                                  0.9061
ssim_loss                                   0.1216
mel_loss                                    0.2598
postnet_mel_loss                            0.2598
dur_loss                                    0.0482
pitch_loss                                  0.1621
energy_loss                                 0.0547




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 89
total_loss                                  0.8642
ssim_loss                                   0.1173
mel_loss                                    0.2427
postnet_mel_loss                            0.2428
dur_loss                                    0.0470
pitch_loss                                  0.1605
energy_loss                                 0.0538




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 90
total_loss                                  0.8457
ssim_loss                                   0.1148
mel_loss                                    0.2354
postnet_mel_loss                            0.2355
dur_loss                                    0.0464
pitch_loss                                  0.1618
energy_loss                                 0.0519




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 91
total_loss                                  0.8394
ssim_loss                                   0.1142
mel_loss                                    0.2339
postnet_mel_loss                            0.2340
dur_loss                                    0.0461
pitch_loss                                  0.1588
energy_loss                                 0.0524




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 92
total_loss                                  0.8394
ssim_loss                                   0.1138
mel_loss                                    0.2339
postnet_mel_loss                            0.2340
dur_loss                                    0.0459
pitch_loss                                  0.1598
energy_loss                                 0.0521




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 93
total_loss                                  0.8408
ssim_loss                                   0.1143
mel_loss                                    0.2345
postnet_mel_loss                            0.2346
dur_loss                                    0.0455
pitch_loss                                  0.1607
energy_loss                                 0.0513




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 94
total_loss                                  0.8378
ssim_loss                                   0.1140
mel_loss                                    0.2345
postnet_mel_loss                            0.2345
dur_loss                                    0.0451
pitch_loss                                  0.1589
energy_loss                                 0.0507




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 95
total_loss                                  0.8453
ssim_loss                                   0.1140
mel_loss                                    0.2347
postnet_mel_loss                            0.2348
dur_loss                                    0.0456
pitch_loss                                  0.1652
energy_loss                                 0.0511




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 96
total_loss                                  0.8407
ssim_loss                                   0.1137
mel_loss                                    0.2352
postnet_mel_loss                            0.2353
dur_loss                                    0.0449
pitch_loss                                  0.1611
energy_loss                                 0.0504




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 97
total_loss                                  0.8361
ssim_loss                                   0.1136
mel_loss                                    0.2327
postnet_mel_loss                            0.2328
dur_loss                                    0.0449
pitch_loss                                  0.1617
energy_loss                                 0.0502




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 98
total_loss                                  0.8198
ssim_loss                                   0.1124
mel_loss                                    0.2293
postnet_mel_loss                            0.2293
dur_loss                                    0.0441
pitch_loss                                  0.1556
energy_loss                                 0.0491




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 99
total_loss                                  0.8162
ssim_loss                                   0.1113
mel_loss                                    0.2275
postnet_mel_loss                            0.2276
dur_loss                                    0.0438
pitch_loss                                  0.1571
energy_loss                                 0.0489




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 100
total_loss                                  0.8177
ssim_loss                                   0.1117
mel_loss                                    0.2294
postnet_mel_loss                            0.2294
dur_loss                                    0.0434
pitch_loss                                  0.1552
energy_loss                                 0.0485




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 101
total_loss                                  0.8283
ssim_loss                                   0.1133
mel_loss                                    0.2339
postnet_mel_loss                            0.2339
dur_loss                                    0.0438
pitch_loss                                  0.1546
energy_loss                                 0.0488




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 102
total_loss                                  0.8083
ssim_loss                                   0.1113
mel_loss                                    0.2265
postnet_mel_loss                            0.2265
dur_loss                                    0.0432
pitch_loss                                  0.1531
energy_loss                                 0.0477




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 103
total_loss                                  0.8010
ssim_loss                                   0.1100
mel_loss                                    0.2234
postnet_mel_loss                            0.2235
dur_loss                                    0.0430
pitch_loss                                  0.1533
energy_loss                                 0.0480




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 104
total_loss                                  0.8074
ssim_loss                                   0.1104
mel_loss                                    0.2280
postnet_mel_loss                            0.2280
dur_loss                                    0.0426
pitch_loss                                  0.1515
energy_loss                                 0.0469




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 105
total_loss                                  0.8047
ssim_loss                                   0.1107
mel_loss                                    0.2265
postnet_mel_loss                            0.2265
dur_loss                                    0.0421
pitch_loss                                  0.1523
energy_loss                                 0.0465




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 106
total_loss                                  0.7892
ssim_loss                                   0.1089
mel_loss                                    0.2207
postnet_mel_loss                            0.2207
dur_loss                                    0.0420
pitch_loss                                  0.1511
energy_loss                                 0.0458




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 107
total_loss                                  0.7938
ssim_loss                                   0.1096
mel_loss                                    0.2224
postnet_mel_loss                            0.2225
dur_loss                                    0.0420
pitch_loss                                  0.1516
energy_loss                                 0.0457




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 108
total_loss                                  0.7909
ssim_loss                                   0.1086
mel_loss                                    0.2196
postnet_mel_loss                            0.2197
dur_loss                                    0.0420
pitch_loss                                  0.1548
energy_loss                                 0.0462




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 109
total_loss                                  0.7842
ssim_loss                                   0.1076
mel_loss                                    0.2165
postnet_mel_loss                            0.2165
dur_loss                                    0.0417
pitch_loss                                  0.1565
energy_loss                                 0.0454




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 110
total_loss                                  0.7783
ssim_loss                                   0.1069
mel_loss                                    0.2174
postnet_mel_loss                            0.2175
dur_loss                                    0.0414
pitch_loss                                  0.1500
energy_loss                                 0.0451




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 111
total_loss                                  0.7813
ssim_loss                                   0.1069
mel_loss                                    0.2162
postnet_mel_loss                            0.2163
dur_loss                                    0.0408
pitch_loss                                  0.1564
energy_loss                                 0.0446




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 112
total_loss                                  0.7814
ssim_loss                                   0.1075
mel_loss                                    0.2192
postnet_mel_loss                            0.2192
dur_loss                                    0.0406
pitch_loss                                  0.1510
energy_loss                                 0.0439




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 113
total_loss                                  0.8075
ssim_loss                                   0.1109
mel_loss                                    0.2307
postnet_mel_loss                            0.2307
dur_loss                                    0.0414
pitch_loss                                  0.1491
energy_loss                                 0.0447




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 114
total_loss                                  0.8226
ssim_loss                                   0.1123
mel_loss                                    0.2367
postnet_mel_loss                            0.2368
dur_loss                                    0.0416
pitch_loss                                  0.1508
energy_loss                                 0.0444




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 115
total_loss                                  0.7695
ssim_loss                                   0.1067
mel_loss                                    0.2164
postnet_mel_loss                            0.2164
dur_loss                                    0.0398
pitch_loss                                  0.1472
energy_loss                                 0.0430




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 116
total_loss                                  0.7458
ssim_loss                                   0.1042
mel_loss                                    0.2074
postnet_mel_loss                            0.2074
dur_loss                                    0.0393
pitch_loss                                  0.1454
energy_loss                                 0.0422




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 117
total_loss                                  0.7450
ssim_loss                                   0.1045
mel_loss                                    0.2068
postnet_mel_loss                            0.2068
dur_loss                                    0.0392
pitch_loss                                  0.1458
energy_loss                                 0.0419




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 118
total_loss                                  0.7567
ssim_loss                                   0.1054
mel_loss                                    0.2099
postnet_mel_loss                            0.2099
dur_loss                                    0.0395
pitch_loss                                  0.1494
energy_loss                                 0.0426




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 119
total_loss                                  0.7567
ssim_loss                                   0.1048
mel_loss                                    0.2105
postnet_mel_loss                            0.2105
dur_loss                                    0.0394
pitch_loss                                  0.1489
energy_loss                                 0.0427




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 120
total_loss                                  0.7595
ssim_loss                                   0.1049
mel_loss                                    0.2108
postnet_mel_loss                            0.2109
dur_loss                                    0.0393
pitch_loss                                  0.1511
energy_loss                                 0.0425




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 121
total_loss                                  0.7523
ssim_loss                                   0.1035
mel_loss                                    0.2092
postnet_mel_loss                            0.2092
dur_loss                                    0.0393
pitch_loss                                  0.1486
energy_loss                                 0.0426




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 122
total_loss                                  0.7470
ssim_loss                                   0.1035
mel_loss                                    0.2073
postnet_mel_loss                            0.2074
dur_loss                                    0.0392
pitch_loss                                  0.1480
energy_loss                                 0.0416




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 123
total_loss                                  0.7408
ssim_loss                                   0.1027
mel_loss                                    0.2064
postnet_mel_loss                            0.2064
dur_loss                                    0.0387
pitch_loss                                  0.1449
energy_loss                                 0.0416




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 124
total_loss                                  0.7452
ssim_loss                                   0.1043
mel_loss                                    0.2072
postnet_mel_loss                            0.2072
dur_loss                                    0.0382
pitch_loss                                  0.1471
energy_loss                                 0.0412




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 125
total_loss                                  0.7399
ssim_loss                                   0.1033
mel_loss                                    0.2070
postnet_mel_loss                            0.2070
dur_loss                                    0.0386
pitch_loss                                  0.1432
energy_loss                                 0.0409




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 126
total_loss                                  0.7473
ssim_loss                                   0.1037
mel_loss                                    0.2096
postnet_mel_loss                            0.2096
dur_loss                                    0.0385
pitch_loss                                  0.1450
energy_loss                                 0.0409




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 127
total_loss                                  0.7585
ssim_loss                                   0.1046
mel_loss                                    0.2145
postnet_mel_loss                            0.2146
dur_loss                                    0.0387
pitch_loss                                  0.1448
energy_loss                                 0.0413




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 128
total_loss                                  0.7307
ssim_loss                                   0.1020
mel_loss                                    0.2043
postnet_mel_loss                            0.2043
dur_loss                                    0.0378
pitch_loss                                  0.1429
energy_loss                                 0.0395




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 129
total_loss                                  0.7304
ssim_loss                                   0.1015
mel_loss                                    0.2011
postnet_mel_loss                            0.2012
dur_loss                                    0.0375
pitch_loss                                  0.1498
energy_loss                                 0.0394




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 130
total_loss                                  0.7362
ssim_loss                                   0.1023
mel_loss                                    0.2035
postnet_mel_loss                            0.2035
dur_loss                                    0.0380
pitch_loss                                  0.1490
energy_loss                                 0.0401




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 131
total_loss                                  0.7262
ssim_loss                                   0.1026
mel_loss                                    0.2027
postnet_mel_loss                            0.2027
dur_loss                                    0.0369
pitch_loss                                  0.1420
energy_loss                                 0.0393




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 132
total_loss                                  0.7206
ssim_loss                                   0.1012
mel_loss                                    0.2005
postnet_mel_loss                            0.2005
dur_loss                                    0.0370
pitch_loss                                  0.1424
energy_loss                                 0.0391




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 133
total_loss                                  0.7131
ssim_loss                                   0.0999
mel_loss                                    0.1987
postnet_mel_loss                            0.1987
dur_loss                                    0.0369
pitch_loss                                  0.1406
energy_loss                                 0.0382




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 134
total_loss                                  0.7199
ssim_loss                                   0.1005
mel_loss                                    0.1996
postnet_mel_loss                            0.1997
dur_loss                                    0.0371
pitch_loss                                  0.1440
energy_loss                                 0.0391




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 135
total_loss                                  0.7249
ssim_loss                                   0.1011
mel_loss                                    0.2026
postnet_mel_loss                            0.2026
dur_loss                                    0.0371
pitch_loss                                  0.1427
energy_loss                                 0.0389




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 136
total_loss                                  0.7213
ssim_loss                                   0.1005
mel_loss                                    0.1997
postnet_mel_loss                            0.1997
dur_loss                                    0.0370
pitch_loss                                  0.1456
energy_loss                                 0.0388




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 137
total_loss                                  0.7076
ssim_loss                                   0.0994
mel_loss                                    0.1969
postnet_mel_loss                            0.1969
dur_loss                                    0.0366
pitch_loss                                  0.1401
energy_loss                                 0.0378




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 138
total_loss                                  0.7084
ssim_loss                                   0.0994
mel_loss                                    0.1971
postnet_mel_loss                            0.1971
dur_loss                                    0.0365
pitch_loss                                  0.1401
energy_loss                                 0.0382




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 139
total_loss                                  0.7072
ssim_loss                                   0.0993
mel_loss                                    0.1963
postnet_mel_loss                            0.1964
dur_loss                                    0.0362
pitch_loss                                  0.1407
energy_loss                                 0.0383




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 140
total_loss                                  0.7049
ssim_loss                                   0.0988
mel_loss                                    0.1956
postnet_mel_loss                            0.1957
dur_loss                                    0.0362
pitch_loss                                  0.1400
energy_loss                                 0.0386




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 141
total_loss                                  0.6997
ssim_loss                                   0.0984
mel_loss                                    0.1957
postnet_mel_loss                            0.1958
dur_loss                                    0.0358
pitch_loss                                  0.1371
energy_loss                                 0.0368




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 142
total_loss                                  0.6989
ssim_loss                                   0.0987
mel_loss                                    0.1950
postnet_mel_loss                            0.1950
dur_loss                                    0.0355
pitch_loss                                  0.1381
energy_loss                                 0.0365




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 143
total_loss                                  0.6959
ssim_loss                                   0.0983
mel_loss                                    0.1941
postnet_mel_loss                            0.1941
dur_loss                                    0.0355
pitch_loss                                  0.1372
energy_loss                                 0.0366




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 144
total_loss                                  0.6974
ssim_loss                                   0.0978
mel_loss                                    0.1945
postnet_mel_loss                            0.1945
dur_loss                                    0.0357
pitch_loss                                  0.1381
energy_loss                                 0.0367




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 145
total_loss                                  0.6912
ssim_loss                                   0.0975
mel_loss                                    0.1926
postnet_mel_loss                            0.1926
dur_loss                                    0.0351
pitch_loss                                  0.1366
energy_loss                                 0.0367




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 146
total_loss                                  0.7178
ssim_loss                                   0.1000
mel_loss                                    0.2021
postnet_mel_loss                            0.2021
dur_loss                                    0.0362
pitch_loss                                  0.1400
energy_loss                                 0.0373




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 147
total_loss                                  0.7052
ssim_loss                                   0.0994
mel_loss                                    0.1986
postnet_mel_loss                            0.1986
dur_loss                                    0.0357
pitch_loss                                  0.1364
energy_loss                                 0.0366




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 148
total_loss                                  0.6828
ssim_loss                                   0.0966
mel_loss                                    0.1909
postnet_mel_loss                            0.1909
dur_loss                                    0.0347
pitch_loss                                  0.1341
energy_loss                                 0.0355




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 149
total_loss                                  0.6841
ssim_loss                                   0.0966
mel_loss                                    0.1888
postnet_mel_loss                            0.1888
dur_loss                                    0.0347
pitch_loss                                  0.1398
energy_loss                                 0.0354




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 150
total_loss                                  0.6858
ssim_loss                                   0.0965
mel_loss                                    0.1895
postnet_mel_loss                            0.1895
dur_loss                                    0.0350
pitch_loss                                  0.1398
energy_loss                                 0.0355




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 151
total_loss                                  0.6841
ssim_loss                                   0.0967
mel_loss                                    0.1890
postnet_mel_loss                            0.1890
dur_loss                                    0.0349
pitch_loss                                  0.1392
energy_loss                                 0.0354




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 152
total_loss                                  0.6775
ssim_loss                                   0.0960
mel_loss                                    0.1883
postnet_mel_loss                            0.1883
dur_loss                                    0.0347
pitch_loss                                  0.1353
energy_loss                                 0.0350




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 153
total_loss                                  0.6893
ssim_loss                                   0.0974
mel_loss                                    0.1925
postnet_mel_loss                            0.1925
dur_loss                                    0.0346
pitch_loss                                  0.1370
energy_loss                                 0.0352




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 154
total_loss                                  0.6842
ssim_loss                                   0.0973
mel_loss                                    0.1917
postnet_mel_loss                            0.1917
dur_loss                                    0.0344
pitch_loss                                  0.1344
energy_loss                                 0.0346




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 155
total_loss                                  0.6714
ssim_loss                                   0.0951
mel_loss                                    0.1862
postnet_mel_loss                            0.1862
dur_loss                                    0.0342
pitch_loss                                  0.1349
energy_loss                                 0.0347




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 156
total_loss                                  0.6678
ssim_loss                                   0.0953
mel_loss                                    0.1857
postnet_mel_loss                            0.1857
dur_loss                                    0.0340
pitch_loss                                  0.1329
energy_loss                                 0.0342




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 157
total_loss                                  0.6693
ssim_loss                                   0.0952
mel_loss                                    0.1859
postnet_mel_loss                            0.1859
dur_loss                                    0.0340
pitch_loss                                  0.1341
energy_loss                                 0.0343




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 158
total_loss                                  0.6774
ssim_loss                                   0.0957
mel_loss                                    0.1874
postnet_mel_loss                            0.1874
dur_loss                                    0.0347
pitch_loss                                  0.1375
energy_loss                                 0.0347




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 159
total_loss                                  0.6743
ssim_loss                                   0.0950
mel_loss                                    0.1861
postnet_mel_loss                            0.1861
dur_loss                                    0.0339
pitch_loss                                  0.1386
energy_loss                                 0.0346




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 160
total_loss                                  0.6614
ssim_loss                                   0.0938
mel_loss                                    0.1845
postnet_mel_loss                            0.1845
dur_loss                                    0.0333
pitch_loss                                  0.1322
energy_loss                                 0.0332




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 161
total_loss                                  0.6643
ssim_loss                                   0.0942
mel_loss                                    0.1850
postnet_mel_loss                            0.1850
dur_loss                                    0.0335
pitch_loss                                  0.1331
energy_loss                                 0.0334




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 162
total_loss                                  0.6673
ssim_loss                                   0.0944
mel_loss                                    0.1862
postnet_mel_loss                            0.1862
dur_loss                                    0.0334
pitch_loss                                  0.1332
energy_loss                                 0.0339




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 163
total_loss                                  0.6717
ssim_loss                                   0.0947
mel_loss                                    0.1870
postnet_mel_loss                            0.1870
dur_loss                                    0.0335
pitch_loss                                  0.1361
energy_loss                                 0.0334




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 164
total_loss                                  0.6744
ssim_loss                                   0.0952
mel_loss                                    0.1873
postnet_mel_loss                            0.1873
dur_loss                                    0.0338
pitch_loss                                  0.1367
energy_loss                                 0.0341




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 165
total_loss                                  0.6642
ssim_loss                                   0.0940
mel_loss                                    0.1845
postnet_mel_loss                            0.1845
dur_loss                                    0.0336
pitch_loss                                  0.1347
energy_loss                                 0.0329




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 166
total_loss                                  0.6668
ssim_loss                                   0.0943
mel_loss                                    0.1863
postnet_mel_loss                            0.1863
dur_loss                                    0.0332
pitch_loss                                  0.1339
energy_loss                                 0.0329




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 167
total_loss                                  0.6566
ssim_loss                                   0.0930
mel_loss                                    0.1818
postnet_mel_loss                            0.1818
dur_loss                                    0.0328
pitch_loss                                  0.1343
energy_loss                                 0.0328




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 168
total_loss                                  0.6484
ssim_loss                                   0.0925
mel_loss                                    0.1795
postnet_mel_loss                            0.1794
dur_loss                                    0.0326
pitch_loss                                  0.1322
energy_loss                                 0.0322




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 169
total_loss                                  0.6491
ssim_loss                                   0.0924
mel_loss                                    0.1801
postnet_mel_loss                            0.1801
dur_loss                                    0.0325
pitch_loss                                  0.1315
energy_loss                                 0.0325




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 170
total_loss                                  0.6563
ssim_loss                                   0.0933
mel_loss                                    0.1809
postnet_mel_loss                            0.1808
dur_loss                                    0.0330
pitch_loss                                  0.1349
energy_loss                                 0.0334




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 171
total_loss                                  0.6515
ssim_loss                                   0.0926
mel_loss                                    0.1808
postnet_mel_loss                            0.1808
dur_loss                                    0.0326
pitch_loss                                  0.1323
energy_loss                                 0.0325




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 172
total_loss                                  0.6616
ssim_loss                                   0.0941
mel_loss                                    0.1860
postnet_mel_loss                            0.1860
dur_loss                                    0.0328
pitch_loss                                  0.1306
energy_loss                                 0.0321




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 173
total_loss                                  0.6572
ssim_loss                                   0.0930
mel_loss                                    0.1815
postnet_mel_loss                            0.1815
dur_loss                                    0.0327
pitch_loss                                  0.1363
energy_loss                                 0.0322




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 174
total_loss                                  0.6481
ssim_loss                                   0.0927
mel_loss                                    0.1782
postnet_mel_loss                            0.1782
dur_loss                                    0.0325
pitch_loss                                  0.1344
energy_loss                                 0.0321




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 175
total_loss                                  0.6561
ssim_loss                                   0.0932
mel_loss                                    0.1815
postnet_mel_loss                            0.1815
dur_loss                                    0.0327
pitch_loss                                  0.1346
energy_loss                                 0.0326




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 176
total_loss                                  0.6601
ssim_loss                                   0.0936
mel_loss                                    0.1833
postnet_mel_loss                            0.1833
dur_loss                                    0.0331
pitch_loss                                  0.1343
energy_loss                                 0.0324




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 177
total_loss                                  0.6445
ssim_loss                                   0.0913
mel_loss                                    0.1779
postnet_mel_loss                            0.1779
dur_loss                                    0.0322
pitch_loss                                  0.1333
energy_loss                                 0.0318




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 178
total_loss                                  0.6585
ssim_loss                                   0.0935
mel_loss                                    0.1829
postnet_mel_loss                            0.1829
dur_loss                                    0.0325
pitch_loss                                  0.1347
energy_loss                                 0.0320




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 179
total_loss                                  0.6552
ssim_loss                                   0.0931
mel_loss                                    0.1810
postnet_mel_loss                            0.1810
dur_loss                                    0.0323
pitch_loss                                  0.1365
energy_loss                                 0.0314




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 180
total_loss                                  0.6373
ssim_loss                                   0.0910
mel_loss                                    0.1755
postnet_mel_loss                            0.1755
dur_loss                                    0.0319
pitch_loss                                  0.1325
energy_loss                                 0.0309




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 181
total_loss                                  0.6318
ssim_loss                                   0.0909
mel_loss                                    0.1744
postnet_mel_loss                            0.1744
dur_loss                                    0.0315
pitch_loss                                  0.1305
energy_loss                                 0.0300




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 182
total_loss                                  0.6345
ssim_loss                                   0.0908
mel_loss                                    0.1751
postnet_mel_loss                            0.1750
dur_loss                                    0.0317
pitch_loss                                  0.1314
energy_loss                                 0.0305




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 183
total_loss                                  0.6422
ssim_loss                                   0.0913
mel_loss                                    0.1777
postnet_mel_loss                            0.1777
dur_loss                                    0.0317
pitch_loss                                  0.1325
energy_loss                                 0.0313




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 184
total_loss                                  0.6456
ssim_loss                                   0.0916
mel_loss                                    0.1794
postnet_mel_loss                            0.1794
dur_loss                                    0.0317
pitch_loss                                  0.1320
energy_loss                                 0.0315




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 185
total_loss                                  0.6408
ssim_loss                                   0.0916
mel_loss                                    0.1767
postnet_mel_loss                            0.1767
dur_loss                                    0.0323
pitch_loss                                  0.1322
energy_loss                                 0.0313




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 186
total_loss                                  0.6356
ssim_loss                                   0.0907
mel_loss                                    0.1754
postnet_mel_loss                            0.1754
dur_loss                                    0.0319
pitch_loss                                  0.1317
energy_loss                                 0.0305




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 187
total_loss                                  0.6269
ssim_loss                                   0.0903
mel_loss                                    0.1736
postnet_mel_loss                            0.1736
dur_loss                                    0.0315
pitch_loss                                  0.1278
energy_loss                                 0.0301




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 188
total_loss                                  0.6362
ssim_loss                                   0.0905
mel_loss                                    0.1757
postnet_mel_loss                            0.1757
dur_loss                                    0.0317
pitch_loss                                  0.1321
energy_loss                                 0.0304




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 189
total_loss                                  0.6354
ssim_loss                                   0.0908
mel_loss                                    0.1762
postnet_mel_loss                            0.1762
dur_loss                                    0.0318
pitch_loss                                  0.1294
energy_loss                                 0.0310




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 190
total_loss                                  0.6330
ssim_loss                                   0.0905
mel_loss                                    0.1746
postnet_mel_loss                            0.1745
dur_loss                                    0.0316
pitch_loss                                  0.1311
energy_loss                                 0.0307




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 191
total_loss                                  0.6350
ssim_loss                                   0.0908
mel_loss                                    0.1776
postnet_mel_loss                            0.1776
dur_loss                                    0.0311
pitch_loss                                  0.1277
energy_loss                                 0.0301




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 192
total_loss                                  0.6359
ssim_loss                                   0.0907
mel_loss                                    0.1764
postnet_mel_loss                            0.1764
dur_loss                                    0.0314
pitch_loss                                  0.1307
energy_loss                                 0.0302




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 193
total_loss                                  0.6359
ssim_loss                                   0.0908
mel_loss                                    0.1756
postnet_mel_loss                            0.1757
dur_loss                                    0.0316
pitch_loss                                  0.1317
energy_loss                                 0.0306




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 194
total_loss                                  0.6407
ssim_loss                                   0.0909
mel_loss                                    0.1759
postnet_mel_loss                            0.1759
dur_loss                                    0.0317
pitch_loss                                  0.1359
energy_loss                                 0.0303




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 195
total_loss                                  0.6260
ssim_loss                                   0.0893
mel_loss                                    0.1718
postnet_mel_loss                            0.1717
dur_loss                                    0.0315
pitch_loss                                  0.1313
energy_loss                                 0.0303




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 196
total_loss                                  0.6183
ssim_loss                                   0.0888
mel_loss                                    0.1698
postnet_mel_loss                            0.1698
dur_loss                                    0.0309
pitch_loss                                  0.1296
energy_loss                                 0.0294




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 197
total_loss                                  0.6147
ssim_loss                                   0.0884
mel_loss                                    0.1699
postnet_mel_loss                            0.1699
dur_loss                                    0.0307
pitch_loss                                  0.1268
energy_loss                                 0.0290




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 198
total_loss                                  0.6226
ssim_loss                                   0.0893
mel_loss                                    0.1723
postnet_mel_loss                            0.1723
dur_loss                                    0.0310
pitch_loss                                  0.1279
energy_loss                                 0.0299




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 199
total_loss                                  0.6293
ssim_loss                                   0.0895
mel_loss                                    0.1738
postnet_mel_loss                            0.1738
dur_loss                                    0.0311
pitch_loss                                  0.1308
energy_loss                                 0.0302




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 200
total_loss                                  0.6186
ssim_loss                                   0.0885
mel_loss                                    0.1708
postnet_mel_loss                            0.1708
dur_loss                                    0.0309
pitch_loss                                  0.1281
energy_loss                                 0.0296




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 201
total_loss                                  0.6181
ssim_loss                                   0.0893
mel_loss                                    0.1702
postnet_mel_loss                            0.1702
dur_loss                                    0.0309
pitch_loss                                  0.1275
energy_loss                                 0.0299




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 202
total_loss                                  0.6142
ssim_loss                                   0.0883
mel_loss                                    0.1698
postnet_mel_loss                            0.1697
dur_loss                                    0.0305
pitch_loss                                  0.1267
energy_loss                                 0.0292




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 203
total_loss                                  0.6155
ssim_loss                                   0.0883
mel_loss                                    0.1698
postnet_mel_loss                            0.1698
dur_loss                                    0.0304
pitch_loss                                  0.1273
energy_loss                                 0.0298




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 204
total_loss                                  0.6159
ssim_loss                                   0.0878
mel_loss                                    0.1701
postnet_mel_loss                            0.1700
dur_loss                                    0.0307
pitch_loss                                  0.1282
energy_loss                                 0.0290




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 205
total_loss                                  0.6166
ssim_loss                                   0.0882
mel_loss                                    0.1708
postnet_mel_loss                            0.1708
dur_loss                                    0.0307
pitch_loss                                  0.1270
energy_loss                                 0.0292




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 206
total_loss                                  0.6131
ssim_loss                                   0.0874
mel_loss                                    0.1690
postnet_mel_loss                            0.1689
dur_loss                                    0.0305
pitch_loss                                  0.1282
energy_loss                                 0.0291




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 207
total_loss                                  0.6166
ssim_loss                                   0.0881
mel_loss                                    0.1702
postnet_mel_loss                            0.1702
dur_loss                                    0.0303
pitch_loss                                  0.1284
energy_loss                                 0.0294




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 208
total_loss                                  0.6212
ssim_loss                                   0.0887
mel_loss                                    0.1715
postnet_mel_loss                            0.1714
dur_loss                                    0.0304
pitch_loss                                  0.1303
energy_loss                                 0.0290




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 209
total_loss                                  0.6034
ssim_loss                                   0.0871
mel_loss                                    0.1671
postnet_mel_loss                            0.1670
dur_loss                                    0.0301
pitch_loss                                  0.1244
energy_loss                                 0.0278




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 210
total_loss                                  0.6035
ssim_loss                                   0.0871
mel_loss                                    0.1665
postnet_mel_loss                            0.1665
dur_loss                                    0.0301
pitch_loss                                  0.1252
energy_loss                                 0.0281




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 211
total_loss                                  0.6058
ssim_loss                                   0.0873
mel_loss                                    0.1670
postnet_mel_loss                            0.1669
dur_loss                                    0.0303
pitch_loss                                  0.1257
energy_loss                                 0.0286




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 212
total_loss                                  0.6127
ssim_loss                                   0.0876
mel_loss                                    0.1674
postnet_mel_loss                            0.1674
dur_loss                                    0.0302
pitch_loss                                  0.1312
energy_loss                                 0.0287




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 213
total_loss                                  0.6137
ssim_loss                                   0.0875
mel_loss                                    0.1688
postnet_mel_loss                            0.1687
dur_loss                                    0.0307
pitch_loss                                  0.1289
energy_loss                                 0.0291




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 214
total_loss                                  0.6062
ssim_loss                                   0.0872
mel_loss                                    0.1672
postnet_mel_loss                            0.1671
dur_loss                                    0.0298
pitch_loss                                  0.1266
energy_loss                                 0.0282




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 215
total_loss                                  0.5998
ssim_loss                                   0.0866
mel_loss                                    0.1652
postnet_mel_loss                            0.1652
dur_loss                                    0.0296
pitch_loss                                  0.1252
energy_loss                                 0.0279




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 216
total_loss                                  0.6030
ssim_loss                                   0.0868
mel_loss                                    0.1656
postnet_mel_loss                            0.1656
dur_loss                                    0.0296
pitch_loss                                  0.1276
energy_loss                                 0.0278




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 217
total_loss                                  0.6043
ssim_loss                                   0.0870
mel_loss                                    0.1661
postnet_mel_loss                            0.1661
dur_loss                                    0.0301
pitch_loss                                  0.1269
energy_loss                                 0.0282




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 218
total_loss                                  0.6044
ssim_loss                                   0.0865
mel_loss                                    0.1663
postnet_mel_loss                            0.1663
dur_loss                                    0.0298
pitch_loss                                  0.1279
energy_loss                                 0.0276




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 219
total_loss                                  0.5983
ssim_loss                                   0.0859
mel_loss                                    0.1650
postnet_mel_loss                            0.1650
dur_loss                                    0.0295
pitch_loss                                  0.1252
energy_loss                                 0.0276




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 220
total_loss                                  0.6041
ssim_loss                                   0.0863
mel_loss                                    0.1649
postnet_mel_loss                            0.1649
dur_loss                                    0.0297
pitch_loss                                  0.1304
energy_loss                                 0.0278




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 221
total_loss                                  0.5994
ssim_loss                                   0.0865
mel_loss                                    0.1646
postnet_mel_loss                            0.1645
dur_loss                                    0.0294
pitch_loss                                  0.1266
energy_loss                                 0.0277




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 222
total_loss                                  0.6000
ssim_loss                                   0.0862
mel_loss                                    0.1654
postnet_mel_loss                            0.1654
dur_loss                                    0.0294
pitch_loss                                  0.1258
energy_loss                                 0.0279




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 223
total_loss                                  0.5980
ssim_loss                                   0.0859
mel_loss                                    0.1651
postnet_mel_loss                            0.1650
dur_loss                                    0.0295
pitch_loss                                  0.1254
energy_loss                                 0.0270




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 224
total_loss                                  0.6310
ssim_loss                                   0.0902
mel_loss                                    0.1782
postnet_mel_loss                            0.1782
dur_loss                                    0.0303
pitch_loss                                  0.1256
energy_loss                                 0.0286




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 225
total_loss                                  0.6025
ssim_loss                                   0.0862
mel_loss                                    0.1671
postnet_mel_loss                            0.1671
dur_loss                                    0.0294
pitch_loss                                  0.1253
energy_loss                                 0.0274




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 226
total_loss                                  0.5854
ssim_loss                                   0.0853
mel_loss                                    0.1613
postnet_mel_loss                            0.1613
dur_loss                                    0.0290
pitch_loss                                  0.1221
energy_loss                                 0.0264




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 227
total_loss                                  0.5808
ssim_loss                                   0.0843
mel_loss                                    0.1602
postnet_mel_loss                            0.1601
dur_loss                                    0.0288
pitch_loss                                  0.1212
energy_loss                                 0.0261




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 228
total_loss                                  0.5842
ssim_loss                                   0.0852
mel_loss                                    0.1606
postnet_mel_loss                            0.1605
dur_loss                                    0.0289
pitch_loss                                  0.1224
energy_loss                                 0.0267




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 229
total_loss                                  0.5885
ssim_loss                                   0.0852
mel_loss                                    0.1624
postnet_mel_loss                            0.1624
dur_loss                                    0.0291
pitch_loss                                  0.1227
energy_loss                                 0.0267




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 230
total_loss                                  0.5978
ssim_loss                                   0.0861
mel_loss                                    0.1641
postnet_mel_loss                            0.1641
dur_loss                                    0.0294
pitch_loss                                  0.1264
energy_loss                                 0.0278




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 231
total_loss                                  0.5956
ssim_loss                                   0.0853
mel_loss                                    0.1634
postnet_mel_loss                            0.1633
dur_loss                                    0.0294
pitch_loss                                  0.1270
energy_loss                                 0.0271




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 232
total_loss                                  0.5860
ssim_loss                                   0.0842
mel_loss                                    0.1618
postnet_mel_loss                            0.1617
dur_loss                                    0.0289
pitch_loss                                  0.1227
energy_loss                                 0.0267




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 233
total_loss                                  0.5878
ssim_loss                                   0.0849
mel_loss                                    0.1616
postnet_mel_loss                            0.1615
dur_loss                                    0.0287
pitch_loss                                  0.1248
energy_loss                                 0.0262




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 234
total_loss                                  0.5912
ssim_loss                                   0.0849
mel_loss                                    0.1623
postnet_mel_loss                            0.1623
dur_loss                                    0.0289
pitch_loss                                  0.1261
energy_loss                                 0.0268




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 235
total_loss                                  0.5861
ssim_loss                                   0.0849
mel_loss                                    0.1612
postnet_mel_loss                            0.1611
dur_loss                                    0.0288
pitch_loss                                  0.1234
energy_loss                                 0.0266




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 236
total_loss                                  0.6123
ssim_loss                                   0.0866
mel_loss                                    0.1709
postnet_mel_loss                            0.1708
dur_loss                                    0.0295
pitch_loss                                  0.1271
energy_loss                                 0.0274




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 237
total_loss                                  0.5932
ssim_loss                                   0.0858
mel_loss                                    0.1636
postnet_mel_loss                            0.1636
dur_loss                                    0.0290
pitch_loss                                  0.1244
energy_loss                                 0.0269




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 238
total_loss                                  0.5846
ssim_loss                                   0.0844
mel_loss                                    0.1596
postnet_mel_loss                            0.1596
dur_loss                                    0.0286
pitch_loss                                  0.1260
energy_loss                                 0.0263




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 239
total_loss                                  0.5791
ssim_loss                                   0.0841
mel_loss                                    0.1583
postnet_mel_loss                            0.1583
dur_loss                                    0.0288
pitch_loss                                  0.1237
energy_loss                                 0.0260




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 240
total_loss                                  0.5786
ssim_loss                                   0.0837
mel_loss                                    0.1592
postnet_mel_loss                            0.1591
dur_loss                                    0.0287
pitch_loss                                  0.1218
energy_loss                                 0.0260




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 241
total_loss                                  0.5813
ssim_loss                                   0.0838
mel_loss                                    0.1597
postnet_mel_loss                            0.1596
dur_loss                                    0.0286
pitch_loss                                  0.1234
energy_loss                                 0.0263




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 242
total_loss                                  0.5832
ssim_loss                                   0.0837
mel_loss                                    0.1607
postnet_mel_loss                            0.1607
dur_loss                                    0.0286
pitch_loss                                  0.1234
energy_loss                                 0.0261




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 243
total_loss                                  0.5835
ssim_loss                                   0.0843
mel_loss                                    0.1600
postnet_mel_loss                            0.1599
dur_loss                                    0.0286
pitch_loss                                  0.1243
energy_loss                                 0.0265




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 244
total_loss                                  0.5829
ssim_loss                                   0.0842
mel_loss                                    0.1596
postnet_mel_loss                            0.1595
dur_loss                                    0.0285
pitch_loss                                  0.1250
energy_loss                                 0.0261




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 245
total_loss                                  0.5869
ssim_loss                                   0.0841
mel_loss                                    0.1604
postnet_mel_loss                            0.1604
dur_loss                                    0.0289
pitch_loss                                  0.1267
energy_loss                                 0.0262




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 246
total_loss                                  0.5948
ssim_loss                                   0.0859
mel_loss                                    0.1648
postnet_mel_loss                            0.1648
dur_loss                                    0.0287
pitch_loss                                  0.1242
energy_loss                                 0.0265




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 247
total_loss                                  0.5789
ssim_loss                                   0.0841
mel_loss                                    0.1594
postnet_mel_loss                            0.1593
dur_loss                                    0.0284
pitch_loss                                  0.1225
energy_loss                                 0.0252




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 248
total_loss                                  0.5928
ssim_loss                                   0.0853
mel_loss                                    0.1646
postnet_mel_loss                            0.1646
dur_loss                                    0.0286
pitch_loss                                  0.1235
energy_loss                                 0.0261




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 249
total_loss                                  0.5844
ssim_loss                                   0.0846
mel_loss                                    0.1608
postnet_mel_loss                            0.1607
dur_loss                                    0.0289
pitch_loss                                  0.1237
energy_loss                                 0.0258




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 250
total_loss                                  0.5732
ssim_loss                                   0.0827
mel_loss                                    0.1568
postnet_mel_loss                            0.1567
dur_loss                                    0.0282
pitch_loss                                  0.1236
energy_loss                                 0.0253




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 251
total_loss                                  0.5759
ssim_loss                                   0.0831
mel_loss                                    0.1573
postnet_mel_loss                            0.1572
dur_loss                                    0.0282
pitch_loss                                  0.1244
energy_loss                                 0.0259




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 252
total_loss                                  0.5745
ssim_loss                                   0.0831
mel_loss                                    0.1582
postnet_mel_loss                            0.1581
dur_loss                                    0.0282
pitch_loss                                  0.1212
energy_loss                                 0.0258




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 253
total_loss                                  0.5759
ssim_loss                                   0.0835
mel_loss                                    0.1581
postnet_mel_loss                            0.1580
dur_loss                                    0.0283
pitch_loss                                  0.1228
energy_loss                                 0.0250




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 254
total_loss                                  0.5736
ssim_loss                                   0.0830
mel_loss                                    0.1580
postnet_mel_loss                            0.1579
dur_loss                                    0.0281
pitch_loss                                  0.1213
energy_loss                                 0.0253




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 255
total_loss                                  0.5699
ssim_loss                                   0.0828
mel_loss                                    0.1570
postnet_mel_loss                            0.1569
dur_loss                                    0.0280
pitch_loss                                  0.1204
energy_loss                                 0.0248




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 256
total_loss                                  0.5700
ssim_loss                                   0.0823
mel_loss                                    0.1571
postnet_mel_loss                            0.1571
dur_loss                                    0.0281
pitch_loss                                  0.1205
energy_loss                                 0.0249




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 257
total_loss                                  0.5754
ssim_loss                                   0.0830
mel_loss                                    0.1570
postnet_mel_loss                            0.1569
dur_loss                                    0.0282
pitch_loss                                  0.1248
energy_loss                                 0.0255




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 258
total_loss                                  0.5797
ssim_loss                                   0.0838
mel_loss                                    0.1593
postnet_mel_loss                            0.1592
dur_loss                                    0.0282
pitch_loss                                  0.1239
energy_loss                                 0.0253




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 259
total_loss                                  0.5776
ssim_loss                                   0.0834
mel_loss                                    0.1590
postnet_mel_loss                            0.1589
dur_loss                                    0.0282
pitch_loss                                  0.1227
energy_loss                                 0.0255




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 260
total_loss                                  0.5721
ssim_loss                                   0.0830
mel_loss                                    0.1565
postnet_mel_loss                            0.1564
dur_loss                                    0.0281
pitch_loss                                  0.1227
energy_loss                                 0.0254




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 261
total_loss                                  0.5675
ssim_loss                                   0.0818
mel_loss                                    0.1557
postnet_mel_loss                            0.1556
dur_loss                                    0.0280
pitch_loss                                  0.1213
energy_loss                                 0.0251




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 262
total_loss                                  0.5721
ssim_loss                                   0.0829
mel_loss                                    0.1559
postnet_mel_loss                            0.1558
dur_loss                                    0.0281
pitch_loss                                  0.1247
energy_loss                                 0.0247




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 263
total_loss                                  0.5686
ssim_loss                                   0.0822
mel_loss                                    0.1550
postnet_mel_loss                            0.1549
dur_loss                                    0.0276
pitch_loss                                  0.1245
energy_loss                                 0.0244




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 264
total_loss                                  0.5660
ssim_loss                                   0.0817
mel_loss                                    0.1551
postnet_mel_loss                            0.1550
dur_loss                                    0.0278
pitch_loss                                  0.1219
energy_loss                                 0.0246




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 265
total_loss                                  0.5682
ssim_loss                                   0.0823
mel_loss                                    0.1554
postnet_mel_loss                            0.1553
dur_loss                                    0.0277
pitch_loss                                  0.1228
energy_loss                                 0.0248




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 266
total_loss                                  0.5757
ssim_loss                                   0.0827
mel_loss                                    0.1590
postnet_mel_loss                            0.1589
dur_loss                                    0.0282
pitch_loss                                  0.1214
energy_loss                                 0.0256




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 267
total_loss                                  0.5698
ssim_loss                                   0.0823
mel_loss                                    0.1559
postnet_mel_loss                            0.1558
dur_loss                                    0.0278
pitch_loss                                  0.1234
energy_loss                                 0.0246




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 268
total_loss                                  0.5639
ssim_loss                                   0.0819
mel_loss                                    0.1548
postnet_mel_loss                            0.1547
dur_loss                                    0.0277
pitch_loss                                  0.1201
energy_loss                                 0.0247




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 269
total_loss                                  0.5637
ssim_loss                                   0.0819
mel_loss                                    0.1547
postnet_mel_loss                            0.1546
dur_loss                                    0.0276
pitch_loss                                  0.1199
energy_loss                                 0.0250




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 270
total_loss                                  0.5654
ssim_loss                                   0.0818
mel_loss                                    0.1547
postnet_mel_loss                            0.1546
dur_loss                                    0.0278
pitch_loss                                  0.1215
energy_loss                                 0.0249




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 271
total_loss                                  0.5648
ssim_loss                                   0.0817
mel_loss                                    0.1541
postnet_mel_loss                            0.1540
dur_loss                                    0.0276
pitch_loss                                  0.1224
energy_loss                                 0.0249




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 272
total_loss                                  0.5636
ssim_loss                                   0.0815
mel_loss                                    0.1539
postnet_mel_loss                            0.1538
dur_loss                                    0.0276
pitch_loss                                  0.1223
energy_loss                                 0.0244




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 273
total_loss                                  0.5633
ssim_loss                                   0.0818
mel_loss                                    0.1541
postnet_mel_loss                            0.1539
dur_loss                                    0.0277
pitch_loss                                  0.1211
energy_loss                                 0.0247




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 274
total_loss                                  0.5613
ssim_loss                                   0.0809
mel_loss                                    0.1539
postnet_mel_loss                            0.1538
dur_loss                                    0.0275
pitch_loss                                  0.1205
energy_loss                                 0.0246




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 275
total_loss                                  0.5627
ssim_loss                                   0.0812
mel_loss                                    0.1533
postnet_mel_loss                            0.1532
dur_loss                                    0.0277
pitch_loss                                  0.1224
energy_loss                                 0.0248




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 276
total_loss                                  0.5610
ssim_loss                                   0.0813
mel_loss                                    0.1534
postnet_mel_loss                            0.1532
dur_loss                                    0.0275
pitch_loss                                  0.1213
energy_loss                                 0.0243




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 277
total_loss                                  0.5590
ssim_loss                                   0.0809
mel_loss                                    0.1530
postnet_mel_loss                            0.1529
dur_loss                                    0.0273
pitch_loss                                  0.1207
energy_loss                                 0.0241




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 278
total_loss                                  0.5622
ssim_loss                                   0.0816
mel_loss                                    0.1540
postnet_mel_loss                            0.1539
dur_loss                                    0.0275
pitch_loss                                  0.1207
energy_loss                                 0.0246




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 279
total_loss                                  0.5598
ssim_loss                                   0.0813
mel_loss                                    0.1532
postnet_mel_loss                            0.1531
dur_loss                                    0.0273
pitch_loss                                  0.1207
energy_loss                                 0.0243




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 280
total_loss                                  0.5549
ssim_loss                                   0.0811
mel_loss                                    0.1525
postnet_mel_loss                            0.1524
dur_loss                                    0.0273
pitch_loss                                  0.1178
energy_loss                                 0.0239




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 281
total_loss                                  0.5588
ssim_loss                                   0.0805
mel_loss                                    0.1527
postnet_mel_loss                            0.1526
dur_loss                                    0.0275
pitch_loss                                  0.1212
energy_loss                                 0.0243




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 282
total_loss                                  0.5600
ssim_loss                                   0.0803
mel_loss                                    0.1527
postnet_mel_loss                            0.1526
dur_loss                                    0.0275
pitch_loss                                  0.1229
energy_loss                                 0.0240




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 283
total_loss                                  0.5599
ssim_loss                                   0.0810
mel_loss                                    0.1531
postnet_mel_loss                            0.1531
dur_loss                                    0.0275
pitch_loss                                  0.1211
energy_loss                                 0.0241




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 284
total_loss                                  0.5536
ssim_loss                                   0.0802
mel_loss                                    0.1519
postnet_mel_loss                            0.1518
dur_loss                                    0.0272
pitch_loss                                  0.1189
energy_loss                                 0.0237




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 285
total_loss                                  0.5586
ssim_loss                                   0.0806
mel_loss                                    0.1526
postnet_mel_loss                            0.1525
dur_loss                                    0.0272
pitch_loss                                  0.1218
energy_loss                                 0.0241




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 286
total_loss                                  0.5492
ssim_loss                                   0.0800
mel_loss                                    0.1510
postnet_mel_loss                            0.1508
dur_loss                                    0.0268
pitch_loss                                  0.1173
energy_loss                                 0.0233




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 287
total_loss                                  0.5537
ssim_loss                                   0.0808
mel_loss                                    0.1514
postnet_mel_loss                            0.1513
dur_loss                                    0.0270
pitch_loss                                  0.1197
energy_loss                                 0.0236




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 288
total_loss                                  0.5817
ssim_loss                                   0.0833
mel_loss                                    0.1635
postnet_mel_loss                            0.1634
dur_loss                                    0.0276
pitch_loss                                  0.1200
energy_loss                                 0.0240




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 289
total_loss                                  0.5601
ssim_loss                                   0.0816
mel_loss                                    0.1553
postnet_mel_loss                            0.1552
dur_loss                                    0.0272
pitch_loss                                  0.1176
energy_loss                                 0.0233




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 290
total_loss                                  0.5469
ssim_loss                                   0.0801
mel_loss                                    0.1494
postnet_mel_loss                            0.1492
dur_loss                                    0.0270
pitch_loss                                  0.1182
energy_loss                                 0.0229




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 291
total_loss                                  0.5434
ssim_loss                                   0.0793
mel_loss                                    0.1483
postnet_mel_loss                            0.1482
dur_loss                                    0.0267
pitch_loss                                  0.1181
energy_loss                                 0.0228




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 292
total_loss                                  0.5490
ssim_loss                                   0.0799
mel_loss                                    0.1501
postnet_mel_loss                            0.1499
dur_loss                                    0.0271
pitch_loss                                  0.1186
energy_loss                                 0.0234




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 293
total_loss                                  0.5528
ssim_loss                                   0.0803
mel_loss                                    0.1506
postnet_mel_loss                            0.1505
dur_loss                                    0.0271
pitch_loss                                  0.1211
energy_loss                                 0.0233




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 294
total_loss                                  0.5506
ssim_loss                                   0.0801
mel_loss                                    0.1500
postnet_mel_loss                            0.1499
dur_loss                                    0.0270
pitch_loss                                  0.1202
energy_loss                                 0.0234




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 295
total_loss                                  0.5570
ssim_loss                                   0.0812
mel_loss                                    0.1530
postnet_mel_loss                            0.1528
dur_loss                                    0.0269
pitch_loss                                  0.1193
energy_loss                                 0.0239




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 296
total_loss                                  0.5576
ssim_loss                                   0.0806
mel_loss                                    0.1524
postnet_mel_loss                            0.1523
dur_loss                                    0.0270
pitch_loss                                  0.1215
energy_loss                                 0.0239




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 297
total_loss                                  0.5451
ssim_loss                                   0.0797
mel_loss                                    0.1489
postnet_mel_loss                            0.1487
dur_loss                                    0.0269
pitch_loss                                  0.1181
energy_loss                                 0.0228




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 298
total_loss                                  0.5446
ssim_loss                                   0.0793
mel_loss                                    0.1489
postnet_mel_loss                            0.1487
dur_loss                                    0.0267
pitch_loss                                  0.1181
energy_loss                                 0.0229




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 299
total_loss                                  0.5487
ssim_loss                                   0.0792
mel_loss                                    0.1500
postnet_mel_loss                            0.1499
dur_loss                                    0.0269
pitch_loss                                  0.1192
energy_loss                                 0.0236




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 300
total_loss                                  0.5469
ssim_loss                                   0.0791
mel_loss                                    0.1499
postnet_mel_loss                            0.1497
dur_loss                                    0.0268
pitch_loss                                  0.1183
energy_loss                                 0.0232




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 301
total_loss                                  0.5458
ssim_loss                                   0.0792
mel_loss                                    0.1493
postnet_mel_loss                            0.1492
dur_loss                                    0.0266
pitch_loss                                  0.1188
energy_loss                                 0.0227




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 302
total_loss                                  0.5469
ssim_loss                                   0.0794
mel_loss                                    0.1495
postnet_mel_loss                            0.1493
dur_loss                                    0.0265
pitch_loss                                  0.1192
energy_loss                                 0.0230




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 303
total_loss                                  0.5464
ssim_loss                                   0.0794
mel_loss                                    0.1494
postnet_mel_loss                            0.1493
dur_loss                                    0.0265
pitch_loss                                  0.1187
energy_loss                                 0.0230




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 304
total_loss                                  0.5445
ssim_loss                                   0.0790
mel_loss                                    0.1493
postnet_mel_loss                            0.1491
dur_loss                                    0.0267
pitch_loss                                  0.1172
energy_loss                                 0.0232




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 305
total_loss                                  0.5475
ssim_loss                                   0.0800
mel_loss                                    0.1494
postnet_mel_loss                            0.1493
dur_loss                                    0.0268
pitch_loss                                  0.1189
energy_loss                                 0.0230




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 306
total_loss                                  0.5440
ssim_loss                                   0.0789
mel_loss                                    0.1489
postnet_mel_loss                            0.1487
dur_loss                                    0.0267
pitch_loss                                  0.1179
energy_loss                                 0.0229




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 307
total_loss                                  0.5503
ssim_loss                                   0.0794
mel_loss                                    0.1492
postnet_mel_loss                            0.1491
dur_loss                                    0.0271
pitch_loss                                  0.1219
energy_loss                                 0.0236




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 308
total_loss                                  0.5453
ssim_loss                                   0.0790
mel_loss                                    0.1491
postnet_mel_loss                            0.1489
dur_loss                                    0.0270
pitch_loss                                  0.1183
energy_loss                                 0.0230




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 309
total_loss                                  0.5414
ssim_loss                                   0.0790
mel_loss                                    0.1477
postnet_mel_loss                            0.1476
dur_loss                                    0.0265
pitch_loss                                  0.1183
energy_loss                                 0.0223




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 310
total_loss                                  0.5390
ssim_loss                                   0.0786
mel_loss                                    0.1475
postnet_mel_loss                            0.1474
dur_loss                                    0.0264
pitch_loss                                  0.1169
energy_loss                                 0.0222




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 311
total_loss                                  0.5465
ssim_loss                                   0.0791
mel_loss                                    0.1498
postnet_mel_loss                            0.1497
dur_loss                                    0.0264
pitch_loss                                  0.1188
energy_loss                                 0.0227




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 312
total_loss                                  0.5477
ssim_loss                                   0.0797
mel_loss                                    0.1498
postnet_mel_loss                            0.1496
dur_loss                                    0.0265
pitch_loss                                  0.1192
energy_loss                                 0.0229




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 313
total_loss                                  0.5503
ssim_loss                                   0.0800
mel_loss                                    0.1518
postnet_mel_loss                            0.1516
dur_loss                                    0.0267
pitch_loss                                  0.1176
energy_loss                                 0.0227




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 314
total_loss                                  0.5454
ssim_loss                                   0.0791
mel_loss                                    0.1496
postnet_mel_loss                            0.1494
dur_loss                                    0.0267
pitch_loss                                  0.1183
energy_loss                                 0.0224




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 315
total_loss                                  0.5372
ssim_loss                                   0.0781
mel_loss                                    0.1467
postnet_mel_loss                            0.1466
dur_loss                                    0.0262
pitch_loss                                  0.1175
energy_loss                                 0.0222




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 316
total_loss                                  0.5373
ssim_loss                                   0.0783
mel_loss                                    0.1458
postnet_mel_loss                            0.1456
dur_loss                                    0.0263
pitch_loss                                  0.1191
energy_loss                                 0.0222




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 317
total_loss                                  0.5384
ssim_loss                                   0.0784
mel_loss                                    0.1464
postnet_mel_loss                            0.1462
dur_loss                                    0.0264
pitch_loss                                  0.1185
energy_loss                                 0.0225




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 318
total_loss                                  0.5375
ssim_loss                                   0.0783
mel_loss                                    0.1468
postnet_mel_loss                            0.1467
dur_loss                                    0.0263
pitch_loss                                  0.1170
energy_loss                                 0.0225




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 319
total_loss                                  0.5407
ssim_loss                                   0.0783
mel_loss                                    0.1471
postnet_mel_loss                            0.1470
dur_loss                                    0.0263
pitch_loss                                  0.1190
energy_loss                                 0.0230




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 320
total_loss                                  0.5416
ssim_loss                                   0.0783
mel_loss                                    0.1485
postnet_mel_loss                            0.1484
dur_loss                                    0.0263
pitch_loss                                  0.1177
energy_loss                                 0.0224




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 321
total_loss                                  0.5379
ssim_loss                                   0.0783
mel_loss                                    0.1473
postnet_mel_loss                            0.1471
dur_loss                                    0.0263
pitch_loss                                  0.1159
energy_loss                                 0.0229




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 322
total_loss                                  0.5463
ssim_loss                                   0.0793
mel_loss                                    0.1512
postnet_mel_loss                            0.1511
dur_loss                                    0.0265
pitch_loss                                  0.1157
energy_loss                                 0.0225




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 323
total_loss                                  0.5522
ssim_loss                                   0.0799
mel_loss                                    0.1526
postnet_mel_loss                            0.1524
dur_loss                                    0.0266
pitch_loss                                  0.1180
energy_loss                                 0.0228




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 324
total_loss                                  0.5369
ssim_loss                                   0.0783
mel_loss                                    0.1456
postnet_mel_loss                            0.1456
dur_loss                                    0.0265
pitch_loss                                  0.1188
energy_loss                                 0.0222




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 325
total_loss                                  0.5323
ssim_loss                                   0.0774
mel_loss                                    0.1442
postnet_mel_loss                            0.1442
dur_loss                                    0.0262
pitch_loss                                  0.1183
energy_loss                                 0.0220




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 326
total_loss                                  0.5340
ssim_loss                                   0.0777
mel_loss                                    0.1449
postnet_mel_loss                            0.1447
dur_loss                                    0.0263
pitch_loss                                  0.1185
energy_loss                                 0.0219




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 327
total_loss                                  0.5416
ssim_loss                                   0.0785
mel_loss                                    0.1469
postnet_mel_loss                            0.1468
dur_loss                                    0.0264
pitch_loss                                  0.1202
energy_loss                                 0.0228




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 328
total_loss                                  0.5503
ssim_loss                                   0.0791
mel_loss                                    0.1498
postnet_mel_loss                            0.1497
dur_loss                                    0.0267
pitch_loss                                  0.1225
energy_loss                                 0.0226




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 329
total_loss                                  0.5391
ssim_loss                                   0.0775
mel_loss                                    0.1461
postnet_mel_loss                            0.1460
dur_loss                                    0.0263
pitch_loss                                  0.1211
energy_loss                                 0.0221




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 330
total_loss                                  0.5334
ssim_loss                                   0.0775
mel_loss                                    0.1444
postnet_mel_loss                            0.1442
dur_loss                                    0.0259
pitch_loss                                  0.1198
energy_loss                                 0.0215




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 331
total_loss                                  0.5319
ssim_loss                                   0.0775
mel_loss                                    0.1444
postnet_mel_loss                            0.1442
dur_loss                                    0.0258
pitch_loss                                  0.1187
energy_loss                                 0.0212




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 332
total_loss                                  0.5334
ssim_loss                                   0.0778
mel_loss                                    0.1455
postnet_mel_loss                            0.1453
dur_loss                                    0.0258
pitch_loss                                  0.1175
energy_loss                                 0.0214




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 333
total_loss                                  0.5329
ssim_loss                                   0.0778
mel_loss                                    0.1455
postnet_mel_loss                            0.1454
dur_loss                                    0.0259
pitch_loss                                  0.1168
energy_loss                                 0.0216




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 334
total_loss                                  0.5323
ssim_loss                                   0.0771
mel_loss                                    0.1452
postnet_mel_loss                            0.1450
dur_loss                                    0.0259
pitch_loss                                  0.1175
energy_loss                                 0.0216




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 335
total_loss                                  0.5323
ssim_loss                                   0.0776
mel_loss                                    0.1451
postnet_mel_loss                            0.1449
dur_loss                                    0.0260
pitch_loss                                  0.1169
energy_loss                                 0.0217




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 336
total_loss                                  0.5329
ssim_loss                                   0.0772
mel_loss                                    0.1449
postnet_mel_loss                            0.1447
dur_loss                                    0.0261
pitch_loss                                  0.1181
energy_loss                                 0.0219




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 337
total_loss                                  0.5340
ssim_loss                                   0.0780
mel_loss                                    0.1450
postnet_mel_loss                            0.1448
dur_loss                                    0.0259
pitch_loss                                  0.1186
energy_loss                                 0.0216




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 338
total_loss                                  0.5406
ssim_loss                                   0.0782
mel_loss                                    0.1477
postnet_mel_loss                            0.1475
dur_loss                                    0.0264
pitch_loss                                  0.1183
energy_loss                                 0.0224




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 339
total_loss                                  0.5387
ssim_loss                                   0.0785
mel_loss                                    0.1473
postnet_mel_loss                            0.1471
dur_loss                                    0.0264
pitch_loss                                  0.1170
energy_loss                                 0.0224




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 340
total_loss                                  0.5268
ssim_loss                                   0.0771
mel_loss                                    0.1433
postnet_mel_loss                            0.1431
dur_loss                                    0.0258
pitch_loss                                  0.1159
energy_loss                                 0.0216




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 341
total_loss                                  0.5275
ssim_loss                                   0.0771
mel_loss                                    0.1433
postnet_mel_loss                            0.1432
dur_loss                                    0.0258
pitch_loss                                  0.1165
energy_loss                                 0.0216




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 342
total_loss                                  0.5278
ssim_loss                                   0.0768
mel_loss                                    0.1437
postnet_mel_loss                            0.1436
dur_loss                                    0.0259
pitch_loss                                  0.1158
energy_loss                                 0.0220




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 343
total_loss                                  0.5290
ssim_loss                                   0.0773
mel_loss                                    0.1441
postnet_mel_loss                            0.1440
dur_loss                                    0.0259
pitch_loss                                  0.1162
energy_loss                                 0.0215




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 344
total_loss                                  0.5326
ssim_loss                                   0.0774
mel_loss                                    0.1447
postnet_mel_loss                            0.1445
dur_loss                                    0.0259
pitch_loss                                  0.1183
energy_loss                                 0.0217




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 345
total_loss                                  0.5311
ssim_loss                                   0.0772
mel_loss                                    0.1454
postnet_mel_loss                            0.1452
dur_loss                                    0.0258
pitch_loss                                  0.1162
energy_loss                                 0.0214




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 346
total_loss                                  0.5284
ssim_loss                                   0.0768
mel_loss                                    0.1441
postnet_mel_loss                            0.1439
dur_loss                                    0.0259
pitch_loss                                  0.1161
energy_loss                                 0.0216




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 347
total_loss                                  0.5310
ssim_loss                                   0.0768
mel_loss                                    0.1436
postnet_mel_loss                            0.1435
dur_loss                                    0.0261
pitch_loss                                  0.1192
energy_loss                                 0.0218




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 348
total_loss                                  0.5292
ssim_loss                                   0.0770
mel_loss                                    0.1433
postnet_mel_loss                            0.1432
dur_loss                                    0.0260
pitch_loss                                  0.1183
energy_loss                                 0.0214




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 349
total_loss                                  0.5250
ssim_loss                                   0.0767
mel_loss                                    0.1429
postnet_mel_loss                            0.1427
dur_loss                                    0.0257
pitch_loss                                  0.1160
energy_loss                                 0.0210




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 350
total_loss                                  0.5260
ssim_loss                                   0.0767
mel_loss                                    0.1432
postnet_mel_loss                            0.1430
dur_loss                                    0.0259
pitch_loss                                  0.1156
energy_loss                                 0.0215




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 351
total_loss                                  0.5276
ssim_loss                                   0.0768
mel_loss                                    0.1432
postnet_mel_loss                            0.1431
dur_loss                                    0.0258
pitch_loss                                  0.1172
energy_loss                                 0.0215




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 352
total_loss                                  0.5293
ssim_loss                                   0.0772
mel_loss                                    0.1442
postnet_mel_loss                            0.1440
dur_loss                                    0.0258
pitch_loss                                  0.1169
energy_loss                                 0.0213




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 353
total_loss                                  0.5299
ssim_loss                                   0.0764
mel_loss                                    0.1436
postnet_mel_loss                            0.1434
dur_loss                                    0.0259
pitch_loss                                  0.1192
energy_loss                                 0.0214




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 354
total_loss                                  0.5442
ssim_loss                                   0.0783
mel_loss                                    0.1503
postnet_mel_loss                            0.1502
dur_loss                                    0.0262
pitch_loss                                  0.1171
energy_loss                                 0.0221




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 355
total_loss                                  0.5336
ssim_loss                                   0.0775
mel_loss                                    0.1458
postnet_mel_loss                            0.1456
dur_loss                                    0.0259
pitch_loss                                  0.1175
energy_loss                                 0.0212




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 356
total_loss                                  0.5269
ssim_loss                                   0.0766
mel_loss                                    0.1429
postnet_mel_loss                            0.1427
dur_loss                                    0.0256
pitch_loss                                  0.1179
energy_loss                                 0.0213




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 357
total_loss                                  0.5190
ssim_loss                                   0.0758
mel_loss                                    0.1405
postnet_mel_loss                            0.1403
dur_loss                                    0.0255
pitch_loss                                  0.1163
energy_loss                                 0.0206




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 358
total_loss                                  0.5243
ssim_loss                                   0.0762
mel_loss                                    0.1413
postnet_mel_loss                            0.1412
dur_loss                                    0.0257
pitch_loss                                  0.1187
energy_loss                                 0.0211




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 359
total_loss                                  0.5223
ssim_loss                                   0.0757
mel_loss                                    0.1422
postnet_mel_loss                            0.1420
dur_loss                                    0.0256
pitch_loss                                  0.1158
energy_loss                                 0.0210




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 360
total_loss                                  0.5238
ssim_loss                                   0.0761
mel_loss                                    0.1426
postnet_mel_loss                            0.1424
dur_loss                                    0.0258
pitch_loss                                  0.1160
energy_loss                                 0.0210




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 361
total_loss                                  0.5421
ssim_loss                                   0.0786
mel_loss                                    0.1490
postnet_mel_loss                            0.1489
dur_loss                                    0.0261
pitch_loss                                  0.1182
energy_loss                                 0.0215




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 362
total_loss                                  0.5291
ssim_loss                                   0.0769
mel_loss                                    0.1443
postnet_mel_loss                            0.1441
dur_loss                                    0.0256
pitch_loss                                  0.1171
energy_loss                                 0.0210




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 363
total_loss                                  0.5205
ssim_loss                                   0.0763
mel_loss                                    0.1415
postnet_mel_loss                            0.1413
dur_loss                                    0.0254
pitch_loss                                  0.1154
energy_loss                                 0.0206




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 364
total_loss                                  0.5232
ssim_loss                                   0.0765
mel_loss                                    0.1426
postnet_mel_loss                            0.1425
dur_loss                                    0.0256
pitch_loss                                  0.1150
energy_loss                                 0.0209




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 365
total_loss                                  0.5284
ssim_loss                                   0.0768
mel_loss                                    0.1435
postnet_mel_loss                            0.1433
dur_loss                                    0.0258
pitch_loss                                  0.1178
energy_loss                                 0.0212




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 366
total_loss                                  0.5193
ssim_loss                                   0.0755
mel_loss                                    0.1412
postnet_mel_loss                            0.1410
dur_loss                                    0.0255
pitch_loss                                  0.1153
energy_loss                                 0.0207




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 367
total_loss                                  0.5262
ssim_loss                                   0.0766
mel_loss                                    0.1439
postnet_mel_loss                            0.1437
dur_loss                                    0.0256
pitch_loss                                  0.1151
energy_loss                                 0.0213




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 368
total_loss                                  0.5376
ssim_loss                                   0.0775
mel_loss                                    0.1472
postnet_mel_loss                            0.1470
dur_loss                                    0.0259
pitch_loss                                  0.1180
energy_loss                                 0.0220




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 369
total_loss                                  0.5255
ssim_loss                                   0.0767
mel_loss                                    0.1436
postnet_mel_loss                            0.1435
dur_loss                                    0.0254
pitch_loss                                  0.1153
energy_loss                                 0.0211




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 370
total_loss                                  0.5264
ssim_loss                                   0.0770
mel_loss                                    0.1442
postnet_mel_loss                            0.1441
dur_loss                                    0.0256
pitch_loss                                  0.1147
energy_loss                                 0.0208




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 371
total_loss                                  0.5168
ssim_loss                                   0.0755
mel_loss                                    0.1403
postnet_mel_loss                            0.1401
dur_loss                                    0.0254
pitch_loss                                  0.1151
energy_loss                                 0.0205




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 372
total_loss                                  0.5168
ssim_loss                                   0.0756
mel_loss                                    0.1398
postnet_mel_loss                            0.1396
dur_loss                                    0.0254
pitch_loss                                  0.1158
energy_loss                                 0.0206




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 373
total_loss                                  0.5161
ssim_loss                                   0.0753
mel_loss                                    0.1402
postnet_mel_loss                            0.1400
dur_loss                                    0.0253
pitch_loss                                  0.1144
energy_loss                                 0.0209




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 374
total_loss                                  0.5183
ssim_loss                                   0.0757
mel_loss                                    0.1413
postnet_mel_loss                            0.1411
dur_loss                                    0.0254
pitch_loss                                  0.1144
energy_loss                                 0.0204




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 375
total_loss                                  0.5214
ssim_loss                                   0.0754
mel_loss                                    0.1411
postnet_mel_loss                            0.1409
dur_loss                                    0.0254
pitch_loss                                  0.1179
energy_loss                                 0.0207




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 376
total_loss                                  0.5179
ssim_loss                                   0.0758
mel_loss                                    0.1409
postnet_mel_loss                            0.1407
dur_loss                                    0.0253
pitch_loss                                  0.1144
energy_loss                                 0.0207




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 377
total_loss                                  0.5184
ssim_loss                                   0.0761
mel_loss                                    0.1413
postnet_mel_loss                            0.1411
dur_loss                                    0.0253
pitch_loss                                  0.1142
energy_loss                                 0.0204




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 378
total_loss                                  0.5196
ssim_loss                                   0.0759
mel_loss                                    0.1411
postnet_mel_loss                            0.1410
dur_loss                                    0.0254
pitch_loss                                  0.1155
energy_loss                                 0.0207




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 379
total_loss                                  0.5198
ssim_loss                                   0.0755
mel_loss                                    0.1404
postnet_mel_loss                            0.1402
dur_loss                                    0.0254
pitch_loss                                  0.1174
energy_loss                                 0.0209




  0%|          | 0/687 [00:00<?, ?it/s]

Epoch: 380
total_loss                                  0.5178
ssim_loss                                   0.0751
mel_loss                                    0.1401
postnet_mel_loss                            0.1399
dur_loss                                    0.0253
pitch_loss                                  0.1165
energy_loss                                 0.0210




  0%|          | 0/687 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Experiment: Intensity extractor test - `2025-06-05`

- <span style="color:red">fix #1</span>: Rank model의 intensity extractor 추정 **Done**
- <span style="color:red">fix #2</span>: Rank loss를 Rank model의 외부의 별도 class로 지정 **Done**
- <span style="color:red">fix #3</span>: Rank model의 output 수정: `H_i, H_j, h_i, h_j, r_i, r_j` 의 값을 반환하도록 설정 **Done**
- <span style="color:red">fix #4</span>: Rank model 별도의 intensity extractor class 생성 **Done**

---

- <span style="color:blue">imp #1</span>: intensity extractor의 output $\mathbf{I}$ 에 대해 phoneme-wise하게 평균을 취하여 크기 변경: [$B$, $T_{mel}$, $H$] $\rightarrow$ [$B$, $T_{phone}$, $H$]
- <span style="color:blue">imp #2</span>: Speaker ID를 사용할 것인지 speaker embedding을 사용할 것인지 실험을 통해 도출 (논문에서는 speaker id)
- <span style="color:blue">imp #3</span>: `phoneme_encoder_output`과 `intensity_representation`, `speaker_id` 를 concat하여 variance adaptor의 입력으로 feed.
- <span style="color:blue">imp #4</span>: 추론시에는 intensity_representation을 명시적으로 구할 수 없어, manual label을 사용 -> manual label을 구하기 위한 clustering 필요 ($N$-level averaging)

---

- Rank model 재학습 후, train dataset에 대한 intensity score 추출
- Intensity score을 $N$ 개로 bucketize (min - median - max).
- Speaker 별, emotion 별

In [None]:
rank_model = torch.load('/workspace/experiments/exp_3/best_model.pth')
intensity_extractor = rankm_model.intensity_extractor.to(device)

# imp#1
# -- textgrid? -> trimming 된 것 어떻게 처리할 것?
# intensity extractor의 결과 I의 time dimension이 phoneme sequence duration과 일치
# 각 phoneme sequence duration에 대해 I의 평균을 구한다.

start_idx = 0

averaged_intensity = []
for d in duration:
    phoneme_averaged = intensity[start_idx:start_idx + d].mean(dim=0)
    averaged_intensity.append(phoneme_averaged)
    start_idx += d






# fine_grained_emo_tts -> train dataset에서 manual intensity 추출출

