In [1]:
import sys
import os

# Agregar la ruta del directorio donde están los módulos
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [22]:
import hifigan
from model import FastSpeech2, ScheduledOptim
import yaml
import torch
import torch.nn as nn
from dataset import Dataset

model_yml ="C:\\Users\\derec\\OneDrive\\Documents\\F-VOICE\\config\\F-VOICE\\model.yaml"
preprocess_yml = "C:\\Users\\derec\\OneDrive\\Documents\\F-VOICE\\config\\F-VOICE\\preprocess.yaml"
train_yml = "C:\\Users\\derec\\OneDrive\\Documents\\F-VOICE\\config\\F-VOICE\\train.yaml"
model_config = yaml.load(open(model_yml, "r"), Loader=yaml.FullLoader)
preprocess_config = yaml.load(open(preprocess_yml, "r"), Loader=yaml.FullLoader)
train_config = yaml.load(open(train_yml, "r"), Loader=yaml.FullLoader)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [54]:
from torch.utils.data import Dataset
from text import text_to_sequence
import numpy as np
from utils.tools import pad_1D, pad_2D
from torch.utils.data import DataLoader
from utils.tools import to_device,log,synth_one_sample

#Modificar cmudict de acuerdo al diccionario generado de MFA
class Dataset(Dataset):
    def __init__(self, filename,dataset_name,preprocessed_path,cleaners,batch_size,sort=False, drop_last=False):
        self.dataset_name = dataset_name
        self.preprocessed_path = preprocessed_path
        self.cleaners = cleaners
        self.batch_size = batch_size
        self.basename, self.text, self.raw_text = self.process_meta(filename)
        self.sort = sort
        self.drop_last = drop_last

    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        basename = self.basename[idx]
        raw_text = self.raw_text[idx]
        phone = np.array(text_to_sequence(self.text[idx], self.cleaners))
        mel_path = os.path.join(
            self.preprocessed_path,
            "mel",
            "{}-mel.npy".format(basename),
        )
        mel = np.load(mel_path)
        pitch_path = os.path.join(
            self.preprocessed_path,
            "pitch",
            "{}-pitch.npy".format(basename),
        )
        pitch = np.load(pitch_path)
        energy_path = os.path.join(
            self.preprocessed_path,
            "energy",
            "{}-energy.npy".format(basename),
        )
        energy = np.load(energy_path)
        duration_path = os.path.join(
            self.preprocessed_path,
            "duration",
            "{}-duration.npy".format(basename),
        )
        duration = np.load(duration_path)

        sample = {
            "id": basename,
            "text": phone,
            "raw_text": raw_text,
            "mel": mel,
            "pitch": pitch,
            "energy": energy,
            "duration": duration,
        }

        return sample
    
    def process_meta(self, filename):
        # Procesa el archivo de metadatos y devuelve las listas correspondientes
        with open(
            os.path.join(filename), "r", encoding="utf-8"
        ) as f:
            name = []
            text = []
            raw_text = []
            for line in f.readlines():
                n, t, r = line.strip("\n").split("|")
                name.append(n)
                text.append(t)
                raw_text.append(r)
            return name, text, raw_text
    
    def reprocess(self, data, idxs):
        ids = [data[idx]["id"] for idx in idxs]
        texts = [data[idx]["text"] for idx in idxs]
        raw_texts = [data[idx]["raw_text"] for idx in idxs]
        mels = [data[idx]["mel"] for idx in idxs]
        pitches = [data[idx]["pitch"] for idx in idxs]
        energies = [data[idx]["energy"] for idx in idxs]
        durations = [data[idx]["duration"] for idx in idxs]

        text_lens = np.array([text.shape[0] for text in texts])
        mel_lens = np.array([mel.shape[0] for mel in mels])

        texts = pad_1D(texts)
        mels = pad_2D(mels)
        pitches = pad_1D(pitches)
        energies = pad_1D(energies)
        durations = pad_1D(durations)
        return (
            ids,
            raw_texts,
            texts,
            text_lens,
            max(text_lens),
            mels,
            mel_lens,
            max(mel_lens),
            pitches,
            energies,
            durations,
        )
    def collate_fn(self, data):
        # Función para agrupar los datos en lotes
        data_size = len(data)

        if self.sort:
            len_arr = np.array([d["text"].shape[0] for d in data])
            idx_arr = np.argsort(-len_arr)
        else:
            idx_arr = np.arange(data_size)

        tail = idx_arr[len(idx_arr) - (len(idx_arr) % self.batch_size) :]
        idx_arr = idx_arr[: len(idx_arr) - (len(idx_arr) % self.batch_size)]
        idx_arr = idx_arr.reshape((-1, self.batch_size)).tolist()
        if not self.drop_last and len(tail) > 0:
            idx_arr += [tail.tolist()]

        output = list()
        for idx in idx_arr:
            output.append(self.reprocess(data, idx))

        return output
    
class TextDataset(Dataset):
    def __init__(self, filepath, cleaners):
        self.cleaners = cleaners

        self.basename, self.text, self.raw_text = self.process_meta(
            filepath
        )
        
    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        basename = self.basename[idx]
        raw_text = self.raw_text[idx]
        phone = np.array(text_to_sequence(self.text[idx], self.cleaners))

        return (basename, phone, raw_text)

    def process_meta(self, filename):
        with open(filename, "r", encoding="utf-8") as f:
            name = []
            speaker = []
            text = []
            raw_text = []
            for line in f.readlines():
                n, s, t, r = line.strip("\n").split("|")
                name.append(n)
                speaker.append(s)
                text.append(t)
                raw_text.append(r)
            return name, speaker, text, raw_text

    def collate_fn(self, data):
        ids = [d[0] for d in data]
        speakers = np.array([d[1] for d in data])
        texts = [d[2] for d in data]
        raw_texts = [d[3] for d in data]
        text_lens = np.array([text.shape[0] for text in texts])

        texts = pad_1D(texts)

        return ids, raw_texts, speakers, texts, text_lens, max(text_lens)

In [55]:
filename = "metadata.csv"  # El archivo que contiene los nombres de los archivos y sus metadatos
dataset_name = "F-VOICE"
preprocessed_path = "C:\\Users\\derec\\OneDrive\\Documents\\F-VOICE\\notebook_experimentos\\dereckpreprocessed"
cleaners = ["spanish_cleaners"]
batch_size = 16
trainset = ".\\dereckpreprocessed\\train.txt"
valset = ".\\dereckpreprocessed\\val.txt"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
stats_path = "C:\\Users\\derec\\OneDrive\\Documents\\F-VOICE\\notebook_experimentos\\dereckpreprocessed\\stats.json"
graph_path ="\\graficos"

# Crear una instancia del Dataset
dataset = Dataset(filename, dataset_name, preprocessed_path, cleaners, batch_size)

train_dataset = Dataset(
        trainset, dataset_name,preprocessed_path,cleaners,batch_size=16, sort=True, drop_last=True)
val_dataset = Dataset(
        valset, dataset_name,preprocessed_path,cleaners,batch_size=16, sort=True, drop_last=True)


train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size * 4,
        shuffle=True,
        collate_fn=train_dataset.collate_fn,
    )
val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=val_dataset.collate_fn,
    )


n_batch = 0
for batchs in train_loader:
    for batch in batchs:
        to_device(batch, device)
        n_batch += 1
print("Training set  with size {} is composed of {} batches.".format(len(train_dataset), n_batch ))

n_batch = 0
for batchs in val_loader:
    for batch in batchs:
        to_device(batch, device)
        n_batch += 1
print("Validation set  with size {} is composed of {} batches.".format(len(val_dataset), n_batch))


loader = DataLoader(
        dataset,
        batch_size=batch_size * 4,
        shuffle=True,
        collate_fn=dataset.collate_fn,
    )

Training set  with size 300 is composed of 18 batches.
Validation set  with size 100 is composed of 6 batches.


In [None]:
#Cargar modelo FastSpeech2 preentrenado
model = FastSpeech2(preprocess_config, model_config).to(device)
from model import FastSpeech2Loss
restore_step = 100000 #Ultimo step del modeloe pre-entrenado
ckpt_path = "checkpoint"
ckpt = torch.load(ckpt_path)
model.load_state_dict(ckpt["model"])
optimizer = ScheduledOptim(
            model, train_config, model_config, restore_step
        ).load_state_dict(ckpt["optimizer"])
model.train()
model.eval()
model.requires_grad_ = False

model = nn.DataParallel(model)
#freeze encoder parameters
for param in model.module.encoder.parameters():
        param.requires_grad = False
    
#freeze pitch and energy embeddings
model.module.variance_adaptor.pitch_embedding.weight.requires_grad = False
model.module.variance_adaptor.energy_embedding.weight.requires_grad = False

num_param = num_param = sum(param.numel() for param in model.parameters())
Loss = FastSpeech2Loss(preprocess_config, model_config).to(device)
print("Number of FastSpeech2 Parameters:", num_param)


In [56]:
def evaluate(model, step,valname,dataset_name,train_config ,preprocess_config,model_config, logger=None, vocoder=None):

    #preprocess_config, model_config = preprocessed_path

    # Get dataset
    dataset = Dataset(
        valname, dataset_name, train_config, sort=False, drop_last=False
    )
    batch_size = train_config["optimizer"]["batch_size"]
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=dataset.collate_fn,
    )

    # Get loss function
    Loss = FastSpeech2Loss(preprocess_config, model_config).to(device)

    # Evaluation
    loss_sums = [0 for _ in range(6)]
    for batchs in loader:
        for batch in batchs:
            batch = to_device(batch, device)
            with torch.no_grad():
                # Forward
                output = model(*(batch[2:]))

                # Cal Loss
                losses = Loss(batch, output)

                for i in range(len(losses)):
                    loss_sums[i] += losses[i].item() * len(batch[0])

    loss_means = [loss_sum / len(dataset) for loss_sum in loss_sums]

    message = "Validation Step {}, Total Loss: {:.4f}, Mel Loss: {:.4f}, Mel PostNet Loss: {:.4f}, Pitch Loss: {:.4f}, Energy Loss: {:.4f}, Duration Loss: {:.4f}".format(
        *([step] + [l for l in loss_means])
    )

    if logger is not None:
        fig, wav_reconstruction, wav_prediction, tag = synth_one_sample(
            batch,
            output,
            vocoder,
            model_config,
            preprocess_config,
        )

        log(logger, step, losses=loss_means)
        log(
            logger,
            fig=fig,
            tag="Validation/step_{}_{}".format(step, tag),
        )
        sampling_rate = preprocess_config["preprocessing"]["audio"]["sampling_rate"]
        log(
            logger,
            audio=wav_reconstruction,
            sampling_rate=sampling_rate,
            tag="Validation/step_{}_{}_reconstructed".format(step, tag),
        )
        log(
            logger,
            audio=wav_prediction,
            sampling_rate=sampling_rate,
            tag="Validation/step_{}_{}_synthesized".format(step, tag),
        )

    return message

VOCODER 


In [None]:
#Se debe cargar el modelo preentrado de hifi-gan
import json
with open(".\\hifigan\\config.json", "r") as f:
        config = json.load(f)
        config = hifigan.AttrDict(config)
        vocoder = hifigan.Generator(config)
ckpt = torch.load("hifigan/generator_universal.pth.tar")
vocoder.load_state_dict(ckpt["generator"])
vocoder.eval()
vocoder.remove_weight_norm()
vocoder.to(device)


Tensorboard logger

In [None]:
from torch.utils.tensorboard import SummaryWriter
train_log_path = ""
val_log_path = ""
os.makedirs(train_log_path, exist_ok=True)
os.makedirs(val_log_path, exist_ok=True)
train_logger = SummaryWriter(train_log_path)
val_logger = SummaryWriter(val_log_path)

Entrenamiento


In [None]:
step = restore_step + 1
epoch = 1
grad_acc_step = train_config["optimizer"]["grad_acc_step"]
grad_clip_thresh = train_config["optimizer"]["grad_clip_thresh"]
total_step = train_config["step"]["total_step"]
log_step = train_config["step"]["log_step"]
save_step = train_config["step"]["save_step"]
synth_step = train_config["step"]["synth_step"]
val_step = train_config["step"]["val_step"]

In [None]:
import tqdm 
outer_bar = tqdm(total=total_step, desc="Training", position=0)
outer_bar.n = restore_step
outer_bar.update()

while True:
    inner_bar = tqdm(total=len(loader), desc="Epoch {}".format(epoch), position=1)
    for batchs in loader:
        for batch in batchs:
            batch = to_device(batch, device)

            # Forward
            output = model(*(batch[2:]))

            # Calculate Loss
            losses = Loss(batch, output)
            total_loss = losses[0]

            # Backward
            total_loss = total_loss / grad_acc_step
            total_loss.backward()
            if step % grad_acc_step == 0:
                # Clipping gradients to avoid gradient explosion
                nn.utils.clip_grad_norm_(model.parameters(), grad_clip_thresh)

                # Update weights
                optimizer.step_and_update_lr()
                optimizer.zero_grad()

            if step % log_step == 0:
                losses = [l.item() for l in losses]
                message1 = "Step {}/{}, ".format(step, total_step)
                message2 = "Total Loss: {:.4f}, Mel Loss: {:.4f}, Mel PostNet Loss: {:.4f}, Pitch Loss: {:.4f}, Energy Loss: {:.4f}, Duration Loss: {:.4f}".format(
                    *losses
                )

                with open(os.path.join(train_log_path, "log.txt"), "a") as f:
                    f.write(message1 + message2 + "\n")

                outer_bar.write(message1 + message2)

                log(train_logger, step, losses=losses)

            if step % synth_step == 0:
                fig, wav_reconstruction, wav_prediction, tag = synth_one_sample(
                    batch,
                    output,
                    vocoder,
                    model_config = model_config,
                    preprocess_config=preprocess_config,
                )

                log(
                    train_logger,
                    fig=fig,
                    tag="Training/step_{}_{}".format(step, tag),
                )
                sampling_rate = preprocess_config["preprocessing"]["audio"][
                    "sampling_rate"
                ]
                log(
                    train_logger,
                    audio=wav_reconstruction,
                    sampling_rate=sampling_rate,
                    tag="Training/step_{}_{}_reconstructed".format(step, tag),
                )
                log(
                    train_logger,
                    audio=wav_prediction,
                    sampling_rate=sampling_rate,
                    tag="Training/step_{}_{}_synthesized".format(step, tag),
                )

            if step % val_step == 0:
                model.eval()
                message = evaluate(model, step,valset,dataset_name,train_config ,preprocess_config,model_config, val_logger, vocoder)
                with open(os.path.join(val_log_path, "log.txt"), "a") as f:
                    f.write(message + "\n")
                outer_bar.write(message)

                model.train()

            if step % save_step == 0:
                torch.save(
                    {
                        "model": model.module.state_dict(),
                        "optimizer": optimizer._optimizer.state_dict(),
                    },
                    os.path.join(
                       ckpt_path,
                        "{}.pth.tar".format(step),
                    ),
                )

            if step == total_step:
                quit()
            step += 1
            outer_bar.update(1)

        inner_bar.update(1)
    epoch += 1
