# Welcome to the CeVOX ENUNU/NNSVS Training Notebook!

*Last update: `2022-12-10 12:25 (YYYY-MM-DD hh[24]:mm GMT+8)`* 

**NOTE:** This notebook is using NNSVS v0.0.3 to compensate for ENUNU. Hopefully ENUNU catches up soon but I'm not gonna get my hopes up that much !

This notebook is made by DogeyVOX

# Check Setup

In [None]:
#@title Check GPU Type

#@markdown Google usually gives T4s now so there's nothing to really worry about but hey !

!nvidia-smi -L
!nvidia-smi

In [None]:
#@title Mount Google Drive

#@markdown This makes things easy so just do it.

from google.colab import drive
drive.flush_and_unmount()
!rm -rf /content/drive
drive.mount('/content/drive')
print('Done!')

# Preparation

In [None]:
#@title # Step 1: Install training kit

#@markdown You obviously just run this once.

from IPython.display import clear_output

print('Installing prerequisites.')
!rm -rf /content/sample_data
!python -m pip install --upgrade wheel
!apt-get install p7zip-full
%pip install numpy cython utaupy tqdm pydub pyyaml natsort mlflow optuna hydra-optuna-sweeper
%pip install git+https://github.com/MattShannon/bandmat
clear_output()

print('Installing PyTorch.')
%pip install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio===0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
%pip install hydra-core<1.1
clear_output()

print('Downloading ENUNU training kit.')
!git clone https://github.com/UtaUtaUtau/enunu_training_kit/
clear_output()

import yaml

def load_yaml(location):
    res = None
    with open(location) as f:
        res = yaml.safe_load(f)
    return res

def write_yaml(data, location):
    with open(location, 'w', encoding='utf8') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

def represent_none(self, _):
    return self.represent_scalar('tag:yaml.org,2002:null', '')

yaml.add_representer(type(None), represent_none)

enunu_base = '/content/enunu_training_kit/train'
acoustic_base = enunu_base + '/conf/train/acoustic'
duration_base = enunu_base + '/conf/train/duration'
timelag_base = enunu_base + '/conf/train/timelag'
postfilter_base = enunu_base + '/conf/train/postfilter'

config = load_yaml(enunu_base + '/config.yaml')
enuconfig = load_yaml(enunu_base + '/enuconfig.yaml')
acoustic = load_yaml(acoustic_base + '/model/acoustic_custom.yaml')
acoustic_data = load_yaml(acoustic_base + '/data/myconfig.yaml')
acoustic_train = load_yaml(acoustic_base + '/train/myconfig.yaml')

duration = load_yaml(duration_base + '/model/duration_custom.yaml')
duration_data = load_yaml(duration_base + '/data/myconfig.yaml')
duration_train = load_yaml(duration_base + '/train/myconfig.yaml')

timelag = load_yaml(timelag_base + '/model/timelag_custom.yaml')
timelag_data = load_yaml(timelag_base + '/data/myconfig.yaml')
timelag_train = load_yaml(timelag_base + '/train/myconfig.yaml')

postfilter_mgc = load_yaml(postfilter_base + '/model/postfilter_mgc.yaml')
postfilter_bap = load_yaml(postfilter_base + '/model/postfilter_bap.yaml')
postfilter_data = load_yaml(postfilter_base + '/data/myconfig.yaml')
postfilter_mgc_train = load_yaml(postfilter_base + '/train/mgc.yaml')
postfilter_bap_train = load_yaml(postfilter_base + '/train/bap.yaml')

print('Installing NNSVS')
%pip install https://github.com/nnsvs/nnsvs/archive/refs/tags/v0.0.3.zip
clear_output()
print('Done!')

Done!


In [None]:
#@title # Step 2: Decompress dataset

#@markdown This'll deal with COMMON archive types so don't worry about anything.

#@markdown Supported types: `.rar`, `.zip`, `.tar`, `.tar.gz`, `.tar.bz2`, `.7z`

#@markdown ---
#@markdown Empty dataset folder if ig u made a mistake.

empty_dataset_folder = False #@param {type: "boolean"}

#@markdown File location stuff. You know the drill. It's case sensitive.
dataset_loc = '/content/drive/MyDrive/database.*' #@param {type: "string"}
enunu_loc = '/content/enunu_training_kit/train/singing_database'

#@markdown ---

#@markdown Sample rate is a requirement now so
sample_rate = "44100" #@param [44100, 48000]

config['sample_rate'] = int(sample_rate)
enuconfig['sample_rate'] = int(sample_rate)
write_yaml(config, enunu_base + '/config.yaml')
write_yaml(enuconfig, enunu_base + '/enuconfig.yaml')

import os

if empty_dataset_folder:
    !rm -rf "$enunu_loc"

if not os.path.exists(enunu_loc):
    !mkdir "$enunu_loc"

if dataset_loc.endswith('.rar'):
    !unrar x "$dataset_loc" "$enunu_loc"
elif dataset_loc.endswith('.zip'):
    !unzip "$dataset_loc" -d "$enunu_loc"
elif dataset_loc.endswith('.tar'):
    !tar -xf "$dataset_loc" -C "$enunu_loc"
elif dataset_loc.endswith('.tar.gz'):
    !tar -xzf "$dataset_loc" -C "$enunu_loc"
elif dataset_loc.endswith('.tar.bz2'):
    !tar -xjf "$dataset_loc" -C "$enunu_loc"
else:
    !7za x "$dataset_loc" -o$enunu_loc

print('Done!')

# Training Options/Parameters

These are lowkey not optional so yeah.

In [None]:
#@title # Step 3: Set Language

#@markdown Set the language for your model or something.

#@markdown Mode guide:
#@markdown - Default: Default Japanese
#@markdown - Custom: Get from a directory
#@markdown - Archive: Get from an archive. `.zip` only
#@markdown - GitHub: Get from a GitHub repository
mode = "Custom" #@param ["Default", "Custom", "Archive", "GitHub"]

#@markdown ---
#@markdown ### The location of the custom .hed and .table

#@markdown Can be a .zip location, a directory, or a GitHub repo.
lang_loc = '/content/drive/MyDrive/whatever.*' #@param {type: "string"}

#@markdown ---
#@markdown ### Filenames of .hed and .table

hed_file = 'Custom_HED.hed' #@param {type: "string"}
table_file = 'dict.table' #@param {type: "string"}

#@markdown ---
#@markdown ### Vowels of the language

vowel_list = 'a, i, u, e, o, A, I, U, E, O, N' #@param {type: "string"}
vowels = vowel_list.split(',')

import os, shutil
from nnmnkwii.io import hts

def find_language_files(location):
    for root, dirs, files in os.walk(location):
        for f in files:
            full_path = os.path.join(root, f)
            if f == hed_file:
                shutil.copy(full_path, enunu_base + '/hed')
            if f == table_file:
                shutil.copy(full_path, enunu_base + '/dic')

if mode == 'Default':
    hed_file = 'jp_qst_crazy_mono_013_enunu_221D.hed'
    table_file = 'kana2phonemes_002_oto2lab.table'
    vowels = ['a', 'i', 'u', 'e', 'o', 'A', 'I', 'U', 'E', 'O', 'N']
elif mode == 'Custom':
    find_language_files(lang_loc)
elif mode == 'Archive':
    !unzip "$lang_loc" -d /content/custom_lang
    find_language_files('/content/custom_lang')
    !rm -rf /content/custom_lang
elif mode == 'GitHub':
    !git clone "$lang_loc" /content/custom_lang
    find_language_files('/content/custom_lang')
    !rm -rf /content/custom_lang
    
in_dim = 0

binary, continuous = hts.load_question_set(enunu_base + '/hed/' + hed_file)
in_dim = len(binary) + len(continuous)

print('in_dim: ', in_dim)

for i in range(len(vowels)):
    vowels[i] = "'" + vowels[i].strip().replace("'", "\\'") + "'"

script = open(enunu_base + '/stage0/compare_mono_align_and_mono_score.py').readlines()

with open(enunu_base + '/stage0/compare_mono_align_and_mono_score.py', 'w', encoding = 'utf8') as f:
    for i in script:
        if i.startswith('VOWELS = '):
            f.write('VOWELS = {' + ', '.join(vowels) + '}\n')
        else:
            f.write(i)

config['table_path'] = 'dic/' + table_file
config['question_path'] = 'hed/' + hed_file
enuconfig['table_path'] = 'dic/' + table_file
enuconfig['question_path'] = 'hed/' + hed_file

acoustic['netG']['in_dim'] = in_dim + 4
duration['netG']['in_dim'] = in_dim
timelag['netG']['in_dim'] = in_dim

write_yaml(acoustic, acoustic_base + '/model/acoustic_custom.yaml')
write_yaml(duration, duration_base + '/model/duration_custom.yaml')
write_yaml(timelag, timelag_base + '/model/timelag_custom.yaml')

write_yaml(config, enunu_base + '/config.yaml')
write_yaml(enuconfig, enunu_base + '/enuconfig.yaml')

In [None]:
#@title # Step 4: Change Model Type

#@markdown Some of these models might not work in ENUNU, but I just added them cuz why not !

acoustic_model = 'Conv1dResnet' #@param ["Conv1dResnet", "FeedForwardNet", "LSTMRNN", "LSTMRNNSAR", "Conv1dResnetSAR", "MDN", "MDNv2", "RMDN", "FFConvLSTM", "VariancePredictor", "ResF0Conv1dResnet", "ResSkipF0FFConvLSTM", "ResF0VariancePredictor"]
duration_model = 'MDN' #@param ["LSTMRNN", "LSTMRNNSAR", "FeedForwardNet", "MDN", "MDNv2", "RMDN"]
timelag_model = 'MDN' #@param ["FeedForwardNet", "MDN", "MDNv2", "RMDN"]

#@markdown ---

#@markdown ### Vibrato Modeling

#@markdown Yeah. Vibrato... idk how to explain it.

vibrato_mode = 'none' #@param ["none", "sine", "diff"]

#@markdown ---

#@markdown ### Initalization Type

#@markdown Acoustic might train better with an init_type.

acoustic_init = 'kaiming_normal' #@param ["none", "kaiming_normal", "xavier_normal"]
duration_init = 'none' #@param ["none", "kaiming_normal", "xavier_normal"]
timelag_init = 'none' #@param ["none", "kaiming_normal", "xavier_normal"]

#@markdown ---

#@markdown ### Feats Criterion/Loss Function

#@markdown L1/MAE loss seems to work better generally.

#@markdown **Disclaimer:** MAE loss seems to drop vibrato. This isn't completely confirmed so you're free to try anyways.

acoustic_criterion = 'mae' #@param ["mse", "mae"]
duration_criterion = 'mae' #@param ["mse", "mae"]
timelag_criterion = 'mae' #@param ["mse", "mae"]

#@markdown ---

#@markdown This option is for `Conv1dResnet`, `VariancePredictor` and ResF0 models only.

use_mdn = False #@param {type : "boolean"}

#@markdown ---

#@markdown ###This part is for ResF0 models only.

#@markdown Specify which part of the HED file is the central silences. It's automatically dealt with for the Japanese Default HED tho so no worries in that case.

silences = 'C-Phone_Silences' #@param {type : "string"}

#@markdown Okay apparently this is pitch regularization.
pitch_reg = 1.0 #@param {type : "number"}
#@markdown This tells how big the smoothing is or smn.
pitch_reg_decay_size = 15 #@param {type : "integer"}

#Set models
acoustic['netG']['_target_'] = 'nnsvs.model.' + acoustic_model
duration['netG']['_target_'] = 'nnsvs.model.' + duration_model
timelag['netG']['_target_'] = 'nnsvs.model.' + timelag_model

#Set feats_criterion for each model.
acoustic_train['feats_criterion'] = acoustic_criterion
duration_train['feats_criterion'] = duration_criterion
timelag_train['feats_criterion'] = timelag_criterion

#Set init type and remove if it's none.
if acoustic_init == 'none':
    if 'init_type' in acoustic['netG'].keys():
        del acoustic['netG']['init_type']
else:
    acoustic['netG']['init_type'] = acoustic_init

if duration_init == 'none':
    if 'init_type' in duration['netG'].keys():
        del duration['netG']['init_type']
else:
    duration['netG']['init_type'] = duration_init

if timelag_init == 'none':
    if 'init_type' in timelag['netG'].keys():
        del timelag['netG']['init_type']
else:
    timelag['netG']['init_type'] = timelag_init

#Set use_mdn for models that use them, remove them if not.
if acoustic_model in ['Conv1dResnet', 'VariancePredictor'] or acoustic_model.startswith('Res'):
    acoustic['netG']['use_mdn'] = use_mdn
    if use_mdn:
        acoustic['netG']['dim_wise'] = True
        acoustic_train['use_detect_anomaly'] = False
    else:
        acoustic_train['use_detect_anomaly'] = True
else:
    if 'use_mdn' in acoustic['netG'].keys():
        del acoustic['netG']['use_mdn']
    if 'dim_wise' in acoustic['netG'].keys():
        del acoustic['netG']['dim_wise']
    acoustic_train['use_detect_anomaly'] = True

#Set things for MDN acoustic
if 'MDN' in acoustic_model or use_mdn:
    acoustic['netG']['dim_wise'] = True
    acoustic_train['use_detect_anomaly'] = False
else:
    if 'dim_wise' in acoustic['netG'].keys():
        del acoustic['netG']['dim_wise']
    acoustic_train['use_detect_anomaly'] = True

#Set bidirectional for LSTM/RNN models, remove if not.
if 'LSTM' in acoustic_model or acoustic_model == 'RMDN':
    acoustic['netG']['bidirectional'] = True
else:
    if 'bidirectional' in acoustic['netG'].keys():
        del acoustic['netG']['bidirectional']

if 'LSTM' in duration_model or duration_model == 'RMDN':
    duration['netG']['bidirectional'] = True
else:
    if 'bidirectional' in duration['netG'].keys():
        del duration['netG']['bidirectional']

if 'LSTM' in timelag_model or timelag_model == 'RMDN':
    timelag['netG']['bidirectional'] = True
else:
    if 'bidirectional' in timelag['netG'].keys():
        del timelag['netG']['bidirectional']

#Deal with ResF0 models.
if acoustic_model.startswith('Res'):
    if mode == 'Default':
        silences = 'p4_C-Phone_Muon'
    
    in_rest_idx = 0
    in_lf0_idx = 0

    for n in range(len(binary)):
        if binary[n][0] == silences:
            in_rest_idx = n
            break

    for n in range(len(continuous)):
        if continuous[n][0].startswith('e1'):
            in_lf0_idx = n + len(binary)
            break

    print(f'in_rest_idx: {in_rest_idx}\nin_lf0_idx: {in_lf0_idx}')

    config['relative_f0'] = False

    acoustic_data['sample_rate'] = int(sample_rate)
    acoustic_data['in_lf0_idx'] = in_lf0_idx
    acoustic_data['in_rest_idx'] = in_rest_idx
    acoustic_data['out_lf0_idx'] = 180

    acoustic['netG']['in_lf0_idx'] = in_lf0_idx
    acoustic['netG']['out_lf0_idx'] = 180
    acoustic['netG']['in_lf0_min'] = None
    acoustic['netG']['in_lf0_max'] = None
    acoustic['netG']['out_lf0_mean'] = None
    acoustic['netG']['out_lf0_scale'] = None

    acoustic_train['pitch_reg_weight'] = pitch_reg
    acoustic_train['pitch_reg_decay_size'] = pitch_reg_decay_size
    acoustic_train['use_detect_anomaly'] = False
else:
    for k in ['sample_rate', 'in_lf0_idx', 'in_rest_idx', 'out_lf0_idx']:
        if k in acoustic_data.keys():
            del acoustic_data[k]

    for k in ['in_lf0_idx', 'out_lf0_idx', 'in_lf0_min', 'in_lf0_max', 'out_lf0_mean', 'out_lf0_scale']:
        if k in acoustic['netG'].keys():
            del acoustic['netG'][k]
    
    for k in ['pitch_reg_weight', 'pitch_reg_decay_size']:
        if k in acoustic_train.keys():
            del acoustic_train[k]

    config['relative_f0'] = True

# Set vibrato mode stuff
if vibrato_mode == 'none':
    config['acoustic_features'] = 'static_deltadelta'
    acoustic['stream_sizes'] = [180, 3, 1, 15]
    acoustic['has_dynamic_features'] = [True, True, False, True]
    acoustic['netG']['out_dim'] = 199

    postfilter_mgc['stream_sizes'] = [60, 1, 1, 5]
    postfilter_mgc['has_dynamic_features'] = [False, False, False, False]
    postfilter_mgc_train['adv_streams'] = [True, False, False, False]

    postfilter_bap['stream_sizes'] = [60, 1, 1, 5]
    postfilter_bap['has_dynamic_features'] = [False, False, False, False]
    postfilter_bap_train['adv_streams'] = [False, False, False, True]
elif vibrato_mode == 'sine':
    config['acoustic_features'] = 'static_deltadelta_sinevib'
    acoustic['stream_sizes'] = [180, 3, 1, 15, 6, 1]
    acoustic['has_dynamic_features'] = [True, True, False, True, True, False]
    acoustic['netG']['out_dim'] = 206

    postfilter_mgc['stream_sizes'] = [60, 1, 1, 5, 2, 1]
    postfilter_mgc['has_dynamic_features'] = [False, False, False, False, False, False]
    postfilter_mgc_train['adv_streams'] = [True, False, False, False, False, False]
    
    postfilter_bap['stream_sizes'] = [60, 1, 1, 5, 2, 1]
    postfilter_bap['has_dynamic_features'] = [False, False, False, False, False, False]
    postfilter_bap_train['adv_streams'] = [False, False, False, True, False, False]
else:
    config['acoustic_features'] = 'static_deltadelta_diffvib'
    acoustic['stream_sizes'] = [180, 3, 1, 15, 3]
    acoustic['has_dynamic_features'] = [True, True, False, True, True]
    acoustic['netG']['out_dim'] = 202

    postfilter_mgc['stream_sizes'] = [60, 1, 1, 5, 1]
    postfilter_mgc['has_dynamic_features'] = [False, False, False, False, False]
    postfilter_mgc_train['adv_streams'] = [True, False, False, False, False]
    
    postfilter_bap['stream_sizes'] = [60, 1, 1, 5, 1]
    postfilter_bap['has_dynamic_features'] = [False, False, False, False, False]
    postfilter_bap_train['adv_streams'] = [False, False, False, True, False]

write_yaml(config, enunu_base + '/config.yaml')

write_yaml(acoustic_data, acoustic_base + '/data/myconfig.yaml')

write_yaml(acoustic_train, acoustic_base + '/train/myconfig.yaml')
write_yaml(duration_train, duration_base + '/train/myconfig.yaml')
write_yaml(timelag_train, timelag_base + '/train/myconfig.yaml')

write_yaml(acoustic, acoustic_base + '/model/acoustic_custom.yaml')
write_yaml(duration, duration_base + '/model/duration_custom.yaml')
write_yaml(timelag, timelag_base + '/model/timelag_custom.yaml')

write_yaml(postfilter_mgc, postfilter_base + '/model/postfilter_mgc.yaml')
write_yaml(postfilter_mgc_train, postfilter_base + '/train/mgc.yaml')
write_yaml(postfilter_bap, postfilter_base + '/model/postfilter_bap.yaml')
write_yaml(postfilter_bap_train, postfilter_base + '/train/bap.yaml')


In [None]:
#@title # Step 5: Change Training Parameters

#@markdown ---
#@markdown ### Resume from checkpoint
#@markdown If resuming from a checkpoint duh.
resume_from_checkpoint = False #@param {type: "boolean"}
resume_loc = '/content/the_model' #@param {type: "string"}

#@markdown ---
#@markdown ### Segmentation and Checkpoint Intervals
#@markdown How much the data is split and how many checkpoints are saved while training (roughly)
middle_frequency = 3 #@param {type: "integer"}
checkpoints = 8 #@param {type: "integer"}
checkpoints = max(checkpoints, 1)

#@markdown ---
#@markdown ### F0/Pitch Estimation Settings
#@markdown Range of F0 detection. This is in Hz but you can reference [this](https://pages.mtu.edu/~suits/notefreqs.html) for the pitches.
f0_floor = 150 #@param {type: "number"}
f0_ceil = 700 #@param {type: "number"}

#@markdown This is for avoiding conflicts in F0 estimation in voiced/unvoiced parts. Requires your hed file to have `C-VUV_Voiced` and `C-VUV_Unvoiced`
correct_vuv = True #@param {type: "boolean"}

#@markdown ---
#@markdown ### Acoustic Settings
acoustic_epochs = 64 #@param {type: "integer"}
acoustic_hidden_dim = 256 #@param {type: "integer"}
acoustic_num_layers = 6 #@param {type: "integer"}
acoustic_dropout = 0.1 #@param {type: "slider", min: 0, max: 1, step: 0.1}
acoustic_batch_size = 8 #@param {type: "integer"}
acoustic_interval = acoustic_epochs // checkpoints

#@markdown ---
#@markdown ### Duration Settings
duration_epochs = 128 #@param {type: "integer"}
duration_hidden_dim = 256 #@param {type: "integer"}
duration_num_layers = 3 #@param {type: "integer"}
duration_dropout = 0.5 #@param {type: "slider", min: 0, max: 1, step: 0.1}
duration_batch_size = 8 #@param {type: "integer"}
duration_interval = duration_epochs // checkpoints

#@markdown ---
#@markdown ### Timelag Settings
timelag_epochs = 128 #@param {type: "integer"}
timelag_hidden_dim = 256 #@param {type: "integer"}
timelag_num_layers = 3 #@param {type: "integer"}
timelag_dropout = 0.5 #@param {type: "slider", min: 0, max: 1, step: 0.1}
timelag_batch_size = 8 #@param {type: "integer"}
timelag_interval = timelag_epochs // checkpoints

#@markdown ---
#@markdown ### Postfilter Settings
postfilter_epochs = 256 #@param {type: "integer"}
postfilter_batch_size = 8 #@param {type: "integer"}
postfilter_interval = postfilter_epochs // checkpoints

if resume_from_checkpoint:
    config['pretrained_expdir'] = resume_loc
else:
    config['pretrained_expdir'] = None

config['stage0']['middle_frequency'] = middle_frequency

config['f0_floor'] = f0_floor
config['f0_ceil'] = f0_ceil
config['correct_vuv'] = correct_vuv

write_yaml(config, enunu_base + '/config.yaml')

acoustic_data['batch_size'] = acoustic_batch_size
if acoustic_model.endswith('FFConvLSTM'):
    for k in ['hidden_dim', 'num_layers']:
        if k in acoustic['netG'].keys():
            del acoustic['netG'][k]

    acoustic['netG']['ff_hidden_dim'] = acoustic_hidden_dim
    acoustic['netG']['conv_hidden_dim'] = acoustic_hidden_dim
    acoustic['netG']['lstm_hidden_dim'] = acoustic_hidden_dim
    acoustic['netG']['num_lstm_layers'] = acoustic_num_layers
else:
    for k in ['ff_hidden_dim', 'conv_hidden_dim', 'lstm_hidden_dim', 'num_lstm_layers']:
        if k in acoustic['netG'].keys():
            del acoustic['netG'][k]
            
    acoustic['netG']['hidden_dim'] = acoustic_hidden_dim
    acoustic['netG']['num_layers'] = acoustic_num_layers
acoustic['netG']['dropout'] = acoustic_dropout
acoustic_train['nepochs'] = acoustic_epochs
acoustic_train['checkpoint_epoch_interval'] = acoustic_interval

if acoustic_model in ['Conv1dResnet', 'Conv1dResnetSAR', 'MDN', 'ResF0Conv1dResnet']:
    del acoustic['netG']['dropout']

duration_data['batch_size'] = duration_batch_size
duration['netG']['hidden_dim'] = duration_hidden_dim
duration['netG']['num_layers'] = duration_num_layers
duration['netG']['dropout'] = duration_dropout
duration_train['nepochs'] = duration_epochs
duration_train['checkpoint_epoch_interval'] = duration_interval

timelag_data['batch_size'] = timelag_batch_size
timelag['netG']['hidden_dim'] = timelag_hidden_dim
timelag['netG']['num_layers'] = timelag_num_layers
timelag['netG']['dropout'] = timelag_dropout
timelag_train['nepochs'] = timelag_epochs
timelag_train['checkpoint_epoch_interval'] = timelag_interval

postfilter_data['batch_size'] = postfilter_batch_size
postfilter_mgc_train['nepochs'] = postfilter_epochs
postfilter_mgc_train['checkpoint_epoch_interval'] = postfilter_interval
postfilter_bap_train['nepochs'] = postfilter_epochs
postfilter_bap_train['checkpoint_epoch_interval'] = postfilter_interval


write_yaml(acoustic_data, acoustic_base + '/data/myconfig.yaml')
write_yaml(acoustic_train, acoustic_base + '/train/myconfig.yaml')
write_yaml(acoustic, acoustic_base + '/model/acoustic_custom.yaml')

write_yaml(duration_data, duration_base + '/data/myconfig.yaml')
write_yaml(duration_train, duration_base + '/train/myconfig.yaml')
write_yaml(duration, duration_base + '/model/duration_custom.yaml')

write_yaml(timelag_data, timelag_base + '/data/myconfig.yaml')
write_yaml(timelag_train, timelag_base + '/train/myconfig.yaml')
write_yaml(timelag, timelag_base + '/model/timelag_custom.yaml')

write_yaml(postfilter_data, postfilter_base + '/data/myconfig.yaml')
write_yaml(postfilter_mgc_train, postfilter_base + '/train/mgc.yaml')
write_yaml(postfilter_bap_train, postfilter_base + '/train/bap.yaml')

# Training
Finally...

In [None]:
#@title # Tensorboard

#@markdown Run this if you wanna see funny graphs I guess. If it's not showing, here's what you do:

#@markdown ## Chromium
#@markdown - Enable third party cookies.

#@markdown ## Firefox
#@markdown - Disable Enhanced Tracking for Google Colab.

#@markdown **TIP:** You can set a reload interval if you click the settings at the top to get updates every 30 seconds or so
%load_ext tensorboard
import datetime
from tensorboard import notebook
!mkdir /content/enunu_training_kit/train/tensorboard/
%tensorboard --logdir /content/enunu_training_kit/train/tensorboard/


In [None]:
#@title # Step 6: Data Prep, Feature Extraction, Train main models

#@markdown YEAAYEAAYEAAAEEAAAAYYYEEAAAAAYYOOOOUUUUUUWOoooaahhh

#@markdown This only trains acoustic, duration and timelag

starting_stage = 0 #@param {type: "slider", min: 0, max: 5, step: 1}
stopping_stage = 5 #@param {type: "slider", min: 0, max: 5, step: 1}

#@markdown ---
#@markdown NO SPACES !!! OR JAPANESE CHARACTERS !!!
singer_name = 'Unnamed' #@param {type: "string"}

config['spk'] = singer_name
config['tag'] = 'CeVOX_Cantano_Al_Kit'
enuconfig['stats_dir'] = f'dump/{singer_name}/norm'
enuconfig['model_dir'] = f'exp/{singer_name}_CeVOX_Cantano_Al_Kit'

write_yaml(config, enunu_base + '/config.yaml')
write_yaml(enuconfig, enunu_base + '/enuconfig.yaml')

%cd "/content/enunu_training_kit/train"
if acoustic_model.startswith('Res'):
    print('ResF0 Mode')
    !bash run_resf0.sh --stage $starting_stage --stop_stage $stopping_stage
else:
    !bash run.sh --stage $starting_stage --stop_stage $stopping_stage

In [None]:
#@title # Step 7: Train postfilter

#@markdown YEAAYEAAYEAAAEEAAAAYYYEEAAAAAYYOOOOUUUUUUWOoooaahhh part 2

#@markdown For postfilter. Trains slowly !!!!!

starting_stage = 7 #@param {type: "slider", min: 7, max: 10, step: 1}
stopping_stage = 10 #@param {type: "slider", min: 7, max: 10, step: 1}

%cd "/content/enunu_training_kit/train"
if acoustic_model.startswith('Res'):
    print('ResF0 Mode')
    !bash run_resf0.sh --stage $starting_stage --stop_stage $stopping_stage
else:
    !bash run.sh --stage $starting_stage --stop_stage $stopping_stage

In [None]:
#@title # Step 8: Package Model
%cd "/content/enunu_training_kit/train"
!bash run.sh --stage 99 --stop_stage 99
%cd /content/
store_on_drive = True #@param {type: "boolean"}
clean_up_model = True #@param {type: "boolean"}
from datetime import datetime, timezone
import glob

time_now = datetime.now(timezone.utc).strftime('%Y-%m-%d %H-%M-%S')

archive_name = f'{singer_name}_{time_now}'

if clean_up_model:
    !rm -f /content/enunu_training_kit/train/release/{singer_name}_---/exp/{singer_name}_CeVOX_Cantano_Al_Kit/*/checkpoint*
    !rm -f /content/enunu_training_kit/train/release/{singer_name}_---/exp/{singer_name}_CeVOX_Cantano_Al_Kit/*/epoch*
    for model_loc in glob.glob(enunu_base + f'/release/{singer_name}_---/exp/{singer_name}_CeVOX_Cantano_Al_Kit/*/model.yaml'):
        model_yaml = load_yaml(model_loc)
        for k in ['init_type']:
            if k in model_yaml['netG'].keys():
                del model_yaml['netG'][k]
        write_yaml(model_yaml, model_loc)

%cd /content/enunu_training_kit/train/release
!zip -r "/content/{archive_name}.zip" ./{singer_name}_---

if store_on_drive:
    if not os.path.exists('/content/drive/MyDrive/NNSVS_Release_Models'):
        !mkdir /content/drive/MyDrive/NNSVS_Release_Models
    
    !mv -v "/content/{archive_name}.zip" /content/drive/MyDrive/NNSVS_Release_Models

#clear_output()
print('Done!')