# CeVOX DiffSinger Training Notebook

![Ah shit, here we go again.](https://cdn.discordapp.com/attachments/455377490399985676/1141634804337606676/hq720.png)

# Setup Drive and stuff

In [None]:
#@title Install DiffSinger

from IPython.display import clear_output

clear_output()
print('Cloning DiffSinger')
!git clone https://github.com/openvpi/DiffSinger
%cd DiffSinger

clear_output()
print('Installing requirements')
%pip install -r requirements.txt
%pip install onnxruntime==1.14.0

clear_output()
print('Downloading OpenVPI NSF-HiFiGAN')
!wget https://github.com/openvpi/vocoders/releases/download/nsf-hifigan-44.1k-hop512-128bin-2024.02/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip
!unzip nsf_hifigan_44.1k_hop512_128bin_2024.02.zip -d checkpoints
!rm nsf_hifigan_44.1k_hop512_128bin_2024.02.zip

clear_output()
print('Downloading OpenVPI RMVPE')
!wget https://github.com/yxlllc/RMVPE/releases/download/230917/rmvpe.zip
!unzip rmvpe.zip -d checkpoints
!rm rmvpe.zip

clear_output()
print('Downloading OpenVPI Harmonic Separator')
!wget https://github.com/yxlllc/vocal-remover/releases/download/hnsep_240512/hnsep_240512.zip
!unzip hnsep_240512.zip -d checkpoints
!rm hnsep_240512.zip

clear_output()
print('Done')

In [None]:
#@title Mount Google Drive

#@markdown This still makes things easy so just do it.
#@markdown ## Note: Because of weird path processing reasons, the Google Drive folder will be mounted in /content/DiffSinger/drive instead of the usual /content/drive

from google.colab import drive
drive.flush_and_unmount()
!rm -rf /content/DiffSinger/drive
drive.mount('/content/DiffSinger/drive')
print('Done!')

# Raw Data Preparation

In [None]:
#@title Decompress dataset

#@markdown Make sure it's an actual DiffSinger one. I am not adding automated DB making here sowwy... You need to refine that as much as you can

#@markdown Only accepting `.7z` or whatever `p7zip` reads sowwy part two...

#@markdown Make sure each speaker is in a folder... basically

#@markdown ```
#@markdown Archive
#@markdown +---speaker1
#@markdown |   | transcriptions.csv
#@markdown |   \---wavs
#@markdown |       <wav-files/>
#@markdown +---speaker2
#@markdown |   | transcriptions.csv
#@markdown |   \---wavs
#@markdown |       <wav-files/>
#@markdown ```

#@markdown it'll just assume that it's multispeaker when there's multiple folders of course.

#@markdown oh also if you have a dictionary just upload it directly to `DiffSinger/dictionaries` thx.

import glob

dataset_loc = '/content/DiffSinger/drive/MyDrive/dataset.7z' #@param {type: "string"}

!7za x "$dataset_loc" -o"data"

folders = glob.glob('data/*')
folders.sort()

In [None]:
#@title Decompress binaries (Skip if you haven't done binarization)

#@markdown This is to skip binarization hopefully.

#@markdown If you made binaries outside of this notebook, make sure they're in folders like `acoustic_bin` and `variance_bin` because that's how I formatted them.

binary_loc = '/content/DiffSinger/drive/MyDrive/binaries.7z' #@param {type: "string"}

!7za x "$binary_loc" -o"data"

# Data Processing Settings

You might still need to run these even if you have binaries.

In [None]:
#@title General Settings

#@markdown ### Dataset Stuff
#@markdown `exp_name` is just for the model folder names now. Speaker names are taken from the folders in the archive.
exp_name = 'model' #@param {type: "string"}
num_test_samples = 3 #@param {type: "integer"}
dictionary = 'dictionary.txt' #@param {type: "string"}

#@markdown ---

#@markdown ### Variance stuff

#@markdown These options need to be present on both models so they're right here.

use_energy = False #@param {type: "boolean"}
use_breathiness = False #@param {type: "boolean"}
use_tension = False #@param {type: "boolean"}
use_voicing = False #@param {type: "boolean"}

import yaml
import os
import copy

def load_yaml(location):
    res = None
    with open(location) as f:
        res = yaml.safe_load(f)
    return res

def write_yaml(data, location):
    with open(location, 'w', encoding='utf8') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

def represent_none(self, _):
    return self.represent_scalar('tag:yaml.org,2002:null', '')

def get_test_prefixes(folder, n=5, id=None):
    wavs = glob.glob(os.path.join(folder, 'wavs/*.wav'))
    wavs.sort()
    wavs = wavs[-n:]
    for i in range(len(wavs)):
        _, f = os.path.split(wavs[i])
        fn, _ = os.path.splitext(f)
        wavs[i] = fn if id is None else f'{id}:{fn}'

    return wavs

base = load_yaml('configs/base.yaml')

#@markdown ---

#@markdown ### Batching stuff (for processing)

#@markdown This is the best Colab can do. They are not known for having the best CPUs...
num_workers = 2 #@param {type: "slider", min: 0, max: 2, step: 1}

base['ds_workers'] = num_workers

#@markdown ---

#@markdown ### Pitch Estimator stuff

#@markdown `parselmouth` is fast, `rmvpe` is fast but may have range issues, `harvest` is slow but very accurate

pitch_estimator = 'harvest' #@param ["parselmouth", "rmvpe", "harvest"]

#@markdown Change this if you're using `harvest`. `rmvpe` doesn't read this.

f0_min = 65 #@param {type: "number"}
f0_max = 1100 #@param {type: "number"}

base['pe'] = pitch_estimator
base['f0_min'] = f0_min
base['f0_max'] = f0_max

if pitch_estimator == 'rmvpe':
    base['pe_ckpt'] = 'checkpoints/rmvpe/model.pt'

#@markdown ---

#@markdown ### Harmonic Separator stuff

#@markdown Harmonic separation is done for tension and voicing parameters. You can ignore this freely if you don't plan to have tension/voicing.

#@markdown `vr` uses the new AI-based harmonic separator, `world` uses the old WORLD based harmonic separator.

hnsep = 'vr' #@param ['vr', 'world']

base['hnsep'] = hnsep

#@markdown ---

#@markdown ### Training Precision stuff

#@markdown T4s and V100s don't support `bf16-mixed`

precision = '16-mixed' #@param ["32-true", "64-true", "16-mixed", "bf16-mixed"]

base['pl_trainer_precision'] = precision

write_yaml(base, 'configs/base.yaml')

num_spk = len(folders)
use_spk_id = num_spk > 1
test_prefixes = []
speakers = copy.deepcopy(folders)
for i in range(len(speakers)):
    _, s = os.path.split(speakers[i])
    speakers[i] = s

if not use_spk_id:
    folders = folders[0]
    test_prefixes = get_test_prefixes(folders, n=num_test_samples)
else:
    for i in range(len(folders)):
        folder = folders[i]
        test_prefixes.extend(get_test_prefixes(folder, n=num_test_samples, id=i))


In [None]:
#@title Acoustic Settings

#@markdown I won't be putting much (a lie) I think you should just directly edit the YAML (still true despite that).

yaml.add_representer(type(None), represent_none)

acoustic = load_yaml('configs/acoustic.yaml')

acoustic['raw_data_dir'] = folders
acoustic['binary_data_dir'] = 'data/acoustic_bin'

acoustic['speakers'] = speakers
acoustic['test_prefixes'] = test_prefixes
acoustic['use_spk_id'] = use_spk_id
acoustic['num_spk'] = num_spk
acoustic['binarization_args']['num_workers'] = num_workers
acoustic['dictionary'] = 'dictionaries/' + dictionary
acoustic['use_energy_embed'] = use_energy
acoustic['use_breathiness_embed'] = use_breathiness
acoustic['use_tension_embed'] = use_tension
acoustic['use_voicing_embed'] = use_voicing

#@markdown ---

#@markdown ### Augmentation Stuff
#@markdown I honestly still don't fully know what this is for (other than data augmentation and the silly embeds it can give you).

#@markdown The scale is for what percentage of the dataset is used for augmentation, so the default is 100% aka the whole dataset.
#@markdown

#@markdown #### **Pitch Shifting Augmentation**
#@markdown Adds more pitch or smn. The default scale here is for `random`, the default scale for `fixed` is `0.75`. `random` enables you to have a gender parameter working. `fixed` apparently makes speakers so it might not be ideal for multispeaker.
pitch_augmentation = 'none' #@param ["none", "fixed", "random"]
shift_scale = 1.0 #@param {type: "slider", min: 0, max: 1, step: 0.01}

#@markdown

#@markdown #### **Time Stretching Augmentation**
#@markdown It... time stretches or smn man idk. Idek what a continuous velocity parameter would be like.
time_stretching = False #@param {type: "boolean"}
stretch_scale = 1.0 #@param {type: "slider", min: 0, max: 1, step: 0.01}

if pitch_augmentation == 'fixed':
    acoustic['augmentation_args']['fixed_pitch_shifting']['enabled'] = True
    acoustic['augmentation_args']['random_pitch_shifting']['enabled'] = False
    acoustic['augmentation_args']['fixed_pitch_shifting']['scale'] = shift_scale
    acoustic['use_key_shift_embed'] = False
elif pitch_augmentation == 'random':
    acoustic['augmentation_args']['fixed_pitch_shifting']['enabled'] = False
    acoustic['augmentation_args']['random_pitch_shifting']['enabled'] = True
    acoustic['augmentation_args']['random_pitch_shifting']['scale'] = shift_scale
    acoustic['use_key_shift_embed'] = True
elif pitch_augmentation == 'none':
    acoustic['augmentation_args']['fixed_pitch_shifting']['enabled'] = False
    acoustic['augmentation_args']['random_pitch_shifting']['enabled'] = False
    acoustic['use_key_shift_embed'] = False

acoustic['augmentation_args']['random_time_stretching']['enabled'] = time_stretching
acoustic['augmentation_args']['random_time_stretching']['scale'] = stretch_scale
acoustic['use_speed_embed'] = time_stretching

#@markdown ---

#@markdown ### Diffusion Stuff
#@markdown Diffusion Type is how the diffusion process is done really. Switching it to `reflow` by default instead of `ddpm`

acoustic_diff_type = 'reflow' #@param ["ddpm", "reflow"]
acoustic['diffusion_type'] = acoustic_diff_type

#@markdown Shallow diffusion is basically a hybrid solution of rendering acoustic. It's faster and better quality (says in the paper).

use_shallow_diffusion = True #@param {type : "boolean"}
acoustic['use_shallow_diffusion'] = use_shallow_diffusion

#@markdown `k_step` is the number of diffusion steps that the diffusion side will take. 200 is enough but 400 is kept cuz it's the default

#@markdown `reflow` does not use `k_step` but I'll be converting it to its equivalent `t_start`, which is why this is a slider now.
k_step = 400 #@param {type: "slider", min: 0, max: 1000, step: 1}
k_step = int(k_step)
acoustic['K_step'] = k_step
acoustic['K_step_infer'] = k_step
acoustic['T_start'] = k_step / 1000
acoustic['T_start_infer'] = k_step / 1000

#@markdown ---

#@markdown ### Batching Stuff (for training)

max_batch_size = 12 #@param {type: "integer"}

#@markdown This simulates a larger batch size while having lower memory cost, but makes training "slower" since not all batches makes a training step.

#@markdown You can put 4 to simulate the original 48 max batch size...
accumulate_grad_batches = 1 #@param {type: "integer"}

acoustic['max_batch_size'] = max_batch_size
acoustic['accumulate_grad_batches'] = accumulate_grad_batches

#@markdown ---

#@markdown ### Learning Rate Stuff

#@markdown If your batch size is lower, make your `step_size` higher to accomodate for the slower learning time. Maybe `gamma` too.
#@markdown `reflow` is faster at learning than `ddpm`

learning_rate = 0.0004 #@param {type: "number"}
step_size = 50000 #@param {type: "integer"}
gamma = 0.5 #@param {type: "slider", min: 0, max: 1, step: 0.01}

acoustic['optimizer_args']['lr'] = learning_rate
acoustic['lr_scheduler_args']['step_size'] = step_size
acoustic['lr_scheduler_args']['gamma'] = gamma

#@markdown ---

#@markdown ### Loss Stuff

#@markdown `reflow` seems to be better with L1 loss.

acoustic_loss_type = 'l2' #@param ["l1", "l2"]
acoustic['main_loss_type'] = acoustic_loss_type

write_yaml(acoustic, 'configs/acoustic.yaml')

In [None]:
#@title Variance Settings

#@markdown Same deal as acoustic

#@markdown ---

variance = load_yaml('configs/variance.yaml')

variance['raw_data_dir'] = folders
variance['binary_data_dir'] = 'data/variance_bin'

variance['speakers'] = speakers
variance['test_prefixes'] = test_prefixes
variance['use_spk_id'] = use_spk_id
variance['num_spk'] = num_spk
variance['dictionary'] = 'dictionaries/' + dictionary
variance['binarization_args']['num_workers'] = num_workers

#@markdown **Tip:** Duration trains better with pitch/energy/breathiness, but pitch/energy/breathiness trains better without duration.

predict_duration = True #@param {type: "boolean"}

#@markdown ---

#@markdown ### Diffusion stuff

#@markdown Same deal with acoustic.

variance_diff_type = 'reflow' #@param ["ddpm", "reflow"]
variance['diffusion_type'] = variance_diff_type

#@markdown ---

#@markdown ### Pitch Generation stuff

#@markdown Pitch prediction is kinda okay if you train it with L1 loss.

predict_pitch = False #@param {type: "boolean"}

#@markdown These are new things for pitch modelling.
#@markdown If I understand correctly, melody encoder is mostly really just distinguishing rests from notes,
#@markdown and glide embeds are a new thing that even SlurCutter doesn't even support adding it yet.

#@markdown Only melody encoder is supported now I think? Not sure with glide embeds.

use_melody_encoder = False #@param {type: "boolean"}
use_glide_embed = False #@param {type: "boolean"}

variance['predict_dur'] = predict_duration
variance['predict_pitch'] = predict_pitch
variance['predict_energy'] = use_energy
variance['predict_breathiness'] = use_breathiness
variance['predict_tension'] = use_tension
variance['predict_voicing'] = use_voicing
variance['use_melody_encoder'] = use_melody_encoder
variance['use_glide_embed'] = use_glide_embed

#@markdown ---

#@markdown ### Batching Stuff (for training)

#@markdown Lower this if you're training pitch/energy/breathiness. Something like 16 or 20.
max_batch_size = 48 #@param {type: "integer"}

accumulate_grad_batches = 1 #@param {type: "integer"}

variance['max_batch_size'] = max_batch_size
variance['accumulate_grad_batches'] = accumulate_grad_batches

#@markdown ---

#@markdown ### Learning Rate Stuff

#@markdown You can leave this alone but **I highly recommend putting higher learning rates and higher step sizes when training with pitch/energy/breathiness.**
#@markdown Something like `lr = 0.001, step_size = 50000` works. Diffusion just has a rough start.

#@markdown If you also increase step size you should decrease gamma too. Maybe to like... `0.6`.

learning_rate = 0.0006 #@param {type: "number"}
step_size = 12000 #@param {type: "integer"}
gamma = 0.75 #@param {type: "slider", min: 0, max: 1, step: 0.01}

variance['optimizer_args']['lr'] = learning_rate
variance['lr_scheduler_args']['step_size'] = step_size
variance['lr_scheduler_args']['gamma'] = gamma

#@markdown ---

#@markdown ### Loss Stuff

#@markdown You can leave this alone. MSE = L2 btw.

duration_loss_type = 'mse' #@param ["mse", "huber"]
#@markdown I think this is only used when using Pitch Diffusion and Energy/Breathiness prediction. It likes L1 a lot.
variance_loss_type = 'l2' #@param ["l1", "l2"]

variance['main_loss_type'] = variance_loss_type
variance['dur_prediction_args']['loss_type'] = duration_loss_type

write_yaml(variance, 'configs/variance.yaml')

# Actual Data Processing

In [None]:
#@title Binarize Variance
!python scripts/binarize.py --config configs/variance.yaml

In [None]:
#@title Binarize Acoustic
!python scripts/binarize.py --config configs/acoustic.yaml

In [None]:
#@title Compress binaries

#@markdown This saves a `binaries.7z` in your root folder in Google Drive a.k.a it'll just be outside everything
%cd data
!7za a binaries.7z acoustic_bin variance_bin
!mv binaries.7z /content/DiffSinger/drive/MyDrive/
%cd ..

# Training

## Note: I highly recommend saving to Drive. You don't really wanna lose the models.

In [None]:
#@title Launch Tensorboard

#@markdown I hope Tensorboard doesn't have the same issues as it did back then but just in case... Do this if it doesn't load.

#@markdown ## Chromium
#@markdown - Enable third party cookies.

#@markdown ## Firefox
#@markdown - Disable Enhanced Tracking for Google Colab.

#@markdown **TIP:** You can set a reload interval if you click the settings at the top to get updates every 30 seconds or so

checkpoints_in_drive = True #@param {type: "boolean"}

%load_ext tensorboard
if checkpoints_in_drive:
    %tensorboard --logdir /content/DiffSinger/drive/MyDrive/DiffSinger_Checkpoints
else:
    %tensorboard --logdir /content/DiffSinger/checkpoints

In [None]:
#@title Train Variance
import os
import yaml

DRIVE_FOLDER = '/content/DiffSinger/drive/MyDrive/DiffSinger_Checkpoints'
save_to_drive = True #@param {type: "boolean"}

def load_yaml(location):
    res = None
    with open(location) as f:
        res = yaml.safe_load(f)
    return res

def write_yaml(data, location):
    with open(location, 'w', encoding='utf8') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

def represent_none(self, _):
    return self.represent_scalar('tag:yaml.org,2002:null', '')

def resolve_config(config):
    res = load_yaml(config)
    if 'base_config' in res.keys():
        for c in res['base_config']:
            temp = resolve_config(c)
            for k, v in temp.items():
                if k not in res.keys():
                    res[k] = v
        del res['base_config']
    return res

if save_to_drive:
    if not os.path.exists(DRIVE_FOLDER):
        os.makedirs(DRIVE_FOLDER)

    final_config_path = os.path.join(DRIVE_FOLDER, f'{exp_name}_variance/config.yaml')
    if not os.path.exists(final_config_path):
        dir = os.path.dirname(final_config_path)
        os.makedirs(dir)
        final_config = resolve_config('configs/variance.yaml')
        write_yaml(final_config, final_config_path)
    !python scripts/train.py --config configs/variance.yaml --reset --hparams work_dir={DRIVE_FOLDER}/{exp_name}_variance
else:
    !python scripts/train.py --config configs/variance.yaml --exp_name {exp_name}_variance --reset

In [None]:
#@title Train Acoustic
import os

DRIVE_FOLDER = '/content/DiffSinger/drive/MyDrive/DiffSinger_Checkpoints'
save_to_drive = True #@param {type: "boolean"}

def load_yaml(location):
    res = None
    with open(location) as f:
        res = yaml.safe_load(f)
    return res

def write_yaml(data, location):
    with open(location, 'w', encoding='utf8') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

def represent_none(self, _):
    return self.represent_scalar('tag:yaml.org,2002:null', '')

def resolve_config(config):
    res = load_yaml(config)
    if 'base_config' in res.keys():
        for c in res['base_config']:
            temp = resolve_config(c)
            for k, v in temp.items():
                if k not in res.keys():
                    res[k] = v
        del res['base_config']
    return res

if save_to_drive:
    if not os.path.exists(DRIVE_FOLDER):
        os.makedirs(DRIVE_FOLDER)

    final_config_path = os.path.join(DRIVE_FOLDER, f'{exp_name}_acoustic/config.yaml')
    if not os.path.exists(final_config_path):
        dir = os.path.dirname(final_config_path)
        os.makedirs(dir)
        final_config = resolve_config('configs/acoustic.yaml')
        write_yaml(final_config, final_config_path)
    !python scripts/train.py --config configs/acoustic.yaml --reset --hparams work_dir={DRIVE_FOLDER}/{exp_name}_acoustic
else:
    !python scripts/train.py --config configs/acoustic.yaml --exp_name {exp_name}_acoustic --reset

# Export to ONNX

You can format your stuff for OU urself sowwy.. just look at [this](https://github.com/xunmengshe/OpenUtau/wiki/Voicebank-Development) as a guide. also the models need to be in `DiffSinger/checkpoints` if you're exporting.

## NOTE: If you know how to run DiffSinger locally, please just use local DiffSinger to export. It works ONLY in PyTorch 1.13.

In [None]:
#@title Install PyTorch 1.13

#@markdown If you still wanna export ONNX here in Colab...

%pip install -U torch==1.13.0+cu117 torchvision==0.14.0+cu117 torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cu117

In [None]:
#@title Move Drive checkpoints
!cp -dpR /content/DiffSinger/drive/MyDrive/DiffSinger_Checkpoints /content/DiffSinger/checkpoints

In [None]:
#@title Exporter
model_type = 'acoustic' #@param ["acoustic", "variance"]
speaker_name = 'speaker1' #@param {type: "string"}
!python scripts/export.py {model_type} --exp {speaker_name}_{model_type}