<a href="https://colab.research.google.com/github/StruckX/struck-notebooks/blob/main/DDSP_SVC_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DDSP-SVC Notebook
Made by Struck | [DDSP-SVC GitHub](https://github.com/yxlllc/DDSP-SVC)

# Setup

In [None]:
#@markdown ## Check Device
#@markdown Check if you're using a gpu.
!nvidia-smi

In [None]:
#@markdown ## Mount Drive
#@markdown If you want to save your stuffs to Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@markdown ## Install DDSP-SVC
#@markdown Clone the repo and install dependencies

import os
from IPython.display import clear_output

!git clone https://github.com/yxlllc/DDSP-SVC

%cd -q /content/DDSP-SVC

!apt install p7zip-rar
!pip install --upgrade -q pip setuptools numpy numba
!pip install -q pyworld praat-parselmouth torchcrepe einops local_attention wave fairseq transformers tensorboardX

!mkdir -p "pretrain/hubert" "pretrain/contentvec" &> /dev/null
!wget -nc -q --show-progress -O "pretrain/hubert/hubert-soft-0d54a1f4.pt" "https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt"
!wget -nc -q --show-progress -O "pretrain/contentvec/checkpoint_best_legacy_500.pt" "https://github.com/fishaudio/fish-diffusion/releases/download/v1.12/content-vec-best-legacy-500.pt"

clear_output()

print("Done! Restrating runtime...");

os.system(f"sleep 1; kill {os.getpid()}")

# Training

**Disclaimer:** Please make sure to only train DDSP-SVC models with legally obtained authorized data, and do not use these models and any audio they synthesize for illegal purposes. The author of this repository *(and notebook)* is not responsible for any infringement, fraud and other illegal acts caused by the use of these model checkpoints and audio. ([DDSP-SVC README](https://github.com/yxlllc/DDSP-SVC/blob/master/README.md))

In [None]:
#@markdown ## Extract Dataset
#@markdown Run only if you're making a new model or want to preprocess again.

#@markdown All archives that `7z` supports such as `7z`, `zip`, `rar`, etc. can be used.
#@markdown <details>
#@markdown <summary>Your files inside the archive must look <b>EXACTLY</b> like this (<a>click me!</a>)</summary>
#@markdown
#@markdown ![Dataset Structure](https://cdn.discordapp.com/attachments/1061982906895057009/1112176630597697576/simplified-dataset.png)<br>
#@markdown **Optional:** You can manually add your own validation audios in `data/val/audio`, it must look the same as above. It is recommended to have at least ≤5 audios per speaker.
#@markdown </details>

%cd -q /content/DDSP-SVC

archive_path = "/content/model-dataset.7z" #@param {type: "string"}

!rm -r ./data/train/audio

print("Extracting audios...")
!7z x -o"data/train/audio" '{archive_path}' -bso0 -bsp1

print("Done!")

In [None]:
#@markdown ## Extract Preprocessed Dataset
#@markdown Run this instead if you already preprocessed your dataset.

%cd -q /content/DDSP-SVC

archive_path = "/content/drive/MyDrive/preprocessed-dataset.7z" #@param {type: "string"}
config_path = "/content/drive/MyDrive/combsub-sins-etc.yaml" #@param {type: "string"}

!rm -r ./data/*

print("Extracting preprocessed dataset...")
!7z x '{archive_path}' -o"data" -bso0 -bsp1
!cp '{config_path}' configs/

print("Done!")

In [None]:

#@markdown ## Configurations

import yaml

#@markdown Model config:
model_name = "model-ai" #@param {type: "string"}

synthesizer = "combsub" #@param ["combsub","combsub-old", "sins"]

pitch_extractor = "crepe" #@param ["crepe", "parselmouth", "harvest", "dio"]

number_of_speaker = 1 #@param {type:"integer"}

encoder = "contentvec768l12" #@param ["hubertsoft", "contentvec", "contentvec768", "contentvec768l12"]

#@markdown Save checkpoints to a directory:
use_save_dir = False #@param {type: "boolean"}
save_dir_path = "exp/model-ai" #@param {type: "string"}

if encoder == "hubertsoft":
  encoder_ckpt = "pretrain/hubert/hubert-soft-0d54a1f4.pt"
  enc_out_ch = 256
else:
  encoder_ckpt = "pretrain/contentvec/checkpoint_best_legacy_500.pt"
  enc_out_ch = 256 if encoder == "contentvec" else 768

with open(f"configs/{synthesizer}.yaml", "r+") as f:
  y = yaml.safe_load(f)
  y['data']['f0_extractor'] = pitch_extractor
  y['data']['encoder'] = encoder
  y['data']['encoder_out_channels'] = enc_out_ch
  y['data']['encoder_ckpt'] = encoder_ckpt
  y['model']['n_spk'] = number_of_speaker if number_of_speaker > 0 else 1
  y['env']['expdir'] = save_dir_path if use_save_dir else f"exp/{model_name}"
  f.seek(0)
  f.write(yaml.dump(y, default_flow_style=False, sort_keys=False))
  f.truncate()

print("Done!")

In [None]:
#@markdown ## Start Preprocess
#@markdown Skip if you already have a preprocessed data.

import os

save_to_drive = True #@param {type: "boolean"}

if os.path.exists("data/val/audio"):
  if not any(files.endswith('.wav') for files in os.listdir("data/val/audio")):
    !python draw.py
else:
  !python draw.py

!python preprocess.py -c configs/{synthesizer}.yaml

if save_to_drive:
  model_drive_path = f'/content/drive/MyDrive/DDSP-SVC/data/{model_name}'
  !mkdir -p '{model_drive_path}'
  !7z a -mx=1 '{model_drive_path}/preprocessed_dataset.7z' ./data/*
  !cp 'configs/{synthesizer}.yaml' '{model_drive_path}'
  print(f"Done! Files saved at {model_drive_path}")
else:
  !7z a -mx=1 '/content/{model_name}_data/preprocessed_dataset.7z' ./data/*
  !cp 'configs/{synthesizer}.yaml' '/content/{model_name}_data'
  print(f"Done! Files saved at /content/{model_name}_data")

In [None]:
#@markdown ## Start Tensorboard
#@markdown Visualize your model's progress. Note that predicted audios aren't enhanced, only in inference.

import IPython

%reload_ext tensorboard

if use_save_dir:
  %tensorboard --logdir '{save_dir_path}'
else:
  %tensorboard --logdir 'exp/{model_name}'

display(IPython.display.HTML('''
<button onclick="window.open(document.querySelector('iframe').src, '__blank')">Open in New Tab</button> 
<button onclick="document.querySelector('iframe').style.display = 'none'">Hide TensorBoard</button>
<button onclick="document.querySelector('iframe').style.display = 'block'">Show TensorBoard</button> 
'''))

In [None]:
#@markdown ## Start Training
#@markdown Train! Train! Train!

!python train.py -c configs/{synthesizer}.yaml

# Inference

In [None]:
#@markdown ## Voice Conversion
#@markdown Main Settings:

%cd -q /content/DDSP-SVC

input_audio = "/content/drive/input.wav" #@param {type: "string"}

model_pt = "/content/drive/model_0.pt" #@param {type: "string"}

output_audio = "/content/output.flac" #@param {type: "string"}

keychange = 0 #@param {type:"integer"}

pitch_extractor = "crepe" #@param ["crepe", "parselmouth", "harvest", "dio"]

#@markdown Speaker Settings:

speaker_id = 1 #@param {type:"integer"}

mix_speaker = False #@param {type: "boolean"}

mix_ratio = "1:0.50, 2:0.50" #@param {type: "string"}

#@markdown Enhancer Settings:

use_enhancer = True #@param {type: "boolean"}

enhancer = "OpenVPI" #@param ["OpenVPI", "fishaudio"]

adaptive_key = 0 #@param {type:"integer"}

if not 'installed_enhancer' in globals():
  installed_enhancer = "None"

if use_enhancer and not installed_enhancer == enhancer:
  !rm -r pretrain/nsf_hifigan
  if enhancer == "OpenVPI":
    !wget -q --show-progress -O /content/nsf_hifigan.zip "https://github.com/openvpi/vocoders/releases/download/nsf-hifigan-v1/nsf_hifigan_20221211.zip"
  else:
    !wget -q --show-progress -O /content/nsf_hifigan.zip "https://github.com/fishaudio/fish-diffusion/releases/download/v2.0.0/nsf_hifigan-stable-v1.zip"
  !7z x /content/nsf_hifigan.zip -o"pretrain" -bso0 -bsp1
  !rm /content/nsf_hifigan.zip
  installed_enhancer = enhancer

infer_flags = f'''\
-i "{input_audio}" \
-m "{model_pt}" \
-o "{output_audio}" \
-k {keychange} \
-pe {pitch_extractor} \
{f"-mix '{{{mix_ratio}}}'" if mix_speaker else f"-id {speaker_id}"} \
{f"-eak {adaptive_key}" if use_enhancer else "-e false"}
'''

!python main.py {infer_flags}

In [None]:
#@markdown ## Download Output

from IPython.display import display, Audio

preview_only = True #@param {type:"boolean"}

if preview_only:
  audio = open(output_audio, "rb").read()
  display(Audio(audio))
else:
  from google.colab import files
  files.download(output_audio)