[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/SociallyIneptWeeb/AniVoiceChanger/blob/main/AniVoiceChanger_colab.ipynb)

In [None]:
#@title Mount Drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title Clone repository
!git init
!git remote add origin https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI.git
!git fetch origin 195a14e5c51ec02774c4d1961d0a4b67755e25c8 --depth=1
!git reset --hard FETCH_HEAD

In [None]:
#@title Set Mode and Parameters
#@markdown ## Mode
#@markdown To run the WebUI for training a voice model, set the mode to Training.

#@markdown To run the voice changer server for main_colab.py to connect to, set the mode to Inference.
MODE = 'Training' #@param ['Training', 'Inference']

#@markdown If MODE: Training, specify the path to a zip file containing the voice clips in your google drive to be used for training. If MODE: Inference, ignore.
DATASET = 'char_voice_lines.zip' #@param {type:"string"}
DATASET = '/content/drive/MyDrive/' + DATASET

if MODE == 'Training':
  !mkdir -p dataset
  !unzip -d dataset -B {DATASET}
  # rename duplicate filenames in dataset
  !ls -a /content/dataset/
  !rename 's/(\w+)\.(\w+)~(\d*)/$1_$3.$2/' /content/dataset/*.*~*

#@markdown ## Upload a trained model
#@markdown Only fill the below fields if you would like to continue training a previously trained model or use it for inference. Specify the name and epoch number of the model to be used or trained. The folder containing the trained files in your google drive will be used.

MODELNAME = ""  #@param {type:"string"}
MODELEPOCH = 2333333  #@param {type:"integer"}
if MODELNAME:
  !mkdir -p /content/logs/{MODELNAME}
  !cp /content/drive/MyDrive/{MODELNAME}_files/*.index /content/logs/{MODELNAME}/
  !cp /content/drive/MyDrive/{MODELNAME}_files/{MODELNAME}.pth /content/weights/
  if MODE == 'Training':
    !cp /content/drive/MyDrive/{MODELNAME}_files/D_{MODELEPOCH}.pth /content/logs/{MODELNAME}/
    !cp /content/drive/MyDrive/{MODELNAME}_files/G_{MODELEPOCH}.pth /content/logs/{MODELNAME}/
    !cp /content/drive/MyDrive/{MODELNAME}_files/*.npy /content/logs/{MODELNAME}/

In [None]:
#@title Install requirements
!pip install -r requirements.txt
!apt -y install -qq aria2

In [None]:
#@title Download pretrained models

# v1
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D32k.pth -d /content/pretrained -o D32k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D40k.pth -d /content/pretrained -o D40k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D48k.pth -d /content/pretrained -o D48k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G32k.pth -d /content/pretrained -o G32k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G40k.pth -d /content/pretrained -o G40k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G48k.pth -d /content/pretrained -o G48k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D32k.pth -d /content/pretrained -o f0D32k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D40k.pth -d /content/pretrained -o f0D40k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D48k.pth -d /content/pretrained -o f0D48k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G32k.pth -d /content/pretrained -o f0G32k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G40k.pth -d /content/pretrained -o f0G40k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G48k.pth -d /content/pretrained -o f0G48k.pth

# v2
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d /content/pretrained_v2 -o D40k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d /content/pretrained_v2 -o G40k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth -d /content/pretrained_v2 -o f0D40k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth -d /content/pretrained_v2 -o f0G40k.pth

In [None]:
#@title Download Vocal Separation model
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人声vocals+非人声instrumentals.pth -d /content/uvr5_weights -o HP2-人声vocals+非人声instrumentals.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主旋律人声vocals+其他instrumentals.pth -d /content/uvr5_weights -o HP5-主旋律人声vocals+其他instrumentals.pth

In [None]:
#@title Download hubert_base model
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d /content -o hubert_base.pt

In [None]:
#@title Run WebUI for Training, skip if Inference
if MODE == 'Training':
  # %load_ext tensorboard
  # %tensorboard --logdir /content/Retrieval-based-Voice-Conversion-WebUI/logs
  !python3 infer-web.py --colab --pycmd python3

In [None]:
#@title Manually back up the trained model files to Google Drive for Mode: "Training"
#@markdown MODELNAME should be the EXPERIMENT_NAME that you typed.

#@markdown If the name of the model is john, and in the logs/john folder is a file called D_2333333.pth, set the MODELNAME: john and MODELEPOCH: 2333333

if MODE == 'Training':

#@markdown Model name
  MODELNAME = ""  #@param {type:"string"}
  if MODELNAME:
#@markdown Epoch number
    MODELEPOCH = 2333333  #@param {type:"integer"}
#@markdown Save intermediate models if you would like to continue training the model later
    SAVE_INTERMEDIATE = True  #@param {type:"boolean"}

    !mkdir -p /content/drive/MyDrive/{MODELNAME}_files

    if SAVE_INTERMEDIATE:
      !cp /content/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_files/
      !cp /content/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_files/
      !cp /content/logs/{MODELNAME}/total_*.npy /content/drive/MyDrive/{MODELNAME}_files/

    !cp /content/logs/{MODELNAME}/added_*.index /content/drive/MyDrive/{MODELNAME}_files/
    !cp /content/weights/{MODELNAME}.pth /content/drive/MyDrive/{MODELNAME}_files/

# Inference

The code below is to be run for Mode: Inference

When prompted `Proceed (Y/n)?`, click beside it, type `Y` and press `Enter`.

If `WARNING: The following packages were previously imported in this runtime: [numpy] You must restart the runtime in order to use newly installed versions.` is seen in the output, click `Restart Runtime` and then continue running the next cell.

In [None]:
#@title Install specific numpy version. If needed, click Restart Runtime before running the bottom two cells.
if MODE == 'Inference':
  !pip uninstall numpy
  !pip install numpy==1.23.5

In [None]:
try:
  MODE
except NameError:
  MODE = 'Inference'

#@title Set your NGROK_AUTH_TOKEN.
if MODE == 'Inference':
  !pip install flask-ngrok2 -q

#@markdown Obtain your Ngrok auth token from [here](https://dashboard.ngrok.com/get-started/your-authtoken)
  NGROK_AUTH_TOKEN = '' #@param {type:"string"}

In [None]:
if MODE != 'Inference':
  raise Exception('Mode is not set to Inference.')


#@title Run RVC Inference server
import json
import sys
import wave
from pathlib import Path

BASE_DIR = Path('/content')
sys.path.append(str(BASE_DIR))

import torch
from multiprocessing import cpu_count
from flask_ngrok2 import run_with_ngrok
from flask import Flask, request, send_file

from vc_infer_pipeline import VC
from infer_pack.models import (
    SynthesizerTrnMs256NSFsid,
    SynthesizerTrnMs256NSFsid_nono,
    SynthesizerTrnMs768NSFsid,
    SynthesizerTrnMs768NSFsid_nono,
)
from my_utils import load_audio
from fairseq import checkpoint_utils
from scipy.io import wavfile


INPUT_VOICE_PATH = 'input.mp3'
OUTPUT_VOICE_PATH = 'output.wav'
MODEL_NAME = ''
DEVICE = 'cuda:0'
cpt = None


class Config:
  def __init__(self, device, is_half):
    self.device = device
    self.is_half = is_half
    self.n_cpu = 0
    self.gpu_name = None
    self.gpu_mem = None
    self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()

  def device_config(self) -> tuple:
    if torch.cuda.is_available():
      i_device = int(self.device.split(":")[-1])
      self.gpu_name = torch.cuda.get_device_name(i_device)

      if (
        ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
        or "P40" in self.gpu_name.upper()
        or "1060" in self.gpu_name
        or "1070" in self.gpu_name
        or "1080" in self.gpu_name
      ):
        print("16 series/10 series P40 forced single precision")
        self.is_half = False
        for config_file in ["32k.json", "40k.json", "48k.json"]:
          with open(f"configs/{config_file}", "r") as f:
            strr = f.read().replace("true", "false")
          with open(f"configs/{config_file}", "w") as f:
            f.write(strr)
        with open("trainset_preprocess_pipeline_print.py", "r") as f:
          strr = f.read().replace("3.7", "3.0")
        with open("trainset_preprocess_pipeline_print.py", "w") as f:
          f.write(strr)
      else:
        self.gpu_name = None

      self.gpu_mem = int(
        torch.cuda.get_device_properties(i_device).total_memory
        / 1024
        / 1024
        / 1024
        + 0.4
      )
      if self.gpu_mem <= 4:
        with open("trainset_preprocess_pipeline_print.py", "r") as f:
          strr = f.read().replace("3.7", "3.0")
        with open("trainset_preprocess_pipeline_print.py", "w") as f:
          f.write(strr)

    elif torch.backends.mps.is_available():
      print("No supported N-card found, use MPS for inference")
      self.device = "mps"
    else:
      print("No supported N-card found, use CPU for inference")
      self.device = "cpu"
      self.is_half = True

    if self.n_cpu == 0:
      self.n_cpu = cpu_count()

    if self.is_half:
      # 6G memory config
      x_pad = 3
      x_query = 10
      x_center = 60
      x_max = 65
    else:
      # 5G memory config
      x_pad = 1
      x_query = 6
      x_center = 38
      x_max = 41

    if self.gpu_mem != None and self.gpu_mem <= 4:
      x_pad = 1
      x_query = 5
      x_center = 30
      x_max = 32

    return x_pad, x_query, x_center, x_max


CONFIG = Config(DEVICE, True)


def load_hubert():
  models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(['hubert_base.pt'], suffix='', )
  hubert = models[0]
  hubert = hubert.to(DEVICE)

  if True:
    hubert = hubert.half()
  else:
    hubert = hubert.float()

  hubert.eval()
  return hubert

HUBERT_MODEL = load_hubert()


def get_vc(device, is_half, config):
  global cpt, version, net_g, tgt_sr, vc
  model_path = BASE_DIR / 'weights' / f'{MODEL_NAME}.pth'
  if not model_path.exists():
    print(f'The model {model_path} does not exist. Please ensure that you have filled in the proper MODEL_NAME in your .env file.')
    return None

  model_path = str(model_path)
  print(f'loading pth {model_path}')
  cpt = torch.load(model_path, map_location='cpu')
  tgt_sr = cpt["config"][-1]
  cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
  if_f0 = cpt.get("f0", 1)
  version = cpt.get("version", "v1")

  if version == "v1":
    if if_f0 == 1:
      net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
    else:
      net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
  elif version == "v2":
    if if_f0 == 1:
      net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half)
    else:
      net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])

  del net_g.enc_q
  print(net_g.load_state_dict(cpt["weight"], strict=False))
  net_g.eval().to(device)

  if is_half:
    net_g = net_g.half()
  else:
    net_g = net_g.float()

  vc = VC(tgt_sr, config)


def rvc_infer(pitch_change, pitch_extraction_algo, volume_envelope, index_rate):
  logs_dir = BASE_DIR / 'logs' / MODEL_NAME
  index_path = ''
  for file in logs_dir.iterdir():
    if file.suffix == '.index':
      index_path = str(logs_dir / file.name)
      break

  # vc single
  audio = load_audio(INPUT_VOICE_PATH, 16000)
  times = [0, 0, 0]
  if_f0 = cpt.get('f0', 1)
  audio_opt = vc.pipeline(HUBERT_MODEL, net_g, 0, audio, INPUT_VOICE_PATH, times, pitch_change, pitch_extraction_algo, index_path, index_rate, if_f0, 3, tgt_sr, 0, volume_envelope, version, 0.33, f0_file=None)
  wavfile.write(OUTPUT_VOICE_PATH, tgt_sr, audio_opt)


app = Flask(__name__)
run_with_ngrok(app, auth_token=NGROK_AUTH_TOKEN)

@app.route('/', methods=['GET'])
def test():
  response = {'status':'OK','message':'Test'}
  return json.dumps(response)


@app.route('/infer', methods=['POST'])
def infer():
  global MODEL_NAME, cpt
  model_name = request.args.get('model')
  if MODEL_NAME != model_name:
    MODEL_NAME = model_name
    if cpt:
      del cpt
    get_vc(DEVICE, True, CONFIG)

  pitch_change = int(request.args.get('pitch'))
  pitch_extraction_algo = request.args.get('algo')
  volume_envelope = float(request.args.get('volume'))
  index_rate = float(request.args.get('index_rate'))
  audio_data = request.files['audio_file']
  audio_data.save(INPUT_VOICE_PATH)
  rvc_infer(pitch_change, pitch_extraction_algo, volume_envelope, index_rate)
  return send_file(OUTPUT_VOICE_PATH, mimetype="audio/wav")

app.run()