# 下载GitHub仓库

In [None]:
%cd /root/autodl-tmp
!git clone https://github.com/TronFlyn/vits_chinese
%cd vits_chinese

In [None]:
!git pull

# 查看N卡状态

In [None]:
!nvidia-smi

# 安装依赖

In [None]:
%cd /root/autodl-tmp/vits_chinese
!sudo pip install -r requirements.txt
!sudo apt-get install espeak -y

# 处理数据包

In [None]:
!cp /root/autodl-tmp/mxj.zip /root/autodl-tmp/vits_chinese/mxj.zip

In [None]:
%cd /root/autodl-tmp/vits_chinese
!unzip -q -d /root/autodl-tmp/vits_chinese/mxj mxj.zip

# 文本预处理

In [None]:
import os
path = "/root/autodl-tmp/vits_chinese"
os.chdir(path)
print(os.getcwd())

In [None]:
%cd /root/autodl-tmp/vits_chinese/monotonic_align
!python setup.py build_ext --inplace
%cd ..

In [None]:
import os
path = "/root/autodl-tmp/vits_chinese"
os.chdir(path)
print(os.getcwd())

In [None]:
!python preprocess.py --text_index 1 --text_cleaners chinese_cleaners1 --filelists /root/autodl-tmp/vits_chinese/mxj_text/mxj.txt /root/autodl-tmp/vits_chinese/mxj_text/mxj_val.txt

# 训练

In [None]:
!python train.py -c configs/mxj.json -m mxj

# 合成语音

In [None]:
#在"代码执行程序"下拉菜单选择"重新启动代码程序"
#再从该代码框开始，进行推断和输出语音
import os
path = "/root/autodl-tmp/vits_chinese"
os.chdir(path)
print(os.getcwd())

%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

In [None]:
hps = utils.get_hparams_from_file("/root/autodl-tmp/vits_chinese/configs/mxj.json")

In [None]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cuda()
_ = net_g.eval()

_ = utils.load_checkpoint("/root/autodl-tmp/vits_chinese/logs/mxj/G_10000.pth", net_g, None)

In [None]:
stn_tst = get_text("你这变态，就这么喜欢被我用脚踩吗！", hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([1024]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths,sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate))