In [27]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

## MB-iSTFT-VITS

In [28]:
hps = utils.get_hparams_from_file("./configs/ljs_mb_istft_vits.json")

In [49]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
_ = net_g.eval()

_ = utils.load_checkpoint("./logs/ljs_mb_istft_vits/G_65000.pth", net_g, None)
# _ = utils.load_checkpoint("logs/ljs_mb_istft_vits/G_1000000.pth", net_g, None)

Mutli-band iSTFT VITS


In [60]:
import time
def tts(txt,device="cuda"):
    
    stn_tst = get_text(txt, hps)
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        t1 = time.time()
        audio = net_g.to(device).infer(x_tst.to(device), x_tst_lengths.to(device), noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].cpu().data.float().numpy()
        t2 = time.time()
        print("推理时间：", (t2-t1),"s")
    ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

In [74]:
tts("[JA]なんでこんなに慣れてんのよ。私の方が先に好きだったのに[JA]",device="cuda")
tts("[JA]こんにちは。私わあやちねねです。こんにちは。私わあやちねねです。こんにちは。私わあやちねねです。[JA]",device="cuda")


推理时间： 0.06965231895446777 s


推理时间： 0.09937572479248047 s


In [78]:
tts("[JA]なんでこんなに慣れてんのよ。私の方が先に好きだったのに[JA]",device="cpu")
tts("[JA]こんにちは。私わあやちねねです。こんにちは。私わあやちねねです。こんにちは。私わあやちねねです。[JA]",device="cpu")


推理时间： 0.2594280242919922 s


推理时间： 0.4499993324279785 s
