In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import torch
import torchaudio
import commons
import utils
import data_utils
from models import SynthesizerTrn
from hubert_model import hubert_soft
from scipy.io.wavfile import write

In [2]:
config = "./configs/ljs_base.json"
g_checkpoint = "logs/ljs/G_36000.pth"

In [3]:
hps = utils.get_hparams_from_file(config)
hubert = hubert_soft("checkpoint/hubert-soft.pt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
net_g = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).to(device)
_ = net_g.eval()

_ = utils.load_checkpoint(g_checkpoint, net_g, None)

INFO:root:Loaded checkpoint 'logs/ljs/G_36000.pth' (iteration 34)


In [5]:
def convert(path):
    with torch.inference_mode():
        source, sr = torchaudio.load(path)
        if sr != hps.data.sampling_rate:
            source = torchaudio.functional.resample(source, sr, hps.data.sampling_rate)
        source = source.unsqueeze(0)
        unit = hubert.units(source.to(device))
        print(unit.size())
        x_tst_lengths = torch.LongTensor([unit.size(1)]).to(device)
        audio = net_g.infer(unit, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][
            0, 0].data.cpu().float().numpy()
    ipd.display(ipd.Audio(source.squeeze(), rate=hps.data.sampling_rate, normalize=False))
    ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

### 英語訓練データ
### データ数：１２９９８
### Epoch: ３４

In [6]:
# 英語
path = "val/en1.mp3"
convert(path)

torch.Size([1, 574, 256])


In [7]:
# 日本語
path = "val/jp1.mp3"
convert(path)

torch.Size([1, 417, 256])


In [8]:
# 検証データ英語女性
path = "Dummy1/LJ034-0048.wav"
convert(path)

torch.Size([1, 261, 256])


In [9]:
# 検証データ日本語男性
path = "jp1/meian_5708.wav"
convert(path)

torch.Size([1, 304, 256])


In [10]:
# 他の日本語女性　ー＞　訓練データ英語女性
path = "/home/sunyuqiang/ml/data/jvs_ver1/jvs010/parallel100/wav24kHz16bit/VOICEACTRESS100_004.wav"
convert(path)

torch.Size([1, 439, 256])


In [11]:
# 他の日本語男性　ー＞　訓練データ英語女性
path = "/home/sunyuqiang/ml/data/jvs_ver1/jvs012/parallel100/wav24kHz16bit/VOICEACTRESS100_004.wav"
convert(path)

torch.Size([1, 449, 256])


In [12]:
# 他の日本語女性　ー＞　訓練データ英語女性
path = "/home/sunyuqiang/ml/data/jvs_ver1/jvs010/parallel100/wav24kHz16bit/VOICEACTRESS100_005.wav"
convert(path)

torch.Size([1, 858, 256])
