In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import torch
import torchaudio
import commons
import utils
import data_utils
from models import SynthesizerTrn
from hubert_model import hubert_soft
from scipy.io.wavfile import write

## LJ Speech

In [2]:
hps = utils.get_hparams_from_file("./configs/ljs_base.json")
hubert = hubert_soft("checkpoint/hubert-soft.pt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
net_g = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).to(device)
_ = net_g.eval()

_ = utils.load_checkpoint("logs/ljs/G_36000.pth", net_g, None)

INFO:root:Loaded checkpoint 'logs/ljs/G_36000.pth' (iteration 34)


In [4]:
def convert(path):
    with torch.inference_mode():
        source, sr = torchaudio.load(path)
        if sr != hps.data.sampling_rate:
            source = torchaudio.functional.resample(source,sr,hps.data.sampling_rate)
        source = source.unsqueeze(0)
        unit = hubert.units(source.to(device))
        x_tst_lengths = torch.LongTensor([unit.size(1)]).to(device)
        audio = net_g.infer(unit, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
    ipd.display(ipd.Audio(source.squeeze(), rate=hps.data.sampling_rate, normalize=False))
    ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

In [5]:
path = "val/en1.mp3"
convert(path)

In [9]:
path = "val/jp1.mp3"
convert(path)

In [11]:
path = "val/cn1.mp3"
convert(path)

In [12]:
path = "val/meian_0005.wav"
convert(path)

In [15]:
path = "Dummy1/LJ001-0022.wav"
convert(path)