In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import torch
import torchaudio
import commons
import utils
import data_utils
from models import SynthesizerTrn
from hubert_model import hubert_soft
from scipy.io.wavfile import write

In [2]:
config = "./configs/en50jp50.json"
g_checkpoint = "logs/en50jp50/G_48000.pth"

## LJ Speech

In [3]:
hps = utils.get_hparams_from_file(config)
hubert = hubert_soft("checkpoint/hubert-soft.pt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
net_g = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).to(device)
_ = net_g.eval()

_ = utils.load_checkpoint(g_checkpoint, net_g, None)

INFO:root:Loaded checkpoint 'logs/en50jp50/G_48000.pth' (iteration 64)


In [5]:
def convert(path):
    with torch.inference_mode():
        source, sr = torchaudio.load(path)
        if sr != hps.data.sampling_rate:
            source = torchaudio.functional.resample(source,sr,hps.data.sampling_rate)
        source = source.unsqueeze(0)
        unit = hubert.units(source.to(device))
        print(unit.size())
        x_tst_lengths = torch.LongTensor([unit.size(1)]).to(device)
        audio = net_g.infer(unit, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
    ipd.display(ipd.Audio(source.squeeze(), rate=hps.data.sampling_rate, normalize=False))
    ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

In [6]:
path = "val/en1.mp3"
convert(path)

torch.Size([1, 574, 256])


In [11]:
path = "val/cn1.mp3"
convert(path)

In [12]:
path = "val/meian_0005.wav"
convert(path)

In [9]:
path = "val/meian_0004.wav"
convert(path)

torch.Size([1, 638, 256])


In [15]:
path = "Dummy1/LJ001-0022.wav"
convert(path)

In [8]:
path = "val/en1.mp3"
convert(path)

torch.Size([1, 574, 256])
