In [None]:
import librosa
import numpy as np
import torch
# from audio_processing import dynamic_range_decompression
# from audio_processing import dynamic_range_compression
from modules.FastDiff.module.FastDiff_model import FastDiff
from utils import audio
from modules.FastDiff.module.util import compute_hyperparams_given_schedule, sampling_given_noise_schedule
import IPython.display as ipd

# download checkpoint to this folder
state_dict = torch.load("pretrained_models/LJSpeech/model_ckpt_steps_500000.ckpt")["state_dict"]["model"]
model = FastDiff().cuda()
model.load_state_dict(state_dict)

# hparams (donot change)
fft_size, hop_size, win_length = 1024, 256, 1024
window="hann"
num_mels=80
fmin, fmax=80, 7600
eps=1e-6
sample_rate=22050

In [None]:
# get diffusion schedule
train_noise_schedule = torch.linspace(1e-06, 0.01, 1000).cuda()
diffusion_hyperparams = compute_hyperparams_given_schedule(train_noise_schedule)

# map diffusion hyperparameters to gpu
for key in diffusion_hyperparams:
    if key in ["beta", "alpha", "sigma"]:
        diffusion_hyperparams[key] = diffusion_hyperparams[key].cuda()
diffusion_hyperparams = diffusion_hyperparams

# load noise schedule for 6 sampling steps (recommended)
#noise_schedule = torch.FloatTensor([1.7838445955931093e-06, 2.7984189728158526e-05, 0.00043231004383414984,
                                  # 0.006634317338466644, 0.09357017278671265, 0.6000000238418579]).cuda()
# load noise schedule for 4 sampling steps
noise_schedule = torch.FloatTensor([3.2176e-04, 2.5743e-03, 2.5376e-02, 7.0414e-01]).cuda()

In [None]:
# Direct inference from wavefroms #

wav, _ = librosa.core.load('egs/audios/LJ001-0001_gt.wav', sr=22050)
# get amplitude spectrogram
x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
                        win_length=win_length, window=window, pad_mode="constant")
spc = np.abs(x_stft)  # (n_bins, T)

# get mel basis
fmin = 0 if fmin == -1 else fmin
fmax = sample_rate / 2 if fmax == -1 else fmax
mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax)
mel = mel_basis @ spc
mel = np.log10(np.maximum(eps, mel))  # (n_mel_bins, T)
mel = torch.from_numpy(mel).cuda()

In [None]:
audio_length = mel.shape[-1] * hop_size
pred_wav = sampling_given_noise_schedule(
    model, (1, 1, audio_length), diffusion_hyperparams, noise_schedule,
    condition=mel, ddim=False, return_sequence=False)

pred_wav = pred_wav / pred_wav.abs().max()
pred_wav = pred_wav.view(-1).cpu().float().numpy()
audio.save_wav(pred_wav, 'egs/audios/test.wav', 22050)
ipd.Audio(pred_wav, rate=sample_rate) 