In [None]:
!python -V

In [None]:
#@markdown **Step 1:** Check which GPU you've been allocated.

!nvidia-smi -L
!nvidia-smi

In [None]:
#@markdown **Step 2:** Download dependencies.
%tensorflow_version 2.x
import os

custom_lists = [
    #"https://gist.githubusercontent.com/SortAnon/997cda157954a189259c9876fd804e53/raw/example_models.json",
]

!apt-get install sox libsndfile1 ffmpeg
!pip install tensorflow==2.4.1 dash==1.21.0 dash-bootstrap-components==0.13.0 jupyter-dash==0.4.0 psola wget unidecode pysptk frozendict torchvision==0.9.1 torchaudio==0.8.1 torchtext==0.9.1 torch_stft kaldiio pydub pyannote.audio g2p_en pesq pystoi crepe resampy ffmpeg-python torchcrepe einops taming-transformers-rom1504==0.0.6 tensorflow-hub gdown --upgrade
!python -m pip install git+https://github.com/SortAnon/NeMo.git
if not os.path.exists("hifi-gan"):
    !git clone -q --recursive https://github.com/SortAnon/hifi-gan
!git clone -q https://github.com/SortAnon/ControllableTalkNet
os.chdir("/content/ControllableTalkNet")
!git archive --output=./files.tar --format=tar HEAD
os.chdir("/content")
!tar xf ControllableTalkNet/files.tar
!rm -rf ControllableTalkNet

# PESQ fix
!python -m pip uninstall -y pesq
!python -m pip uninstall -y numpy
!python -m pip install numpy==1.19.5
!python -m pip --no-cache-dir install --no-build-isolation --no-binary :all: pesq==0.0.2

os.chdir("/content/model_lists")
for c in custom_lists:
    !wget "{c}"
os.chdir("/content")


#Download pretrained model

In [None]:
!sudo wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=17vecivV8k2zEBmwAHLSFN-8UoLkJK5LQ' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=17vecivV8k2zEBmwAHLSFN-8UoLkJK5LQ" -O Character_Talknet.zip && rm -rf /tmp/cookies.txt

In [None]:
!unzip /content/Character_Talknet.zip

In [6]:
!chmod -R 777 /content/

#Define class and functions

In [7]:
import json
import os
import sys
import numpy as np
import resampy
import scipy
import torch

sys.path.append("hifi-gan")
from denoiser import Denoiser
from env import AttrDict
from meldataset import MAX_WAV_VALUE, mel_spectrogram
from models import Generator


class vHiFiGAN:
    def __init__(self, model_path, conf_name, device):
        # Load HiFi-GAN
        conf = os.path.join("hifi-gan", conf_name + ".json")
        with open(conf) as f:
            json_config = json.loads(f.read())
        self.h = AttrDict(json_config)
        torch.manual_seed(self.h.seed)
        self.hifigan = Generator(self.h).to(torch.device(device))
        state_dict_g = torch.load(model_path, map_location=torch.device(device))
        self.hifigan.load_state_dict(state_dict_g["generator"])
        self.hifigan.eval()
        self.hifigan.remove_weight_norm()
        self.denoiser = Denoiser(self.hifigan, mode="normal")
        self.device = device

    def vocode(self, spect):
        y_g_hat = self.hifigan(spect.float())
        audio = y_g_hat.squeeze()
        audio = audio * MAX_WAV_VALUE
        audio_denoised = self.denoiser(audio.view(1, -1), strength=35)[:, 0]
        return (
            audio_denoised.detach().cpu().numpy().reshape(-1).astype(np.int16),
            audio_denoised.detach().cpu(),
        )

    def superres(self, audio, original_sr):
        # Resampling
        wave = resampy.resample(
            audio,
            original_sr,
            self.h.sampling_rate,
            filter="sinc_window",
            window=scipy.signal.windows.hann,
            num_zeros=8,
        )
        wave_out = wave.astype(np.int16)

        # Super-res
        wave = wave / MAX_WAV_VALUE
        wave = torch.FloatTensor(wave).to(torch.device(self.device))
        new_mel = mel_spectrogram(
            wave.unsqueeze(0),
            self.h.n_fft,
            self.h.num_mels,
            self.h.sampling_rate,
            self.h.hop_size,
            self.h.win_size,
            self.h.fmin,
            self.h.fmax,
        )
        y_g_hat2 = self.hifigan(new_mel)
        audio2 = y_g_hat2.squeeze()
        audio2 = audio2 * MAX_WAV_VALUE
        audio2_denoised = self.denoiser(audio2.view(1, -1), strength=35)[:, 0]

        # High-pass filter, mixing and denormalizing
        audio2_denoised = audio2_denoised.detach().cpu().numpy().reshape(-1)
        b = scipy.signal.firwin(
            101, cutoff=10500, fs=self.h.sampling_rate, pass_zero=False
        )
        y = scipy.signal.lfilter(b, [1.0], audio2_denoised)
        y *= 4.0  # superres strength
        y_out = y.astype(np.int16)
        y_padded = np.zeros(wave_out.shape)
        y_padded[: y_out.shape[0]] = y_out
        sr_mix = wave_out + y_padded
        return sr_mix, self.h.sampling_rate


In [None]:
import torch
from nemo.collections.tts.models import TalkNetSpectModel, TalkNetDursModel, TalkNetPitchModel,HifiGanModel

spec_gen = TalkNetSpectModel.restore_from('TalkNetSpect.nemo')
spec_gen.add_module('_pitch_model', TalkNetPitchModel.restore_from('TalkNetPitch.nemo'))
spec_gen.add_module('_durs_model', TalkNetDursModel.restore_from('TalkNetDurs.nemo'))
vocoder = vHiFiGAN('hifiganmodel', "config_v1", "cuda:0")


In [9]:
def infer(str_input):
    with torch.no_grad():
        parsed = spec_gen.parse(str_input)
        gen_spec_kwargs = {}
           
        spectrogram = spec_gen.generate_spectrogram(tokens=parsed, **gen_spec_kwargs)
        #audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
        audio, audio_torch = vocoder.vocode(spectrogram)
#        audio = vocoder._bias_denoise(audio, spectrogram).squeeze(1)
    if spectrogram is not None:
        if isinstance(spectrogram, torch.Tensor):
            spectrogram = spectrogram.to('cpu').numpy()
        if len(spectrogram.shape) == 3:
            spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio

#Результат

In [13]:
text_to_generate = input("Input text to synthesize: ")
spec, audio = infer(text_to_generate)
import IPython.display as ipd
import numpy as np
from PIL import Image
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt

ipd.Audio(audio, rate=22050, autoplay=True)

Input text to synthesize: The trees of the forest grow by forming new layers of wood directly under the bark.
