In [1]:
!nvidia-smi -L
!nvidia-smi

GPU 0: NVIDIA GeForce GTX 1650 (UUID: GPU-e3b7af2d-96f6-a76f-9060-1227760cafdf)
Sun Mar 27 22:57:21 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 497.09       Driver Version: 497.09       CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
| 32%   38C    P8     8W /  75W |   1005MiB /  4096MiB |      9%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------

In [2]:
!gdown https://drive.google.com/uc?id=1c8fhNEo1GDlXTFzmne4cfOUo3htPWoJe

^C


In [4]:
!unzip LPL_TalkNet.zip

"unzip" �� ���� ����७��� ��� ���譥�
��������, �ᯮ��塞�� �ணࠬ��� ��� ������ 䠩���.
Downloading...
From: https://drive.google.com/uc?id=1c8fhNEo1GDlXTFzmne4cfOUo3htPWoJe
To: c:\Users\stymlice\Documents\GitHub\labTalkNet\LPL_TalkNet.zip

  0%|          | 0.00/107M [00:00<?, ?B/s]
  0%|          | 524k/107M [00:00<01:42, 1.04MB/s]
  1%|          | 1.05M/107M [00:00<01:02, 1.69MB/s]
  1%|▏         | 1.57M/107M [00:00<00:44, 2.40MB/s]
  2%|▏         | 2.10M/107M [00:00<00:34, 3.01MB/s]
  2%|▏         | 2.62M/107M [00:01<00:32, 3.21MB/s]
  3%|▎         | 3.15M/107M [00:01<00:28, 3.67MB/s]
  3%|▎         | 3.67M/107M [00:01<00:25, 4.01MB/s]
  4%|▍         | 4.19M/107M [00:01<00:24, 4.22MB/s]
  4%|▍         | 4.72M/107M [00:01<00:26, 3.91MB/s]
  5%|▍         | 5.24M/107M [00:01<00:24, 4.18MB/s]
  5%|▌         | 5.77M/107M [00:02<00:45, 2.23MB/s]
  6%|▌         | 6.29M/107M [00:02<00:42, 2.38MB/s]
  8%|▊         | 8.39M/107M [00:02<00:19, 4.97MB/s]
  9%|▉         | 9.44M/107M [00:02<00:23, 4.07MB/s]

In [2]:
import json
import os
import sys
import numpy as np
import resampy
import scipy
import torch

sys.path.append("hifi-gan")
from denoiser import Denoiser
from env import AttrDict
from meldataset import MAX_WAV_VALUE, mel_spectrogram
from models import Generator


class vHiFiGAN:
    def __init__(self, model_path, conf_name, device):
        # Load HiFi-GAN
        conf = os.path.join("hifi-gan", conf_name + ".json")
        with open(conf) as f:
            json_config = json.loads(f.read())
        self.h = AttrDict(json_config)
        torch.manual_seed(self.h.seed)
        self.hifigan = Generator(self.h).to(torch.device(device))
        state_dict_g = torch.load(model_path, map_location=torch.device(device))
        self.hifigan.load_state_dict(state_dict_g["generator"])
        self.hifigan.eval()
        self.hifigan.remove_weight_norm()
        self.denoiser = Denoiser(self.hifigan, mode="normal")
        self.device = device

    def vocode(self, spect):
        y_g_hat = self.hifigan(spect.float())
        audio = y_g_hat.squeeze()
        audio = audio * MAX_WAV_VALUE
        audio_denoised = self.denoiser(audio.view(1, -1), strength=35)[:, 0]
        return (
            audio_denoised.detach().cpu().numpy().reshape(-1).astype(np.int16),
            audio_denoised.detach().cpu(),
        )

    def superres(self, audio, original_sr):
        # Resampling
        wave = resampy.resample(
            audio,
            original_sr,
            self.h.sampling_rate,
            filter="sinc_window",
            window=scipy.signal.windows.hann,
            num_zeros=8,
        )
        wave_out = wave.astype(np.int16)

        # Super-res
        wave = wave / MAX_WAV_VALUE
        wave = torch.FloatTensor(wave).to(torch.device(self.device))
        new_mel = mel_spectrogram(
            wave.unsqueeze(0),
            self.h.n_fft,
            self.h.num_mels,
            self.h.sampling_rate,
            self.h.hop_size,
            self.h.win_size,
            self.h.fmin,
            self.h.fmax,
        )
        y_g_hat2 = self.hifigan(new_mel)
        audio2 = y_g_hat2.squeeze()
        audio2 = audio2 * MAX_WAV_VALUE
        audio2_denoised = self.denoiser(audio2.view(1, -1), strength=35)[:, 0]

        # High-pass filter, mixing and denormalizing
        audio2_denoised = audio2_denoised.detach().cpu().numpy().reshape(-1)
        b = scipy.signal.firwin(
            101, cutoff=10500, fs=self.h.sampling_rate, pass_zero=False
        )
        y = scipy.signal.lfilter(b, [1.0], audio2_denoised)
        y *= 4.0  # superres strength
        y_out = y.astype(np.int16)
        y_padded = np.zeros(wave_out.shape)
        y_padded[: y_out.shape[0]] = y_out
        sr_mix = wave_out + y_padded
        return sr_mix, self.h.sampling_rate


In [3]:
import torch
from nemo.collections.tts.models import TalkNetSpectModel, TalkNetDursModel, TalkNetPitchModel,HifiGanModel

spec_gen = TalkNetSpectModel.restore_from('TalkNetSpect.nemo')
spec_gen.add_module('_pitch_model', TalkNetPitchModel.restore_from('TalkNetPitch.nemo'))
spec_gen.add_module('_durs_model', TalkNetDursModel.restore_from('TalkNetDurs.nemo'))
vocoder = vHiFiGAN('hifiganmodel', "config_v1", "cuda:0")

[NeMo W 2022-03-27 21:35:09 modelPT:139] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.asr.data.audio_to_text.AudioToCharWithDursF0Dataset
      manifest_filepath: trainfiles.json
      max_duration: null
      min_duration: 0.1
      int_values: false
      load_audio: true
      normalize: false
      sample_rate: 22050
      trim: false
      durs_file: /content/drive/My Drive/LPL/durations.pt
      f0_file: /content/drive/My Drive/LPL/f0s.pt
      blanking: true
      vocab:
        notation: phonemes
        punct: true
        spaces: true
        stresses: false
        add_blank_at: last
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 16
      num_workers: 4
    
[NeMo W 2022-03-27 21:35:09 modelPT:146] If you intend to do validation, please call the ModelP

[NeMo I 2022-03-27 21:35:09 features:252] PADDING: 1
[NeMo I 2022-03-27 21:35:09 features:269] STFT using torch
[NeMo I 2022-03-27 21:35:10 modelPT:439] Model TalkNetSpectModel was successfully restored from TalkNetSpect.nemo.


[NeMo W 2022-03-27 21:35:11 modelPT:139] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.asr.data.audio_to_text.AudioToCharWithDursF0Dataset
      manifest_filepath: trainfiles.json
      max_duration: null
      min_duration: 0.1
      int_values: false
      load_audio: false
      normalize: false
      sample_rate: 22050
      trim: false
      durs_file: /content/drive/My Drive/LPL/durations.pt
      f0_file: /content/drive/My Drive/LPL/f0s.pt
      blanking: true
      vocab:
        notation: phonemes
        punct: true
        spaces: true
        stresses: false
        add_blank_at: last
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 16
      num_workers: 4
    
[NeMo W 2022-03-27 21:35:11 modelPT:146] If you intend to do validation, please call the Model

[NeMo I 2022-03-27 21:35:11 modelPT:439] Model TalkNetPitchModel was successfully restored from TalkNetPitch.nemo.


[NeMo W 2022-03-27 21:35:11 modelPT:139] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.asr.data.audio_to_text.AudioToCharWithDursF0Dataset
      manifest_filepath: trainfiles.json
      max_duration: null
      min_duration: 0.1
      int_values: false
      load_audio: false
      normalize: false
      sample_rate: 22050
      trim: false
      durs_file: /content/drive/My Drive/LPL/durations.pt
      f0_file: /content/drive/My Drive/LPL/f0s.pt
      blanking: true
      vocab:
        notation: phonemes
        punct: true
        spaces: true
        stresses: false
        add_blank_at: last
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 64
      num_workers: 4
    
[NeMo W 2022-03-27 21:35:11 modelPT:146] If you intend to do validation, please call the Model

[NeMo I 2022-03-27 21:35:11 modelPT:439] Model TalkNetDursModel was successfully restored from TalkNetDurs.nemo.
Removing weight norm...


      fft_window = pad_center(fft_window, filter_length)
    


In [4]:
def infer(str_input):
    with torch.no_grad():
        parsed = spec_gen.parse(str_input)
        gen_spec_kwargs = {}
           
        spectrogram = spec_gen.generate_spectrogram(tokens=parsed, **gen_spec_kwargs)
        #audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
        audio, audio_torch = vocoder.vocode(spectrogram)
#        audio = vocoder._bias_denoise(audio, spectrogram).squeeze(1)
    if spectrogram is not None:
        if isinstance(spectrogram, torch.Tensor):
            spectrogram = spectrogram.to('cpu').numpy()
        if len(spectrogram.shape) == 3:
            spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio

In [26]:
text_to_generate = input("Input text to synthesize: ")
spec, audio = infer(text_to_generate)
import IPython.display as ipd
import numpy as np
from scipy.io.wavfile import write

audio_path = 'C:/Users/stymlice/Documents/GitHub/labTalkNet/output.wav'
write(audio_path, 22050, audio)
ipd.Audio(audio, rate=22050, autoplay=True)
