# Text-to-Speech with Flowtron and Waveglow

This is an English LibriTTS multispeaker TTS demo using open source projects [NVIDIA/flowtron](https://github.com/NVIDIA/flowtron).

For other deep-learning Colab notebooks, visit [tugstugi/dl-colab-notebooks](https://github.com/tugstugi/dl-colab-notebooks).

## Install Flowtron and Waveglow

In [None]:
#@title
%tensorflow_version 1.x
import os
from os.path import exists, join, basename, splitext

git_repo_url = 'https://github.com/NVIDIA/flowtron.git'
project_name = splitext(basename(git_repo_url))[0]
if not exists(project_name):
  # clone and install
  !git clone -q --recursive {git_repo_url}
  !pip install -q librosa unidecode gdown
  
os.chdir(project_name)
from flowtron import Flowtron
from data import Data

import sys
sys.path.insert(0, 'tacotron2')
sys.path.insert(0, 'tacotron2/waveglow')
from glow import WaveGlow

from IPython.display import Audio
import matplotlib
import matplotlib.pylab as plt
plt.rcParams["axes.grid"] = False

TensorFlow 1.x selected.
[K     |████████████████████████████████| 245kB 9.2MB/s 
[?25h

## Download pretrained models

In [None]:
flowtron_pretrained_model = 'flowtron_libritts.pt'
if not exists(flowtron_pretrained_model):
  !gdown https://drive.google.com/uc?id=1KhJcPawFgmfvwV7tQAOeC253rYstLrs8
waveglow_pretrained_model = 'waveglow_256channels_universal_v5.pt'
if not exists(waveglow_pretrained_model):
  !gdown https://drive.google.com/uc?id=1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF

Downloading...
From: https://drive.google.com/uc?id=1KhJcPawFgmfvwV7tQAOeC253rYstLrs8
To: /content/flowtron/flowtron_libritts.pt
244MB [00:01, 154MB/s]
Downloading...
From: https://drive.google.com/uc?id=1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF
To: /content/flowtron/waveglow_256channels_universal_v5.pt
676MB [00:02, 228MB/s]


In [None]:
!pip install torchaudio

Collecting torchaudio
[?25l  Downloading https://files.pythonhosted.org/packages/37/16/ecdb9eb09ec6b8133d6c9536ea9e49cd13c9b5873c8488b8b765a39028da/torchaudio-0.7.2-cp37-cp37m-manylinux1_x86_64.whl (7.6MB)
[K     |████████████████████████████████| 7.6MB 6.7MB/s 
Installing collected packages: torchaudio
Successfully installed torchaudio-0.7.2


In [None]:
import json
import torch
import numpy as np
import pdb
import timeit
torch.manual_seed(1234)
torch.cuda.manual_seed(1234)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = False

# read config
config = json.load(open('config.json'))
data_config = config["data_config"]
model_config = config["model_config"]
model_config['n_speakers'] = 123 # there are 123 speakers
data_config['training_files'] = 'filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt'
data_config['validation_files'] = data_config['training_files']

# load waveglow
waveglow = torch.load(waveglow_pretrained_model)['model'].cuda().eval()
waveglow.cuda().half()
for k in waveglow.convinv:
    k.float()
_ = waveglow.eval()

# load flowtron
model = Flowtron(**model_config).cuda()
state_dict = torch.load(flowtron_pretrained_model, map_location='cpu')['state_dict']
model.load_state_dict(state_dict)
_ = model.eval()

ignore_keys = ['training_files', 'validation_files']
trainset = Data(data_config['training_files'], **dict((k, v) for k, v in data_config.items() if k not in ignore_keys))



Number of speakers : 123
Number of speakers : 123


In [None]:
from scipy.io.wavfile import write
import timeit
mels2=load_mel('/content/LJ001-0153real.wav')
#pdb.set_trace() 
mels2=mels2.cuda() #.unsqueeze(0)
start = timeit.default_timer()
audio = waveglow.infer(mels2.half(), sigma=0.8).float()
stop = timeit.default_timer()
print('Time: ', stop - start)
#audio = waveglow.infer(mels2, sigma=0.8).float()
audio = audio.cpu().numpy()[0]
# normalize audio for now
audio = audio / np.abs(audio).max()
#write("LJ001-0153glow.wav", 22050, audio) #.data.cpu().numpy()
Audio(audio, rate=22050)

Time:  0.3266595850000158


In [None]:
print(sum(p.numel() for p in waveglow.parameters())) 

87879272


Now synthesize the above text:

In [None]:
import torch
from denoiser import Denoiser
from layers import TacotronSTFT, STFT
from hparams import create_hparams
import IPython.display as ipd
hparams = create_hparams()

# Load mels
from utils import load_wav_to_torch
stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length,
                    hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
                    hparams.mel_fmax)
def load_mel(path):
    audio, sampling_rate = load_wav_to_torch(path)
    if sampling_rate != stft.sampling_rate:
        raise ValueError("{} {} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))
    audio_norm = audio / hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    return melspec

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

