In [1]:
!pip install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Running setup.py bdist_wheel for wget ... [?25ldone
[?25h  Stored in directory: /Users/ben/Library/Caches/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


# Deep Convolutional Networks with Guided Attention

In [27]:
from os.path import exists

if not exists('pytorch-dc-tts'):
    !git clone --quiet https://github.com/tugstugi/pytorch-dc-tts

!pip install --ignore-installed torch>=0.4 librosa

[31mxarray 0.14.1 has requirement pandas>=0.24, but you'll have pandas 0.23.4 which is incompatible.[0m
[31mtorchvision 0.6.1 has requirement torch==1.5.1, but you'll have torch 1.6.0 which is incompatible.[0m
[31mbotocore 1.12.70 has requirement urllib3<1.25,>=1.20, but you'll have urllib3 1.25.10 which is incompatible.[0m


In [4]:
import wget

# download text2mel
if not exists('ljspeech-text2mel.pth'):
    wget.download(
        'https://www.dropbox.com/s/4t13ugxzzgnocbj/step-300K.pth',
        'ljspeech-text2mel.pth'
    )

# download SSRN
if not exists('ljspeech-ssrn.pth'):
    wget.download(
        'https://www.dropbox.com/s/gw4aqrgcvccmg0g/step-100K.pth',
        'ljspeech-ssrn.pth'
    )

In [2]:
import sys
sys.path.append('pytorch-dc-tts')
import numpy as np
import torch
import IPython
from IPython.display import Audio
from hparams import HParams as hp
from audio import save_to_wav
from models import Text2Mel, SSRN
from datasets.lj_speech import vocab, idx2char, get_test_data

In [None]:
torch.set_grad_enabled(False)
text2mel = Text2Mel(vocab)
text2mel.load_state_dict(torch.load('ljspeech-text2mel.pth').state_dict())
text2mel = text2mel.eval()
ssrn = SSRN()
ssrn.load_state_dict(torch.load('ljspeech-ssrn.pth').state_dict())
ssrn = ssrn.eval()

In [None]:
SENTENCES = [
  'The horse raced past the barn fell.',
  'The old man the boat.',
  'The florist sent the flowers was pleased.',
  'The cotton clothing is made of grows in Mississippi.',
  'The sour drink from the ocean.',
  'Have the students who failed the exam take the supplementary.',
  'We painted the wall with cracks.',
  'The girl told the story cried.',
  'The raft floated down the river sank.',
  'Fat people eat accumulates.'
]

In [None]:
# synthetize by one by one because there is a batch processing bug!
for i in range(len(SENTENCES)):
    sentence = SENTENCES[i]
    normalized_sentence = "".join([c if c.lower() in vocab else '' for c in sentence])
    print(normalized_sentence)
    
    sentences = [normalized_sentence]
    max_N = len(normalized_sentence)
    L = torch.from_numpy(get_test_data(sentences, max_N))
    zeros = torch.from_numpy(np.zeros((1, hp.n_mels, 1), np.float32))
    Y = zeros
    A = None

    for t in range(hp.max_T):
      _, Y_t, A = text2mel(L, Y, monotonic_attention=True)
      Y = torch.cat((zeros, Y_t), -1)
      _, attention = torch.max(A[0, :, -1], 0)
      attention = attention.item()
      if L[0, attention] == vocab.index('E'):  # EOS
          break

    _, Z = ssrn(Y)
    
    Z = Z.cpu().detach().numpy()
    save_to_wav(Z[0, :, :].T, '%d.wav' % (i + 1))
    IPython.display.display(Audio('%d.wav' % (i + 1), rate=hp.sr))

# WaveGAN

In [2]:
import wget

wget.download(
  'https://s3.amazonaws.com/wavegan-v1/models/timit.ckpt.index',
  'model.ckpt.index'
)
wget.download(
  'https://s3.amazonaws.com/wavegan-v1/models/timit.ckpt.data-00000-of-00001',
  'model.ckpt.data-00000-of-00001')
wget.download(
  'https://s3.amazonaws.com/wavegan-v1/models/timit_infer.meta',
  'infer.meta'
);

'infer.meta'

In [4]:
import tensorflow as tf

tf.reset_default_graph()
saver = tf.train.import_meta_graph('infer.meta')
graph = tf.get_default_graph()
sess = tf.InteractiveSession()
saver.restore(sess, 'model.ckpt')

INFO:tensorflow:Restoring parameters from model.ckpt


In [5]:
import numpy as np
import PIL.Image
from IPython.display import display, Audio
import time as time

# Sample latent vectors
_z = (np.random.rand(2, 100) * 2.) - 1.

# Generate
z = graph.get_tensor_by_name('z:0')
G_z = graph.get_tensor_by_name('G_z:0')[:, :, 0]
G_z_spec = graph.get_tensor_by_name('G_z_spec:0')

start = time.time()
_G_z, _G_z_spec = sess.run([G_z, G_z_spec], {z: _z})
print('Finished! (Took {} seconds)'.format(time.time() - start))

for i in range(2):
    display(Audio(_G_z[i], rate=16000))

Finished! (Took 0.21904587745666504 seconds)
