In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd /content/drive/MyDrive/GST-Tacotron-master

/content/drive/MyDrive/GST-Tacotron-master


In [3]:
pip install pypinyin

Collecting pypinyin
  Downloading pypinyin-0.46.0-py2.py3-none-any.whl (1.3 MB)
[?25l[K     |▎                               | 10 kB 21.1 MB/s eta 0:00:01[K     |▌                               | 20 kB 13.3 MB/s eta 0:00:01[K     |▊                               | 30 kB 9.9 MB/s eta 0:00:01[K     |█                               | 40 kB 9.0 MB/s eta 0:00:01[K     |█▎                              | 51 kB 4.4 MB/s eta 0:00:01[K     |█▌                              | 61 kB 5.2 MB/s eta 0:00:01[K     |█▊                              | 71 kB 5.5 MB/s eta 0:00:01[K     |██                              | 81 kB 5.5 MB/s eta 0:00:01[K     |██▎                             | 92 kB 6.1 MB/s eta 0:00:01[K     |██▌                             | 102 kB 5.2 MB/s eta 0:00:01[K     |██▉                             | 112 kB 5.2 MB/s eta 0:00:01[K     |███                             | 122 kB 5.2 MB/s eta 0:00:01[K     |███▎                            | 133 kB 5.2 MB/s eta 0:00:

In [4]:
!pip3 install -r requirements.txt

Collecting audioread==2.1.5
  Downloading audioread-2.1.5.tar.gz (15 kB)
Collecting bypy==1.6.4
  Downloading bypy-1.6.4-py2.py3-none-any.whl (239 kB)
[K     |████████████████████████████████| 239 kB 5.2 MB/s 
[?25hCollecting certifi==2018.4.16
  Downloading certifi-2018.4.16-py2.py3-none-any.whl (150 kB)
[K     |████████████████████████████████| 150 kB 55.0 MB/s 
Collecting cycler==0.10.0
  Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Collecting cymem==1.31.2
  Downloading cymem-1.31.2.tar.gz (33 kB)
Collecting cytoolz==0.8.2
  Downloading cytoolz-0.8.2.tar.gz (386 kB)
[K     |████████████████████████████████| 386 kB 57.2 MB/s 
[?25hCollecting decorator==4.3.0
  Downloading decorator-4.3.0-py2.py3-none-any.whl (9.2 kB)
Collecting dill==0.2.7.1
  Downloading dill-0.2.7.1.tar.gz (64 kB)
[K     |████████████████████████████████| 64 kB 2.9 MB/s 
[?25hCollecting idna==2.6
  Downloading idna-2.6-py2.py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 

In [12]:
from utils import *
from Data import get_eval_data
from Hyperparameters import Hyperparameters as hp
import torch
from scipy.io.wavfile import write
from Network import *

from pypinyin import lazy_pinyin, Style

device = torch.device('cpu')

def spectrogram2wav(mag):
    '''# Generate wave file from spectrogram'''
    # transpose
    #print(mag.shape, "Mag shape")
    mag = mag.T

    # de-noramlize
    mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db

    # to amplitude
    mag = np.power(10.0, mag * 0.05)

    # wav reconstruction
    wav = griffin_lim(mag)

    # de-preemphasis
    wav = signal.lfilter([1], [1, -hp.preemphasis], wav)

    # trim
    wav, _ = librosa.effects.trim(wav)
    #print(mag.shape)
    return wav.astype(np.float32)


def griffin_lim(spectrogram):
    '''Applies Griffin-Lim's raw.
    '''
    X_best = copy.deepcopy(spectrogram)
    for i in range(hp.n_iter):
        X_t = invert_spectrogram(X_best)
        est = librosa.stft(X_t, 2*hp.n_fft, hp.hop_length, win_length=hp.win_length)
        #print(est.shape, "Est shape")
        phase = est / np.maximum(1e-8, np.abs(est))
        #print(phase.shape, "Phase shape")
        X_best = spectrogram * phase
    X_t = invert_spectrogram(X_best)
    y = np.real(X_t)

    return y


def synthesis(model, eval_text):
    eval_text = _pinyin(eval_text)

    model.eval()

    # ref_wavs = [
    #     'ref_wav/nannan.wav', 'ref_wav/xiaofeng.wav', 'ref_wav/donaldduck.wav'
    # ]
    ref_wavs = [
        'ref_wav/nannan.wav',
        'ref_wav/xiaofeng.wav',
        'ref_wav/donaldduck.wav'
    ]
    speakers = ['nannan', 'xiaofeng', 'donaldduck']

    wavs = {}

    for ref_wav, speaker in zip(ref_wavs, speakers):
        text, GO, ref_mels = get_eval_data(eval_text, ref_wav)
        text = text.to(device)
        GO = GO.to(device)
        ref_mels = ref_mels.to(device)

        mel_hat, mag_hat, attn = model(text, GO, ref_mels)
        mag_hat = mag_hat.squeeze().detach().cpu().numpy()
        attn = attn.squeeze().detach().cpu().numpy()
        #print(mag_hat.shape)
        wav_hat = spectrogram2wav(mag_hat)
        wavs[speaker] = wav_hat

    return wavs


def load_model(checkpoint_path):
    model = Tacotron().to(device)
    model.load_state_dict(
        torch.load(
            checkpoint_path, map_location=lambda storage, location: storage))
    return model


def _pinyin(s):
    symbols = '0123456789abcdefghijklmnopqrstuvwxyz '
    s = lazy_pinyin(s, style=Style.TONE2)
    yin = []
    for token in s:
        if token != ' ':
            a = ''
            for c in token:
                if c in symbols:
                    a += c
            yin.append(a)
    a = ''
    s = ' '.join(yin)
    for i in range(len(s)):
        if s[i] == ' ' and i < len(s) - 1 and s[i + 1] == ' ':
            continue
        a += s[i]
    return a



In [16]:
text = '''毛主席是中国的红太阳'''
text = "刘易斯汉密尔顿赢得摩纳哥大奖赛"
text = "太阳从东方升起"
model = load_model('checkpoint/epoch100.pt')
wavs = synthesis(model, text)
for k in wavs:
  wav = wavs[k]
  write('samples/{}.wav'.format(k), hp.sr, wav)



In [None]:
len("PE abcdefghijklmnopqrstuvwxyz'.?")