In [1]:
!nvidia-smi
%env CUDA_VISIBLE_DEVICES=1
%env CUDA_VISIBLE_DEVICES

Mon Apr 26 11:36:20 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA Tesla V1...  On   | 00000000:37:00.0 Off |                    0 |
| N/A   73C    P0   216W / 250W |   4131MiB / 16160MiB |     92%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA Tesla V1...  On   | 00000000:AF:00.0 Off |                    0 |
| N/A   32C    P0    42W / 250W |      0MiB / 16160MiB |      0%      Default |
|       

'1'

In [2]:
import random
import os
import numpy as np
import torch
random.seed(12)
os.environ['PYTHONHASHSEED'] = str(12)
np.random.seed(12)
torch.manual_seed(12)
torch.cuda.manual_seed(12)

In [3]:
############################################## Tacotron 2 main script ##################################################

# Import libraries
from scipy.io.wavfile import write
from tqdm import tqdm

def gen_tacotron2(transcript, trans_id, save_path, tacotron2, waveglow):
    """ Function: Generate speech from 1 input transcript (collection of sentences) using Tacotron 2.
        Input:    Transcript file with ID
        Output:   Returns nothing but saves audio file (.wav) in folder 'gen_tacotron2' """



    # Preprocessing text
    sequence = torch.tensor(tacotron2.text_to_sequence(transcript, ['english_cleaners']), device='cuda').unsqueeze(0)

    # Generate speech using models
    with torch.no_grad():
        _, mel, _, _ = tacotron2.infer(sequence)
        audio = waveglow.infer(mel)
    audio_numpy = audio[0].data.cpu().numpy()
    rate = 22050

    write(save_path+'/'+trans_id+'.wav', rate, audio_numpy)

In [4]:
""" Program description: Generate speech using Tacotron 2. Tacotron 2 is pretrained on the entire LJSpeech dataset
                         (see more at: https://github.com/NVIDIA/tacotron2). The speech is generated from a training
                         split of the LibriSpeech dataset. Next, we train an ASR model on the generated dataset (see
                         more at: https://github.com/borgholt/asr). The performance of the ASR model is evaluated on 
                         a test split of the LibriSpeech dataset. 

                         In result of this program is (1) a synthesized speech dataset, (2) a parameter save of the 
                         ASR model and (3) output of best ASR WER through stdout stream. """

""" Part 1: Define dataset file-IDs """
train_IDs = 'test-clean'           # LibriSpeech train (for generating)
test_IDs = "dev-clean"           # LibriSpeech test  (for validating ASR)
save_folder = 'tacotron2'

if not os.path.exists("/work3/s194278/fagprojekt/LibriSpeech/"+save_folder):
    os.mkdir("/work3/s194278/fagprojekt/LibriSpeech/"+save_folder)

""" Part 2: Generate speech using Tacotron 2 """
# Get all transcript IDs and call the generator routine for each transcript one by one
def split_line(line):
    idx = line.split(" ")[0]
    trans = line[len(idx)+1:].rstrip()
    return idx, trans

with open('/work3/s194278/fagprojekt/LibriSpeech/'+train_IDs+'.txt', 'r') as f:
    lines = f.readlines()

trans_dict = dict([split_line(l) for l in lines])

# Load pretrained Tacotron 2, transfer computation to GPU and set model in test mode
tacotron2 = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_tacotron2')
tacotron2 = tacotron2.to('cuda')
tacotron2.eval()

# Load pretrained WaveGlow, transfer computation to GPU and set model in test mode
waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow')
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to('cuda')
waveglow.eval()


# Store transcript path (represents both .wav and .txt)
save_path = "/work3/s194278/fagprojekt/LibriSpeech/" + save_folder

for trans_id, transcript in tqdm(trans_dict.items()):
    if len(transcript) > 128:
        continue
    # Generate speech and save
    gen_tacotron2(transcript, trans_id, save_path, tacotron2, waveglow)

Using cache found in /zhome/22/c/137477/.cache/torch/hub/nvidia_DeepLearningExamples_torchhub
Using cache found in /zhome/22/c/137477/.cache/torch/hub/nvidia_DeepLearningExamples_torchhub
 21%|██        | 540/2620 [08:08<30:48,  1.13it/s]  



100%|██████████| 2620/2620 [43:11<00:00,  1.01it/s] 
