# NeMo Thai TTS Tutorial

## Download Tsync2 Corpus

In [None]:
!wget https://github.com/korakot/corpus/releases/download/v1.0/AIFORTHAI-TSync2Corpus.zip

In [None]:
!unzip AIFORTHAI-TSync2Corpus.zip

## Create menifest for NeMo

NeMo menifest format <br>
```json
{
    "audio_filepath": "TSync2_clean/wav/tsync2_noon_48_1058_trim.wav",
    "text": "ปลด แล้ว โปสเตอร์ หวิว เก้า นางเอก", 
    "duration": 2.507755102040816
}
{
    "audio_filepath": "TSync2_clean/wav/tsync2_noon_50_2872_trim.wav",
    "text": "แต่ หลง ไข่ กรุ๊ป ขอ ต่อรอง ลง เหลือ ราคา สาม ร้อย ถึง สี่ ร้อย ล้าน เหรียญ", 
    "duration": 5.526349206349207
}
```

In [3]:
import os
import glob

import librosa
import numpy as np

from pythainlp.tokenize import word_tokenize
from pythainlp.util import normalize, isthaichar

from sklearn.model_selection import train_test_split
from scipy.io import wavfile
import tqdm
import multiprocessing as mp


def clean_text(text):
    norm_text = normalize(text)
    norm_text = ''.join(char for char in norm_text if isthaichar(char))
    words = word_tokenize(norm_text, engine='newmm')
    words = [words[i-1] if words[i] == 'ๆ' else words[i] for i in range(len(words))]
    return ' '.join(words)

def __process(inp):
    try:
        filepath, out_dir = inp
        transcript_path = filepath.replace('/wav/', '/wrd_ph/').replace('.wav', '.txt')
        text = open(transcript_path).read().split('\n')[0].split('|')
        text = ' '.join(text)
        text = clean_text(text)
        

        filename = filepath.split('/')[-1]
        out_file_path = os.path.join(out_dir, "wav", filename.replace('.wav', '_trim.wav'))

        y, sr = librosa.load(filepath)
        y_trim, _ = librosa.effects.trim(y, top_db=30)
        duration = librosa.get_duration(y_trim, sr)

        wavfile.write(out_file_path, sr, (y_trim*32768).astype('int16'))
        return '{"audio_filepath": "'+out_file_path+'", \
                           "text": "'+text.strip()+'", "duration": '+str(duration)+'}\n'
    except Exception as e:
        pass

def create_menifest(inp_dir, out_dir, num_workers = 30):
    file_list = glob.glob(inp_dir+'/wav/*.wav')
    
    if not os.path.exists(out_dir):
        os.makedirs(os.path.join(out_dir, 'wav'))
        
    entries = []
    with mp.Pool(num_workers) as p:
        results = p.imap(__process, [(filepath, out_dir) for filepath in file_list])
        for result in tqdm.tqdm(results, total=len(file_list)):
            entries.append(result)
           
    train_files, test_files = train_test_split(entries, test_size=0.05, random_state=42)

    print("Number of sentence in train set : ", len(train_files))
    print("Number of sentence in test set : ", len(test_files))
    
    with open(os.path.join(out_dir, 'tsync2_train.json'), 'w') as tr:
        for train_file in train_files:
            if train_file!='' and train_file is not None:
                tr.write(train_file)
    
    with open(os.path.join(out_dir, 'tsync2_test.json'), 'w') as tt:
        for test_file in test_files:
            if test_file!='' and test_file is not None:
                tt.write(test_file)

create_menifest("TSync2", "TSync2_clean")

100%|██████████| 2710/2710 [00:43<00:00, 61.81it/s] 

Number of sentence in train set :  2574
Number of sentence in test set :  136





## Train

In [11]:
import pytorch_lightning as pl

from nemo.collections.common.callbacks import LogEpochTimeCallback
from nemo.collections.tts.models import Tacotron2Model, WaveGlowModel
from omegaconf import OmegaConf, DictConfig

from nemo.utils.exp_manager import exp_manager

In [4]:
cfg = OmegaConf.load('tacotron2_th.yaml')

trainer = pl.Trainer(**cfg.trainer)

exp_manager(trainer, cfg.get("exp_manager", None))

model = Tacotron2Model(cfg=cfg.model, trainer=trainer)

# Let's add a few more callbacks
lr_logger = pl.callbacks.LearningRateMonitor()
epoch_time_logger = LogEpochTimeCallback()
trainer.callbacks.extend([lr_logger, epoch_time_logger])
# Call lightning trainer's fit() to train the model
trainer.fit(model)

[NeMo W 2021-06-21 03:10:53 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################

GPU available: True, used: True
I0621 03:10:54.096405 140288176682816 distributed.py:53] GPU available: True, used: True
TPU available: None, using: 0 TPU cores
I0621 03:10:54.097816 140288176682816 distributed.py:53] TPU available: None, using: 0 TPU cores
Multi-processing is handled by Slurm.
I0621 03:10:54.098776 140288176682816 distributed.py:53] Multi-processing is handled by Slurm.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
I0621 03:10:54.099539 140288

[NeMo I 2021-06-21 03:10:54 exp_manager:208] Experiments will be logged at /work/tsync2_tacotron2_logs/Tacotron2
[NeMo I 2021-06-21 03:10:54 exp_manager:548] TensorboardLogger has been set up
[NeMo I 2021-06-21 03:10:54 collections:173] Dataset loaded with 2573 files totalling 3.80 hours
[NeMo I 2021-06-21 03:10:54 collections:174] 0 files were filtered totalling 0.00 hours
[NeMo I 2021-06-21 03:10:54 collections:173] Dataset loaded with 136 files totalling 0.20 hours
[NeMo I 2021-06-21 03:10:54 collections:174] 0 files were filtered totalling 0.00 hours
[NeMo I 2021-06-21 03:10:54 features:240] PADDING: 16
[NeMo I 2021-06-21 03:10:54 features:256] STFT using torch


initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/1
I0621 03:10:55.309100 140288176682816 ddp_plugin.py:89] initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/1


[NeMo I 2021-06-21 03:10:57 modelPT:685] Optimizer config = Adam (
    Parameter Group 0
        amsgrad: False
        betas: (0.9, 0.999)
        eps: 1e-08
        lr: 0.001
        weight_decay: 1e-06
    )
[NeMo I 2021-06-21 03:10:57 lr_scheduler:621] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7f95a006de10>" 
    will be used during training (effective maximum steps = 1620) - 
    Parameters : 
    (min_lr: 1.0e-06
    max_steps: 1620
    )


Set SLURM handle signals.
I0621 03:10:57.394197 140288176682816 slurm_connector.py:81] Set SLURM handle signals.

  | Name                       | Type               | Params
------------------------------------------------------------------
0 | audio_to_melspec_precessor | FilterbankFeatures | 0     
1 | text_embedding             | Embedding          | 35.8 K
2 | encoder                    | Encoder            | 5.5 M 
3 | decoder                    | Decoder            | 18.3 M
4 | postnet                    | Postnet            | 4.3 M 
5 | loss                       | Tacotron2Loss      | 0     
------------------------------------------------------------------
28.2 M    Trainable params
0         Non-trainable params
28.2 M    Total params
I0621 03:10:57.397764 140288176682816 lightning.py:1346] 
  | Name                       | Type               | Params
------------------------------------------------------------------
0 | audio_to_melspec_precessor | FilterbankFeatures | 0   

Validation sanity check: 0it [00:00, ?it/s]

[NeMo W 2021-06-21 03:10:58 patch_utils:50] torch.stft() signature has been updated for PyTorch 1.7+
    Please update PyTorch to remain compatible with later versions of NeMo.
      normalized, onesided, return_complex)
    


Training: 0it [00:00, ?it/s]

    Please use self.log(...) inside the lightningModule instead.
    
    # log on a step or aggregate epoch metric to the logger and/or progress bar
    # (inside LightningModule)
    self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
    
    
Saving latest checkpoint...
I0621 03:20:25.034493 140288176682816 distributed.py:53] Saving latest checkpoint...


MisconfigurationException: ModelCheckpoint(monitor='val_loss') not found in the returned metrics: ['loss', 'training_loss']. HINT: Did you call self.log('val_loss', tensor) in the LightningModule?

## Inference

In [28]:
model = Tacotron2Model.restore_from("Tacotron2.nemo")
model.to('cuda:0')

# Load vocoder
vocoder = WaveGlowModel.from_pretrained(model_name="tts_waveglow_268m")
vocoder.to('cuda:0')

token_input = model.parse('ภาษาไทย ง่าย นิด เดียว') # map character to index
spec_gen = model.generate_spectrogram(tokens=token_input.to('cuda:0')) # generate spectrogram
audio = vocoder.convert_spectrogram_to_audio(spec=spec_gen) # convert spectrogram to waveform

[NeMo W 2021-06-21 19:49:50 modelPT:133] Please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.asr.data.audio_to_text.AudioToCharDataset
      manifest_filepath: /data/TSync2/tsync2-train-cut.json
      max_duration: null
      min_duration: 0.1
      trim: false
      int_values: false
      normalize: false
      sample_rate: 22050
      parser: base
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 32
      num_workers: 30
    
[NeMo W 2021-06-21 19:49:50 modelPT:140] Please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    dataset:
      _target_: nemo.collections.asr.data.audio_to_text.AudioToCharDataset
      manifest_filepath: /data/TSync2/tsync2-validation.json
 

[NeMo I 2021-06-21 19:49:50 features:240] PADDING: 16
[NeMo I 2021-06-21 19:49:50 features:256] STFT using torch
[NeMo I 2021-06-21 19:49:51 modelPT:376] Model Tacotron2Model was successfully restored from Tacotron2.nemo.
[NeMo I 2021-06-21 19:49:51 cloud:56] Found existing object /root/.cache/torch/NeMo/NeMo_1.0.0rc1/tts_waveglow_268m/4e349bb935476653130e3d61d859ca82/tts_waveglow_268m.nemo.
[NeMo I 2021-06-21 19:49:51 cloud:62] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.0.0rc1/tts_waveglow_268m/4e349bb935476653130e3d61d859ca82/tts_waveglow_268m.nemo
[NeMo I 2021-06-21 19:49:51 common:654] Instantiating model from pre-trained checkpoint
[NeMo I 2021-06-21 19:49:58 features:240] PADDING: 16
[NeMo I 2021-06-21 19:49:58 features:249] STFT using conv
[NeMo I 2021-06-21 19:50:01 modelPT:376] Model WaveGlowModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.0.0rc1/tts_waveglow_268m/4e349bb935476653130e3d61d859ca82/tts_waveglow_268m.nemo.


In [29]:
import IPython.display as ipd

ipd.Audio(audio.cpu().numpy(), rate=22050)