# NeMo Thai TTS Tutorial

## Download Tsync2 Corpus

In [None]:
!wget https://github.com/korakot/corpus/releases/download/v1.0/AIFORTHAI-TSync2Corpus.zip

In [None]:
!unzip AIFORTHAI-TSync2Corpus.zip

## Create menifest for NeMo

NeMo menifest format <br>
```json
{
    "audio_filepath": "TSync2_clean/wav/tsync2_noon_48_1058_trim.wav",
    "text": "ปลด แล้ว โปสเตอร์ หวิว เก้า นางเอก", 
    "duration": 2.507755102040816
}
{
    "audio_filepath": "TSync2_clean/wav/tsync2_noon_50_2872_trim.wav",
    "text": "แต่ หลง ไข่ กรุ๊ป ขอ ต่อรอง ลง เหลือ ราคา สาม ร้อย ถึง สี่ ร้อย ล้าน เหรียญ", 
    "duration": 5.526349206349207
}
```

In [None]:
import os
import glob

import librosa
import numpy as np

from pythainlp.tokenize import word_tokenize
from pythainlp.util import normalize, isthaichar

from sklearn.model_selection import train_test_split
from scipy.io import wavfile
import tqdm
import multiprocessing as mp


def clean_text(text):
    norm_text = normalize(text)
    norm_text = ''.join(char for char in norm_text if isthaichar(char))
    words = word_tokenize(norm_text, engine='newmm')
    words = [words[i-1] if words[i] == 'ๆ' else words[i] for i in range(len(words))]
    return ' '.join(words)

def __process(inp):
    try:
        filepath, out_dir = inp
        transcript_path = filepath.replace('/wav/', '/wrd_ph/').replace('.wav', '.txt')
        text = open(transcript_path).read().split('\n')[0].split('|')
        text = ' '.join(text)
        text = clean_text(text)
        

        filename = filepath.split('/')[-1]
        out_file_path = os.path.join(out_dir, "wav", filename.replace('.wav', '_trim.wav'))

        y, sr = librosa.load(filepath)
        y_trim, _ = librosa.effects.trim(y, top_db=30)
        duration = librosa.get_duration(y_trim, sr)

        wavfile.write(out_file_path, sr, (y_trim*32768).astype('int16'))
        return '{"audio_filepath": "'+out_file_path+'", \
                           "text": "'+text.strip()+'", "duration": '+str(duration)+'}\n'
    except Exception as e:
        pass

def create_menifest(inp_dir, out_dir, num_workers = 30):
    file_list = glob.glob(inp_dir+'/wav/*.wav')
    
    if not os.path.exists(out_dir):
        os.makedirs(os.path.join(out_dir, 'wav'))
        
    entries = []
    with mp.Pool(num_workers) as p:
        results = p.imap(__process, [(filepath, out_dir) for filepath in file_list])
        for result in tqdm.tqdm(results, total=len(file_list)):
            entries.append(result)
           
    train_files, test_files = train_test_split(entries, test_size=0.05, random_state=42)

    print("Number of sentence in train set : ", len(train_files))
    print("Number of sentence in test set : ", len(test_files))
    
    with open(os.path.join(out_dir, 'tsync2_train.json'), 'w') as tr:
        for train_file in train_files:
            if train_file!='' and train_file is not None:
                tr.write(train_file)
    
    with open(os.path.join(out_dir, 'tsync2_test.json'), 'w') as tt:
        for test_file in test_files:
            if test_file!='' and test_file is not None:
                tt.write(test_file)

create_menifest("TSync2", "TSync2_clean")

## Train

In [None]:
import pytorch_lightning as pl

from nemo.collections.common.callbacks import LogEpochTimeCallback
from nemo.collections.tts.models import Tacotron2Model, WaveGlowModel
from omegaconf import OmegaConf, DictConfig

from nemo.utils.exp_manager import exp_manager

In [None]:
cfg = OmegaConf.load('tacotron2_th.yaml')

trainer = pl.Trainer(**cfg.trainer)

exp_manager(trainer, cfg.get("exp_manager", None))

model = Tacotron2Model(cfg=cfg.model, trainer=trainer)

# Let's add a few more callbacks
lr_logger = pl.callbacks.LearningRateMonitor()
epoch_time_logger = LogEpochTimeCallback()
trainer.callbacks.extend([lr_logger, epoch_time_logger])
# Call lightning trainer's fit() to train the model
trainer.fit(model)

## Inference

In [None]:
model = Tacotron2Model.restore_from("Tacotron2.nemo")
model.to('cuda:0')

# Load vocoder
vocoder = WaveGlowModel.from_pretrained(model_name="tts_waveglow_268m")
vocoder.to('cuda:0')

token_input = model.parse('ภาษาไทย ง่าย นิด เดียว') # map character to index
spec_gen = model.generate_spectrogram(tokens=token_input.to('cuda:0')) # generate spectrogram
audio = vocoder.convert_spectrogram_to_audio(spec=spec_gen) # convert spectrogram to waveform

In [None]:
import IPython.display as ipd

ipd.Audio(audio.cpu().numpy(), rate=22050)