## Preprocess Dataset

In [1]:
import sys

sys.path.append("/home/voice-cloning-finetune/TTS")

In [2]:
import os
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples

ModuleNotFoundError: No module named 'trainer'

In [None]:
# dataset config for one of the pre-defined datasets
ds_path = "/home/voice-cloning-finetune/ThaiMultiSpeech"
dataset_config = BaseDatasetConfig(
    formatter="tms_formatter", meta_file_train="metadata.txt", path=ds_path, language="th"
)

# load training samples
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)

In [None]:
train_samples[:5]

## Train TorToise Model

In [5]:
# TortoiseConfig: all model related values for training, validating and testing.
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.models.glow_tts import GlowTTS
from TTS.config.shared_configs import BaseAudioConfig
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.speakers import SpeakerManager
from trainer import Trainer, TrainerArgs

import re

Append the following code in `/usr/local/lib/python3.9/dist-packages/TTS/tts/utils/text/cleaners.py`

In [6]:
# from pythainlp.tokenize import word_tokenize

# def thai_cleaners(text):
#     """Pipeline for Thai text"""
#     text = lowercase(text)
#     text = replace_symbols(text, lang=None)
#     text = remove_aux_symbols(text)
#     text = ' '.join(word_tokenize(text, keep_whitespace=False))
#     return text

In [7]:
# Example of Thai cleaners on transcription sample
# thai_cleaners(train_samples[0]['text'])

In [8]:
output_path = "glowtts_train_dir"
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [9]:
audio_config = BaseAudioConfig(sample_rate=22050, resample=True)

In [10]:
config = GlowTTSConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=100,
    audio=audio_config,
    text_cleaner="thai_cleaners",
    use_speaker_embedding=True,
    use_phonemes=False,
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
    save_step=1000,
    training_seed=42
)

In [None]:
ap = AudioProcessor.init_from_config(config)
# Modify sample rate if for a custom audio dataset:
# ap.sample_rate = 22050

In [12]:
tokenizer, config = TTSTokenizer.init_from_config(config)

In [13]:
# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader
speaker_manager = SpeakerManager()
speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
config.num_speakers = speaker_manager.num_speakers

In [None]:
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

In [None]:
model = GlowTTS(config, ap, tokenizer, speaker_manager=speaker_manager)

In [None]:
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

In [None]:
trainer.fit()