## Preprocess Dataset

In [1]:
import sys

sys.path.append("/home/voice-cloning-finetune/TTS")

In [2]:
import os
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples

In [3]:
# dataset config for one of the pre-defined datasets
ds_path = "/home/voice-cloning-finetune/ThaiMultiSpeech"
dataset_config = BaseDatasetConfig(
    formatter="tms_formatter", meta_file_train="metadata.txt", path=ds_path, language="th"
)

# load training samples
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)

 | > Found 2390 files in /home/voice-cloning-finetune/ThaiMultiSpeech


In [4]:
train_samples[:5]

[{'text': 'พอเข้ามาในห้องคุณสาย แม่ก็หยิบหมอนออกมาจากหลังตู้ใบหนึ่ง เอามาวางลงกับพื้นกระดาน แล้วแม่ก็เสื่อมตัวลงนอนอีกสักครู่ก็หลับอย่างสบาย ส่วนคุณสายก็กลับมานั่งที่เดิม หยิบผ้าห่มที่ซักแล้วมากองหนึ่ง บอกว่าเป็นของเสด็จ แล้วก็เรียกร้องให้เข้าไปนั่งใกล้ ๆ แล้วบอกให้ช่วยกันจีบ โดยคุณสายทําให้ดูก่อน แล้วก็ให้พลอยลองทําดูบ้าง',
  'audio_file': '/home/voice-cloning-finetune/ThaiMultiSpeech/wavs/เจ้าคุณพระ_sample0_chunk179.wav',
  'speaker_name': 'เจ้าคุณพระ',
  'root_path': '/home/voice-cloning-finetune/ThaiMultiSpeech',
  'language': 'th',
  'audio_unique_name': '#wavs/เจ้าคุณพระ_sample0_chunk179'},
 {'text': 'การดึงเสื้อสวมเข้าทางหัวของเขาเท่าไหร่ เสื้อก็ยิ่งหดเล็กลงเล็กลง จนกระทั่งในที่สุดมันเล็กจนแทบจะใส่ให้ตุ๊กตาหุ่นมือได้พอดี ซึ่งแน่นอนย่อมไม่พอดีกับแฮร์รี่ ป้าเพชรธูเนียสรุปว่า มันคงหดตอนที่เธอเอาไปซัก และแฮร์รี่ก็โล่งใจที่ไม่ถูกลงโทษ',
  'audio_file': '/home/voice-cloning-finetune/ThaiMultiSpeech/wavs/Understand Thai_sample1_chunk134.wav',
  'speaker_name': 'Understand Thai',
  'roo

## Train TorToise Model

In [5]:
# TortoiseConfig: all model related values for training, validating and testing.
from TTS.tts.configs.shared_configs import CharactersConfig
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.models.glow_tts import GlowTTS
from TTS.config.shared_configs import BaseAudioConfig
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.speakers import SpeakerManager
from trainer import Trainer, TrainerArgs

import re

Append the following code in `/usr/local/lib/python3.9/dist-packages/TTS/tts/utils/text/cleaners.py`

In [6]:
# from pythainlp.tokenize import word_tokenize

# def thai_cleaners(text):
#     """Pipeline for Thai text"""
#     text = lowercase(text)
#     text = replace_symbols(text, lang=None)
#     text = remove_aux_symbols(text)
#     text = ' '.join(word_tokenize(text, keep_whitespace=False))
#     return text

In [7]:
# Example of Thai cleaners on transcription sample
# thai_cleaners(train_samples[0]['text'])

In [8]:
output_path = "glowtts_train_dir"
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [9]:
audio_config = BaseAudioConfig(sample_rate=22050, resample=True)

In [10]:
char_config = CharactersConfig(
    pad="_",
    eos="&",
    bos="*",
    blank=None,
    characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789\u0e01\u0e02\u0e03\u0e04\u0e05\u0e06\u0e07\u0e08\u0e09\u0e0a\u0e0b\u0e0c\u0e0d\u0e0e\u0e0f\u0e10\u0e11\u0e12\u0e13\u0e14\u0e15\u0e16\u0e17\u0e18\u0e19\u0e1a\u0e1b\u0e1c\u0e1d\u0e1e\u0e1f\u0e20\u0e21\u0e22\u0e23\u0e24\u0e25\u0e26\u0e27\u0e28\u0e29\u0e2a\u0e2b\u0e2c\u0e2d\u0e2e\u0e30\u0e31\u0e32\u0e33\u0e34\u0e35\u0e36\u0e37\u0e38\u0e39\u0e3a\u0e40\u0e41\u0e42\u0e43\u0e44\u0e45\u0e46\u0e47\u0e48\u0e49\u0e4a\u0e4b\u0e4c\u0e4d\u0e4e\u0e4f\u0e50\u0e51\u0e52\u0e53\u0e54\u0e55\u0e56\u0e57\u0e58\u0e59",
    punctuations="!'(),-.:;? ",
    phonemes="",
    is_unique=True,
    is_sorted=True,
)

In [11]:
config = GlowTTSConfig(
    batch_size=4,
    eval_batch_size=2,
    num_loader_workers=1,
    num_eval_loader_workers=1,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=100,
    audio=audio_config,
    text_cleaner="thai_cleaners",
    use_speaker_embedding=True,
    use_phonemes=False,
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
    characters=char_config,
    save_step=1000,
    training_seed=42
)

In [12]:
ap = AudioProcessor.init_from_config(config)
# Modify sample rate if for a custom audio dataset:
# ap.sample_rate = 22050

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:True
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


In [13]:
tokenizer, config = TTSTokenizer.init_from_config(config)

In [14]:
# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader
speaker_manager = SpeakerManager()
speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
config.num_speakers = speaker_manager.num_speakers

In [15]:
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

 | > Found 2390 files in /home/voice-cloning-finetune/ThaiMultiSpeech


In [16]:
model = GlowTTS(config, ap, tokenizer, speaker_manager=speaker_manager)

 > Init speaker_embedding layer.


In [17]:
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

fatal: not a git repository (or any parent up to mount point /home)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
fatal: not a git repository (or any parent up to mount point /home)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
  return torch._C._cuda_getDeviceCount() > 0
 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Num. of CPUs: 8
 | > Num. of Torch Threads: 4
 | > Torch seed: 42
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=glowtts_train_dir/run-October-21-2024_08+44AM-0000000

 > Model has 32340433 parameters


 > `speakers.pth` is saved to glowtts_train_dir/run-October-21-2024_08+44AM-0000000/speakers.pth.
 > `speakers_file` is updated in the config.json.


In [None]:
trainer.fit()


[4m[1m > EPOCH: 0/100[0m
 --> glowtts_train_dir/run-October-21-2024_08+44AM-0000000




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: False
| > Number of instances : 2367



[1m > TRAINING (2024-10-21 08:44:36) [0m


 | > Preprocessing samples
 | > Max text length: 551
 | > Min text length: 5
 | > Avg text length: 275.29446556822984
 | 
 | > Max audio length: 1487988
 | > Min audio length: 208538
 | > Avg audio length: 1372787.1081537812
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.



[1m   --> TIME: 2024-10-21 08:44:56 -- STEP: 0/592 -- GLOBAL_STEP: 0[0m
     | > current_lr: 2.5e-07 
     | > step_time: 18.5631  (18.563103675842285)
     | > loader_time: 1.2964  (1.296417474746704)

 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
  with autocast(enabled=False):  # avoid mixed_precision in criterion
