In [1]:
import os

from trainer import Trainer, TrainerArgs

from TTS.tts.configs.glow_tts_config import GlowTTSConfig

from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

In [2]:
output_path = "train"
dataset_path = "LJSpeech-1.1"

In [3]:

dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train="metadata.csv", path=dataset_path
)

In [4]:
config = GlowTTSConfig(
    batch_size=32,
    eval_batch_size=8,
    num_loader_workers=8,
    num_eval_loader_workers=8,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=40,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=True,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
)

In [5]:
ap = AudioProcessor.init_from_config(config)

tokenizer, config = TTSTokenizer.init_from_config(config)

In [6]:
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size
)

In [7]:
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)

In [None]:
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 16
 | > Num. of Torch Threads: 8


 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=train/run-February-22-2025_01+01AM-9b6e3e6

 > Model has 28610449 parameters


: 

In [None]:
trainer.fit()


[4m[1m > EPOCH: 0/10[0m
 --> train/run-February-22-2025_01+01AM-9b6e3e6

[1m > TRAINING (2025-02-22 01:01:20) [0m
d͡ʒoʊsɪf di. nɪkɔl,
Character '͡' not found in the vocabulary. Discarding it.
ðə ɪŋkaʊntɚ ɪn ðə lʌnt͡ʃɹum.
Character '͡' not found in the vocabulary. Discarding it.
noʊ soʊld͡ʒɚz iðɚ.
Character '͡' not found in the vocabulary. Discarding it.
æftɚ fɚðɚ kwɛst͡ʃənɪŋ
Character '͡' not found in the vocabulary. Discarding it.
ðə soʊld͡ʒɚz ðɛn?
Character '͡' not found in the vocabulary. Discarding it.
ɪn kwɪɡliz d͡ʒʌd͡ʒmənt,
Character '͡' not found in the vocabulary. Discarding it.
ðə dʌt͡ʃəs əv kɛnt.
Character '͡' not found in the vocabulary. Discarding it.
hɚ kæptən wəz d͡ʒɑn smɪθ,
Character '͡' not found in the vocabulary. Discarding it.

[1m   --> TIME: 2025-02-22 01:01:28 -- STEP: 0/811 -- GLOBAL_STEP: 0[0m
     | > current_lr: 2.5e-07 
     | > step_time: 4.5945  (4.594481706619263)
     | > loader_time: 3.4383  (3.438286781311035)

 [!] `train_step()` retuned `None`

In [None]:
# resume training

trainer_args = TrainerArgs(
    restore_path=os.path.join(output_path, "run-February-22-2025_01+01AM-9b6e3e6/best_model.pth")
)

In [9]:
trainer = Trainer(
    trainer_args, config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 16
 | > Num. of Torch Threads: 8
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False


 > Start Tensorboard: tensorboard --logdir=train/run-February-22-2025_02+19AM-9b6e3e6
 > Restoring from best_model.pth ...
 > Restoring Model...
 > Restoring Optimizer...
 > Restoring Scaler...
 > Model restored from step 6488

 > Model has 28610449 parameters


In [10]:
trainer.fit()


[4m[1m > EPOCH: 0/40[0m
 --> train/run-February-22-2025_02+19AM-9b6e3e6

[1m > TRAINING (2025-02-22 02:19:54) [0m

[1m   --> TIME: 2025-02-22 02:20:04 -- STEP: 11/406 -- GLOBAL_STEP: 6500[0m
     | > loss: 1.3386030197143555  (1.357455242763866)
     | > log_mle: 0.3208548426628113  (0.3191657066345215)
     | > loss_dur: 1.017748236656189  (1.0382895469665527)
     | > amp_scaler: 32768.0  (32768.0)
     | > grad_norm: tensor(2.6704, device='cuda:0')  (tensor(2.6417, device='cuda:0'))
     | > current_lr: 2.5e-07 
     | > step_time: 0.3937  (0.47186381166631525)
     | > loader_time: 0.0045  (0.004741213538429954)


[1m   --> TIME: 2025-02-22 02:20:20 -- STEP: 36/406 -- GLOBAL_STEP: 6525[0m
     | > loss: 1.287428855895996  (1.3216145667764876)
     | > log_mle: 0.3273215889930725  (0.31601093212763476)
     | > loss_dur: 0.9601072669029236  (1.0056036346488528)
     | > amp_scaler: 32768.0  (32768.0)
     | > grad_norm: tensor(2.3782, device='cuda:0')  (tensor(2.5160, devi




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.00377635657787323 [0m(+0.00012503564357757568)
     | > avg_loss:[92m 0.058226386085152626 [0m(-0.011003616265952601)
     | > avg_log_mle:[92m -0.19087673723697662 [0m(-0.006618469953536987)
     | > avg_loss_dur:[92m 0.24910312332212925 [0m(-0.0043851463124156)

 > BEST MODEL : train/run-February-22-2025_02+19AM-9b6e3e6/best_model_21105.pth

[4m[1m > EPOCH: 36/40[0m
 --> train/run-February-22-2025_02+19AM-9b6e3e6

[1m > TRAINING (2025-02-22 05:23:18) [0m

[1m   --> TIME: 2025-02-22 05:23:31 -- STEP: 20/406 -- GLOBAL_STEP: 21125[0m
     | > loss: 0.09701956808567047  (0.08118630349636077)
     | > log_mle: -0.1416633129119873  (-0.15810806751251222)
     | > loss_dur: 0.23868288099765778  (0.239294371008873)
     | > amp_scaler: 8192.0  (8192.0)
     | > grad_norm: tensor(7.9840, device='cuda:0')  (tensor(6.2664, device='cuda:0'))
     | > current_lr: 9e-06 
     | > step_time: 0.4227  (0.49135193824768064)




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.005947694182395935 [0m(+0.0012499988079071045)
     | > avg_loss:[91m 0.040165191516280174 [0m(+0.0004118774086236954)
     | > avg_log_mle:[91m -0.19419001787900925 [0m(+0.00394662469625473)
     | > avg_loss_dur:[92m 0.23435520939528942 [0m(-0.003534747287631035)


[4m[1m > EPOCH: 39/40[0m
 --> train/run-February-22-2025_02+19AM-9b6e3e6

[1m > TRAINING (2025-02-22 05:39:33) [0m

[1m   --> TIME: 2025-02-22 05:39:36 -- STEP: 2/406 -- GLOBAL_STEP: 22325[0m
     | > loss: 0.03577302396297455  (0.044322676956653595)
     | > log_mle: -0.18397462368011475  (-0.1769341230392456)
     | > loss_dur: 0.2197476476430893  (0.2212567999958992)
     | > amp_scaler: 8192.0  (8192.0)
     | > grad_norm: tensor(6.5903, device='cuda:0')  (tensor(5.0262, device='cuda:0'))
     | > current_lr: 9.75e-06 
     | > step_time: 0.3241  (0.527601957321167)
     | > loader_time: 0.004  (0.025082826614379883)


[1m   --> TIME: 2025-




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.004724264144897461 [0m(-0.0012234300374984741)
     | > avg_loss:[92m 0.025690163485705853 [0m(-0.014475028030574322)
     | > avg_log_mle:[92m -0.20592039823532104 [0m(-0.011730380356311798)
     | > avg_loss_dur:[92m 0.2316105617210269 [0m(-0.0027446476742625237)

 > BEST MODEL : train/run-February-22-2025_02+19AM-9b6e3e6/best_model_22729.pth
