In [None]:
import os

# Trainer: Where the ✨️ happens.
# TrainingArgs: Defines the set of arguments of the Trainer.
from trainer import Trainer, TrainerArgs

# GlowTTSConfig: all model related values for training, validating and testing.
from TTS.tts.configs.glow_tts_config import GlowTTSConfig

# BaseDatasetConfig: defines name, formatter and path of the dataset.
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

# we use the same path as this script as our training folder.
output_path = os.path.dirname(os.path.abspath('../../TTS'))

# DEFINE DATASET CONFIG
# Set LJSpeech as our target dataset and define its path.
# You can also use a simple Dict to define the dataset and pass it to your custom formatter.
dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "LJSpeech-1.1/")
)

# INITIALIZE THE TRAINING CONFIGURATION
# Configure the model. Every config class inherits the BaseTTSConfig.
config = GlowTTSConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
)

# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)

# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)

# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

# INITIALIZE THE MODEL
# Models take a config object and a speaker manager as input
# Config defines the details of the model like the number of layers, the size of the embedding, etc.
# Speaker manager is used by multi-speaker models.
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)

# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

# AND... 3,2,1... 🚀
trainer.fit()

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 | > Found 13100 files in /root/tts-output/LJSpeech-1.1


 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 24
 | > Num. of Torch Threads: 12
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/root/tts-output/run-July-25-2023_05+10PM-0000000

 > Model has 28610257 parameters

[4m[1m > EPOCH: 0/1000[0m
 --> /root/tts-output/run-July-25-2023_05+10PM-0000000


[*] Pre-computing phonemes...


  1%|▌                                                                               | 86/12969 [00:01<03:54, 54.99it/s]

ðə kəmˈɪʃən bᵻlˈiːvz ðætðə fˈækts ʌvðɪ ɐsˌæsᵻnˈeɪʃən ʌv pɹˈɛzɪdənt kˈɛnədi pˈɔɪnt tə sˈɜːʔn̩ mˈɛʒɚz wˈɪtʃ,
 [!] Character '̩' not found in the vocabulary. Discarding it.


 16%|████████████▍                                                                 | 2062/12969 [00:41<03:57, 45.86it/s]

ˈɪntʊ ðə “kɹˈeɪɾɚ” dˈʌɡ ˈaʊt ɪnðə mˈɪdəl, pˈoːɹ ðə spˈʌndʒ, wˈɔːɹm wˈɔːɾɚ, ðə məlˈæsᵻz, ænd sˈoʊdə dɪsˈɑːlvd ɪn hˈɑːt wˈɔːɾɚ.
 [!] Character '“' not found in the vocabulary. Discarding it.
ˈɪntʊ ðə “kɹˈeɪɾɚ” dˈʌɡ ˈaʊt ɪnðə mˈɪdəl, pˈoːɹ ðə spˈʌndʒ, wˈɔːɹm wˈɔːɾɚ, ðə məlˈæsᵻz, ænd sˈoʊdə dɪsˈɑːlvd ɪn hˈɑːt wˈɔːɾɚ.
 [!] Character '”' not found in the vocabulary. Discarding it.


100%|█████████████████████████████████████████████████████████████████████████████| 12969/12969 [04:22<00:00, 49.37it/s]




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: espeak
	| > 3 not found characters:
	| > ̩
	| > “
	| > ”
| > Number of instances : 12969



[1m > TRAINING (2023-07-25 17:15:09) [0m


 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.90014650319993
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 144984.29755570978
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.


  return F.conv1d(input, weight, bias, self.stride,

[1m   --> TIME: 2023-07-25 17:15:14 -- STEP: 0/406 -- GLOBAL_STEP: 0[0m
     | > current_lr: 2.5e-07 
     | > step_time: 3.9744  (3.974362373352051)
     | > loader_time: 1.449  (1.4489879608154297)

 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.

[1m   --> TIME: 2023-07-25 17:15:25 -- STEP: 25/406 -- GLOBAL_



> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: espeak
	| > 3 not found characters:
	| > ̩
	| > “
	| > ”
| > Number of instances : 131
 | > Preprocessing samples
 | > Max text length: 174
 | > Min text length: 20
 | > Avg text length: 100.76335877862596
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 34739.0
 | > Avg audio length: 144033.41221374046
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.11794063448905945 [0m(+0)
     | > avg_loss: 3.403439462184906 [0m(+0)
     | > avg_log_mle: 0.8592553213238716 [0m(+0)
     | > avg_loss_dur: 2.5441841781139374 [0m(+0)



 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.11794063448905945 [0m(+0.0)
     | > avg_loss: 3.403439462184906 [0m(+0.0)
     | > avg_log_mle: 0.8592553213238716 [0m(+0.0)
     | > avg_loss_dur: 2.5441841781139374 [0m(+0.0)

 > BEST MODEL : /root/tts-output/run-July-25-2023_05+10PM-0000000/best_model_406.pth

[4m[1m > EPOCH: 1/1000[0m
 --> /root/tts-output/run-July-25-2023_05+10PM-0000000

[1m > TRAINING (2023-07-25 17:19:38) [0m

[1m   --> TIME: 2023-07-25 17:19:48 -- STEP: 19/406 -- GLOBAL_STEP: 425[0m
     | > loss: 3.340238094329834  (3.39993731599105)
     | > log_mle: 0.8539333343505859  (0.8522215516943681)
     | > loss_dur: 2.486304759979248  (2.547715764296682)
     | > amp_scaler: 16384.0  (16384.0)
     | > grad_norm: tensor(9.0524, device='cuda:0')  (tensor(9.0390, device='cuda:0'))
     | > current_lr: 2.5e-07 
     | > step_time: 0.3899  (0.4352801849967555)
     | > loader_time: 0.0046  (0.004071059979890522)


[1m   --> TIME: 2023-07-25 17:19:

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.07795578241348267 [0m(+0.0)
     | > avg_loss: 3.271086573600769 [0m(+0.0)
     | > avg_log_mle: 0.8498746827244759 [0m(+0.0)
     | > avg_loss_dur: 2.421211898326874 [0m(+0.0)

 > BEST MODEL : /root/tts-output/run-July-25-2023_05+10PM-0000000/best_model_812.pth

[4m[1m > EPOCH: 2/1000[0m
 --> /root/tts-output/run-July-25-2023_05+10PM-0000000

[1m > TRAINING (2023-07-25 17:23:37) [0m

[1m   --> TIME: 2023-07-25 17:23:45 -- STEP: 13/406 -- GLOBAL_STEP: 825[0m
     | > loss: 3.227701187133789  (3.2774199889256406)
     | > log_mle: 0.8378933668136597  (0.8436014147905203)
     | > loss_dur: 2.389807939529419  (2.4338185787200928)
     | > amp_scaler: 16384.0  (16384.0)
     | > grad_norm: tensor(7.9824, device='cuda:0')  (tensor(8.0358, device='cuda:0'))
     | > current_lr: 5e-07 
     | > step_time: 0.3046  (0.42044813816363996)
     | > loader_time: 0.0052  (0.004366232798649714)


[1m   --> TIME: 2023-07-25 17:23