# Tworzenie modelu TTS



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
## Install Coqui TTS
! pip install -U pip
! pip install TTS

Collecting pip
  Downloading pip-23.3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.3.1
Collecting TTS
  Downloading TTS-0.21.3-cp310-cp310-manylinux1_x86_64.whl.metadata (22 kB)
Collecting scikit-learn>=1.3.0 (from TTS)
  Downloading scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting anyascii>=0.3.0 (from TTS)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting pysbd>=0.3.4 (from TTS)
  Downloading pysbd-0.3.4-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Przygotowanie datasetu



In [None]:
import os

# BaseDatasetConfig: defines name, formatter and path of the dataset.
from TTS.tts.configs.shared_configs import BaseDatasetConfig

output_path = "/content/drive/MyDrive/original/tts_train_dir"
if not os.path.exists(output_path):
    os.makedirs(output_path)


In [None]:
# Download and extract LJSpeech dataset.

!wget -O $output_path/LJSpeech-1.1.tar.bz2 https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
!tar -xf $output_path/LJSpeech-1.1.tar.bz2 -C $output_path

--2023-12-06 01:36:27--  https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
Resolving data.keithito.com (data.keithito.com)... 24.199.73.137
Connecting to data.keithito.com (data.keithito.com)|24.199.73.137|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2748572632 (2.6G) [text/plain]
Saving to: ‘/content/drive/MyDrive/original/tts_train_dir/LJSpeech-1.1.tar.bz2’


2023-12-06 01:37:05 (70.1 MB/s) - ‘/content/drive/MyDrive/original/tts_train_dir/LJSpeech-1.1.tar.bz2’ saved [2748572632/2748572632]



In [None]:
dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "LJSpeech-1.1/")
)
print(dataset_config)

BaseDatasetConfig(formatter='ljspeech', dataset_name='', path='/content/drive/MyDrive/original/tts_train_dir/LJSpeech-1.1/', meta_file_train='metadata.csv', ignored_speakers=None, language='', phonemizer='', meta_file_val='', meta_file_attn_mask='')


## Trening Modelu

Ustawienie konfiguracji


In [None]:
# GlowTTSConfig: all model related values for training, validating and testing.
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
config = GlowTTSConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=100,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
    save_step=1000,
)

Inicjalizacja Audio Procesora

In [None]:
from TTS.utils.audio import AudioProcessor
ap = AudioProcessor.init_from_config(config)
# Modify sample rate if for a custom audio dataset:
# ap.sample_rate = 22050


 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


In [None]:
from TTS.tts.utils.text.tokenizer import TTSTokenizer
tokenizer, config = TTSTokenizer.init_from_config(config)

In [None]:
from TTS.tts.datasets import load_tts_samples
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

 | > Found 13100 files in /content/drive/MyDrive/original/tts_train_dir/LJSpeech-1.1


Inicjalizacja modelu

In [None]:
from TTS.tts.models.glow_tts import GlowTTS
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)

In [None]:
from trainer import Trainer, TrainerArgs
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 2
 | > Num. of Torch Threads: 1
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/content/drive/MyDrive/original/tts_train_dir/run-December-06-2023_01+44AM-0000000

 > Model has 28610257 parameters


### Rozpoczecie treningu




In [None]:
trainer.fit()


[4m[1m > EPOCH: 0/100[0m
 --> /content/drive/MyDrive/original/tts_train_dir/run-December-06-2023_01+44AM-0000000


[*] Pre-computing phonemes...


  0%|          | 3/12969 [00:01<1:52:47,  1.92it/s]

ɪnstɛd əv weɪtɪŋ ðɛɹ, ɔzwɔld əpɛɹəntli wɛnt æz fɑɹ əweɪ æz hi kʊd ænd bɔɹdɪd ðə fɚst oʊk klɪf bʌs wɪt͡ʃ keɪm əlɔŋ
 [!] Character '͡' not found in the vocabulary. Discarding it.


 16%|█▌        | 2059/12969 [01:55<06:56, 26.22it/s]

ɪntu ðə “kɹeɪtɚ” dʌɡ aʊt ɪn ðə mɪdəl, pɔɹ ðə spʌnd͡ʒ, wɔɹm wɔtɚ, ðə məlæsɪz, ænd soʊdə dɪzɑlvd ɪn hɑt wɔtɚ.
 [!] Character '“' not found in the vocabulary. Discarding it.
ɪntu ðə “kɹeɪtɚ” dʌɡ aʊt ɪn ðə mɪdəl, pɔɹ ðə spʌnd͡ʒ, wɔɹm wɔtɚ, ðə məlæsɪz, ænd soʊdə dɪzɑlvd ɪn hɑt wɔtɚ.
 [!] Character '”' not found in the vocabulary. Discarding it.


100%|██████████| 12969/12969 [08:13<00:00, 26.26it/s]




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 12969



[1m > TRAINING (2023-12-06 01:53:19) [0m


 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.90014650319993
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 144984.29755570978
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.



[1m   --> TIME: 2023-12-06 01:53:36 -- STEP: 0/406 -- GLOBAL_STEP: 0[0m
     | > current_lr: 2.5e-07 
     | > step_time: 10.6426  (10.642576694488525)
     | > loader_time: 5.6152  (5.615214109420776)

 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.

[1m   --> TIME: 2023-12-06 01:53:57 -- STEP: 25/406 -- GLOBAL_STEP: 25[0m
     | > loss: 3.6917357444763184  (3



> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
	| > 3 not found characters:
	| > ͡
	| > “
	| > ”
| > Number of instances : 131
 | > Preprocessing samples
 | > Max text length: 174
 | > Min text length: 20
 | > Avg text length: 100.76335877862596
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 34739.0
 | > Avg audio length: 144033.41221374046
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.00674670934677124 [0m(+0)
     | > avg_loss: 3.513145923614502 [0m(+0)
     | > avg_log_mle: 0.759749561548233 [0m(+0)
     | > avg_loss_dur: 2.7533963918685913 [0m(+0)

 > BEST MODEL : /content/drive/MyDrive/original/tts_train_dir/run-December-06-2023_01+44AM-0000000/best_model_406.pth

[4m[1m > EPOCH: 1/100[0m
 --> /content/drive/MyDrive/original/tts_train_dir/run-December-06-2023_01+44AM-0000000

[1m > TRAINING (2023-12-06 02:03:40) [0m

[1m   --> TIME: 2023-12-06 02:04:04 -- STEP: 19/406 -- GLOBAL_STEP: 425[0m
     | > loss: 3.4858903884887695  (3.5221318571191085)
     | > log_mle: 0.7617048025131226  (0.7579264076132524)
     | > loss_dur: 2.7241857051849365  (2.764205443231683)
     | > amp_scaler: 16384.0  (16384.0)
     | > grad_norm: tensor(9.7684, device='cuda:0')  (tensor(9.6743, device='cuda:0'))
     | > current_lr: 2.5e-07 
     | > step_time: 0.9686  (1.013396601927908)
     | > loader_time: 0.0059  

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.006984710693359375 [0m(+0.00023800134658813477)
     | > avg_loss:[92m 3.320562958717346 [0m(-0.19258296489715576)
     | > avg_log_mle:[92m 0.7483132779598236 [0m(-0.011436283588409424)
     | > avg_loss_dur:[92m 2.572249710559845 [0m(-0.18114668130874634)

 > BEST MODEL : /content/drive/MyDrive/original/tts_train_dir/run-December-06-2023_01+44AM-0000000/best_model_812.pth

[4m[1m > EPOCH: 2/100[0m
 --> /content/drive/MyDrive/original/tts_train_dir/run-December-06-2023_01+44AM-0000000

[1m > TRAINING (2023-12-06 02:14:24) [0m

[1m   --> TIME: 2023-12-06 02:14:42 -- STEP: 13/406 -- GLOBAL_STEP: 825[0m
     | > loss: 3.2669517993927  (3.3698046024029074)
     | > log_mle: 0.7410008907318115  (0.7470268836388221)
     | > loss_dur: 2.5259509086608887  (2.6227777371039758)
     | > amp_scaler: 16384.0  (16384.0)
     | > grad_norm: tensor(8.5569, device='cuda:0')  (tensor(8.7048, device='cuda:0'))
     | > curr

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.007891803979873657 [0m(+0.0009070932865142822)
     | > avg_loss:[92m 2.9907763302326202 [0m(-0.32978662848472595)
     | > avg_log_mle:[92m 0.7172440364956856 [0m(-0.03106924146413803)
     | > avg_loss_dur:[92m 2.27353236079216 [0m(-0.29871734976768494)

 > BEST MODEL : /content/drive/MyDrive/original/tts_train_dir/run-December-06-2023_01+44AM-0000000/best_model_1218.pth

[4m[1m > EPOCH: 3/100[0m
 --> /content/drive/MyDrive/original/tts_train_dir/run-December-06-2023_01+44AM-0000000

[1m > TRAINING (2023-12-06 02:25:05) [0m

[1m   --> TIME: 2023-12-06 02:25:17 -- STEP: 7/406 -- GLOBAL_STEP: 1225[0m
     | > loss: 3.026080846786499  (3.112809113093785)
     | > log_mle: 0.7211859226226807  (0.718170131955828)
     | > loss_dur: 2.3048949241638184  (2.39463894707816)
     | > amp_scaler: 16384.0  (16384.0)
     | > grad_norm: tensor(6.0849, device='cuda:0')  (tensor(6.2423, device='cuda:0'))
     | > current

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.006057292222976685 [0m(-0.0018345117568969727)
     | > avg_loss:[92m 2.8965582847595215 [0m(-0.09421804547309875)
     | > avg_log_mle:[92m 0.6587095484137535 [0m(-0.05853448808193207)
     | > avg_loss_dur:[92m 2.2378487586975098 [0m(-0.03568360209465027)

 > BEST MODEL : /content/drive/MyDrive/original/tts_train_dir/run-December-06-2023_01+44AM-0000000/best_model_1624.pth

[4m[1m > EPOCH: 4/100[0m
 --> /content/drive/MyDrive/original/tts_train_dir/run-December-06-2023_01+44AM-0000000

[1m > TRAINING (2023-12-06 02:35:49) [0m

[1m   --> TIME: 2023-12-06 02:35:55 -- STEP: 1/406 -- GLOBAL_STEP: 1625[0m
     | > loss: 2.964743137359619  (2.964743137359619)
     | > log_mle: 0.6646634340286255  (0.6646634340286255)
     | > loss_dur: 2.300079584121704  (2.300079584121704)
     | > amp_scaler: 16384.0  (16384.0)
     | > grad_norm: tensor(5.2000, device='cuda:0')  (tensor(5.2000, device='cuda:0'))
     | > curr

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.010662168264389038 [0m(+0.0046048760414123535)
     | > avg_loss:[92m 2.5343425571918488 [0m(-0.36221572756767273)
     | > avg_log_mle:[92m 0.5874077007174492 [0m(-0.07130184769630432)
     | > avg_loss_dur:[92m 1.9469348788261414 [0m(-0.2909138798713684)

 > BEST MODEL : /content/drive/MyDrive/original/tts_train_dir/run-December-06-2023_01+44AM-0000000/best_model_2030.pth

[4m[1m > EPOCH: 5/100[0m
 --> /content/drive/MyDrive/original/tts_train_dir/run-December-06-2023_01+44AM-0000000

[1m > TRAINING (2023-12-06 02:46:37) [0m

[1m   --> TIME: 2023-12-06 02:47:03 -- STEP: 20/406 -- GLOBAL_STEP: 2050[0m
     | > loss: 2.48616361618042  (2.6068166971206663)
     | > log_mle: 0.6098449230194092  (0.6022091299295426)
     | > loss_dur: 1.8763188123703003  (2.0046075642108914)
     | > amp_scaler: 16384.0  (16384.0)
     | > grad_norm: tensor(4.3723, device='cuda:0')  (tensor(4.5662, device='cuda:0'))
     | > cu

In [None]:
import locale
print(locale.getpreferredencoding())

UTF-8


In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
from google.colab import files
files.download('/content/tts_train_dir/run-December-05-2023_03+58PM-0000000/config.json')
files.download('/content/tts_train_dir/run-December-05-2023_03+58PM-0000000/best_model.pth')
files.download('/content/tts_train_dir/LJSpeech-1.1/metadata.csv')

FileNotFoundError: ignored

In [None]:
!zip -r /content/run-December-05-2023_03+58PM-0000000.zip /content/tts_train_dir/run-December-05-2023_03+58PM-0000000
from google.colab import files
files.download("/content/run-December-05-2023_03+58PM-0000000.zip")

NotImplementedError: ignored

In [None]:
!pip install tensorboard
!tensorboard --logdir=tts_train_dir

## Testowanie modelu


In [None]:
import glob, os
output_path = "tts_train_dir"
ckpts = sorted([f for f in glob.glob(output_path+"/*/*.pth")])
configs = sorted([f for f in glob.glob(output_path+"/*/*.json")])


In [None]:
 !tts --text "Text for TTS" \
      --model_path "/content/tts_train_dir/run-December-03-2023_05+06PM-0000000/best_model.pth" \
      --config_path "/content/tts_train_dir/run-December-03-2023_05+06PM-0000000/config.json" \
      --out_path out.wav

In [None]:
import IPython
IPython.display.Audio("out.wav")