# Saarthi TTS Training

In [1]:
import os
from glob import glob
from trainer import Trainer, TrainerArgs
import torch
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig
from TTS.tts.utils.languages import LanguageManager
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.config import load_config
from TTS.tts.models import setup_model

In [2]:
output_path = './TTS_Training'

mailabs_path = "hindi_temp/**"
dataset_paths = glob(mailabs_path)
dataset_config = [
    BaseDatasetConfig(name="mailabs", meta_file_train=None, path=path, language=path.split("/")[-1])
    for path in dataset_paths
]

In [3]:
phonems = '|k_|K_|g_|j_|J_|X_|x_f_K_k_G_g_|y_w_h_C_c_J_i_y_Y_V_q_x_X_N_S_r_T_t_D_d_n~_n_`I_I_P_p_B_b_m_z_s_Aː_A_Iː_i_u_Uː_R_Oː_O_eː_e_Eː_oː_o_M_h_a_`a_v_j_l_`l'
dev = ''.join([chr(i) for i in range(0x0900, 0x097F)])

In [4]:
audio_config = VitsAudioConfig(
    sample_rate=16000,
    win_length=1024,
    hop_length=256,
    num_mels=80,
    mel_fmin=0,
    mel_fmax=None,
)

vitsArgs = VitsArgs(
    use_speaker_embedding=True,
    speaker_embedding_channels=512,
    use_language_embedding=True,
    embedded_language_dim=512,
    use_sdp=False,
    num_speakers=50,
    # freeze_encoder=True,
    # freeze_PE=True,
    # freeze_DP=True,
    # freeze_flow_decoder=True,
    # freeze_waveform_decoder=True

)

In [5]:
config = VitsConfig(
    model_args=vitsArgs,
    audio=audio_config,
    speaker_embedding_channels=512,
    run_name="hindi_female",
    batch_size=48,
    eval_batch_size=16,
    batch_group_size=0,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    save_step=100,
    save_n_checkpoints=10,
    text_cleaner=None,
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    compute_input_seq_cache=False,
    use_language_weighted_sampler=False,
    print_eval=False,
    mixed_precision=False,
    # sort_by_audio_len=True,
    min_audio_len=16 * 128 * 4,
    max_audio_len=160000,
    output_path=output_path,
    datasets=dataset_config,
    characters=CharactersConfig(
        pad="<PAD>",
        eos="<EOS>",
        bos="<BOS>",
        blank="<BLNK>",
        characters= dev,
        punctuations= ",?.!;:'‘¡",
        phonemes= dev,
        
    ),
    test_sentences=[
        [
            "हाय क्या मैं ओंकार से बात कर रहा हूँ ?",
            "saarthi_oria_male",
            None,
            "odia",
        ],
    ],
    num_speakers=50,
    use_speaker_embedding = True,

)

In [6]:
config.from_dict(config.to_dict())

# init audio processor
ap = AudioProcessor(**config.audio.to_dict())

# load training samples
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

# print(eval_samples)
speaker_manager = SpeakerManager()
# speaker_manager.load_ids_from_file('model_config/speakers.pth')
# speaker_manager = SpeakerManager(speaker_id_file_path='model_config/speakers.pth')
speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
config.model_args.num_speakers = 50

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 | > Found 1501 files in /root/Documents/Audio-Encoder-Pretraining-main/voice_coder/TTS_Training/hindi_temp/hindi


In [7]:
language_manager = LanguageManager(config=config)
config.model_args.num_languages = 11
# language_manager.load_ids_from_file('model_config/language_ids.json')
tokenizer, config = TTSTokenizer.init_from_config(config)

vocab is  ['i', 'y', 'ɨ', 'ʉ', 'ɯ', 'u', 'ɪ', 'ʏ', 'ʊ', 'e', 'ø', 'ɘ', 'ə', 'ɵ', 'ɤ', 'o', 'ɛ', 'œ', 'ɜ', 'ɞ', 'ʌ', 'ɔ', 'æ', 'ɐ', 'a', 'ɶ', 'ɑ', 'ɒ', 'ᵻ', 'ʘ', 'ɓ', 'ǀ', 'ɗ', 'ǃ', 'ʄ', 'ǂ', 'ɠ', 'ǁ', 'ʛ', 'p', 'b', 't', 'd', 'ʈ', 'ɖ', 'c', 'ɟ', 'k', 'ɡ', 'q', 'ɢ', 'ʔ', 'ɴ', 'ŋ', 'ɲ', 'ɳ', 'n', 'ɱ', 'm', 'ʙ', 'r', 'ʀ', 'ⱱ', 'ɾ', 'ɽ', 'ɸ', 'β', 'f', 'v', 'θ', 'ð', 's', 'z', 'ʃ', 'ʒ', 'ʂ', 'ʐ', 'ç', 'ʝ', 'x', 'ɣ', 'χ', 'ʁ', 'ħ', 'ʕ', 'h', 'ɦ', 'ɬ', 'ɮ', 'ʋ', 'ɹ', 'ɻ', 'j', 'ɰ', 'l', 'ɭ', 'ʎ', 'ʟ', 'ˈ', 'ˌ', 'ː', 'ˑ', 'ʍ', 'w', 'ɥ', 'ʜ', 'ʢ', 'ʡ', 'ɕ', 'ʑ', 'ɺ', 'ɧ', 'ʲ', 'ɚ', '˞', 'ɫ']
final Vocab:  ['<PAD>', '<EOS>', '<BOS>', '<BLNK>', 'a', 'b', 'c', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'æ', 'ç', 'ð', 'ø', 'ħ', 'ŋ', 'œ', 'ǀ', 'ǁ', 'ǂ', 'ǃ', 'ɐ', 'ɑ', 'ɒ', 'ɓ', 'ɔ', 'ɕ', 'ɖ', 'ɗ', 'ɘ', 'ə', 'ɚ', 'ɛ', 'ɜ', 'ɞ', 'ɟ', 'ɠ', 'ɡ', 'ɢ', 'ɣ', 'ɤ', 'ɥ', 'ɦ', 'ɧ', 'ɨ', 'ɪ', 'ɫ', 'ɬ', 'ɭ', 'ɮ', 'ɯ', 'ɰ', 'ɱ', 'ɲ', 'ɳ', 'ɴ', 'ɵ

In [8]:
model = Vits(config, ap, tokenizer, speaker_manager, language_manager)
# cp = torch.load('model_config/checkpoint_1433000.pth', 
#                 map_location=torch.device('cuda'))
# model_weights = cp['model'].copy()
# for key in list(model_weights.keys()):
#   if "speaker_encoder" in key:
#     del model_weights[key]
# model.load_state_dict(model_weights)


Number of speakers:  50
 > initialization of speaker-embedding layers.
 > initialization of language-embedding layers.


In [9]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Number of parameters ", count_parameters(model))

Number of parameters  115227645


In [None]:
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

trainer.fit()

2023-06-15 10:11:14,803.803 DEBUG local:  open file: /root/Documents/Audio-Encoder-Pretraining-main/voice_coder/TTS_Training/./TTS_Training/hindi_female-June-15-2023_10+11AM-f6af544/config.json
2023-06-15 10:11:16,979.979 DEBUG local:  open file: /root/Documents/Audio-Encoder-Pretraining-main/voice_coder/TTS_Training/./TTS_Training/hindi_female-June-15-2023_10+11AM-f6af544/speakers.pth
2023-06-15 10:11:16,982.982 DEBUG local:  open file: /root/Documents/Audio-Encoder-Pretraining-main/voice_coder/TTS_Training/./TTS_Training/hindi_female-June-15-2023_10+11AM-f6af544/language_ids.json


 > `speakers.pth` is saved to ./TTS_Training/hindi_female-June-15-2023_10+11AM-f6af544/speakers.pth.
 > `speakers_file` is updated in the config.json.
 > `language_ids.json` is saved to ./TTS_Training/hindi_female-June-15-2023_10+11AM-f6af544/language_ids.json.
 > `language_ids_file` is updated in the config.json.
[*] Pre-computing phonemes...


  0%|                                                  | 0/1486 [00:00<?, ?it/s]

 [!] Character 'f' not found in the vocabulary. Discarding it.
 [!] Character 'Aː' not found in the vocabulary. Discarding it.
 [!] Character 'y' not found in the vocabulary. Discarding it.
 [!] Character 'a' not found in the vocabulary. Discarding it.
 [!] Character 'r' not found in the vocabulary. Discarding it.
 [!] Character ' ' not found in the vocabulary. Discarding it.
 [!] Character 'v' not found in the vocabulary. Discarding it.
 [!] Character 'l' not found in the vocabulary. Discarding it.
 [!] Character 'k' not found in the vocabulary. Discarding it.
 [!] Character 'eː' not found in the vocabulary. Discarding it.
 [!] Character 'i' not found in the vocabulary. Discarding it.
 [!] Character '' not found in the vocabulary. Discarding it.
 [!] Character 'p' not found in the vocabulary. Discarding it.
 [!] Character 'h' not found in the vocabulary. Discarding it.
 [!] Character 'c' not found in the vocabulary. Discarding it.
 [!] Character 'Eː' not found in the vocabulary. Disca

  0%|                                          | 4/1486 [00:00<02:26, 10.12it/s]

 [!] Character 'M' not found in the vocabulary. Discarding it.
 [!] Character 't' not found in the vocabulary. Discarding it.
 [!] Character 'm' not found in the vocabulary. Discarding it.
 [!] Character 'z' not found in the vocabulary. Discarding it.
 [!] Character 'n' not found in the vocabulary. Discarding it.
 [!] Character 'B' not found in the vocabulary. Discarding it.
 [!] Character 'u' not found in the vocabulary. Discarding it.
 [!] Character 'J' not found in the vocabulary. Discarding it.
 [!] Character 'b' not found in the vocabulary. Discarding it.
 [!] Character 'T' not found in the vocabulary. Discarding it.
 [!] Character 'C' not found in the vocabulary. Discarding it.
 [!] Character 'j' not found in the vocabulary. Discarding it.
 [!] Character 'V' not found in the vocabulary. Discarding it.
 [!] Character 'D' not found in the vocabulary. Discarding it.
 [!] Character '|y' not found in the vocabulary. Discarding it.
 [!] Character 'K' not found in the vocabulary. Discar

  0%|▏                                         | 6/1486 [00:00<02:40,  9.22it/s]

 [!] Character '|x' not found in the vocabulary. Discarding it.
 [!] Character '|X' not found in the vocabulary. Discarding it.
 [!] Character 'G' not found in the vocabulary. Discarding it.
 [!] Character 'S' not found in the vocabulary. Discarding it.
 [!] Character 'N' not found in the vocabulary. Discarding it.
 [!] Character 'R' not found in the vocabulary. Discarding it.


  1%|▎                                        | 11/1486 [00:00<01:51, 13.28it/s]

 [!] Character 'n~' not found in the vocabulary. Discarding it.
 [!] Character 'w' not found in the vocabulary. Discarding it.


  1%|▌                                        | 21/1486 [00:02<02:36,  9.35it/s]

 [!] Character '|g' not found in the vocabulary. Discarding it.


  2%|▉                                        | 32/1486 [00:02<02:02, 11.90it/s]

 [!] Character 'Y' not found in the vocabulary. Discarding it.
 [!] Character '|k' not found in the vocabulary. Discarding it.


  3%|█                                        | 40/1486 [00:03<02:25,  9.97it/s]

 [!] Character '|K' not found in the vocabulary. Discarding it.


  5%|█▉                                       | 68/1486 [00:06<02:18, 10.22it/s]

 [!] Character 'X' not found in the vocabulary. Discarding it.
 [!] Character '\xa0h' not found in the vocabulary. Discarding it.


  6%|██▌                                      | 93/1486 [00:08<02:01, 11.44it/s]

 [!] Character '`a' not found in the vocabulary. Discarding it.
 [!] Character 'e' not found in the vocabulary. Discarding it.


 10%|███▊                                    | 142/1486 [00:12<01:50, 12.16it/s]

 [!] Character '़f' not found in the vocabulary. Discarding it.


 51%|████████████████████▍                   | 758/1486 [01:06<01:02, 11.67it/s]

 [!] Character '़|x' not found in the vocabulary. Discarding it.


 64%|█████████████████████████▋              | 952/1486 [01:23<00:45, 11.68it/s]