# Saarthi TTS Training

In [2]:
import os
from glob import glob
from trainer import Trainer, TrainerArgs
import torch
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig
from TTS.tts.utils.languages import LanguageManager
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.config import load_config
from TTS.tts.models import setup_model

In [3]:
output_path = './TTS_Training'

mailabs_path = "mallu_temp/**"
dataset_paths = glob(mailabs_path)
dataset_config = [
    BaseDatasetConfig(name="mailabs", meta_file_train=None, path=path, language=path.split("/")[-1])
    for path in dataset_paths
]

In [4]:
phonems = '|k_|K_|g_|j_|J_|X_|x_f_K_k_G_g_|y_w_h_C_c_J_i_y_Y_V_q_x_X_N_S_r_T_t_D_d_n~_n_`I_I_P_p_B_b_m_z_s_Aː_A_Iː_i_u_Uː_R_Oː_O_eː_e_Eː_oː_o_M_h_a_`a_v_j_l_`l'

In [5]:
audio_config = VitsAudioConfig(
    sample_rate=16000,
    win_length=1024,
    hop_length=256,
    num_mels=80,
    mel_fmin=0,
    mel_fmax=None,
)

vitsArgs = VitsArgs(
    use_speaker_embedding=True,
    speaker_embedding_channels=512,
    use_language_embedding=True,
    embedded_language_dim=512,
    use_sdp=False,
    num_speakers=50,
    # freeze_encoder=True,
    # freeze_PE=True,
    # freeze_DP=True,
    # freeze_flow_decoder=True,
    # freeze_waveform_decoder=True

)

In [6]:
config = VitsConfig(
    model_args=vitsArgs,
    audio=audio_config,
    speaker_embedding_channels=512,
    run_name="odia_male_dp",
    batch_size=48,
    eval_batch_size=16,
    batch_group_size=0,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    save_step=100,
    save_n_checkpoints=10,
    text_cleaner=None,
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    compute_input_seq_cache=False,
    use_language_weighted_sampler=False,
    print_eval=False,
    mixed_precision=False,
    # sort_by_audio_len=True,
    min_audio_len=32 * 256 * 4,
    max_audio_len=160000,
    output_path=output_path,
    datasets=dataset_config,
    characters=CharactersConfig(
        pad="<PAD>",
        eos="<EOS>",
        bos="<BOS>",
        blank="<BLNK>",
        characters= phonems,
        punctuations= ",?.!;:'‘¡",
        phonemes= phonems,
        
    ),
    test_sentences=[
        [
            "हाय क्या मैं ओंकार से बात कर रहा हूँ ?",
            "saarthi_oria_male",
            None,
            "odia",
        ],
    ],
    num_speakers=50,
    use_speaker_embedding = True,

)

In [7]:
config.from_dict(config.to_dict())

# init audio processor
ap = AudioProcessor(**config.audio.to_dict())

# load training samples
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

# print(eval_samples)
speaker_manager = SpeakerManager()
speaker_manager.load_ids_from_file('model_config/speakers.pth')
speaker_manager = SpeakerManager(speaker_id_file_path='model_config/speakers.pth')
# speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
config.model_args.num_speakers = 50

2023-06-07 07:46:04,978.978 DEBUG local:  open file: /root/Documents/Audio-Encoder-Pretraining-main/voice_coder/TTS_Custom/model_config/speakers.pth
2023-06-07 07:46:04,980.980 DEBUG local:  open file: /root/Documents/Audio-Encoder-Pretraining-main/voice_coder/TTS_Custom/model_config/speakers.pth


 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
> File mallu_temp/malayalam/malayalam_male_saarthi/wavs/1.wav does not exist!
> File mallu_temp/malayalam/malayalam_male_saarthi/wavs/2.wav does not exist!
> File mallu_temp/malayalam/malayalam_male_saarthi/wavs/3.wav does not exist!
> File mallu_temp/mal

In [7]:
language_manager = LanguageManager(config=config)
config.model_args.num_languages = 11
language_manager.load_ids_from_file('model_config/language_ids.json')
tokenizer, config = TTSTokenizer.init_from_config(config)

2023-06-05 10:09:20,290.290 DEBUG local:  open file: /root/Documents/Audio-Encoder-Pretraining-main/voice_coder/TTS_Custom/model_config/language_ids.json


vocab is  ['iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧʲɚ˞ɫ']
final Vocab:  ['<PAD>', '<EOS>', '<BOS>', '<BLNK>', 'iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧʲɚ˞ɫ', '!', "'", '(', ')', ',', '-', '.', ':', ';', '?', ' ']
vocab is  ['|k', '|K', '|g', '|j', '|J', '|X', '|x', 'f', 'K', 'k', 'G', 'g', '|y', 'w', 'h', 'C', 'c', 'J', 'i', 'y', 'Y', 'V', 'q', 'x', 'X', 'N', 'S', 'r', 'T', 't', 'D', 'd', 'n~', 'n', '`I', 'I', 'P', 'p', 'B', 'b', 'm', 'z', 's', 'Aː', 'A', 'Iː', 'i', 'u', 'Uː', 'R', 'Oː', 'O', 'eː', 'e', 'Eː', 'oː', 'o', 'M', 'h', 'a', '`a', 'v', 'j', 'l', '`l']
final Vocab:  ['<PAD>', '<EOS>', '<BOS>', '<BLNK>', 'A', 'Aː', 'B', 'C', 'D', 'Eː', 'G', 'I', 'Iː', 'J', 'K', 'M', 'N', 'O', 'Oː', 'P', 'R', 'S', 'T', 'Uː', 'V', 'X', 'Y', '`I', '`a', '`l', 'a', 'b', 'c', 'd', 'e', 'eː', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'n~', 'o', 'oː', 'p',

In [8]:
model = Vits(config, ap, tokenizer, speaker_manager, language_manager)
cp = torch.load('model_config/checkpoint_1433000.pth', 
                map_location=torch.device('cuda'))
model_weights = cp['model'].copy()
for key in list(model_weights.keys()):
  if "speaker_encoder" in key:
    del model_weights[key]
model.load_state_dict(model_weights)


Number of speakers:  50
 > initialization of speaker-embedding layers.
 > initialization of language-embedding layers.


<All keys matched successfully>

In [9]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Number of parameters ", count_parameters(model))

Number of parameters  115220477


In [None]:
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

trainer.fit()