In [1]:
import os

from trainer import Trainer, TrainerArgs

from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits, VitsAudioConfig
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from multiprocessing import freeze_support

if __name__ == '__main__':
  freeze_support()
  
  output_path = os.path.dirname(os.path.abspath("outputs/outputs.json"))
  dataset_config = BaseDatasetConfig(
      formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../dataset/")
  )
  audio_config = VitsAudioConfig(
      sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
  )

  config = VitsConfig(
      audio=audio_config,
      run_name="vits_ljspeech",
      batch_size=1,
      eval_batch_size=16,
      batch_group_size=1,
      num_loader_workers=2,
      num_eval_loader_workers=4,
      run_eval=True,
      test_delay_epochs=-1,
      epochs=1,
      text_cleaner="english_cleaners",
      use_phonemes=True,
      phoneme_language="en-us",
      phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
      compute_input_seq_cache=True,
      print_step=25,
      print_eval=True,
      mixed_precision=False,
      output_path=output_path,
      datasets=[dataset_config],
      cudnn_benchmark=False,
  )

  # INITIALIZE THE AUDIO PROCESSOR
  # Audio processor is used for feature extraction and audio I/O.
  # It mainly serves to the dataloader and the training loggers.
  ap = AudioProcessor.init_from_config(config)

  # INITIALIZE THE TOKENIZER
  # Tokenizer is used to convert text to sequences of token IDs.
  # config is updated with the default characters if not defined in the config.
  tokenizer, config = TTSTokenizer.init_from_config(config)

  # LOAD DATA SAMPLES
  # Each sample is a list of ```[text, audio_file_path, speaker_name]```
  # You can define your custom sample loader returning the list of samples.
  # Or define your custom formatter and pass it to the `load_tts_samples`.
  # Check `TTS.tts.datasets.load_tts_samples` for more details.
  train_samples, eval_samples = load_tts_samples(
      dataset_config,
      eval_split=True,
      eval_split_max_size=config.eval_split_max_size,
      eval_split_size=config.eval_split_size,
  )

  # init model
  model = Vits(config, ap, tokenizer, speaker_manager=None)

  # init the trainer and 🚀
  trainer = Trainer(
      TrainerArgs(),
      config,
      output_path,
      model=model,
      train_samples=train_samples,
      eval_samples=eval_samples,
      
  )

  trainer.fit()

  trainer.save_checkpoint()

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 | > Found 278 files in C:\TextToSpeech\TTS\rory\dataset


 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 20
 | > Num. of Torch Threads: 14
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=c:\TextToSpeech\TTS\rory\outputs\vits_ljspeech-June-01-2024_08+24PM-f1b64b72
  from .autonotebook import tqdm as notebook_tqdm

 > Model has 83059180 parameters

[4m[1m > EPOCH: 0/1[0m
 --> c:\TextToSpeech\TTS\rory\outputs\vits_ljspeech-June-01-2024_08+24PM-f1b64b72

[1m > TRAINING (2024-06-01 20:24:49) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
| > Number of instances : 276
 | > Preprocessing samples
 | > Max text length: 174
 | > Min text length: 18
 | > Avg text length: 102.9963768115942
 | 
 | > Max audio length: 222387.0
 | > Min audio length: 32691.0
 | > Avg audio length: 143258.884057971
 | > Num. instances discarded samples: 0
 | > Batch group size: 1.


Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\native\SpectralOps.cpp:878.)
  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
  return F.conv1d(input, weight, bias, self.stride,
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass

[1m   --> TIME: 2024-06-01 20:25:08 -- STEP: 0/276 -- GLOBAL_STEP: 0[0m
     | > loss_disc: 6.085947036743164  (6.085947036743164)
     | > loss_disc_real_0: 1.0116182565689087  (1.0116182565689087)
     | > loss_disc_real_1: 1.0209587812423706  (1.0209587812423706)
     | > loss_disc_real_2: 1.0361289978027344  (1.0361289978027344)
     | > loss_disc_real_3: 0.9905913472175598  (0.9905913472175598)
     | > loss_disc_real_4: 1.0160038471221924  (1.0160038471221924)
     | > loss_disc_real_5: 1.010072827339172

PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'c:/TextToSpeech/TTS/rory/outputs/vits_ljspeech-June-01-2024_08+24PM-f1b64b72\\trainer_0_log.txt'