# Notes

## Setup notes
- If installing TTS package on a venv, install propper cuda enabled torch otherwise default torch will be installed, preventing cuda from being used.
- Go to "TTS\tts\layers\tortoise\arch_utils.py" replace references of LogitWarper to LogitsProcessor
- Go to "TTS\tts\models\xtts.py then to function get_compatible_checkpoint_state_dict. On line 714: checkpoint = load_fsspec(model_path, map_location=torch.device("cpu"))["model"]. Add the argument 'weights_only = False": checkpoint = load_fsspec(model_path, map_location=torch.device("cpu"), weights_only = False)["model"]


In [1]:
'''Imports'''
from trainer import Trainer, TrainerArgs
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
from TTS.utils.manage import ModelManager
import torch
import sys
import os
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
'''Display device used'''
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
print(torch.version.cuda)           # PyTorch's CUDA version (linked at build time)
print(torch.cuda.is_available())    # Should be True
print(torch.cuda.get_device_name()) # Your GPU's name

cuda:0
12.4
True
NVIDIA GeForce RTX 2070


In [10]:
'''DOWNLOADS'''
# Get XTTS files
CHECKPOINT_PATH = './XTTS-files/'
os.makedirs(CHECKPOINT_PATH, exist_ok=True)

# DVAE files
DVAE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"

# Set the path to the downloaded files
DVAE_CHECKPOINT = os.path.join(CHECKPOINT_PATH, os.path.basename(DVAE_LINK))
MEL_NORM_FILE = os.path.join(CHECKPOINT_PATH, os.path.basename(MEL_NORM_LINK))

# DVAE download if not exists
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
    print(" > Downloading DVAE files!")
    ModelManager._download_model_files([MEL_NORM_LINK, DVAE_LINK], CHECKPOINT_PATH, progress_bar=True)

# XTTS v2.0 checkpoint
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"

# Transfer learning parameters. NOTE: Sets base model to use
TOKENIZER_FILE = os.path.join(CHECKPOINT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json
XTTS_CHECKPOINT = os.path.join(CHECKPOINT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth

# XTTS v2.0 download if not exists
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
    print(" > Downloading XTTS v2.0 files!")
    ModelManager._download_model_files(
        [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINT_PATH, progress_bar=True
    )
print("Paths set.")

Paths set.


In [11]:
'''DATA LOADING'''
# Set lang
LANGUAGE ='en'
# Set to folder name that contains metadata.csv and wavs dir (with the .wav examples)
DATASET= "example_1"
training_dir = f'./datasets/{DATASET}/' # change to folder w/ training examples

# Dataset uses ljspeech format
dataset_config = BaseDatasetConfig(
    formatter="ljspeech",
    meta_file_train="metadata.csv", # metadata file w/ transcriptions
    language=LANGUAGE,
    path=training_dir
)

# Turn off eval split. Will evaluate manually
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_size=0.02 # Might change
)


'''MODIFY'''
# Audio config
audio_config = XttsAudioConfig(sample_rate=16000, dvae_sample_rate=16000, output_sample_rate=24000) 

# Speaker Reference: Match theses to the test sentences
SPEAKER_TEXT = [
    "for a hundred yards, then, curving, was lost to view. Doubtless there was an outpost farther along. The other bank of the stream was open ground.",
    "A gentle acclivity, topped with a stockade of vertical tree trunks, loopholed for rifles, with a single embrasure through, which protruded the muzzle of a brass cannon,",
    "The lady had now brought the water, which the soldier drank. He thanked her ceremoniously, bowed to her husband, and rode away. An hour later, after nightfall,"


]
SPEAKER_REFERENCE = [
    F"datasets/{DATASET}/wavs/chunk_0009.wav",
    F"datasets/{DATASET}/wavs/chunk_0010.wav",
    F"datasets/{DATASET}/wavs/chunk_0054.wav"
]

 | > Found 140 files in C:\Users\12017\Desktop\NJIT\DS677_852_Project\src\datasets\example_1


In [12]:
'''Set Model arguments'''
model_args = GPTArgs(
    max_conditioning_length=143677, # Audio used for conditioning latents should be less than this 
    min_conditioning_length=66150, # and more than this
    debug_loading_failures=True,
    max_wav_length=255995, # Set >= longest audio in dataset  
    max_text_length=200, 
    mel_norm_file=MEL_NORM_FILE,
    dvae_checkpoint=DVAE_CHECKPOINT,
    xtts_checkpoint=XTTS_CHECKPOINT,  
    tokenizer_file=TOKENIZER_FILE,
    gpt_num_audio_tokens=1026, 
    gpt_start_audio_token=1024,
    gpt_stop_audio_token=1025,
    gpt_use_masking_gt_prompt_approach=True,
    gpt_use_perceiver_resampler=True,
)

In [13]:
'''Set up configuration file'''
'''TRAINING CONFIG'''
OUT_PATH = './training_outputs/'

RUN_NAME = 'owl_speech_test'
# RUN_NAME = f"xttsv2_finetune_{datetime.now().strftime('%Y%m%d_%H%M')}"
PROJECT_NAME = 'Owl Finetune'
# DASHBOARD_LOGGER = 'wandb'
LOGGER_URI = None

OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  

BATCH_SIZE = 3 # 4 is common

config = GPTTrainerConfig(
    run_eval=True,
    epochs = 4, # assuming you want to end training manually w/ keyboard interrupt
    output_path=OUT_PATH,
    model_args=model_args,
    run_name=RUN_NAME,
    project_name=PROJECT_NAME,
    run_description="""
        GPT XTTS training
        """,
    # dashboard_logger=DASHBOARD_LOGGER,
    wandb_entity=None,
    # logger_uri=LOGGER_URI,
    audio=audio_config,
    batch_size=BATCH_SIZE,
    batch_group_size=48,
    eval_batch_size=BATCH_SIZE,
    num_loader_workers=0, # On Windows, num_loader_workers > 0 can break multiprocessing in PyTorch
    eval_split_max_size=256, 
    print_step=50, 
    plot_step=100, 
    log_model_step=1000, 
    save_step=1000, # Needs to be an int
    save_n_checkpoints=3, # Rotate last 3 checkpoints
    save_checkpoints=True,
    print_eval=True,
    optimizer="AdamW",
    optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
    optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
    lr=5e-06,  
    lr_scheduler="MultiStepLR",
    lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
    test_sentences=[ 
        {
            "text": SPEAKER_TEXT[0],
            "speaker_wav": SPEAKER_REFERENCE[0], 
            "language": LANGUAGE,
        },
        {
            "text": SPEAKER_TEXT[1],
            "speaker_wav": SPEAKER_REFERENCE[1],
            "language": LANGUAGE,
        },
        {
            "text": SPEAKER_TEXT[2],
            "speaker_wav": SPEAKER_REFERENCE[2],
            "language": LANGUAGE,
        }
        
    ],
) 

In [14]:
'''Set up Trainer'''
# Init model 
model = GPTTrainer.init_from_config(config)

# Init Trainer
GRAD_ACUMM_STEPS = 252
START_WITH_EVAL = True  

trainer = Trainer(
    TrainerArgs(
        restore_path=None, # Change to model path if resuming
        skip_train_epoch=False,
        start_with_eval=START_WITH_EVAL,
        grad_accum_steps=GRAD_ACUMM_STEPS,
    ),
    config,
    output_path=OUT_PATH,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 16
 | > Num. of Torch Threads: 1
 | > Torch seed: 1
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=./training_outputs/owl_speech_test-April-12-2025_04+01PM-dd3b630


>> DVAE weights restored from: ./XTTS-files/dvae.pth



 > Model has 518442047 parameters


In [15]:
'''TRAINING: manual interupts will set model to output saves at given checkpoints'''
trainer.fit()


[4m[1m > EPOCH: 0/4[0m
 --> ./training_outputs/owl_speech_test-April-12-2025_04+01PM-dd3b630

[1m > EVALUATION [0m



 > Filtering invalid eval samples!!
 > Total eval samples after filtering: 2


[1m   --> STEP: 0[0m
     | > loss_text_ce: 0.024175841361284256  (0.024175841361284256)
     | > loss_mel_ce: 4.230667591094971  (4.230667591094971)
     | > loss: 4.254843235015869  (4.254843235015869)

GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected 

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.0937800407409668 [0m(+0)
     | > avg_loss_text_ce: 0.024175841361284256 [0m(+0)
     | > avg_loss_mel_ce: 4.230667591094971 [0m(+0)
     | > avg_loss: 4.254843235015869 [0m(+0)


[4m[1m > EPOCH: 1/4[0m
 --> ./training_outputs/owl_speech_test-April-12-2025_04+01PM-dd3b630

[1m > TRAINING (2025-04-12 16:01:49) [0m


 > Sampling by language: dict_keys(['en'])



[1m   --> TIME: 2025-04-12 16:01:54 -- STEP: 0/46 -- GLOBAL_STEP: 0[0m
     | > loss_text_ce: 0.023332759737968445  (0.023332759737968445)
     | > loss_mel_ce: 4.199986934661865  (4.199986934661865)
     | > loss: 0.016759205609560013  (0.016759205609560013)
     | > current_lr: 5e-06 
     | > step_time: 5.4952  (5.495172739028931)
     | > loader_time: 0.1356  (0.13561296463012695)


[1m > EVALUATION [0m

[1m   --> STEP: 0[0m
     | > loss_text_ce: 0.024010371416807175  (0.024010371416807175)
     | > loss_mel_ce: 4.11047887802124  (4.11047887802124)
     | > loss: 4.134489059448242  (4.134489059448242)



 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0628664493560791 [0m(-0.030913591384887695)
     | > avg_loss_text_ce:[92m 0.024010371416807175 [0m(-0.0001654699444770813)
     | > avg_loss_mel_ce:[92m 4.11047887802124 [0m(-0.12018871307373047)
     | > avg_loss:[92m 4.134489059448242 [0m(-0.12035417556762695)

 > BEST MODEL : ./training_outputs/owl_speech_test-April-12-2025_04+01PM-dd3b630\best_model_46.pth

[4m[1m > EPOCH: 2/4[0m
 --> ./training_outputs/owl_speech_test-April-12-2025_04+01PM-dd3b630

[1m > TRAINING (2025-04-12 16:10:18) [0m

[1m   --> TIME: 2025-04-12 16:11:13 -- STEP: 4/46 -- GLOBAL_STEP: 50[0m
     | > loss_text_ce: 0.023366305977106094  (0.023232044652104378)
     | > loss_mel_ce: 4.010820388793945  (4.0120755434036255)
     | > loss: 0.016008678823709488  (0.01601312612183392)
     | > current_lr: 5e-06 
     | > step_time: 2.5745  (2.7844334840774536)
     | > loader_time: 0.0953  (0.0937570333480835)



error loading ./datasets/example_1/wavs\﻿chunk_0000.wav (<class 'soundfile.LibsndfileError'>, LibsndfileError(2, "Error opening './datasets/example_1/wavs\\\\\\ufeffchunk_0000.wav': "), <traceback object at 0x00000261B5CD8B80>)



[1m > EVALUATION [0m

[1m   --> STEP: 0[0m
     | > loss_text_ce: 0.023904411122202873  (0.023904411122202873)
     | > loss_mel_ce: 4.023954391479492  (4.023954391479492)
     | > loss: 4.047858715057373  (4.047858715057373)



 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.06782031059265137 [0m(+0.004953861236572266)
     | > avg_loss_text_ce:[92m 0.023904411122202873 [0m(-0.00010596029460430145)
     | > avg_loss_mel_ce:[92m 4.023954391479492 [0m(-0.08652448654174805)
     | > avg_loss:[92m 4.047858715057373 [0m(-0.08663034439086914)

 > BEST MODEL : ./training_outputs/owl_speech_test-April-12-2025_04+01PM-dd3b630\best_model_92.pth

[4m[1m > EPOCH: 3/4[0m
 --> ./training_outputs/owl_speech_test-April-12-2025_04+01PM-dd3b630

[1m > TRAINING (2025-04-12 16:21:05) [0m

[1m   --> TIME: 2025-04-12 16:23:19 -- STEP: 8/46 -- GLOBAL_STEP: 100[0m
     | > loss_text_ce: 0.02607664465904236  (0.023935667937621474)
     | > loss_mel_ce: 4.002424716949463  (4.033668577671051)
     | > loss: 0.015986118465662003  (0.01610160490963608)
     | > current_lr: 5e-06 
     | > step_time: 2.5733  (4.16097429394722)
     | > loader_time: 0.0908  (0.09121331572532654)


[1m > EVALUATION [0m

[1m

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.07081103324890137 [0m(+0.00299072265625)
     | > avg_loss_text_ce:[92m 0.023824071511626244 [0m(-8.033961057662964e-05)
     | > avg_loss_mel_ce:[92m 3.9446892738342285 [0m(-0.07926511764526367)
     | > avg_loss:[92m 3.968513250350952 [0m(-0.0793454647064209)

 > BEST MODEL : ./training_outputs/owl_speech_test-April-12-2025_04+01PM-dd3b630\best_model_138.pth
