# Notes

## Setup notes
- If installing TTS package on a venv, install propper cuda enabled torch otherwise default torch will be installed, preventing cuda from being used.
- Go to "TTS\tts\layers\tortoise\arch_utils.py" replace references of LogitWarper to LogitsProcessor
- Go to "TTS\tts\models\xtts.py then to function get_compatible_checkpoint_state_dict. On line 714: checkpoint = load_fsspec(model_path, map_location=torch.device("cpu"))["model"]. Add the argument 'weights_only = False": checkpoint = load_fsspec(model_path, map_location=torch.device("cpu"), weights_only = False)["model"]


In [1]:
'''Imports'''
from trainer import Trainer, TrainerArgs
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
from TTS.utils.manage import ModelManager
import torch
import sys
import os
from datetime import datetime
import wandb
from trainer.logging.wandb_logger import WandbLogger

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
'''Display device used'''
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
print(torch.version.cuda)           
print(torch.cuda.is_available())    
print(torch.cuda.get_device_name()) 

cuda:0
12.4
True
NVIDIA GeForce RTX 2070


In [3]:
'''DOWNLOADS'''
# Get XTTS files
CHECKPOINT_PATH = './XTTS-files/'
os.makedirs(CHECKPOINT_PATH, exist_ok=True)

# DVAE files
DVAE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"

# Set the path to the downloaded files
DVAE_CHECKPOINT = os.path.join(CHECKPOINT_PATH, os.path.basename(DVAE_LINK))
MEL_NORM_FILE = os.path.join(CHECKPOINT_PATH, os.path.basename(MEL_NORM_LINK))

# DVAE download if not exists
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
    print(" > Downloading DVAE files!")
    ModelManager._download_model_files([MEL_NORM_LINK, DVAE_LINK], CHECKPOINT_PATH, progress_bar=True)

# XTTS v2.0 checkpoint
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"

# Transfer learning parameters. NOTE: Sets base model to use
TOKENIZER_FILE = os.path.join(CHECKPOINT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json
XTTS_CHECKPOINT = os.path.join(CHECKPOINT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth

# XTTS v2.0 download if not exists
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
    print(" > Downloading XTTS v2.0 files!")
    ModelManager._download_model_files(
        [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINT_PATH, progress_bar=True
    )
print("Paths set.")

Paths set.


In [5]:
'''DATA LOADING'''
# Set lang
LANGUAGE ='en'
# Set to folder name that contains metadata.csv and wavs dir (with the .wav examples)
DATASET= "noramlized_personal_voice"
training_dir = f'./datasets/{DATASET}/' # change to folder w/ training examples

# Dataset uses ljspeech format
dataset_config = BaseDatasetConfig(
    formatter="ljspeech",
    meta_file_train="metadata.csv", # metadata file w/ transcriptions
    language=LANGUAGE,
    path=training_dir
)

# Turn off eval split. Will evaluate manually
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_size=0.02 # Might change
)


'''MODIFY'''
# Audio config
audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) 

# Speaker Reference: Match theses to the test sentences
### Only need 1 speaker audio reference. Do not need to match voice to text
SPEAKER_TEXT = [
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
    "This cake is great. It's so delicious and moist."
]
SPEAKER_REFERENCE = f"datasets/{DATASET}/wavs/chunk_0016.wav"


 | > Found 703 files in C:\Users\12017\Desktop\NJIT\DS677_852_Project\src\datasets\noramlized_personal_voice


In [6]:
'''Set Model arguments'''
model_args = GPTArgs(
    max_conditioning_length=(22050*11), # Use sample rate units to define max condition length (for reference audio wav)
    min_conditioning_length=(22050*3),
    debug_loading_failures=True,
    max_wav_length=(22050*11), # max 10 seconds
    max_text_length=(22050*3), # min 3 seconds
    mel_norm_file=MEL_NORM_FILE,
    dvae_checkpoint=DVAE_CHECKPOINT,
    xtts_checkpoint=XTTS_CHECKPOINT,  
    tokenizer_file=TOKENIZER_FILE,
    gpt_num_audio_tokens=1026, 
    gpt_start_audio_token=1024,
    gpt_stop_audio_token=1025,
    gpt_use_masking_gt_prompt_approach=True,
    gpt_use_perceiver_resampler=True,
)

In [7]:
'''Set up configuration file'''
'''TRAINING CONFIG'''
OUT_PATH = './training_outputs/'

RUN_NAME = f"xttsv2_finetune_{datetime.now().strftime('%Y%m%d_%H%M')}"
PROJECT_NAME = "XTTS-v2 Finetune"
DASHBOARD_LOGGER = 'wandb'
LOGGER_URI = None

OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  

BATCH_SIZE = 3 # 4 is common

config = GPTTrainerConfig(
    run_eval=True,
    epochs = 10, # assuming you want to end training manually w/ keyboard interrupt
    output_path=OUT_PATH,
    model_args=model_args,
    run_name=RUN_NAME,
    project_name=PROJECT_NAME,
    run_description="""
        GPT XTTS training
        """,
    dashboard_logger=DASHBOARD_LOGGER,
    wandb_entity=None,
    logger_uri=LOGGER_URI,
    audio=audio_config,
    batch_size=BATCH_SIZE,
    batch_group_size=48,
    eval_batch_size=BATCH_SIZE,
    num_loader_workers=0, # On Windows, num_loader_workers > 0 can break multiprocessing in PyTorch
    eval_split_max_size=256, 
    print_step=50, 
    plot_step=100, 
    log_model_step=1000, 
    save_step=1000, # Needs to be an int
    save_n_checkpoints=3, # Rotate last 3 checkpoints
    save_checkpoints=True,
    print_eval=True,
    optimizer="AdamW",
    optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
    optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
    lr=5e-06,  
    lr_scheduler="MultiStepLR",
    lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
    test_sentences=[ 
        {
            "text": SPEAKER_TEXT[0],
            "speaker_wav": SPEAKER_REFERENCE, 
            "language": LANGUAGE,
        },
        {
            "text": SPEAKER_TEXT[1],
            "speaker_wav": SPEAKER_REFERENCE,
            "language": LANGUAGE,
        }
    ],
) 

In [8]:
'''Set up Trainer'''
# Init model 
model = GPTTrainer.init_from_config(config)

# Init Trainer
GRAD_ACUMM_STEPS = 84 # Note: GRAD_ACUMM_STEPS * BATCH_SIZE = 252
START_WITH_EVAL = True  

trainer = Trainer(
    TrainerArgs(
        restore_path=None, # Change to model path if resuming
        skip_train_epoch=False,
        start_with_eval=START_WITH_EVAL,
        grad_accum_steps=GRAD_ACUMM_STEPS,
    ),
    config,
    output_path=OUT_PATH,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 16
 | > Num. of Torch Threads: 1
 | > Torch seed: 1
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


>> DVAE weights restored from: ./XTTS-files/dvae.pth


[34m[1mwandb[0m: Currently logged in as: [33msliverwall[0m ([33msliverwall-new-jersey-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



 > Model has 518442047 parameters


In [9]:
'''TRAINING: manual interupts will set model to output saves at given checkpoints'''
trainer.fit()


[4m[1m > EPOCH: 0/10[0m
 --> ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c

[1m > EVALUATION [0m



 > Filtering invalid eval samples!!
 > Total eval samples after filtering: 14


[1m   --> STEP: 0[0m
     | > loss_text_ce: 0.021762222051620483  (0.021762222051620483)
     | > loss_mel_ce: 3.8056249618530273  (3.8056249618530273)
     | > loss: 3.8273870944976807  (3.8273870944976807)

[1m   --> STEP: 1[0m
     | > loss_text_ce: 0.021934326738119125  (0.021934326738119125)
     | > loss_mel_ce: 3.791006326675415  (3.791006326675415)
     | > loss: 3.8129405975341797  (3.8129405975341797)

[1m   --> STEP: 2[0m
     | > loss_text_ce: 0.02519591525197029  (0.023565120995044708)
     | > loss_mel_ce: 4.018796443939209  (3.904901385307312)
     | > loss: 4.043992519378662  (3.928466558456421)

[1m   --> STEP: 3[0m
     | > loss_text_ce: 0.021754387766122818  (0.022961543252070744)
     | > loss_mel_ce: 3.4101994037628174  (3.7400007247924805)
     | > loss: 3.4319539070129395  (3.7629623413085938)

[1m   --> STEP: 4[0m
     | > loss_text_ce: 0.0275751780718565  (0.024114951957017183)
     | > loss_mel_ce: 4.1670823097229  (3.8467711210250854)
     | > loss:

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.0227736234664917 [0m(+0)
     | > avg_loss_text_ce: 0.024114951957017183 [0m(+0)
     | > avg_loss_mel_ce: 3.8467711210250854 [0m(+0)
     | > avg_loss: 3.8708860874176025 [0m(+0)


[4m[1m > EPOCH: 1/10[0m
 --> ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c

[1m > TRAINING (2025-05-04 12:50:30) [0m


 > Sampling by language: dict_keys(['en'])



[1m   --> TIME: 2025-05-04 12:50:36 -- STEP: 0/230 -- GLOBAL_STEP: 0[0m
     | > loss_text_ce: 0.02736741304397583  (0.02736741304397583)
     | > loss_mel_ce: 3.9518797397613525  (3.9518797397613525)
     | > loss: 0.047371990978717804  (0.047371990978717804)
     | > current_lr: 5e-06 
     | > step_time: 6.2011  (6.2011120319366455)
     | > loader_time: 0.0203  (0.020256996154785156)

[34m[1mwandb[0m: Adding directory to artifact (.\training_outputs\xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c)... Done. 0.0s

[1m   --> TIME: 2025-05-04 12:57:19 -- STEP: 50/230 -- GLOBAL_STEP: 50[0m
     | > loss_text_ce: 0.02770291455090046  (0.025397119112312794)
     | > loss_mel_ce: 3.9957215785980225  (3.9501606369018556)
     | > loss: 0.04789791256189346  (0.04732806928455831)
     | > current_lr: 5e-06 
     | > step_time: 2.3159  (2.897572021484375)
     | > loader_time: 0.0254  (0.026508593559265138)


[1m   --> TIME: 2025-05-04 13:05:28 -- STEP: 100/230 -- GLOBAL_STE

error loading ./datasets/noramlized_personal_voice/wavs\chunk_0365.wav (<class 'AssertionError'>, AssertionError('UNK token found in  Another useful contribution is that deep player functions are very fast as many key operations are coded in C++.\n -> [en]another useful contribution is that deep player functions are very fast as many key operations are coded in c.'), <traceback object at 0x00000202CE227880>)



[1m   --> TIME: 2025-05-04 13:23:41 -- STEP: 200/230 -- GLOBAL_STEP: 200[0m
     | > loss_text_ce: 0.02432379499077797  (0.025138790719211112)
     | > loss_mel_ce: 4.0928053855896  (3.873010665178299)
     | > loss: 0.049013447016477585  (0.0464065419882536)
     | > current_lr: 5e-06 
     | > step_time: 10.0056  (7.49384029507637)
     | > loader_time: 0.0295  (0.030196043252944945)



Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0365.wav because it was already ignored before !!



[1m > EVALUATION [0m

[1m   --> STEP: 0[0m
     | > loss_text_ce: 0.02171185426414013  (0.02171185426414013)
     | > loss_mel_ce: 3.6262032985687256  (3.6262032985687256)
     | > loss: 3.6479151248931885  (3.6479151248931885)

[1m   --> STEP: 1[0m
     | > loss_text_ce: 0.021888677030801773  (0.021888677030801773)
     | > loss_mel_ce: 3.6042394638061523  (3.6042394638061523)
     | > loss: 3.6261281967163086  (3.6261281967163086)

[1m   --> STEP: 2[0m
     | > loss_text_ce: 0.02515198476612568  (0.023520330898463726)
     | > loss_mel_ce: 3.7666802406311035  (3.685459852218628)
     | > loss: 3.791832208633423  (3.7089802026748657)

[1m   --> STEP: 3[0m
     | > loss_text_ce: 0.02175145037472248  (0.022930704057216644)
     | > loss_mel_ce: 3.1950371265411377  (3.5219856103261313)
     | > loss: 3.2167885303497314  (3.544916311899821)

[1m   --> STEP: 4[0m
     | > loss_text_ce: 0.02749786339700222  (0.024072493892163038)
     | > loss_mel_ce: 3.901393413543701  (3.6168

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.01312243938446045 [0m(-0.00965118408203125)
     | > avg_loss_text_ce:[92m 0.024072493892163038 [0m(-4.245806485414505e-05)
     | > avg_loss_mel_ce:[92m 3.6168375611305237 [0m(-0.22993355989456177)
     | > avg_loss:[92m 3.640910029411316 [0m(-0.22997605800628662)

 > BEST MODEL : ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c\best_model_230.pth

[4m[1m > EPOCH: 2/10[0m
 --> ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c

[1m > TRAINING (2025-05-04 13:29:09) [0m

[1m   --> TIME: 2025-05-04 13:33:11 -- STEP: 20/230 -- GLOBAL_STEP: 250[0m
     | > loss_text_ce: 0.024602022022008896  (0.02593605639412999)
     | > loss_mel_ce: 3.70546817779541  (3.766446256637573)
     | > loss: 0.04440559819340706  (0.045147409290075304)
     | > current_lr: 5e-06 
     | > step_time: 3.2492  (3.0704188108444215)
     | > loader_time: 0.0339  (0.020787596702575684)



error loading ./datasets/noramlized_personal_voice/wavs\chunk_0658.wav (<class 'AssertionError'>, AssertionError('UNK token found in  You can see that the algorithm converged and obtained an estimate of μ and σ. We can also try to\n -> [en]you can see that the algorithm converged and obtained an estimate of μ and . we can also try to'), <traceback object at 0x00000202CE227740>)



[1m   --> TIME: 2025-05-04 14:01:46 -- STEP: 170/230 -- GLOBAL_STEP: 400[0m
     | > loss_text_ce: 0.027420639991760254  (0.02499215723398853)
     | > loss_mel_ce: 3.7945311069488525  (3.696958381989423)
     | > loss: 0.04549942910671234  (0.04430893571937785)
     | > current_lr: 5e-06 
     | > step_time: 2.6851  (3.8320623173433197)
     | > loader_time: 0.0311  (0.019707243582781624)


[1m   --> TIME: 2025-05-04 14:11:31 -- STEP: 220/230 -- GLOBAL_STEP: 450[0m
     | > loss_text_ce: 0.02541688084602356  (0.025006731362505388)
     | > loss_mel_ce: 3.582221269607544  (3.688327723199671)
     | > loss: 0.042948074638843536  (0.04420636330138554)
     | > current_lr: 5e-06 
     | > step_time: 2.9321  (3.995974132147703)
     | > loader_time: 0.015  (0.020250595699657092)


[1m > EVALUATION [0m

[1m   --> STEP: 0[0m
     | > loss_text_ce: 0.02164589613676071  (0.02164589613676071)
     | > loss_mel_ce: 3.5339882373809814  (3.5339882373809814)
     | > loss: 3.55563402175903

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.015694618225097656 [0m(+0.002572178840637207)
     | > avg_loss_text_ce:[92m 0.024018140975385904 [0m(-5.435291677713394e-05)
     | > avg_loss_mel_ce:[92m 3.5193036794662476 [0m(-0.09753388166427612)
     | > avg_loss:[92m 3.5433218479156494 [0m(-0.0975881814956665)

 > BEST MODEL : ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c\best_model_460.pth

[4m[1m > EPOCH: 3/10[0m
 --> ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c

[1m > TRAINING (2025-05-04 14:13:40) [0m

[1m   --> TIME: 2025-05-04 14:20:47 -- STEP: 40/230 -- GLOBAL_STEP: 500[0m
     | > loss_text_ce: 0.026652071624994278  (0.024855676898732783)
     | > loss_mel_ce: 3.651787519454956  (3.6210572481155396)
     | > loss: 0.043790947645902634  (0.04340372625738381)
     | > current_lr: 5e-06 
     | > step_time: 2.8766  (2.786069840192795)
     | > loader_time: 0.017  (0.015826195478439335

Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0658.wav because it was already ignored before !!



[1m   --> TIME: 2025-05-04 14:38:20 -- STEP: 140/230 -- GLOBAL_STEP: 600[0m
     | > loss_text_ce: 0.028105083853006363  (0.02516503373959235)
     | > loss_mel_ce: 3.5037200450897217  (3.637849572726658)
     | > loss: 0.04204553738236427  (0.04360731744340488)
     | > current_lr: 5e-06 
     | > step_time: 3.0286  (2.906244703701564)
     | > loader_time: 0.0129  (0.015637636184692383)


[1m   --> TIME: 2025-05-04 14:48:09 -- STEP: 190/230 -- GLOBAL_STEP: 650[0m
     | > loss_text_ce: 0.02202760800719261  (0.025075115959503146)
     | > loss_mel_ce: 3.9938790798187256  (3.632510040935717)
     | > loss: 0.047808416187763214  (0.043542681242290306)
     | > current_lr: 5e-06 
     | > step_time: 3.1356  (3.0220538227181675)
     | > loader_time: 0.0145  (0.015413416059393632)


[1m > EVALUATION [0m

[1m   --> STEP: 0[0m
     | > loss_text_ce: 0.021546468138694763  (0.021546468138694763)
     | > loss_mel_ce: 3.474316120147705  (3.474316120147705)
     | > loss: 3.49586248397

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.015017509460449219 [0m(-0.0006771087646484375)
     | > avg_loss_text_ce:[92m 0.02394903264939785 [0m(-6.910832598805428e-05)
     | > avg_loss_mel_ce:[92m 3.4690057039260864 [0m(-0.05029797554016113)
     | > avg_loss:[92m 3.492954730987549 [0m(-0.050367116928100586)

 > BEST MODEL : ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c\best_model_690.pth

[4m[1m > EPOCH: 4/10[0m
 --> ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c

[1m > TRAINING (2025-05-04 14:55:57) [0m


Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0365.wav because it was already ignored before !!



[1m   --> TIME: 2025-05-04 14:58:40 -- STEP: 10/230 -- GLOBAL_STEP: 700[0m
     | > loss_text_ce: 0.025397874414920807  (0.024878767505288123)
     | > loss_mel_ce: 3.4831349849700928  (3.5722805976867678)
     | > loss: 0.04176824912428856  (0.042823326960206035)
     | > current_lr: 5e-06 
     | > step_time: 3.3223  (4.431906676292419)
     | > loader_time: 0.0141  (0.01723039150238037)



Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0658.wav because it was already ignored before !!



[1m   --> TIME: 2025-05-04 15:07:10 -- STEP: 60/230 -- GLOBAL_STEP: 750[0m
     | > loss_text_ce: 0.02430034801363945  (0.024692826842268308)
     | > loss_mel_ce: 3.7291338443756104  (3.5705307801564534)
     | > loss: 0.04468373954296112  (0.042800281755626196)
     | > current_lr: 5e-06 
     | > step_time: 2.5908  (3.0129100084304805)
     | > loader_time: 0.015  (0.016045248508453364)



Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0365.wav because it was already ignored before !!



[1m   --> TIME: 2025-05-04 15:17:22 -- STEP: 110/230 -- GLOBAL_STEP: 800[0m
     | > loss_text_ce: 0.02265559323132038  (0.02464684805070812)
     | > loss_mel_ce: 3.2587788105010986  (3.5692092765461316)
     | > loss: 0.039064694195985794  (0.04278400225395506)
     | > current_lr: 5e-06 
     | > step_time: 2.452  (4.038204546408218)
     | > loader_time: 0.013  (0.017167594216086653)


[1m   --> TIME: 2025-05-04 15:34:29 -- STEP: 160/230 -- GLOBAL_STEP: 850[0m
     | > loss_text_ce: 0.022869858890771866  (0.0247204422717914)
     | > loss_mel_ce: 3.276261329650879  (3.56926601678133)
     | > loss: 0.03927537053823471  (0.042785553936846556)
     | > current_lr: 5e-06 
     | > step_time: 2.4996  (6.637287424504754)
     | > loader_time: 0.014  (0.016998481750488282)


[1m   --> TIME: 2025-05-04 15:57:16 -- STEP: 210/230 -- GLOBAL_STEP: 900[0m
     | > loss_text_ce: 0.02160361409187317  (0.02463118647713037)
     | > loss_mel_ce: 3.6851212978363037  (3.554648460660662)
     

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.016025543212890625 [0m(+0.0010080337524414062)
     | > avg_loss_text_ce:[92m 0.023872357793152332 [0m(-7.667485624551773e-05)
     | > avg_loss_mel_ce:[92m 3.43339079618454 [0m(-0.03561490774154663)
     | > avg_loss:[92m 3.4572631120681763 [0m(-0.03569161891937256)

 > BEST MODEL : ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c\best_model_920.pth

[4m[1m > EPOCH: 5/10[0m
 --> ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c

[1m > TRAINING (2025-05-04 16:03:29) [0m


Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0365.wav because it was already ignored before !!



[1m   --> TIME: 2025-05-04 16:11:50 -- STEP: 30/230 -- GLOBAL_STEP: 950[0m
     | > loss_text_ce: 0.027785643935203552  (0.025221229158341886)
     | > loss_mel_ce: 3.2591328620910645  (3.5236709912618003)
     | > loss: 0.03912997990846634  (0.042248717571298285)
     | > current_lr: 5e-06 
     | > step_time: 6.6432  (4.568212175369263)
     | > loader_time: 0.0169  (0.016440550486246753)


[1m   --> TIME: 2025-05-04 16:27:53 -- STEP: 80/230 -- GLOBAL_STEP: 1000[0m
     | > loss_text_ce: 0.02577793039381504  (0.024781629256904126)
     | > loss_mel_ce: 3.5102763175964355  (3.5418633997440336)
     | > loss: 0.042095884680747986  (0.04246006067842245)
     | > current_lr: 5e-06 
     | > step_time: 3.1617  (5.2335094720125195)
     | > loader_time: 0.0156  (0.015938907861709588)


 > CHECKPOINT : ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c\checkpoint_1000.pth
[34m[1mwandb[0m: Adding directory to artifact (.\training_outputs\xttsv2_finetune_2025

Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0365.wav because it was already ignored before !!
Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0658.wav because it was already ignored before !!



[1m   --> TIME: 2025-05-04 16:47:48 -- STEP: 180/230 -- GLOBAL_STEP: 1100[0m
     | > loss_text_ce: 0.026017239317297935  (0.02480155466538336)
     | > loss_mel_ce: 3.395176887512207  (3.5048231535487706)
     | > loss: 0.040728501975536346  (0.042019342486229204)
     | > current_lr: 5e-06 
     | > step_time: 3.0859  (4.1044195175170906)
     | > loader_time: 0.0156  (0.019807656606038414)


[1m > EVALUATION [0m

[1m   --> STEP: 0[0m
     | > loss_text_ce: 0.021300693973898888  (0.021300693973898888)
     | > loss_mel_ce: 3.392547845840454  (3.392547845840454)
     | > loss: 3.413848638534546  (3.413848638534546)

[1m   --> STEP: 1[0m
     | > loss_text_ce: 0.021670367568731308  (0.021670367568731308)
     | > loss_mel_ce: 3.340406894683838  (3.340406894683838)
     | > loss: 3.362077236175537  (3.362077236175537)

[1m   --> STEP: 2[0m
     | > loss_text_ce: 0.024777373299002647  (0.023223870433866978)
     | > loss_mel_ce: 3.5950794219970703  (3.467743158340454)
     | >

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.01534879207611084 [0m(-0.0006767511367797852)
     | > avg_loss_text_ce:[92m 0.023796888068318367 [0m(-7.54697248339653e-05)
     | > avg_loss_mel_ce:[92m 3.405165135860443 [0m(-0.02822566032409668)
     | > avg_loss:[92m 3.428961992263794 [0m(-0.028301119804382324)

 > BEST MODEL : ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c\best_model_1150.pth

[4m[1m > EPOCH: 6/10[0m
 --> ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c

[1m > TRAINING (2025-05-04 17:00:08) [0m

[1m   --> TIME: 2025-05-04 17:00:22 -- STEP: 0/230 -- GLOBAL_STEP: 1150[0m
     | > loss_text_ce: 0.02570449374616146  (0.02570449374616146)
     | > loss_mel_ce: 3.381763219833374  (3.381763219833374)
     | > loss: 0.040565092116594315  (0.040565092116594315)
     | > current_lr: 5e-06 
     | > step_time: 3.2972  (3.2971622943878174)
     | > loader_time: 0.0535  (0.0534818172454834)


Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0365.wav because it was already ignored before !!



[1m   --> TIME: 2025-05-04 17:47:14 -- STEP: 100/230 -- GLOBAL_STEP: 1250[0m
     | > loss_text_ce: 0.02580801211297512  (0.024547864757478235)
     | > loss_mel_ce: 3.4040565490722656  (3.4696395659446715)
     | > loss: 0.04083172231912613  (0.04159747023135424)
     | > current_lr: 5e-06 
     | > step_time: 2.6232  (19.781923756599426)
     | > loader_time: 0.0138  (0.015003643035888671)


[1m   --> TIME: 2025-05-04 17:55:39 -- STEP: 150/230 -- GLOBAL_STEP: 1300[0m
     | > loss_text_ce: 0.02818452939391136  (0.0247337241222461)
     | > loss_mel_ce: 3.487704277038574  (3.477954489390055)
     | > loss: 0.0418558195233345  (0.04169866998990377)
     | > current_lr: 5e-06 
     | > step_time: 2.7975  (14.102913273175563)
     | > loader_time: 0.0145  (0.014870279630025228)



Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0658.wav because it was already ignored before !!



[1m   --> TIME: 2025-05-04 18:05:41 -- STEP: 200/230 -- GLOBAL_STEP: 1350[0m
     | > loss_text_ce: 0.02705996297299862  (0.024762125415727487)
     | > loss_mel_ce: 3.715608835220337  (3.4769335424900056)
     | > loss: 0.044555582106113434  (0.04168685395270586)
     | > current_lr: 5e-06 
     | > step_time: 28.1773  (11.593655514717106)
     | > loader_time: 0.0333  (0.014937872886657716)


[1m > EVALUATION [0m

[1m   --> STEP: 0[0m
     | > loss_text_ce: 0.02118760533630848  (0.02118760533630848)
     | > loss_mel_ce: 3.36683988571167  (3.36683988571167)
     | > loss: 3.3880274295806885  (3.3880274295806885)

[1m   --> STEP: 1[0m
     | > loss_text_ce: 0.021616017445921898  (0.021616017445921898)
     | > loss_mel_ce: 3.318708896636963  (3.318708896636963)
     | > loss: 3.340324878692627  (3.340324878692627)

[1m   --> STEP: 2[0m
     | > loss_text_ce: 0.024686098098754883  (0.02315105777233839)
     | > loss_mel_ce: 3.5688610076904297  (3.4437849521636963)
     | > l

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.013132691383361816 [0m(-0.0022161006927490234)
     | > avg_loss_text_ce:[92m 0.023726385086774826 [0m(-7.050298154354095e-05)
     | > avg_loss_mel_ce:[92m 3.3821418285369873 [0m(-0.02302330732345581)
     | > avg_loss:[92m 3.4058682322502136 [0m(-0.023093760013580322)

 > BEST MODEL : ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c\best_model_1380.pth

[4m[1m > EPOCH: 7/10[0m
 --> ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c

[1m > TRAINING (2025-05-04 18:15:56) [0m

[1m   --> TIME: 2025-05-04 18:19:19 -- STEP: 20/230 -- GLOBAL_STEP: 1400[0m
     | > loss_text_ce: 0.028840254992246628  (0.024920130241662265)
     | > loss_mel_ce: 3.3603217601776123  (3.418428194522858)
     | > loss: 0.04034716635942459  (0.04099224228411913)
     | > current_lr: 5e-06 
     | > step_time: 2.814  (2.732638144493103)
     | > loader_time: 0.0149  (0.01477283239364

Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0365.wav because it was already ignored before !!



[1m   --> TIME: 2025-05-04 18:36:53 -- STEP: 70/230 -- GLOBAL_STEP: 1450[0m
     | > loss_text_ce: 0.026961294934153557  (0.02493534314313105)
     | > loss_mel_ce: 3.1085729598999023  (3.397600906235831)
     | > loss: 0.03732778877019882  (0.04074447974562645)
     | > current_lr: 5e-06 
     | > step_time: 3.6804  (5.856028996195111)
     | > loader_time: 0.012  (0.034839085170200894)



Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0658.wav because it was already ignored before !!



[1m   --> TIME: 2025-05-04 18:47:59 -- STEP: 120/230 -- GLOBAL_STEP: 1500[0m
     | > loss_text_ce: 0.022760486230254173  (0.02490349393337965)
     | > loss_mel_ce: 3.5500545501708984  (3.390504084030787)
     | > loss: 0.0425335131585598  (0.04065961465239525)
     | > current_lr: 5e-06 
     | > step_time: 3.0519  (5.3779821554819724)
     | > loader_time: 0.0153  (0.027723751465479538)


[1m   --> TIME: 2025-05-04 18:55:50 -- STEP: 170/230 -- GLOBAL_STEP: 1550[0m
     | > loss_text_ce: 0.025320759043097496  (0.024805147409000812)
     | > loss_mel_ce: 3.4310619831085205  (3.3968994673560635)
     | > loss: 0.04114741459488869  (0.04073457941412926)
     | > current_lr: 5e-06 
     | > step_time: 3.116  (4.631322923828573)
     | > loader_time: 0.0119  (0.023954008607303394)


[1m   --> TIME: 2025-05-04 19:04:22 -- STEP: 220/230 -- GLOBAL_STEP: 1600[0m
     | > loss_text_ce: 0.022752178832888603  (0.02474394377151673)
     | > loss_mel_ce: 3.4637115001678467  (3.4089270234107

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.01638394594192505 [0m(+0.0032512545585632324)
     | > avg_loss_text_ce:[92m 0.023657579440623522 [0m(-6.880564615130424e-05)
     | > avg_loss_mel_ce:[92m 3.36228609085083 [0m(-0.019855737686157227)
     | > avg_loss:[92m 3.3859437108039856 [0m(-0.019924521446228027)

 > BEST MODEL : ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c\best_model_1610.pth

[4m[1m > EPOCH: 8/10[0m
 --> ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c

[1m > TRAINING (2025-05-04 19:06:17) [0m


Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0365.wav because it was already ignored before !!
Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0658.wav because it was already ignored before !!



[1m   --> TIME: 2025-05-04 19:13:35 -- STEP: 40/230 -- GLOBAL_STEP: 1650[0m
     | > loss_text_ce: 0.02771531604230404  (0.02523341653868556)
     | > loss_mel_ce: 3.492427349090576  (3.4282394111156465)
     | > loss: 0.041906461119651794  (0.041112772654742)
     | > current_lr: 5e-06 
     | > step_time: 2.7807  (2.899973380565643)
     | > loader_time: 0.0153  (0.014112502336502075)



Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0365.wav because it was already ignored before !!



[1m   --> TIME: 2025-05-04 19:25:37 -- STEP: 90/230 -- GLOBAL_STEP: 1700[0m
     | > loss_text_ce: 0.02241230010986328  (0.025063377805054188)
     | > loss_mel_ce: 3.4081647396087646  (3.425644490453932)
     | > loss: 0.04084020480513573  (0.04107985616558126)
     | > current_lr: 5e-06 
     | > step_time: 2.8391  (3.058094284269545)
     | > loader_time: 0.013  (0.014034957355923123)


[1m   --> TIME: 2025-05-04 19:34:45 -- STEP: 140/230 -- GLOBAL_STEP: 1750[0m
     | > loss_text_ce: 0.025143105536699295  (0.024848589793379817)
     | > loss_mel_ce: 3.331425189971924  (3.42670236996242)
     | > loss: 0.039959147572517395  (0.041089892972792884)
     | > current_lr: 5e-06 
     | > step_time: 3.0654  (3.012140360900334)
     | > loader_time: 0.013  (0.013746367182050433)


[1m   --> TIME: 2025-05-04 19:43:44 -- STEP: 190/230 -- GLOBAL_STEP: 1800[0m
     | > loss_text_ce: 0.021639417856931686  (0.024664794261518283)
     | > loss_mel_ce: 3.269710063934326  (3.411451092519258)

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.01584911346435547 [0m(-0.0005348324775695801)
     | > avg_loss_text_ce:[92m 0.02358994260430336 [0m(-6.763683632016182e-05)
     | > avg_loss_mel_ce:[92m 3.344573736190796 [0m(-0.01771235466003418)
     | > avg_loss:[92m 3.3681636452674866 [0m(-0.017780065536499023)

 > BEST MODEL : ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c\best_model_1840.pth

[4m[1m > EPOCH: 9/10[0m
 --> ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c

[1m > TRAINING (2025-05-04 19:55:10) [0m

[1m   --> TIME: 2025-05-04 19:57:05 -- STEP: 10/230 -- GLOBAL_STEP: 1850[0m
     | > loss_text_ce: 0.024348394945263863  (0.02541128396987915)
     | > loss_mel_ce: 3.4443140029907227  (3.401384687423706)
     | > loss: 0.04129360243678093  (0.040795191377401355)
     | > current_lr: 5e-06 
     | > step_time: 2.937  (2.759142756462097)
     | > loader_time: 0.0131  (0.01450901031494140

Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0365.wav because it was already ignored before !!
Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0658.wav because it was already ignored before !!
Ignoring sample ./datasets/noramlized_personal_voice/wavs\chunk_0658.wav because it was already ignored before !!



[1m   --> TIME: 2025-05-04 20:50:19 -- STEP: 210/230 -- GLOBAL_STEP: 2050[0m
     | > loss_text_ce: 0.02499692142009735  (0.02472384764502445)
     | > loss_mel_ce: 3.545748710632324  (3.39621425583249)
     | > loss: 0.04250887781381607  (0.04072545429780367)
     | > current_lr: 5e-06 
     | > step_time: 11.4648  (6.09997319493975)
     | > loader_time: 0.0159  (0.019097761880783815)


[1m > EVALUATION [0m

[1m   --> STEP: 0[0m
     | > loss_text_ce: 0.020889604464173317  (0.020889604464173317)
     | > loss_mel_ce: 3.302748441696167  (3.302748441696167)
     | > loss: 3.3236379623413086  (3.3236379623413086)

[1m   --> STEP: 1[0m
     | > loss_text_ce: 0.021463457494974136  (0.021463457494974136)
     | > loss_mel_ce: 3.26389217376709  (3.26389217376709)
     | > loss: 3.285355567932129  (3.285355567932129)

[1m   --> STEP: 2[0m
     | > loss_text_ce: 0.024410953745245934  (0.022937205620110035)
     | > loss_mel_ce: 3.5183804035186768  (3.3911362886428833)
     | > loss

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.01636296510696411 [0m(+0.0005138516426086426)
     | > avg_loss_text_ce:[92m 0.02352036675438285 [0m(-6.957584992051125e-05)
     | > avg_loss_mel_ce:[92m 3.3293262720108032 [0m(-0.015247464179992676)
     | > avg_loss:[92m 3.352846622467041 [0m(-0.015317022800445557)

 > BEST MODEL : ./training_outputs/xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c\best_model_2070.pth
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
EvalStats/avg_loader_time,█▁▃▂▃▃▁▃▃
EvalStats/avg_loss,█▅▃▃▂▂▂▁▁
EvalStats/avg_loss_mel_ce,█▅▃▃▂▂▂▁▁
EvalStats/avg_loss_text_ce,█▇▇▆▅▄▃▂▁
TrainEpochStats/avg_grad_norm,▁▁▁▁▁▁▁▁
TrainEpochStats/avg_loader_time,█▄▂▂▃▂▄▁
TrainEpochStats/avg_loss,█▅▄▃▂▂▁▁
TrainEpochStats/avg_loss_mel_ce,█▅▄▃▂▂▁▁
TrainEpochStats/avg_loss_text_ce,█▆▆▃▄▄▃▁
TrainEpochStats/avg_step_time,▅▂▁▆▂█▂▁

0,1
EvalStats/avg_loader_time,0.01585
EvalStats/avg_loss,3.36816
EvalStats/avg_loss_mel_ce,3.34457
EvalStats/avg_loss_text_ce,0.02359
TrainEpochStats/avg_grad_norm,0.0
TrainEpochStats/avg_loader_time,0.01378
TrainEpochStats/avg_loss,0.04092
TrainEpochStats/avg_loss_mel_ce,3.41265
TrainEpochStats/avg_loss_text_ce,0.02454
TrainEpochStats/avg_step_time,3.15302
