# XTTS Finetuning for Maltese

## Requirements
- Python 3.8 or higher
- PyTorch 1.7.0 or higher

In [None]:
# @title Takes 6min and you need to restart the session at the end


# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Clone the repository
!git clone https://github.com/Wubpooz/Malta-TTS.git
%cd Malta-TTS/FineTuning/NewLanguage

!pip install --upgrade pip
!pip install -r requirements.txt

# Download NLTK and Spacy models
!python -c "import nltk; nltk.download('punkt')"
!python -m spacy download en_core_web_sm


# !apt-get remove -y ffmpeg
# !pip uninstall -y torchcodec
# !apt-get update
# !apt-get install -y ffmpeg
# !pip install torch==2.5.1+cu121 torchaudio==2.5.1 torchcodec==0.1.* --index-url https://download.pytorch.org/whl/cu121

!pip install TTS

!pip uninstall -y torch torchaudio torchvision
!pip install torch==2.5.1+cu121 torchaudio==2.5.1 torchvision --index-url https://download.pytorch.org/whl/cu121
!pip install transformers==4.38.2

## Data Preparation

In [None]:
# @title Takes 35min initialy but after it's saved to GDrive it's instant
# EXCRUATINGLY SLOW, 35min for saving 4900 audio files => concurrency if available


import os
import shutil
import pandas as pd
from tqdm.auto import tqdm
from datasets import load_dataset
from concurrent.futures import ThreadPoolExecutor

# Set the library path so torchcodec can find ffmpeg ?
os.environ['LD_LIBRARY_PATH'] += ":/usr/lib/x86_64-linux-gnu/"

# def save_single_file(example, output_dir, i):
#     audio_filename = example['audio']['path']
#     audio_bytes = example['audio']['bytes']
#     text = example['normalized_text']
#     speaker_id = example['speaker_id']

#     output_audio_path = os.path.join(output_dir, audio_filename)

#     with open(output_audio_path, 'wb') as f:
#       f.write(audio_bytes) # is it 24kHz ?

#     return {
#         'audio_file': os.path.join("wavs", audio_filename),
#         'text': text,
#         'speaker_name': speaker_id
#     }

# def save_dataset_split_concurent(split, filename):
#     data = []
#     with ThreadPoolExecutor(max_workers=16) as executor:
#         futures = [executor.submit(save_single_file, example, wavs_dir, i) for i, example in enumerate(ds[split])]

#         for future in tqdm(futures, desc=f"Processing {split} split"):
#             data.append(future.result())

#     df = pd.DataFrame(data)
#     df.to_csv(os.path.join(output_dir, filename), sep="|", index=False)
#     print(f"Saved {len(df)} files to {filename}")


def save_dataset_split(split, filename, save_audio=True):
  data = []
  for example in tqdm(ds[split].to_list(), desc=f"Processing {split} split"):
    audio_filename = example['audio']['path']
    audio_bytes = example['audio']['bytes']
    text = example['normalized_text']
    speaker_id = example['speaker_id']

    if(save_audio):
      with open(os.path.join(wavs_dir, audio_filename), 'wb') as f:
        f.write(audio_bytes) #TODO is it 24kHz ?

    # Use LJSpeech format (extended)
    # /!\ audio_file shouldn't have extension, else fails | also they should just be filenames, the loader will add wav/ before and .wav after
    name, ext = os.path.splitext(audio_filename)
    audio_file_without_ext = name

    data.append({
      'audio_file': audio_file_without_ext,
      'text': text,
      'normalized_text': text,
      'speaker_name': speaker_id
    })

  df = pd.DataFrame(data)
  df.to_csv(os.path.join(output_dir, filename), sep="|", index=False)
  print(f"Saved {len(df)} files to {filename}")



output_dir = "/content/drive/MyDrive/XTTS_Maltese_Data"
if os.path.exists(output_dir) and os.path.exists(os.path.join(output_dir, "wavs")) and os.path.exists(os.path.join(output_dir, "metadata_train.csv")):
  print(f"Processed dataset already exists at {output_dir}")
else:
  print("Loading dataset from Hugging Face...")
  ds = load_dataset("Bluefir/MASRI_HEADSET_v2")

  os.makedirs(output_dir, exist_ok=True)
  wavs_dir = os.path.join(output_dir, "wavs")
  os.makedirs(wavs_dir, exist_ok=True)
  if os.path.exists(wavs_dir):
    save_audio = False
  else:
    save_audio = True

  print("Preparing and saving dataset files...")
  save_dataset_split("train", "metadata_train.csv", save_audio)
  save_dataset_split("test", "metadata_eval.csv", save_audio)
  print("Dataset preparation complete (saved to Google Drive too).")

In [None]:
# @title Dataset repartition
import os
import io
import tempfile
import soundfile as sf
from datasets import load_dataset
from datasets import load_dataset, Audio

# Don't decode audio — just keep metadata
ds = load_dataset("Bluefir/MASRI_HEADSET_v2")
ds = ds.cast_column("audio", Audio(decode=False))

text_lengths = []
audio_durations = []

for split in ["train", "test"]:
    print(f"Processing split: {split}")
    for example in ds[split]:
        # Text length
        text_lengths.append(len(example["normalized_text"]))

        # Save audio bytes to temp file, read duration with soundfile
        audio_bytes = example["audio"]["bytes"]
        with tempfile.NamedTemporaryFile(suffix=".wav") as tmpf:
            tmpf.write(audio_bytes)
            tmpf.flush()
            with sf.SoundFile(tmpf.name) as f:
                duration = len(f) / f.samplerate
                audio_durations.append(duration)

print(f"Text length range: {min(text_lengths)} - {max(text_lengths)} characters")
print(f"Audio duration range: {min(audio_durations):.2f} - {max(audio_durations):.2f} seconds")
print(f"Average text length: {sum(text_lengths)/len(text_lengths):.2f} characters")
print(f"Average audio duration: {sum(audio_durations)/len(audio_durations):.2f} seconds")

## Finetuning

In [None]:
# Define paths and parameters
metadata_train_path = "/content/drive/MyDrive/XTTS_Maltese_Data/metadata_train.csv"
metadata_eval_path = "/content/drive/MyDrive/XTTS_Maltese_Data/metadata_eval.csv"
output_path = "/content/drive/MyDrive/XTTS_Maltese_Training/output"
language_code = "mt"
extended_vocab_size_param = 100000

%env TOKENIZERS_PARALLELISM=false
%env OMP_NUM_THREADS=1
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


import sys
_original_stdout = sys.stdout
_original_stderr = sys.stderr
_log_file = None

def output_redirect(redirect=True):
  global _log_file
  if not redirect:
    sys.stdout = _original_stdout
    sys.stderr = _original_stderr
    if _log_file:
      _log_file.close()
      _log_file = None
  else:
    import os
    log_path = "/content/drive/MyDrive/XTTS_Maltese_Training/output/full_training.log"
    # Clear the log file by opening in write mode and closing immediately
    with open(log_path, "w"):
      pass
    _log_file = open(log_path, "a", buffering=1)  # line-buffered

    class Tee(object):
      def __init__(self, *streams):
        self.streams = streams
      def write(self, data):
        for s in self.streams:
          s.write(data)
          s.flush()
      def flush(self):
        for s in self.streams:
          s.flush()

    sys.stdout = Tee(_original_stdout, _log_file)
    sys.stderr = Tee(_original_stderr, _log_file)


os.makedirs(output_path, exist_ok=True)
%cd /content/Malta-TTS/FineTuning/NewLanguage


output_redirect(False)

print(f"Finetuning for {language_code}")



!python new_language_training_cli.py \
    --is_download \
    --is_tokenizer_extension \
    --output_path "{output_path}" \
    --metadatas "{metadata_train_path},{metadata_eval_path},{language_code}" \
    --num_epochs 1 \
    --batch_size 1 \
    --grad_acumm 48 \
    --max_audio_length 176400 \
    --max_text_length 200 \
    --weight_decay 1e-2 \
    --lr 5e-6 \
    --save_step 5000 \
    --custom_model=custom_model_name \
    --version=main \
    --metadata_path "{metadata_train_path}" \
    --language "{language_code}" \
    --extended_vocab_size {extended_vocab_size_param}


# 35min/epoch on one T4 with batch_size=1, grad_acumm=48, audio_length=176400, max_text=200, weight=1e-2, save_step=5000

output_redirect(False)

# Default values are:
# batch-size: 3
# grad_acc: 84
# max_audio: 255995 = 11.6s
# save_step: 10_000
# epoch: 10 => 100
# --multi-gpu

print("Finetuning process completed!")

env: TOKENIZERS_PARALLELISM=false
env: OMP_NUM_THREADS=1
env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
/content/Malta-TTS/FineTuning/NewLanguage
Finetuning for mt
Step 1: Downloading XTTS base model files.
 > Downloading XTTS v-main files...
 > Downloading XTTS config file...
0.00iB [00:00, ?iB/s] > XTTS model files downloaded successfully!
Step 2: Extending the XTTS tokenizer with the new language.
Original tokenizer loaded with 21161 tokens.
Training new tokenizer with 3983 texts...
[2K[00:00:00] Tokenize words                 ██████████████████ 10066    /    10066
[2K[00:00:00] Count pairs                    ██████████████████ 10066    /    10066
[2K[00:00:00] Compute merges                 ██████████████████ 15307    /    15307
New tokenizer trained with 15345 tokens.
Merging tokenizers from /content/drive/MyDrive/XTTS_Maltese_Training/output/old_tokenizer/ and /content/drive/MyDrive/XTTS_Maltese_Training/output/new_tokenizer/ into /content/drive/MyDrive/XTTS_Maltese_Tra

### Logs

```
# Define paths and parameters
metadata_train_path = "/content/drive/MyDrive/XTTS_Maltese_Data/metadata_train.csv"
metadata_eval_path = "/content/drive/MyDrive/XTTS_Maltese_Data/metadata_eval.csv"
output_path = "/content/drive/MyDrive/XTTS_Maltese_Training/output"
language_code = "mt"
extended_vocab_size_param = 100000

os.makedirs(output_path, exist_ok=True)

%cd /content/Malta-TTS/FineTuning/NewLanguage

/content/Malta-TTS/FineTuning/NewLanguage
Step 1: Downloading XTTS base model files.
 > Downloading XTTS v-main files...
 > Downloading XTTS config file...
0.00iB [00:00, ?iB/s] > XTTS model files downloaded successfully!
Step 2: Extending the XTTS tokenizer with the new language.
Original tokenizer loaded with 21161 tokens.
Training new tokenizer with 3984 texts...
[00:00:00] Tokenize words                 ██████████████████ 10067    /    10067
[00:00:00] Count pairs                    ██████████████████ 10067    /    10067
[00:00:00] Compute merges                 ██████████████████ 15308    /    15308
New tokenizer trained with 15346 tokens.
Merging tokenizers from /content/drive/MyDrive/XTTS_Maltese_Training/output/old_tokenizer/ and /content/drive/MyDrive/XTTS_Maltese_Training/output/new_tokenizer/ into /content/drive/MyDrive/XTTS_Maltese_Training/output/merged_tokenizer/
Old tokenizer vocabulary size: 21161
New tokenizer vocabulary size: 15346
Combined vocabulary size: 21161
Combined vocabulary saved to /content/drive/MyDrive/XTTS_Maltese_Training/output/merged_tokenizer/vocab.json
Tokenizer has been successfully extended and saved to /content/drive/MyDrive/XTTS_Maltese_Training/output/vocab.json
Updating the XTTS checkpoint...
Cleaning checkpoint: /content/drive/MyDrive/XTTS_Maltese_Training/output/model.pth
/content/Malta-TTS/FineTuning/NewLanguage/tokenizer_extension.py:131: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  checkpoint = torch.load(xtts_checkpoint_path, map_location="cpu")
Cleaned checkpoint saved.
Updating the XTTS config file...
Step 3: Starting GPT training.
 > Training XTTS model for Maltese with 1 datasets, 8 epochs, batch size 3, grad_acumm 84, output path: /content/drive/MyDrive/XTTS_Maltese_Training/output/run/training
 > Using the following datasets:
/content/drive/MyDrive/XTTS_Maltese_Data/metadata_train.csv /content/drive/MyDrive/XTTS_Maltese_Data/metadata_eval.csv mt
 > Downloading XTTS model files...
 > Downloading XTTS v-main files...
 > Downloading XTTS config file...

4.37kiB [01:03, 69.2iB/s]
 > XTTS model files downloaded successfully!
 > XTTS model files downloaded successfully!
Setting up model arguments...
/usr/local/lib/python3.11/dist-packages/TTS/utils/io.py:54: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  return torch.load(f, map_location=map_location, **kwargs)
/usr/local/lib/python3.11/dist-packages/TTS/tts/layers/tortoise/arch_utils.py:336: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  self.mel_norms = torch.load(f)
/usr/local/lib/python3.11/dist-packages/TTS/tts/layers/xtts/trainer/gpt_trainer.py:185: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  dvae_checkpoint = torch.load(self.args.dvae_checkpoint, map_location=torch.device("cpu"))
>> DVAE weights restored from: /content/drive/MyDrive/XTTS_Maltese_Training/output/dvae.pth
Loading datasets...
 | > Found 3983 files in /content/drive/MyDrive/XTTS_Maltese_Data
 > Loaded 3983 training samples and 996 evaluation samples.
 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 2
 | > Num. of Torch Threads: 1
 | > Torch seed: 1
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
2025-08-11 08:47:24.121184: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
E0000 00:00:1754902044.384214    2626 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754902044.456423    2626 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754902044.985120    2626 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754902044.985179    2626 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754902044.985190    2626 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754902044.985197    2626 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
 > Start Tensorboard: tensorboard --logdir=/content/drive/MyDrive/XTTS_Maltese_Training/output/run/training/GPT_XTTS_FT-August-11-2025_08+47AM-3fb6ef8

 > Model has 548111567 parameters
Starting training...

 > EPOCH: 0/8
 --> /content/drive/MyDrive/XTTS_Maltese_Training/output/run/training/GPT_XTTS_FT-August-11-2025_08+47AM-3fb6ef8
 > Sampling by language: dict_keys(['mt'])
/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py:617: UserWarning: This DataLoader will create 8 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
  warnings.warn(

 > TRAINING (2025-08-11 08:47:30)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
```



```
/content/Malta-TTS/FineTuning/NewLanguage
Step 1: Downloading XTTS base model files.
 > Downloading XTTS v-main files...
 > Downloading XTTS config file...
0.00iB [00:00, ?iB/s] > XTTS model files downloaded successfully!
Step 2: Extending the XTTS tokenizer with the new language.
Original tokenizer loaded with 21161 tokens.
Training new tokenizer with 3983 texts...
[00:00:00] Tokenize words                 ██████████████████ 10066    /    10066
[00:00:00] Count pairs                    ██████████████████ 10066    /    10066
[00:00:00] Compute merges                 ██████████████████ 15307    /    15307
New tokenizer trained with 15345 tokens.
Merging tokenizers from /content/drive/MyDrive/XTTS_Maltese_Training/output/old_tokenizer/ and /content/drive/MyDrive/XTTS_Maltese_Training/output/new_tokenizer/ into /content/drive/MyDrive/XTTS_Maltese_Training/output/merged_tokenizer/
Old tokenizer vocabulary size: 21161
New tokenizer vocabulary size: 15345
Combined vocabulary size: 21161
Combined vocabulary saved to /content/drive/MyDrive/XTTS_Maltese_Training/output/merged_tokenizer/vocab.json
Tokenizer has been successfully extended and saved to /content/drive/MyDrive/XTTS_Maltese_Training/output/vocab.json
Updating the XTTS checkpoint...
Cleaning checkpoint: /content/drive/MyDrive/XTTS_Maltese_Training/output/model.pth
/content/Malta-TTS/FineTuning/NewLanguage/tokenizer_extension.py:130: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  checkpoint = torch.load(xtts_checkpoint_path, map_location="cpu")
Cleaned checkpoint saved.
Updating the XTTS config file...
Step 3: Starting GPT training.
 > Training XTTS model for Maltese with 1 datasets, 8 epochs, batch size 3, grad_acumm 84, output path: /content/drive/MyDrive/XTTS_Maltese_Training/output/training
 > Using the following datasets:
/content/drive/MyDrive/XTTS_Maltese_Data/metadata_train.csv /content/drive/MyDrive/XTTS_Maltese_Data/metadata_eval.csv mt
 > Downloading XTTS model files...
 > Downloading XTTS v-main files...
 > Downloading XTTS config file...

4.37kiB [00:16, 259iB/s]
 > XTTS model files downloaded successfully!
 > XTTS model files downloaded successfully!
Setting up model arguments...

```





```
env: TOKENIZERS_PARALLELISM=false
env: OMP_NUM_THREADS=1
/content/Malta-TTS/FineTuning/NewLanguage
Step 1: Downloading XTTS base model files.
 > Downloading XTTS v-main files...
 > Downloading XTTS config file...
0.00iB [00:00, ?iB/s] > XTTS model files downloaded successfully!
Step 2: Extending the XTTS tokenizer with the new language.
Original tokenizer loaded with 21161 tokens.
Training new tokenizer with 3983 texts...
[00:00:00] Tokenize words                 ██████████████████ 10066    /    10066
[00:00:00] Count pairs                    ██████████████████ 10066    /    10066
[00:00:00] Compute merges                 ██████████████████ 15307    /    15307
New tokenizer trained with 15345 tokens.
Merging tokenizers from /content/drive/MyDrive/XTTS_Maltese_Training/output/old_tokenizer/ and /content/drive/MyDrive/XTTS_Maltese_Training/output/new_tokenizer/ into /content/drive/MyDrive/XTTS_Maltese_Training/output/merged_tokenizer/
Old tokenizer vocabulary size: 21161
New tokenizer vocabulary size: 15345
Combined vocabulary size: 21161
Combined vocabulary saved to /content/drive/MyDrive/XTTS_Maltese_Training/output/merged_tokenizer/vocab.json
Tokenizer has been successfully extended and saved to /content/drive/MyDrive/XTTS_Maltese_Training/output/vocab.json
Updating the XTTS checkpoint...
Cleaning checkpoint: /content/drive/MyDrive/XTTS_Maltese_Training/output/model.pth
/content/Malta-TTS/FineTuning/NewLanguage/tokenizer_extension.py:131: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  checkpoint = torch.load(xtts_checkpoint_path, map_location="cpu")
Cleaned checkpoint saved.
Updating the XTTS config file...
Updated config file saved to /content/drive/MyDrive/XTTS_Maltese_Training/output/config.json. Added new language: mt
Step 3: Starting GPT training.
 > Training XTTS model for Maltese with 1 datasets, 8 epochs, batch size 3, grad_acumm 84, output path: /content/drive/MyDrive/XTTS_Maltese_Training/output/training
 > Using the following datasets:
/content/drive/MyDrive/XTTS_Maltese_Data/metadata_train.csv /content/drive/MyDrive/XTTS_Maltese_Data/metadata_eval.csv mt
 > Downloading XTTS model files...
 > Downloading XTTS v-main files...
 > Downloading XTTS config file...

4.37kiB [00:44, 98.1iB/s]
 > XTTS model files downloaded successfully!
 > XTTS model files downloaded successfully!
Setting up model arguments...
/usr/local/lib/python3.11/dist-packages/TTS/utils/io.py:54: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  return torch.load(f, map_location=map_location, **kwargs)
/usr/local/lib/python3.11/dist-packages/TTS/tts/layers/tortoise/arch_utils.py:336: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  self.mel_norms = torch.load(f)
/usr/local/lib/python3.11/dist-packages/TTS/tts/layers/xtts/trainer/gpt_trainer.py:185: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  dvae_checkpoint = torch.load(self.args.dvae_checkpoint, map_location=torch.device("cpu"))
>> DVAE weights restored from: /content/drive/MyDrive/XTTS_Maltese_Training/output/dvae.pth
Loading datasets...
 | > Found 3984 files in /content/drive/MyDrive/XTTS_Maltese_Data
 > Loaded 3984 training samples and 997 evaluation samples.
 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 2
 | > Num. of Torch Threads: 1
 | > Torch seed: 1
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
2025-08-12 13:58:09.694672: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
E0000 00:00:1755007089.960026    3292 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755007090.035688    3292 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755007090.567483    3292 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755007090.567517    3292 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755007090.567523    3292 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755007090.567527    3292 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
 > Start Tensorboard: tensorboard --logdir=/content/drive/MyDrive/XTTS_Maltese_Training/output/training/GPT_XTTS_FT-August-12-2025_01+58PM-87ed0ce

 > Model has 548111567 parameters
Starting training...

 > EPOCH: 0/8
 --> /content/drive/MyDrive/XTTS_Maltese_Training/output/training/GPT_XTTS_FT-August-12-2025_01+58PM-87ed0ce
 > Sampling by language: dict_keys(['mt'])

 > TRAINING (2025-08-12 13:58:15)

   --> TIME: 2025-08-12 13:58:36 -- STEP: 0/1328 -- GLOBAL_STEP: 0
     | > loss_text_ce: 0.09944003075361252  (0.09944003075361252)
     | > loss_mel_ce: 5.0861430168151855  (5.0861430168151855)
     | > loss: 0.06173313409090042  (0.06173313409090042)
     | > current_lr: 5e-06
     | > step_time: 1.673  (1.6729779243469238)
     | > loader_time: 18.7882  (18.78819513320923)
```





```
env: TOKENIZERS_PARALLELISM=false
env: OMP_NUM_THREADS=1
env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
/content/Malta-TTS/FineTuning/NewLanguage
Finetuning for mt
Step 1: Downloading XTTS base model files.
 > Downloading XTTS v-main files...
 > Downloading XTTS config file...
0.00iB [00:00, ?iB/s] > XTTS model files downloaded successfully!
Step 2: Extending the XTTS tokenizer with the new language.
Original tokenizer loaded with 21161 tokens.
Training new tokenizer with 3983 texts...
[00:00:00] Tokenize words                 ██████████████████ 10066    /    10066
[00:00:00] Count pairs                    ██████████████████ 10066    /    10066
[00:00:00] Compute merges                 ██████████████████ 15307    /    15307
New tokenizer trained with 15345 tokens.
Merging tokenizers from /content/drive/MyDrive/XTTS_Maltese_Training/output/old_tokenizer/ and /content/drive/MyDrive/XTTS_Maltese_Training/output/new_tokenizer/ into /content/drive/MyDrive/XTTS_Maltese_Training/output/merged_tokenizer/
Old tokenizer vocabulary size: 21161
New tokenizer vocabulary size: 15345
Combined vocabulary size: 21161
Combined vocabulary saved to /content/drive/MyDrive/XTTS_Maltese_Training/output/merged_tokenizer/vocab.json
Tokenizer has been successfully extended and saved to /content/drive/MyDrive/XTTS_Maltese_Training/output/vocab.json
Updating the XTTS checkpoint...
Cleaning checkpoint: /content/drive/MyDrive/XTTS_Maltese_Training/output/model.pth
/content/Malta-TTS/FineTuning/NewLanguage/tokenizer_extension.py:131: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  checkpoint = torch.load(xtts_checkpoint_path, map_location="cpu")
Cleaned checkpoint saved.
Updating the XTTS config file...
Updated config file saved to /content/drive/MyDrive/XTTS_Maltese_Training/output/config.json. Added new language: mt
Step 3: Starting GPT training.
 > Training XTTS model for Maltese with 1 datasets, 8 epochs, batch size 1, grad_acumm 48, output path: /content/drive/MyDrive/XTTS_Maltese_Training/output/training
 > Using the following datasets:
/content/drive/MyDrive/XTTS_Maltese_Data/metadata_train.csv /content/drive/MyDrive/XTTS_Maltese_Data/metadata_eval.csv mt
 > Downloading XTTS model files...
 > Downloading XTTS v-main files...
 > Downloading XTTS config file...

4.37kiB [00:13, 313iB/s]
 > XTTS model files downloaded successfully!
 > XTTS model files downloaded successfully!
Setting up model arguments...
/usr/local/lib/python3.11/dist-packages/TTS/utils/io.py:54: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  return torch.load(f, map_location=map_location, **kwargs)
/usr/local/lib/python3.11/dist-packages/TTS/tts/layers/tortoise/arch_utils.py:336: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  self.mel_norms = torch.load(f)
/usr/local/lib/python3.11/dist-packages/TTS/tts/layers/xtts/trainer/gpt_trainer.py:185: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  dvae_checkpoint = torch.load(self.args.dvae_checkpoint, map_location=torch.device("cpu"))
>> DVAE weights restored from: /content/drive/MyDrive/XTTS_Maltese_Training/output/dvae.pth
Loading datasets...
 | > Found 3984 files in /content/drive/MyDrive/XTTS_Maltese_Data
 > Loaded 3984 training samples and 997 evaluation samples.
 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 2
 | > Num. of Torch Threads: 1
 | > Torch seed: 1
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
2025-08-12 14:16:02.650844: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
E0000 00:00:1755008162.732418    8041 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755008162.756114    8041 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755008162.806003    8041 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755008162.806032    8041 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755008162.806039    8041 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755008162.806046    8041 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
 > Start Tensorboard: tensorboard --logdir=/content/drive/MyDrive/XTTS_Maltese_Training/output/training/GPT_XTTS_FT-August-12-2025_02+16PM-87ed0ce

 > Model has 548111567 parameters
Starting training...

 > EPOCH: 0/8
 --> /content/drive/MyDrive/XTTS_Maltese_Training/output/training/GPT_XTTS_FT-August-12-2025_02+16PM-87ed0ce
 > Sampling by language: dict_keys(['mt'])

 > TRAINING (2025-08-12 14:16:08)

   --> TIME: 2025-08-12 14:16:11 -- STEP: 0/3984 -- GLOBAL_STEP: 0
     | > loss_text_ce: 0.09928756207227707  (0.09928756207227707)
     | > loss_mel_ce: 6.890477180480957  (6.890477180480957)
     | > loss: 0.14562010765075684  (0.14562010765075684)
     | > current_lr: 5e-06
     | > step_time: 0.794  (0.7939767837524414)
     | > loader_time: 2.2878  (2.2877612113952637)


   --> TIME: 2025-08-12 14:16:38 -- STEP: 50/3984 -- GLOBAL_STEP: 50
     | > loss_text_ce: 0.09945293515920639  (0.09942027702927589)
     | > loss_mel_ce: 6.973715782165527  (nan)
     | > loss: 0.14735768735408783  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1831  (0.16856659889221196)
     | > loader_time: 0.0104  (0.31308802127838137)


   --> TIME: 2025-08-12 14:16:52 -- STEP: 100/3984 -- GLOBAL_STEP: 100
     | > loss_text_ce: 0.09928271174430847  (0.09940921388566494)
     | > loss_mel_ce: 4.668110370635986  (nan)
     | > loss: 0.09932069480419159  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1284  (0.16634904384613045)
     | > loader_time: 0.0085  (0.17005764722824096)


   --> TIME: 2025-08-12 14:17:05 -- STEP: 150/3984 -- GLOBAL_STEP: 150
     | > loss_text_ce: 0.0993969514966011  (0.09939988871415456)
     | > loss_mel_ce: 6.246588230133057  (nan)
     | > loss: 0.13220803439617157  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1358  (0.16444286823272714)
     | > loader_time: 0.0076  (0.12666624704996743)


   --> TIME: 2025-08-12 14:17:17 -- STEP: 200/3984 -- GLOBAL_STEP: 200
     | > loss_text_ce: 0.09927795082330704  (0.09938823904842138)
     | > loss_mel_ce: 6.259814739227295  (nan)
     | > loss: 0.13248109817504883  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.0993  (0.1629643666744233)
     | > loader_time: 0.0115  (0.09805421829223632)


   --> TIME: 2025-08-12 14:17:39 -- STEP: 250/3984 -- GLOBAL_STEP: 250
     | > loss_text_ce: 0.099264957010746  (0.09938302928209305)
     | > loss_mel_ce: 5.111857891082764  (nan)
     | > loss: 0.1085650622844696  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1409  (0.16213764381408696)
     | > loader_time: 0.0115  (0.11901351642608642)


   --> TIME: 2025-08-12 14:18:02 -- STEP: 300/3984 -- GLOBAL_STEP: 300
     | > loss_text_ce: 0.09940383583307266  (0.09938052833080291)
     | > loss_mel_ce: 4.946959972381592  (nan)
     | > loss: 0.1051325798034668  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1354  (0.1589464704195659)
     | > loader_time: 0.5438  (0.14166177272796632)


   --> TIME: 2025-08-12 14:18:29 -- STEP: 350/3984 -- GLOBAL_STEP: 350
     | > loss_text_ce: 0.09936606884002686  (0.09937650371875084)
     | > loss_mel_ce: 5.004878044128418  (nan)
     | > loss: 0.10633842647075653  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1961  (0.15912888663155692)
     | > loader_time: 0.4374  (0.16278553826468345)


   --> TIME: 2025-08-12 14:18:51 -- STEP: 400/3984 -- GLOBAL_STEP: 400
     | > loss_text_ce: 0.0991375595331192  (0.09937523253262044)
     | > loss_mel_ce: 3.144663095474243  (nan)
     | > loss: 0.06757918000221252  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.175  (0.15892186701297756)
     | > loader_time: 0.5783  (0.1683490478992463)


   --> TIME: 2025-08-12 14:19:15 -- STEP: 450/3984 -- GLOBAL_STEP: 450
     | > loss_text_ce: 0.09929818660020828  (0.09937064692378046)
     | > loss_mel_ce: 4.862645626068115  (nan)
     | > loss: 0.10337382555007935  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.2494  (0.16169842455122202)
     | > loader_time: 0.0061  (0.17523845407697897)


   --> TIME: 2025-08-12 14:19:40 -- STEP: 500/3984 -- GLOBAL_STEP: 500
     | > loss_text_ce: 0.09924977272748947  (0.09936700002849104)
     | > loss_mel_ce: 4.890869617462158  (nan)
     | > loss: 0.10396082699298859  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1445  (0.16023208856582638)
     | > loader_time: 0.0094  (0.1858855328559876)


   --> TIME: 2025-08-12 14:20:06 -- STEP: 550/3984 -- GLOBAL_STEP: 550
     | > loss_text_ce: 0.09932360798120499  (0.09936148482290183)
     | > loss_mel_ce: nan  (nan)
     | > loss: nan  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.2032  (0.16064514420249246)
     | > loader_time: 0.0066  (0.19378360791639848)


   --> TIME: 2025-08-12 14:20:29 -- STEP: 600/3984 -- GLOBAL_STEP: 600
     | > loss_text_ce: 0.09934771060943604  (0.09935668869564931)
     | > loss_mel_ce: 4.994697093963623  (nan)
     | > loss: 0.10612593591213226  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1254  (0.1609055916468303)
     | > loader_time: 0.0056  (0.1958686021963756)


   --> TIME: 2025-08-12 14:20:50 -- STEP: 650/3984 -- GLOBAL_STEP: 650
     | > loss_text_ce: 0.09927915036678314  (0.09935260821993534)
     | > loss_mel_ce: 4.099350452423096  (nan)
     | > loss: 0.08747144788503647  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.102  (0.16041925833775458)
     | > loader_time: 0.0056  (0.19444965142470158)


   --> TIME: 2025-08-12 14:21:13 -- STEP: 700/3984 -- GLOBAL_STEP: 700
     | > loss_text_ce: 0.09928202629089355  (0.09934935531445913)
     | > loss_mel_ce: 5.430572986602783  (nan)
     | > loss: 0.1152053102850914  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.2143  (0.16049770627702994)
     | > loader_time: 1.0631  (0.19604456629071937)


   --> TIME: 2025-08-12 14:21:35 -- STEP: 750/3984 -- GLOBAL_STEP: 750
     | > loss_text_ce: 0.09934276342391968  (0.0993479255537192)
     | > loss_mel_ce: 4.6960320472717285  (nan)
     | > loss: 0.0999036431312561  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1152  (0.16187055238087977)
     | > loader_time: 0.0105  (0.1946799637476605)


   --> TIME: 2025-08-12 14:21:59 -- STEP: 800/3984 -- GLOBAL_STEP: 800
     | > loss_text_ce: 0.09938839077949524  (0.09934662770479917)
     | > loss_mel_ce: 5.075809955596924  (nan)
     | > loss: 0.10781663656234741  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.2478  (0.1629818463325501)
     | > loader_time: 0.352  (0.19665489494800587)


   --> TIME: 2025-08-12 14:22:21 -- STEP: 850/3984 -- GLOBAL_STEP: 850
     | > loss_text_ce: 0.09924538433551788  (0.09934283975292654)
     | > loss_mel_ce: 5.049042701721191  (nan)
     | > loss: 0.10725601017475128  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.2122  (0.1631670407687917)
     | > loader_time: 0.006  (0.19645514460170987)


   --> TIME: 2025-08-12 14:22:42 -- STEP: 900/3984 -- GLOBAL_STEP: 900
     | > loss_text_ce: 0.0993143618106842  (0.09933980262113942)
     | > loss_mel_ce: 4.41800594329834  (nan)
     | > loss: 0.09411083906888962  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1074  (0.16330833832422897)
     | > loader_time: 0.6636  (0.19477371162838422)


   --> TIME: 2025-08-12 14:23:03 -- STEP: 950/3984 -- GLOBAL_STEP: 950
     | > loss_text_ce: 0.09945549070835114  (0.09933572740146988)
     | > loss_mel_ce: 4.069403171539307  (nan)
     | > loss: 0.08685122430324554  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.2057  (0.16301529809048307)
     | > loader_time: 0.0087  (0.19448888803783232)


   --> TIME: 2025-08-12 14:23:26 -- STEP: 1000/3984 -- GLOBAL_STEP: 1000
     | > loss_text_ce: 0.0992567166686058  (0.09933151377737522)
     | > loss_mel_ce: 5.088573932647705  (nan)
     | > loss: 0.10807980597019196  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.145  (0.16290694594383245)
     | > loader_time: 0.0058  (0.19553615546226516)


   --> TIME: 2025-08-12 14:23:52 -- STEP: 1050/3984 -- GLOBAL_STEP: 1050
     | > loss_text_ce: 0.09929149597883224  (0.09932853177899406)
     | > loss_mel_ce: 3.373035430908203  (nan)
     | > loss: 0.07234014570713043  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.102  (0.1629888452802386)
     | > loader_time: 0.0065  (0.19963491916656506)


   --> TIME: 2025-08-12 14:24:12 -- STEP: 1100/3984 -- GLOBAL_STEP: 1100
     | > loss_text_ce: 0.0992569625377655  (0.09932512729005381)
     | > loss_mel_ce: 5.114090919494629  (nan)
     | > loss: 0.10861141979694366  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.279  (0.16413203196092085)
     | > loader_time: 0.447  (0.19629223563454376)


   --> TIME: 2025-08-12 14:24:31 -- STEP: 1150/3984 -- GLOBAL_STEP: 1150
     | > loss_text_ce: 0.09925679117441177  (0.09932210992859761)
     | > loss_mel_ce: 4.760846138000488  (nan)
     | > loss: 0.10125215351581573  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.2052  (0.164254375955333)
     | > loader_time: 0.0636  (0.1937816545237666)


   --> TIME: 2025-08-12 14:24:54 -- STEP: 1200/3984 -- GLOBAL_STEP: 1200
     | > loss_text_ce: 0.09885226935148239  (0.09931814010565487)
     | > loss_mel_ce: 5.438046455383301  (nan)
     | > loss: 0.11535205692052841  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1098  (0.16417876005172746)
     | > loader_time: 0.0055  (0.19487470130125686)


   --> TIME: 2025-08-12 14:25:16 -- STEP: 1250/3984 -- GLOBAL_STEP: 1250
     | > loss_text_ce: 0.09906523674726486  (0.09931496726870545)
     | > loss_mel_ce: 5.379488468170166  (nan)
     | > loss: 0.11413653939962387  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1064  (0.1647187438964844)
     | > loader_time: 0.0094  (0.1936907978057862)


   --> TIME: 2025-08-12 14:25:35 -- STEP: 1300/3984 -- GLOBAL_STEP: 1300
     | > loss_text_ce: 0.09925433248281479  (0.09931252147715834)
     | > loss_mel_ce: 4.310297966003418  (nan)
     | > loss: 0.091865673661232  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1043  (0.1648214312700125)
     | > loader_time: 0.0077  (0.19157507584645203)


   --> TIME: 2025-08-12 14:25:57 -- STEP: 1350/3984 -- GLOBAL_STEP: 1350
     | > loss_text_ce: 0.09925287216901779  (0.09930909138586795)
     | > loss_mel_ce: 4.419900417327881  (nan)
     | > loss: 0.09414902329444885  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.126  (0.1650100881082041)
     | > loader_time: 0.0076  (0.1913108412424724)


   --> TIME: 2025-08-12 14:26:16 -- STEP: 1400/3984 -- GLOBAL_STEP: 1400
     | > loss_text_ce: 0.09921146184206009  (0.09930578429784102)
     | > loss_mel_ce: 5.3632636070251465  (nan)
     | > loss: 0.11380156874656677  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.2113  (0.16516511559486394)
     | > loader_time: 0.1599  (0.18995167255401618)


   --> TIME: 2025-08-12 14:26:37 -- STEP: 1450/3984 -- GLOBAL_STEP: 1450
     | > loss_text_ce: 0.09918704628944397  (0.09930209123882762)
     | > loss_mel_ce: 5.265410900115967  (nan)
     | > loss: 0.11176245659589767  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.2146  (0.16576877610436805)
     | > loader_time: 0.0081  (0.18876476879777587)


   --> TIME: 2025-08-12 14:26:55 -- STEP: 1500/3984 -- GLOBAL_STEP: 1500
     | > loss_text_ce: 0.09904791414737701  (0.09929870647192009)
     | > loss_mel_ce: 4.15358829498291  (nan)
     | > loss: 0.08859659731388092  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.2475  (0.16635495408376053)
     | > loader_time: 0.0493  (0.18529991420110073)


   --> TIME: 2025-08-12 14:27:19 -- STEP: 1550/3984 -- GLOBAL_STEP: 1550
     | > loss_text_ce: 0.09911007434129715  (0.09929572561575528)
     | > loss_mel_ce: 4.9388041496276855  (nan)
     | > loss: 0.10495655238628387  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.2169  (0.16639484097880694)
     | > loader_time: 0.1706  (0.18663279994841553)


   --> TIME: 2025-08-12 14:27:40 -- STEP: 1600/3984 -- GLOBAL_STEP: 1600
     | > loss_text_ce: 0.09810701757669449  (0.09929270564578481)
     | > loss_mel_ce: 3.950199842453003  (nan)
     | > loss: 0.08433973044157028  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.0924  (0.16641944229602806)
     | > loader_time: 0.0107  (0.1864498174190522)


   --> TIME: 2025-08-12 14:27:58 -- STEP: 1650/3984 -- GLOBAL_STEP: 1650
     | > loss_text_ce: 0.09910426288843155  (0.09928868659969539)
     | > loss_mel_ce: 4.9734625816345215  (nan)
     | > loss: 0.1056784838438034  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1686  (0.1663196997209028)
     | > loader_time: 0.0068  (0.18411164717240774)


   --> TIME: 2025-08-12 14:28:16 -- STEP: 1700/3984 -- GLOBAL_STEP: 1700
     | > loss_text_ce: 0.09913639724254608  (0.09928555809837937)
     | > loss_mel_ce: 5.142073154449463  (nan)
     | > loss: 0.10919186472892761  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.2019  (0.16651707747403308)
     | > loader_time: 0.0057  (0.18191664162804105)


   --> TIME: 2025-08-12 14:28:33 -- STEP: 1750/3984 -- GLOBAL_STEP: 1750
     | > loss_text_ce: 0.0995119959115982  (0.09928312161139086)
     | > loss_mel_ce: 3.8471474647521973  (nan)
     | > loss: 0.08222207427024841  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1022  (0.16663083771296902)
     | > loader_time: 0.0076  (0.17942268303462444)


   --> TIME: 2025-08-12 14:28:51 -- STEP: 1800/3984 -- GLOBAL_STEP: 1800
     | > loss_text_ce: 0.09920361638069153  (0.09928011961281306)
     | > loss_mel_ce: 4.469452857971191  (nan)
     | > loss: 0.09518034756183624  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1817  (0.16710487418704556)
     | > loader_time: 0.0095  (0.1772287559509278)


   --> TIME: 2025-08-12 14:29:09 -- STEP: 1850/3984 -- GLOBAL_STEP: 1850
     | > loss_text_ce: 0.09947633743286133  (0.09927763602620852)
     | > loss_mel_ce: 4.063547134399414  (nan)
     | > loss: 0.08672966063022614  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.0979  (0.1667776948052483)
     | > loader_time: 0.007  (0.17560865363559214)


   --> TIME: 2025-08-12 14:29:27 -- STEP: 1900/3984 -- GLOBAL_STEP: 1900
     | > loss_text_ce: 0.09901562333106995  (0.0992737999401595)
     | > loss_mel_ce: 4.578951835632324  (nan)
     | > loss: 0.09745766222476959  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.218  (0.16648219384645155)
     | > loader_time: 0.4737  (0.17457472663176693)


   --> TIME: 2025-08-12 14:29:43 -- STEP: 1950/3984 -- GLOBAL_STEP: 1950
     | > loss_text_ce: 0.09910154342651367  (0.09927026824691362)
     | > loss_mel_ce: 3.8459911346435547  (nan)
     | > loss: 0.08218943327665329  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1278  (0.16680658095922218)
     | > loader_time: 0.1  (0.17140764089731075)


   --> TIME: 2025-08-12 14:30:01 -- STEP: 2000/3984 -- GLOBAL_STEP: 2000
     | > loss_text_ce: 0.09903059154748917  (0.09926672193780547)
     | > loss_mel_ce: 4.320775508880615  (nan)
     | > loss: 0.092079296708107  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1075  (0.16698893833160394)
     | > loader_time: 0.007  (0.16954512476921088)


   --> TIME: 2025-08-12 14:30:19 -- STEP: 2050/3984 -- GLOBAL_STEP: 2050
     | > loss_text_ce: 0.09922541677951813  (0.09926187506536163)
     | > loss_mel_ce: 5.077208518981934  (nan)
     | > loss: 0.10784237831830978  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.2421  (0.16700738255570569)
     | > loader_time: 0.021  (0.16851480739872635)


   --> TIME: 2025-08-12 14:30:39 -- STEP: 2100/3984 -- GLOBAL_STEP: 2100
     | > loss_text_ce: 0.099253810942173  (0.09925926876210038)
     | > loss_mel_ce: 4.917597770690918  (nan)
     | > loss: 0.10451774299144745  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1345  (0.16725592658633273)
     | > loader_time: 0.3623  (0.16754365080878852)


   --> TIME: 2025-08-12 14:30:54 -- STEP: 2150/3984 -- GLOBAL_STEP: 2150
     | > loss_text_ce: 0.09915103763341904  (0.09925505162671563)
     | > loss_mel_ce: 4.742849349975586  (nan)
     | > loss: 0.10087501257658005  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.248  (0.16705307539119266)
     | > loader_time: 0.0082  (0.16513582052186485)


   --> TIME: 2025-08-12 14:31:09 -- STEP: 2200/3984 -- GLOBAL_STEP: 2200
     | > loss_text_ce: 0.0994110107421875  (0.09924949416382751)
     | > loss_mel_ce: 3.498537063598633  (nan)
     | > loss: 0.07495725154876709  (nan)
     | > current_lr: 5e-06
     | > step_time: 0.1215  (0.16682647553357197)
     | > loader_time: 0.0176  (0.1631722355972637)
```



## Inference

In [None]:
output_path = "/content/drive/MyDrive/XTTS_Maltese_Training/output"
xtts_checkpoint = os.path.join(output_path, "xtts.pth")
xtts_config = os.path.join(output_path, "config.json")
xtts_vocab = os.path.join(output_path, "vocab.json")

tts_text = "Il-kelma Maltija 'bonġu' tfisser 'good morning'."
speaker_audio_file = "/content/drive/MyDrive/XTTS_Maltese_Data/wavs/MSRHS_F_02_P03U015_0046.wav"
lang = "mt"
output_file = "output_maltese.wav"

!python inference.py \
    --xtts_checkpoint="{xtts_checkpoint}" \
    --xtts_config="{xtts_config}" \
    --xtts_vocab="{xtts_vocab}" \
    --tts_text="{tts_text}" \
    --speaker_audio_file="{speaker_audio_file}" \
    --lang="{lang}" \
    --output_file="{output_file}"


from IPython.display import Audio
Audio(output_file)

inference parameters
text: The text to be synthesized.

language: The language of the text to be synthesized.

gpt_cond_latent: The latent vector you get with get_conditioning_latents. (You can cache for faster inference with same speaker)

speaker_embedding: The speaker embedding you get with get_conditioning_latents. (You can cache for faster inference with same speaker)

temperature: The softmax temperature of the autoregressive model. Defaults to 0.65.

length_penalty: A length penalty applied to the autoregressive decoder. Higher settings causes the model to produce more terse outputs. Defaults to 1.0.

repetition_penalty: A penalty that prevents the autoregressive decoder from repeating itself during decoding. Can be used to reduce the incidence of long silences or “uhhhhhhs”, etc. Defaults to 2.0.

top_k: Lower values mean the decoder produces more “likely” (aka boring) outputs. Defaults to 50.

top_p: Lower values mean the decoder produces more “likely” (aka boring) outputs. Defaults to 0.8.

speed: The speed rate of the generated audio. Defaults to 1.0. (can produce artifacts if far from 1.0)

enable_text_splitting: Whether to split the text into sentences and generate audio for each sentence. It allows you to have infinite input length but might loose important context between sentences. Defaults to True.