# XTTS Finetuning for Maltese

**Author: [Mathieu Waharte](mailto:mathieu.waharte@universite-paris-saclay.fr)**

## Requirements
- Python <3.11 for originial coqui's TTS and >=3.11 for TTS port
- HF_TOKEN
- Around 4GB for original files + 5GB per checkpoint

Installation takes around 6min (you may need to restart the session at the end)

In [5]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/
%rm -rf Malta-TTS
!git clone https://github.com/Wubpooz/Malta-TTS.git

%cd Malta-TTS/FineTuning/NewLanguage
!pip install --upgrade pip
!pip install -r requirements.txt
!pip install tf-keras tensorflow-decision-forests tensorflow-text --upgrade

!python -c "import stanza; stanza.download('mt')"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content
Cloning into 'Malta-TTS'...
remote: Enumerating objects: 1026, done.[K
remote: Counting objects: 100% (291/291), done.[K
remote: Compressing objects: 100% (191/191), done.[K
remote: Total 1026 (delta 197), reused 192 (delta 100), pack-reused 735 (from 1)[K
Receiving objects: 100% (1026/1026), 9.66 MiB | 10.57 MiB/s, done.
Resolving deltas: 100% (636/636), done.
/content/Malta-TTS/FineTuning/NewLanguage
Ignoring TTS: markers 'python_version < "3.11"' don't match your environment
Ignoring trainer: markers 'python_version < "3.11"' don't match your environment
Ignoring coqpit: markers 'python_version < "3.11"' don't match your environment
Ignoring transformers: markers 'python_version < "3.11"' don't match your environment
Collecting torch==2.5.1 (from -r requirements.txt (line 2))
  Downloading torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl.metadata

## Data Preparation

In [21]:
# @title Save Metadata and Resample audio (takes 2:30min with 16 workers)

%cd /content/Malta-TTS/FineTuning/NewLanguage/

from prepare_maltese_dataset import load_and_resample

load_and_resample(output_dir='/content/drive/MyDrive/XTTS_Maltese_Data', dataset='Bluefir/MASRI_HEADSET_v2', sampling_rate=22050, num_workers=16, save_audio=False)

/content/Malta-TTS/FineTuning/NewLanguage
Loading dataset from Hugging Face...
Resampling to 22050Hz and saving...
Processing split: train (workers=16)


Resampling train: 100%|██████████| 3983/3983 [00:00<00:00, 4346.66it/s]


Saved 3983 entries to metadata_train.csv
Processing split: test (workers=16)


Resampling test: 100%|██████████| 996/996 [00:00<00:00, 4185.85it/s]


Saved 996 entries to metadata_eval.csv
Dataset saved!


In [2]:
# @title Dataset repartition
from prepare_maltese_dataset import dataset_repartition

dataset_repartition('Bluefir/MASRI_HEADSET_v2')

Processing split: train
Processing split: test
Text length range: 1 - 188 characters
Audio duration range: 0.62 - 10.89 seconds
Average text length: 65.99 characters
Average audio duration: 4.82 seconds


## Finetuning

In [1]:
# @title Output Redirection

import os
import sys
_original_stdout = sys.stdout
_original_stderr = sys.stderr
_log_file = None

def output_redirect(output_path: str, redirect: bool = True):
  global _log_file
  if not redirect:
    sys.stdout = _original_stdout
    sys.stderr = _original_stderr
    if _log_file:
      _log_file.close()
      _log_file = None
  else:
    log_path = os.path.join(output_path, "full_training.log")
    # Clear the log file by opening in write mode and closing immediately
    if os.path.exists(log_path):
      with open(log_path, "w"):
        pass

    _log_file = open(log_path, "a", buffering=1)  # line-buffered

    class Tee(object):
      def __init__(self, *streams):
        self.streams = streams
      def write(self, data):
        for s in self.streams:
          s.write(data)
          s.flush()
      def flush(self):
        for s in self.streams:
          s.flush()

    sys.stdout = Tee(_original_stdout, _log_file)
    sys.stderr = Tee(_original_stderr, _log_file)

In [24]:
%env OUTPUT_PATH=/content/drive/MyDrive/XTTS_Maltese_Training
%env META_TRAIN=/content/drive/MyDrive/XTTS_Maltese_Data/metadata_train.csv
%env META_EVAL=/content/drive/MyDrive/XTTS_Maltese_Data/metadata_eval.csv
%env LANG=mt
%env TOKENIZERS_PARALLELISM=false
%env OMP_NUM_THREADS=1
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

import os
output_path = os.environ.get("OUTPUT_PATH")
metadata_train_path = os.environ.get("META_TRAIN")
metadata_eval_path = os.environ.get("META_EVAL")
language_code = os.environ.get("LANG")
if not os.path.exists(output_path):
  os.makedirs(output_path, exist_ok=True)
output_redirect(output_path, True)


# 35min/epoch on one T4 with batch_size=1, grad_acumm=48, audio_length=255995, max_text=200, weight=1e-2
print(f"Finetuning for {language_code}...")
%cd /content/Malta-TTS/FineTuning/NewLanguage
!python new_language_training_cli.py \
    --is_download \
    --is_tokenizer_extension \
    --output_path $OUTPUT_PATH \
    --metadatas "$META_TRAIN,$META_EVAL,$LANG" \
    --num_epochs 1 \
    --batch_size 3 \
    --grad_acumm 48 \
    --max_audio_length 255995 \
    --max_text_length 200 \
    --weight_decay 1e-2 \
    --lr 5e-6 \
    --save_step 50000 \
    --version=main \
    --language $LANG \
    --forgetting_mitigation "none" # or LORA or Freeze
# --multi-gpu

# lr = 5e-7, weight_decay = 0.1 for avoiding forgetting


print("Finetuning process completed!")
output_redirect(output_path, False)

env: OUTPUT_PATH=/content/drive/MyDrive/XTTS_Maltese_Training
env: META_TRAIN=/content/drive/MyDrive/XTTS_Maltese_Data/metadata_train.csv
env: META_EVAL=/content/drive/MyDrive/XTTS_Maltese_Data/metadata_eval.csv
env: LANG=mt
env: TOKENIZERS_PARALLELISM=false
env: OMP_NUM_THREADS=1
env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
Finetuning for mt...
/content/Malta-TTS/FineTuning/NewLanguage
2025-09-02 08:55:31.707960: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756803331.736909   43984 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756803331.746040   43984 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756803331.791442   43984 

## Inference

In [12]:
import os
output_path = os.environ.get("OUTPUT_PATH") or "/content/drive/MyDrive/XTTS_Maltese_Training"
xtts_checkpoint = os.path.join(output_path, "training/", "GPT_XTTS_FT-September-02-2025_08+57AM-6f60f78/", "best_model.pth")
xtts_checkpoint = os.path.join(output_path, "training/", "GPT_XTTS_FT-September-02-2025_08+21AM-6f60f78/", "checkpoint_77.pth")
xtts_config = os.path.join(output_path, "config.json")
xtts_vocab = os.path.join(output_path, "vocab.json")

tts_text = "Il-kelma Maltija 'bonġu' tfisser 'good morning'."
speaker_audio_file = "/content/drive/MyDrive/XTTS_Maltese_Data/wavs/MSRHS_M_11_P24U082_0147.wav"
lang = "mt"

# tts_text = "Hi, how are you?"
# speaker_audio_file = "/content/drive/MyDrive/english_speaker.mp3"
# lang="en"
output_file = os.path.join(output_path, "output_"+lang+".wav")

%cd /content/Malta-TTS/FineTuning/NewLanguage
!python inference.py \
    --xtts_checkpoint="{xtts_checkpoint}" \
    --xtts_config="{xtts_config}" \
    --xtts_vocab="{xtts_vocab}" \
    --tts_text="{tts_text}" \
    --speaker_audio_file="{speaker_audio_file}" \
    --lang="{lang}" \
    --output_file="{output_file}" \
    --temperature 0.7 \
    --length_penalty 1.0 \
    --repetition_penalty 10.0 \
    --top_k 50 \
    --top_p 0.8 \

from IPython.display import Audio
Audio(output_file, rate=24000)

Ff
/content/Malta-TTS/FineTuning/NewLanguage
Traceback (most recent call last):
  File "/content/Malta-TTS/FineTuning/NewLanguage/inference.py", line 1, in <module>
    import compatibility
  File "/content/Malta-TTS/FineTuning/NewLanguage/compatibility.py", line 5, in <module>
    import TTS
  File "/usr/local/lib/python3.12/dist-packages/TTS/__init__.py", line 3, in <module>
    from TTS.utils.generic_utils import is_pytorch_at_least_2_4
  File "/usr/local/lib/python3.12/dist-packages/TTS/utils/generic_utils.py", line 12, in <module>
    import torch
  File "/usr/local/lib/python3.12/dist-packages/torch/__init__.py", line 2486, in <module>
    from torch import _meta_registrations
  File "/usr/local/lib/python3.12/dist-packages/torch/_meta_registrations.py", line 10, in <module>
    from torch._decomp import (
  File "/usr/local/lib/python3.12/dist-packages/torch/_decomp/__init__.py", line 250, in <module>
    import torch._refs
  File "/usr/local/lib/python3.12/dist-packages/torch/_

ValueError: could not convert string to float: '/content/drive/MyDrive/XTTS_Maltese_Training/output_mt.wav'

### Widget use

In [11]:
import ipywidgets as widgets
from IPython.display import display
text = widgets.Textarea(
   value="Il-kelma Maltija 'bonġu' tfisser 'good morning'.",
   placeholder='Type something',
   description='Text:',
   disabled=False
)
display(text)

Textarea(value="Il-kelma Maltija 'bonġu' tfisser 'good morning'.", description='Text:', placeholder='Type some…

In [None]:
print(text.value)