# XTTS Finetuning for Maltese

Author: [Mathieu Waharte](mailto:mathieu.waharte@universite-paris-saclay.fr)

## Requirements
- Python <3.11 for originial coqui's TTS and >=3.11 for TTS port
- HF_TOKEN
- Around 4GB for original files + 5GB per checkpoint

Installation takes around 6min (you may need to restart the session at the end)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/
%rm -rf Malta-TTS
!git clone https://github.com/Wubpooz/Malta-TTS.git

%cd Malta-TTS/FineTuning/NewLanguage
!pip install --upgrade pip
!pip install -r requirements.txt
!pip install tf-keras tensorflow-decision-forests tensorflow-text --upgrade

!python -c "import stanza; stanza.download('mt')"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content
Cloning into 'Malta-TTS'...
remote: Enumerating objects: 1049, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 1049 (delta 52), reused 35 (delta 34), pack-reused 990 (from 1)[K
Receiving objects: 100% (1049/1049), 9.69 MiB | 6.80 MiB/s, done.
Resolving deltas: 100% (664/664), done.
/content/Malta-TTS/FineTuning/NewLanguage
Ignoring TTS: markers 'python_version < "3.11"' don't match your environment
Ignoring trainer: markers 'python_version < "3.11"' don't match your environment
Ignoring coqpit: markers 'python_version < "3.11"' don't match your environment
Ignoring transformers: markers 'python_version < "3.11"' don't match your environment
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 85.6MB/s]        
20

## Data Preparation

In [None]:
# @title Save Metadata and Resample audio (takes 2:30min with 16 workers)

%cd /content/Malta-TTS/FineTuning/NewLanguage/

from prepare_maltese_dataset import load_and_resample

load_and_resample(output_dir='/content/drive/MyDrive/XTTS_Maltese_Data', dataset='Bluefir/MASRI_HEADSET_v2', sampling_rate=22050, num_workers=16, save_audio=False)

/content/Malta-TTS/FineTuning/NewLanguage
Loading dataset from Hugging Face...
Resampling to 22050Hz and saving...
Processing split: train (workers=16)


Resampling train: 100%|██████████| 3983/3983 [00:00<00:00, 4346.66it/s]


Saved 3983 entries to metadata_train.csv
Processing split: test (workers=16)


Resampling test: 100%|██████████| 996/996 [00:00<00:00, 4185.85it/s]


Saved 996 entries to metadata_eval.csv
Dataset saved!


In [None]:
# @title Dataset repartition
from prepare_maltese_dataset import dataset_repartition

dataset_repartition('Bluefir/MASRI_HEADSET_v2')

Processing split: train
Processing split: test
Text length range: 1 - 188 characters
Audio duration range: 0.62 - 10.89 seconds
Average text length: 65.99 characters
Average audio duration: 4.82 seconds


## Finetuning

In [5]:
# @title Output Redirection

import os
import sys
_original_stdout = sys.stdout
_original_stderr = sys.stderr
_log_file = None

def output_redirect(output_path: str, redirect: bool = True):
  global _log_file
  if not redirect:
    sys.stdout = _original_stdout
    sys.stderr = _original_stderr
    if _log_file:
      _log_file.close()
      _log_file = None
  else:
    log_path = os.path.join(output_path, "full_training.log")
    # Clear the log file by opening in write mode and closing immediately
    if os.path.exists(log_path):
      with open(log_path, "w"):
        pass

    _log_file = open(log_path, "a", buffering=1)  # line-buffered

    class Tee(object):
      def __init__(self, *streams):
        self.streams = streams
      def write(self, data):
        for s in self.streams:
          s.write(data)
          s.flush()
      def flush(self):
        for s in self.streams:
          s.flush()

    sys.stdout = Tee(_original_stdout, _log_file)
    sys.stderr = Tee(_original_stderr, _log_file)

In [None]:
%env TOKENIZERS_PARALLELISM=false
%env OMP_NUM_THREADS=1
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

import os

output_path = "/content/drive/MyDrive/XTTS_Maltese_Training"
metadata_train_path = "/content/drive/MyDrive/XTTS_Maltese_Data/metadata_train.csv"
metadata_eval_path = "/content/drive/MyDrive/XTTS_Maltese_Data/metadata_eval.csv"
language_code = "mt"
if not os.path.exists(output_path):
  os.makedirs(output_path, exist_ok=True)
output_redirect(output_path, False)

# 35min/epoch on one T4 with batch_size=1, grad_acumm=48, audio_length=255995, max_text=200, weight=1e-2
print(f"Finetuning for {language_code}...")
%cd /content/Malta-TTS/FineTuning/NewLanguage

from new_language_training_cli import main
from types import SimpleNamespace

# Memory optimization based on available VRAM?
# import torch
# if torch.cuda.is_available():
  # gpu_info = torch.cuda.get_device_properties(0)
  # print(f"GPU detected: {gpu_info.name}")
  # vram_gb = gpu_info.total_memory / 1024**3
  # print(f"   Memory: {vram_gb:.1f} GB")
  # print(f"   Compute Capability: {gpu_info.major}.{gpu_info.minor}")
  # if vram_gb < 16:
  #   print("⚠️ Low VRAM detected. Adjusting settings...")
  #   batch_size = 2
  #   gradient_accumulation = 126
  #   mixed_precision = True
  #   gradient_checkpointing = True
  # elif vram_gb < 24:
  #   batch_size = 3
  #   gradient_accumulation = 84
  #   mixed_precision = True
  # else:
  #   batch_size = 6
  #   gradient_accumulation = 42

args = SimpleNamespace(
    is_download=True,
    output_path=output_path,
    metadatas=[(metadata_train_path, metadata_eval_path, language_code)],
    num_epochs=1,
    batch_size=1,
    grad_acumm=48,
    min_frequency=2,
    max_new_tokens=1000,
    max_audio_length=255995,
    max_text_length=200,
    weight_decay=1e-2,
    lr=5e-6,
    save_step=50000,
    version="main",
    language=language_code,
    forgetting_mitigation="none",  # or "LORA" or "Freeze"
    multi_gpu=False,
    optimizations=False,
    tf32=False,
)

xtts_checkpoint, xtts_vocab, xtts_config, trainer_out_path, speaker_ref = main(args)
# lr = 5e-7, weight_decay = 0.1 for avoiding forgetting


print("Finetuning process completed!")
output_redirect(output_path, False)

env: TOKENIZERS_PARALLELISM=false
env: OMP_NUM_THREADS=1
env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
Finetuning for mt...
/content/Malta-TTS/FineTuning/NewLanguage
[('/content/drive/MyDrive/XTTS_Maltese_Data/metadata_train.csv', '/content/drive/MyDrive/XTTS_Maltese_Data/metadata_eval.csv', 'mt')]


TypeError: main() takes 0 positional arguments but 1 was given

## Inference

In [None]:
%cd /content/Malta-TTS/FineTuning/NewLanguage

import os
from inference import inference
import torchaudio

output_path = os.path.dirname(xtts_checkpoint) if xtts_checkpoint else "/content/drive/MyDrive/XTTS_Maltese_Training"

#temp file paths
# output_path = "/content/drive/MyDrive/XTTS_Maltese_Training"
# xtts_checkpoint = os.path.join(output_path, "training/", "GPT_XTTS_FT-September-03-2025_10+17AM-741ac52/", "checkpoint_35.pth")
# xtts_config = os.path.join(output_path, "config.json")
# xtts_vocab = os.path.join(output_path, "vocab.json")

# Parameters
language_code = "mt"
speaker_audio_file = "/content/drive/MyDrive/XTTS_Maltese_Data/wavs/MSRHS_M_11_P24U082_0147.wav" if language_code = "mt" else "/content/drive/MyDrive/english_speaker.wav"

test_sentences = {
  'mt': [
    "Il-kelma Maltija 'bonġu' tfisser 'good morning'.",
    "Bonġu, kif int illum?",
    "Il-ħajja hija sabiħa.",
    "Grazzi ħafna tal-għajnuna tiegħek.",
    "Il-Milied qed joqrob malajr.",
    "Nispera li jkollok ġurnata tajba."
  ],
  'en': [
    "Hello, how are you today?",
    "This is a test of the new model.",
    "The weather is beautiful today.",
    "Thank you for your help.",
    "I hope you have a great day."
  ]
}
sentences_list = test_sentences.get(language_code)
if sentences_list is None:
    raise ValueError(f"No test sentences found for language code '{language_code}'")
sentences = sentences_list[0]


temperature = 0.7
length_penalty = 1.0
repetition_penalty = 10.0
top_k = 50
top_p = 0.8
LORA_trained = False

print("Starting inference...")
audio_waveform = inference(
  xtts_checkpoint=xtts_checkpoint,
  xtts_config=xtts_config,
  xtts_vocab=xtts_vocab,
  tts_text=sentences,
  speaker_audio_file=speaker_audio_file,
  lang_code=language_code,
  temperature=temperature,
  length_penalty=length_penalty,
  repetition_penalty=repetition_penalty,
  top_k=top_k,
  top_p=top_p,
  LORA_trained=LORA_trained
)
print("Inference completed!")

output_file = os.path.join(output_path, "output_"+language_code+".wav")
output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
  os.makedirs(output_dir, exist_ok=True)
torchaudio.save(output_file, audio_waveform, sample_rate=24000)
print(f"Audio saved to {output_file}")

from IPython.display import Audio
Audio(output_file, rate=24000)

/content/Malta-TTS/FineTuning/NewLanguage
Starting inference...
Loading model...
Config loaded.
Initializing model...
Loading checkpoint...
Detected standard model weights. Loading as base model...
Model loaded successfully!


  cadena = re.sub('\s+',' ',cadena)


Added char_limits for 'mt' language.
mt added to tokenizer.py!
Applied custom tokenizer.
Computing speaker latents...
Speaker latents computed successfully!
Processing text...
Split into 1 sentences.
Final text chunks: 1
  Chunk 1: 47 chars - 'Il-kelma Maltija 'bonġu' tfisser 'good morning''
Running inference...


Processing sentences: 100%|██████████| 1/1 [00:25<00:00, 25.08s/it]


Inference successful!
Inference completed!
Audio saved to /content/drive/MyDrive/XTTS_Maltese_Training/output_mt.wav


  return torch.tensor(wav_chunks[0]).unsqueeze(0) #TODO remove torch.tensor?


## Further Improvements

In [None]:
# @title Model Downloading
try:
  from google.colab import files
  files.download(output_file)
  print("📥 Download started!")
except ImportError:
  pass

### Widget use

In [16]:
import ipywidgets as widgets
from IPython.display import display
# there is also Dropdown, Slider etc
text = widgets.Textarea(
   value="Il-kelma Maltija 'bonġu' tfisser 'good morning'.",
   placeholder='Type something',
   description='Text:',
   disabled=False
)
display(text)

Textarea(value="Il-kelma Maltija 'bonġu' tfisser 'good morning'.", description='Text:', placeholder='Type some…

In [None]:
print(text.value)