# XTTS Finetuning for Maltese

## Requirements
- Python 3.10

Takes around 6min and you need to restart the session at the end

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Clone the repository
!git clone https://github.com/Wubpooz/Malta-TTS.git
%cd Malta-TTS/FineTuning/NewLanguage

!pip install --upgrade pip
!pip install -r requirements.txt


# !python -m spacy download en_core_web_sm

# !pip install TTS

# !pip uninstall -y torch torchaudio torchvision
# !pip install torch==2.5.1+cu121 torchaudio==2.5.1 torchvision --index-url https://download.pytorch.org/whl/cu121
# !pip install transformers==4.38.2

Mounted at /content/drive
Cloning into 'Malta-TTS'...
remote: Enumerating objects: 633, done.[K
remote: Counting objects: 100% (242/242), done.[K
remote: Compressing objects: 100% (174/174), done.[K
remote: Total 633 (delta 151), reused 152 (delta 66), pack-reused 391 (from 1)[K
Receiving objects: 100% (633/633), 7.07 MiB | 12.53 MiB/s, done.
Resolving deltas: 100% (345/345), done.
/content/Malta-TTS/FineTuning/NewLanguage
Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Collecting torch==2.5.1 (from -r requirements.txt (line 2))
  Downloading torch-2.5.1-cp311-cp311-manylinux1_x86_64.

## Parameters

In [1]:
language_code = "mt"
metadata_train_path = "/content/drive/MyDrive/XTTS_Maltese_Data/metadata_train.csv"
metadata_eval_path = "/content/drive/MyDrive/XTTS_Maltese_Data/metadata_eval.csv"
output_path = "/content/drive/MyDrive/XTTS_Maltese_Training/output"
extended_vocab_size_param = 100000

%env TOKENIZERS_PARALLELISM=false
%env OMP_NUM_THREADS=1
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

env: TOKENIZERS_PARALLELISM=false
env: OMP_NUM_THREADS=1
env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


## Data Preparation

In [2]:
# @title Takes 35min initialy but after it's saved to GDrive it's instant
# EXCRUATINGLY SLOW, 35min for saving 4900 audio files => concurrency if available


import os
import shutil
import pandas as pd
from tqdm.auto import tqdm
from datasets import load_dataset
from concurrent.futures import ThreadPoolExecutor

# Set the library path so torchcodec can find ffmpeg ?
os.environ['LD_LIBRARY_PATH'] += ":/usr/lib/x86_64-linux-gnu/"

# def save_single_file(example, output_dir, i):
#     audio_filename = example['audio']['path']
#     audio_bytes = example['audio']['bytes']
#     text = example['normalized_text']
#     speaker_id = example['speaker_id']

#     output_audio_path = os.path.join(output_dir, audio_filename)

#     with open(output_audio_path, 'wb') as f:
#       f.write(audio_bytes) # is it 24kHz ?

#     return {
#         'audio_file': os.path.join("wavs", audio_filename),
#         'text': text,
#         'speaker_name': speaker_id
#     }

# def save_dataset_split_concurent(split, filename):
#     data = []
#     with ThreadPoolExecutor(max_workers=16) as executor:
#         futures = [executor.submit(save_single_file, example, wavs_dir, i) for i, example in enumerate(ds[split])]

#         for future in tqdm(futures, desc=f"Processing {split} split"):
#             data.append(future.result())

#     df = pd.DataFrame(data)
#     df.to_csv(os.path.join(output_dir, filename), sep="|", index=False)
#     print(f"Saved {len(df)} files to {filename}")


def save_dataset_split(split, filename, save_audio=True):
  data = []
  for example in tqdm(ds[split].to_list(), desc=f"Processing {split} split"):
    audio_filename = example['audio']['path']
    audio_bytes = example['audio']['bytes']
    text = example['normalized_text']
    speaker_id = example['speaker_id']

    if(save_audio):
      with open(os.path.join(wavs_dir, audio_filename), 'wb') as f:
        f.write(audio_bytes) #TODO is it 24kHz ?

    # Use LJSpeech format (extended)
    # /!\ audio_file shouldn't have extension, else fails | also they should just be filenames, the loader will add wav/ before and .wav after
    name, ext = os.path.splitext(audio_filename)
    audio_file_without_ext = name

    data.append({
      'audio_file': audio_file_without_ext,
      'text': text,
      'normalized_text': text,
      'speaker_name': speaker_id
    })

  df = pd.DataFrame(data)
  df.to_csv(os.path.join(output_dir, filename), sep="|", index=False)
  print(f"Saved {len(df)} files to {filename}")



output_dir = "/content/drive/MyDrive/XTTS_Maltese_Data"
if os.path.exists(output_dir) and os.path.exists(os.path.join(output_dir, "wavs")) and os.path.exists(os.path.join(output_dir, "metadata_train.csv")):
  print(f"Processed dataset already exists at {output_dir}")
else:
  print("Loading dataset from Hugging Face...")
  ds = load_dataset("Bluefir/MASRI_HEADSET_v2")

  os.makedirs(output_dir, exist_ok=True)
  wavs_dir = os.path.join(output_dir, "wavs")
  os.makedirs(wavs_dir, exist_ok=True)
  if os.path.exists(wavs_dir):
    save_audio = False
  else:
    save_audio = True

  print("Preparing and saving dataset files...")
  save_dataset_split("train", "metadata_train.csv", save_audio)
  save_dataset_split("test", "metadata_eval.csv", save_audio)
  print("Dataset preparation complete (saved to Google Drive too).")

Processed dataset already exists at /content/drive/MyDrive/XTTS_Maltese_Data


In [3]:
# @title Dataset repartition
import os
import io
import tempfile
import soundfile as sf
from datasets import load_dataset
from datasets import load_dataset, Audio

# Don't decode audio — just keep metadata
ds = load_dataset("Bluefir/MASRI_HEADSET_v2")
ds = ds.cast_column("audio", Audio(decode=False))

text_lengths = []
audio_durations = []

for split in ["train", "test"]:
    print(f"Processing split: {split}")
    for example in ds[split]:
        # Text length
        text_lengths.append(len(example["normalized_text"]))

        # Save audio bytes to temp file, read duration with soundfile
        audio_bytes = example["audio"]["bytes"]
        with tempfile.NamedTemporaryFile(suffix=".wav") as tmpf:
            tmpf.write(audio_bytes)
            tmpf.flush()
            with sf.SoundFile(tmpf.name) as f:
                duration = len(f) / f.samplerate
                audio_durations.append(duration)

print(f"Text length range: {min(text_lengths)} - {max(text_lengths)} characters")
print(f"Audio duration range: {min(audio_durations):.2f} - {max(audio_durations):.2f} seconds")
print(f"Average text length: {sum(text_lengths)/len(text_lengths):.2f} characters")
print(f"Average audio duration: {sum(audio_durations)/len(audio_durations):.2f} seconds")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/310M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/303M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/155M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3983 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/996 [00:00<?, ? examples/s]

Processing split: train
Processing split: test
Text length range: 1 - 188 characters
Audio duration range: 0.62 - 10.89 seconds
Average text length: 65.99 characters
Average audio duration: 4.82 seconds


## Finetuning

In [3]:
import sys
_original_stdout = sys.stdout
_original_stderr = sys.stderr
_log_file = None

def output_redirect(redirect=True):
  global _log_file
  if not redirect:
    sys.stdout = _original_stdout
    sys.stderr = _original_stderr
    if _log_file:
      _log_file.close()
      _log_file = None
  else:
    import os
    log_path = "/content/drive/MyDrive/XTTS_Maltese_Training/output/full_training.log"
    # Clear the log file by opening in write mode and closing immediately
    with open(log_path, "w"):
      pass
    _log_file = open(log_path, "a", buffering=1)  # line-buffered

    class Tee(object):
      def __init__(self, *streams):
        self.streams = streams
      def write(self, data):
        for s in self.streams:
          s.write(data)
          s.flush()
      def flush(self):
        for s in self.streams:
          s.flush()

    sys.stdout = Tee(_original_stdout, _log_file)
    sys.stderr = Tee(_original_stderr, _log_file)

In [5]:
output_redirect(True)


os.makedirs(output_path, exist_ok=True)
%cd /content/Malta-TTS/FineTuning/NewLanguage

print(f"Finetuning for {language_code}")
!python new_language_training_cli.py \
    --is_download \
    --is_tokenizer_extension \
    --output_path "{output_path}" \
    --metadatas "{metadata_train_path},{metadata_eval_path},{language_code}" \
    --num_epochs 1 \
    --batch_size 1 \
    --grad_acumm 48 \
    --max_audio_length 176400 \
    --max_text_length 200 \
    --weight_decay 1e-2 \
    --lr 5e-6 \
    --save_step 5000 \
    --custom_model=custom_model_name \
    --version=main \
    --metadata_path "{metadata_train_path}" \
    --language "{language_code}" \
    --extended_vocab_size {extended_vocab_size_param}

# 35min/epoch on one T4 with batch_size=1, grad_acumm=48, audio_length=176400, max_text=200, weight=1e-2, save_step=5000

# Default values are:
# batch-size: 3
# grad_acc: 84
# max_audio: 255995 = 11.6s
# save_step: 10_000
# epoch: 10 => 100
# --multi-gpu

print("Finetuning process completed!")
output_redirect(False)

/content/Malta-TTS/FineTuning/NewLanguage
Finetuning for mt
Step 1: Downloading XTTS base model files.
 > Downloading DVAE files!
  0% 0.00/1.07k [00:00<?, ?iB/s]
100% 1.07k/1.07k [00:00<00:00, 5.80kiB/s]

  1% 1.67M/211M [00:00<00:12, 16.7MiB/s][A
  4% 8.46M/211M [00:00<00:04, 46.8MiB/s][A
  8% 16.7M/211M [00:00<00:03, 63.2MiB/s][A
 12% 25.2M/211M [00:00<00:02, 71.8MiB/s][A
 16% 33.8M/211M [00:00<00:02, 76.8MiB/s][A
 20% 42.4M/211M [00:00<00:02, 79.9MiB/s][A
 24% 50.4M/211M [00:00<00:02, 76.1MiB/s][A
 28% 58.8M/211M [00:00<00:01, 78.3MiB/s][A
 32% 67.2M/211M [00:00<00:01, 80.0MiB/s][A
 36% 75.5M/211M [00:01<00:01, 81.0MiB/s][A
 40% 83.9M/211M [00:01<00:01, 81.7MiB/s][A
 44% 92.3M/211M [00:01<00:01, 82.6MiB/s][A
 48% 101M/211M [00:01<00:01, 82.2MiB/s] [A
 52% 109M/211M [00:01<00:01, 77.3MiB/s][A
 55% 117M/211M [00:01<00:01, 72.5MiB/s][A
 59% 125M/211M [00:01<00:01, 75.4MiB/s][A
 63% 133M/211M [00:01<00:01, 74.0MiB/s][A
 67% 140M/211M [00:01<00:00, 75.4MiB/s][A
 71% 14

## Inference

In [9]:
import os
xtts_checkpoint = os.path.join(output_path, "training", "GPT_XTTS_FT-August-15-2025_10+43AM-ab8c660", "best_model.pth")
xtts_config = os.path.join(output_path, "config.json")
xtts_vocab = os.path.join(output_path, "vocab.json")

tts_text = "Il-kelma Maltija 'bonġu' tfisser 'good morning'."
speaker_audio_file = "/content/drive/MyDrive/XTTS_Maltese_Data/wavs/MSRHS_F_01_P02U004_0058.wav"
lang = "mt"
output_file = "output_maltese.wav"

# xtts_checkpoint = "/content/drive/MyDrive/XTTS_Maltese_Training/output/model.pth"
tts_text = "Hi, how are you?"
speaker_audio_file = "/content/drive/MyDrive/english_speaker.mp3"
lang="en"
output_file = "output_english.wav"

%cd /content/Malta-TTS/FineTuning/NewLanguage
!python inference.py \
    --xtts_checkpoint="{xtts_checkpoint}" \
    --xtts_config="{xtts_config}" \
    --xtts_vocab="{xtts_vocab}" \
    --tts_text="{tts_text}" \
    --speaker_audio_file="{speaker_audio_file}" \
    --lang="{lang}" \
    --output_file="{output_file}" \
    --temperature 0.7 \
    --length_penalty 1.0 \
    --repetition_penalty 10.0 \
    --top_k 50 \
    --top_p 0.8

from IPython.display import Audio
Audio(output_file, rate=24000)

/content/Malta-TTS/FineTuning/NewLanguage
Starting inference...
[2025-08-15 11:46:54,971] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-08-15 11:46:57,109] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
Loading model...
Config loaded.
Initializing model...
Loading checkpoint...
  return torch.load(f, map_location=map_location, **kwargs)
Added char_limits for 'mt' language.
Model loaded successfully!
Computing speaker latents...
Speaker latents computed successfully!
Running inference...
Processing sentences:   0% 0/1 [00:00<?, ?it/s]Audio time for sentence 1: 1.00 seconds
Processing sentences: 100% 1/1 [00:04<00:00,  4.29s/it]
Inference successful!
  return torch.tensor(wav_chunks[0]).unsqueeze(0)
Inference completed!
Audio saved to output_english.wav


#### ChatGPT


Great question — and Maltese is a perfect example of what “portable TTS” really needs: robustness with tiny models and smart data tricks. Here’s a practical, end-to-end blueprint you can follow, with options from “works on a Raspberry Pi/phone” to “bigger but still deployable”.

1) Choose a portable-friendly architecture

Pick models that are fast, non-autoregressive (or nearly so), and quantization-friendly.

Good options
	•	Piper (Glow-TTS/HiFi-GAN–style, C++ runtime): tiny, fast, proven on ARM; easy to train your own voices.
	•	VITS / FastPitch + HiFi-GAN (light): great quality; prune + quantize for edge.
	•	Distilled WaveRNN vocoder (if you need ultra-low CPU without GPU).
	•	On-device runtimes: ONNX Runtime (mobile/ARM), Core ML (iOS), TFLite (Android), or a pure C/C++ runtime (Piper).

Avoid for edge (unless you’ll distill later): large autoregressive GPT-style TTS blocks or heavyweight diffusion vocoders.

2) Data for Maltese (+ cross-lingual leverage)

Under-resourced means you win with smart transfer.
	•	Primary Maltese speech: any studio/close-talk set you have (even a few hours can work for single-speaker). If you have crowd speech, clean it hard.
	•	Augment with multilingual: mix in Italian/Arabic/English to help prosody and loanwords. Share a phoneme inventory across languages so Maltese borrows benefit.
	•	Text: scrape/news/Parl Maltese, normalize (numbers, dates, abbreviations), and de-duplicate ruthlessly.

Augmentations
	•	Speed perturb (0.9×/1.0×/1.1×), room IRs, light noise, pitch shift ≤ ±100 cents (sparingly), SpecAugment on spectrograms for the acoustic model.
	•	Add code-switch examples (Maltese + English/Italian) — realistic for Malta and improves robustness.

3) Front-end: text normalization + G2P that actually works for Maltese

Maltese orthography is close to phonemic, but you still need rules:
	•	Build a rule-based G2P first (grapheme→phoneme mappings + stress heuristics).
	•	Maintain a lexicon for exceptions, names, loanwords, and abbreviations.
	•	Use a unified phoneme set (IPA or X-SAMPA). Make sure the vocoder/acoustic model uses the same symbol IDs across languages.
	•	Fallback path: if a token is OOV and ambiguous, back off to graphemes (models learn this surprisingly well) or byte-level pieces.

4) Training recipes (concrete)

A. Small single-speaker Maltese voice (fastest path)
	•	Model: FastPitch (acoustic) + HiFi-GAN (vocoder, V1 light).
	•	Hours: 2–10 h clean, single speaker.
	•	Steps
	1.	Train HiFi-GAN on multilingual data first (transferable), then fine-tune on your Maltese speaker (50k–200k steps).
	2.	Train FastPitch on phonemes; use duration/pitch predictors; batch size small (e.g., 16–32 on a single GPU).
	3.	Early stop by MOS proxy: ASR-CER on TTS–>ASR, and an external prosody score.

B. Multispeaker Maltese (+ neighbors) for robustness
	•	Model: VITS (multispeaker) with speaker embeddings; or Piper multispeaker recipe.
	•	Data: combine Maltese with Italian/Arabic/English; ensure at least ~30–60 min per speaker for stability.
	•	Loss tricks: feature matching (for HiFi-GAN), duration loss with stochastic duration predictor (VITS), mild speaker-mixup.

C. Distill/convert for portability
	•	Export acoustic to ONNX (opset 17+), export vocoder to ONNX or keep a C++ HiFi-GAN.
	•	Quantize:
	•	Dynamic INT8 for linears/conv1d (ONNX Runtime).
	•	For mobile GPUs/NNAPI/Core ML: try 16-bit float or 8-bit weight-only.
	•	Aim: < 100 MB total (acoustic ≤ 30–60 MB, vocoder ≤ 40 MB).

5) Making it truly portable (CPU-only, ARM)

Targets
	•	Raspberry Pi 4/5: ~1–2× real-time for 22 kHz with quantized FastPitch + light HiFi-GAN.
	•	Android mid-range: real-time using NNAPI / GPU delegate or plain CPU with INT8.
	•	iOS: Core ML conversion (float16) typically real-time.

Implementation tips
	•	Streaming synthesis: chunk text (clauses), synth acoustic chunks, stream vocoder frames as they’re ready.
	•	Use smaller hop size (256) at 22.05 kHz to balance latency/naturalness.
	•	Pre-warm models on app start (first inference JIT costs).
	•	Cache phonemized text and punctuation normalization.

6) Maltese specifics that help quality
	•	Handle Maltese letters (ċ, ġ, għ, ħ, ż) carefully in normalization.
	•	Prosody: Maltese stress tends toward the penultimate syllable — add a heuristic to your G2P (and learn residual stress via FastPitch).
	•	Loanwords: keep dual lexicon entries (native vs. borrowed pronunciation); pick by context (neighbor tokens, language ID tags).
	•	Code-switch tags: mark spans like <lang=en>, <lang=it> to signal different phonotactics without swapping models.

7) Evaluation & QA (cheap but effective)
	•	Objective: ASR-CER of TTS audio using a strong multilingual ASR (Whisper small/distilled) — lower is better.
	•	Prosody: F0 variance vs. references; pause placement around punctuation.
	•	Subjective: 5–10 native Maltese listeners; MOS + ABX against ground truth; include code-switch sentences and numerics/dates.
	•	Robustness set: typos, all-caps, abbreviations, URLs, telephone numbers, currency, times.

8) A minimal training & export sketch (framework-agnostic)


##### Steps
 1) Train vocoder (HiFi-GAN light) on multilingual + fine-tune on Maltese
python train_hifigan.py --config hifigan_light.yaml --data data_multilingual
python finetune_hifigan.py --checkpoint ckpt_hifigan.pth --data data_mt_speaker

 2) Train FastPitch on phonemes
python train_fastpitch.py --config fastpitch_mt.yaml --train-metadata metadata_mt_phoneme.csv

 3) Export to ONNX
python export_fastpitch_onnx.py --checkpoint fastpitch_mt.pth --out fastpitch_mt.onnx
python export_hifigan_onnx.py --checkpoint hifigan_mt.pth --out hifigan_mt.onnx

 4) Quantize (dynamic int8)
python onnx_quantize.py --in fastpitch_mt.onnx --out fastpitch_mt_int8.onnx --mode dynamic
python onnx_quantize.py --in hifigan_mt.onnx --out hifigan_mt_int8.onnx --mode dynamic

9) Packaging for apps
	•	Android: bundle ONNX models, run with ONNX Runtime (NNAPI/GPU EP if available). Kotlin wrapper; stream PCM to AudioTrack.
	•	iOS: convert to Core ML (coremltools), or use ONNX Runtime Mobile; stream to AVAudioEngine.
	•	Desktop/IoT: Piper or your own C++ runner; one static binary + two model files.

10) If you already have a big multilingual TTS (e.g., XTTS)
	•	Use it to bootstrap: generate high-quality Maltese pseudo-data (careful: avoid style collapse), then distill into a small FastPitch/VITS student on phonemes.
	•	Keep the big model server-side for rare names or tricky code-switches; default to on-device small model; fall back to server only when the small model flags low confidence.

⸻

If you want, I can tailor this into a step-by-step Piper or VITS config for Maltese, including a ready-made phoneme map and a starter lexicon (with stress marks and typical loanword variants).

https://www.reddit.com/r/mlscaling/comments/1gxakk3/did_a_quick_comparison_of_various_tts_models/?utm_source=chatgpt.com