# XTTS Finetuning for Maltese

## Requirements
- Python 3.10

Takes around 6min and you need to restart the session at the end

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Clone the repository
%rm -rf Malta-TTS
!git clone https://github.com/Wubpooz/Malta-TTS.git
%cd Malta-TTS/FineTuning/NewLanguage

!pip install --upgrade pip
!pip install -r requirements.txt

!pip install spacy stanza spacy-stanza
!python -c "import stanza; stanza.download('mt')"
# !python -m spacy download en_core_web_sm

# For Python >= 3.11 TTS installation:
# !pip install coqui-tts # https://github.com/idiap/coqui-ai-TTS
# or clone git and install dependacies directly:
# !git clone https://github.com/coqui-ai/TTS
# !pip install -e .all

Mounted at /content/drive
Cloning into 'Malta-TTS'...
remote: Enumerating objects: 809, done.[K
remote: Counting objects: 100% (74/74), done.[K
remote: Compressing objects: 100% (50/50), done.[K
remote: Total 809 (delta 40), reused 55 (delta 24), pack-reused 735 (from 1)[K
Receiving objects: 100% (809/809), 7.13 MiB | 17.91 MiB/s, done.
Resolving deltas: 100% (479/479), done.
/content/Malta-TTS/FineTuning/NewLanguage
Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Ignoring TTS: markers 'python_version < "3.11"' don't match your environment
Ignoring trainer: markers 'python_version < "

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 72.1MB/s]        
2025-08-20 12:51:21 INFO: Downloading default packages for language: mt (Maltese) ...
Downloading https://huggingface.co/stanfordnlp/stanza-mt/resolve/v1.6.0/models/default.zip: 100% 142M/142M [00:01<00:00, 78.5MB/s]
2025-08-20 12:51:24 INFO: Finished downloading models and saved to /root/stanza_resources.


## Parameters

In [None]:
language_code = "mt"
metadata_train_path = "/content/drive/MyDrive/XTTS_Maltese_Data_20KHz/metadata_train.csv"
metadata_eval_path = "/content/drive/MyDrive/XTTS_Maltese_Data_20KHz/metadata_eval.csv"
output_path = "/content/drive/MyDrive/XTTS_Maltese_Training/output"
extended_vocab_size_param = 100000

%env TOKENIZERS_PARALLELISM=false
%env OMP_NUM_THREADS=1
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

## Data Preparation

In [None]:
# @title Save Metadata and Resample audio (takes 2:30min)
import os
import io
import pandas as pd
from tqdm.auto import tqdm
from datasets import load_dataset, Audio
from concurrent.futures import ThreadPoolExecutor
import librosa
import soundfile as sf

# os.environ['LD_LIBRARY_PATH'] += ":/usr/lib/x86_64-linux-gnu/"

TARGET_SAMPLE_RATE = 22050  # XTTS requirement
NUM_WORKERS = 16

def save_and_resample(example, output_dir, resample=True, save_audio=True):
  audio_filename = example['audio']['path']
  audio_bytes = example['audio']['bytes']
  text = example['normalized_text']
  speaker_id = example['speaker_id']

  save_path = os.path.join(wavs_dir, audio_filename)
  base_name = os.path.splitext(os.path.basename(audio_filename))[0]
  out_path = os.path.join(output_dir, f"{base_name}.wav")

  if save_audio:
    # Read HF bytes safely
    with io.BytesIO(audio_bytes) as f:
      y, sr = sf.read(f)

    # Resample if needed
    if resample and sr != TARGET_SAMPLE_RATE:
      y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SAMPLE_RATE)
      sr = TARGET_SAMPLE_RATE

    sf.write(out_path, y, sr)

  if(save_audio):
    with open(save_path, 'wb') as f:
      f.write(audio_bytes)

  # if(resample):
  #   y, sample_rate = librosa.load(save_path, sr=None)

  #   if sample_rate != TARGET_SAMPLE_RATE:
  #     y = librosa.resample(y, orig_sr=sample_rate, target_sr=TARGET_SAMPLE_RATE)

  #   sf.write(out_path, y, TARGET_SAMPLE_RATE)


  # Use LJSpeech format (extended)
  # /!\ audio_file shouldn't have extension, else fails | also they should just be filenames, the loader will add wav/ before and .wav after
  return {
    'audio_file': base_name,
    'text': text,
    'normalized_text': text,
    'speaker_name': speaker_id
  }

def process_split(split_name, csv_filename, output_wavs_dir, ds, resample=True, save_audio=True):
  print(f"Processing split: {split_name}")
  results = []

  with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
    futures = [executor.submit(save_and_resample, ex, output_wavs_dir, resample, save_audio) for ex in ds[split_name]]
    for f in tqdm(futures):
      results.append(f.result())

  # Save metadata
  df = pd.DataFrame(results)
  df.to_csv(os.path.join(output_dir, csv_filename), sep="|", index=False)
  print(f"Saved {len(df)} entries to {csv_filename}")



output_dir = "/content/drive/MyDrive/XTTS_Maltese_Data_20KHz"
wavs_dir = os.path.join(output_dir, "wavs")
os.makedirs(wavs_dir, exist_ok=True)

print("Loading dataset from Hugging Face...")
ds = load_dataset("Bluefir/MASRI_HEADSET_v2")
ds = ds.cast_column("audio", Audio(decode=False))

print(f"Resampling to {TARGET_SAMPLE_RATE} and saving...")
process_split("train", "metadata_train.csv", wavs_dir, ds, resample=True, save_audio=True)
process_split("test", "metadata_eval.csv", wavs_dir, ds, resample=True, save_audio=True)

print("Dataset saved!")

Loading dataset from Hugging Face...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resampling to 22050 and saving...
Processing split: train


  0%|          | 0/3983 [00:00<?, ?it/s]

Saved 3983 entries to metadata_train.csv
Processing split: test


  0%|          | 0/996 [00:00<?, ?it/s]

Saved 996 entries to metadata_eval.csv
Dataset saved!


In [None]:
# @title Dataset repartition
import os
import io
import tempfile
import soundfile as sf
from datasets import load_dataset
from datasets import load_dataset, Audio

# Don't decode audio — just keep metadata
ds = load_dataset("Bluefir/MASRI_HEADSET_v2")
ds = ds.cast_column("audio", Audio(decode=False))

text_lengths = []
audio_durations = []

for split in ["train", "test"]:
    print(f"Processing split: {split}")
    for example in ds[split]:
        # Text length
        text_lengths.append(len(example["normalized_text"]))

        # Save audio bytes to temp file, read duration with soundfile
        audio_bytes = example["audio"]["bytes"]
        with tempfile.NamedTemporaryFile(suffix=".wav") as tmpf:
            tmpf.write(audio_bytes)
            tmpf.flush()
            with sf.SoundFile(tmpf.name) as f:
                duration = len(f) / f.samplerate
                audio_durations.append(duration)

print(f"Text length range: {min(text_lengths)} - {max(text_lengths)} characters")
print(f"Audio duration range: {min(audio_durations):.2f} - {max(audio_durations):.2f} seconds")
print(f"Average text length: {sum(text_lengths)/len(text_lengths):.2f} characters")
print(f"Average audio duration: {sum(audio_durations)/len(audio_durations):.2f} seconds")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/310M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/303M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/155M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3983 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/996 [00:00<?, ? examples/s]

Processing split: train
Processing split: test
Text length range: 1 - 188 characters
Audio duration range: 0.62 - 10.89 seconds
Average text length: 65.99 characters
Average audio duration: 4.82 seconds


## Finetuning

In [2]:
# @title Output Redirection

import sys
_original_stdout = sys.stdout
_original_stderr = sys.stderr
_log_file = None

def output_redirect(redirect=True):
  global _log_file
  if not redirect:
    sys.stdout = _original_stdout
    sys.stderr = _original_stderr
    if _log_file:
      _log_file.close()
      _log_file = None
  else:
    import os
    log_path = "/content/drive/MyDrive/XTTS_Maltese_Training/output/full_training.log"
    # Clear the log file by opening in write mode and closing immediately
    if os.path.exists(log_path):
      with open(log_path, "w"):
        pass

    _log_file = open(log_path, "a", buffering=1)  # line-buffered

    class Tee(object):
      def __init__(self, *streams):
        self.streams = streams
      def write(self, data):
        for s in self.streams:
          s.write(data)
          s.flush()
      def flush(self):
        for s in self.streams:
          s.flush()

    sys.stdout = Tee(_original_stdout, _log_file)
    sys.stderr = Tee(_original_stderr, _log_file)

In [8]:
# @title Training
import os
output_redirect(True)

os.makedirs(output_path, exist_ok=True)
%cd /content/Malta-TTS/FineTuning/NewLanguage

print(f"Finetuning for {language_code}")
!python new_language_training_cli.py \
    --is_download \
    --is_tokenizer_extension \
    --output_path "{output_path}" \
    --metadatas "{metadata_train_path},{metadata_eval_path},{language_code}" \
    --num_epochs 1 \
    --batch_size 1 \
    --grad_acumm 48 \
    --max_audio_length 255995 \
    --max_text_length 200 \
    --weight_decay 1e-2 \
    --lr 5e-6 \
    --save_step 50_000 \
    --version=main \
    --metadata_path "{metadata_train_path}" \
    --language "{language_code}" \
    --extended_vocab_size {extended_vocab_size_param}

# 35min/epoch on one T4 with batch_size=1, grad_acumm=48, audio_length=255995, max_text=200, weight=1e-2

# Default values are:
# batch-size: 3
# grad_acc: 84
# --multi-gpu

print("Finetuning process completed!")
output_redirect(False)

/content/Malta-TTS/FineTuning/NewLanguage
Finetuning for mt
2025-08-20 13:08:30.148017: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755695310.178064    7747 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755695310.187974    7747 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755695310.223218    7747 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755695310.223248    7747 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755695310.2232

## Inference

In [10]:
import os
xtts_checkpoint = os.path.join(output_path, "training", "GPT_XTTS_FT-August-20-2025_01+09PM-7ae5e66/best_model_3984.pth")

xtts_config = os.path.join(output_path, "config.json")
xtts_vocab = os.path.join(output_path, "vocab.json")

tts_text = "Il-kelma Maltija 'bonġu' tfisser 'good morning'."
speaker_audio_file = "/content/drive/MyDrive/XTTS_Maltese_Data_20KHz/wavs/MSRHS_M_11_P24U082_0147.wav"
lang = "mt"
output_file = "/content/drive/MyDrive/XTTS_Maltese_Data/output_maltese.wav"

# tts_text = "Hi, how are you?"
# speaker_audio_file = "/content/drive/MyDrive/english_speaker.mp3"
# lang="en"
# output_file = "/content/drive/MyDrive/XTTS_Maltese_Data/output_english.wav"

%cd /content/Malta-TTS/FineTuning/NewLanguage
!python inference.py \
    --xtts_checkpoint="{xtts_checkpoint}" \
    --xtts_config="{xtts_config}" \
    --xtts_vocab="{xtts_vocab}" \
    --tts_text="{tts_text}" \
    --speaker_audio_file="{speaker_audio_file}" \
    --lang="{lang}" \
    --output_file="{output_file}" \
    --temperature 0.7 \
    --length_penalty 1.0 \
    --repetition_penalty 10.0 \
    --top_k 50 \
    --top_p 0.8

from IPython.display import Audio
Audio(output_file, rate=24000)

/content/Malta-TTS/FineTuning/NewLanguage
2025-08-20 13:49:13.856551: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755697753.893906   18258 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755697753.904143   18258 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755697753.939435   18258 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755697753.939468   18258 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755697753.939474   18258 computa