# XTTS Finetuning for Maltese

## Requirements
- Python 3.8 or higher
- PyTorch 1.7.0 or higher

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!git clone https://github.com/Wubpooz/Malta-TTS.git
%cd Malta-TTS/FineTuning/NewLanguage

!pip install --upgrade pip
!pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -r requirements.txt
!python -c "import nltk; nltk.download('punkt')"
!python -m spacy download en_core_web_sm

## Data Preparation

In [None]:
from datasets import load_dataset
import os
import pandas as pd
import shutil

print("Loading dataset from Hugging Face...")
ds = load_dataset("Bluefir/MASRI_HEADSET_v2")

output_dir = "/content/drive/MyDrive/XTTS_Maltese_Data"
os.makedirs(output_dir, exist_ok=True)
wavs_dir = os.path.join(output_dir, "wavs")
os.makedirs(wavs_dir, exist_ok=True)

print("Preparing and saving dataset files...")

def save_dataset_split(split, filename):
  data = []
  for example in ds[split]:
    audio_path = example['audio']['path']
    text = example['normalized_text']
    speaker_id = example['speaker_id']
    
    shutil.copy(audio_path, wavs_dir)

    audio_filename = os.path.basename(audio_path)
    data.append({
      'audio_file': os.path.join("wavs", audio_filename),
      'text': text,
      'speaker_name': speaker_id
    })
    
  df = pd.DataFrame(data)
  df.to_csv(os.path.join(output_dir, filename), sep="|", index=False, header=False)
  print(f"Saved {len(df)} files to {filename}")

save_dataset_split("train", "metadata_train.csv")
save_dataset_split("test", "metadata_eval.csv")

print("Dataset preparation complete.")

## Finetuning

In [None]:
# Define paths and parameters
metadata_train_path = "/content/drive/MyDrive/XTTS_Maltese_Data/metadata_train.csv"
metadata_eval_path = "/content/drive/MyDrive/XTTS_Maltese_Data/metadata_eval.csv"
output_path = "/content/drive/MyDrive/XTTS_Maltese_Training"
language_code = "mt"
extended_vocab_size_param = 100000

os.makedirs(output_path, exist_ok=True)

!python new_language_training_cli.py \
    --is_download True \
    --is_tokenizer_extension True \
    --output_path "{output_path}" \
    --metadatas "{metadata_train_path},{metadata_eval_path},{language_code}" \
    --num_epochs 100 \
    --batch_size 3 \
    --grad_acumm 84 \
    --max_audio_length 255995 \
    --max_text_length 200 \
    --weight_decay 1e-2 \
    --lr 5e-6 \
    --save_step 10000 \
    --custom_model=custom_model_name \
    --version=main \
    --multi_gpu \
    --metadata_path "{metadata_train_path}" \
    --language "{language_code}" \
    --extended_vocab_size {extended_vocab_size_param}

print("Finetuning process initiated.")