### Install required packages

##### Minimal installs for inference (no training libs, no datasets)
##### - transformers: SpeechT5 model + processor + HiFi-GAN vocoder
#####- speechbrain: to compute speaker x-vector (speaker embedding)
#####- soundfile: to read/write wav



In [None]:
!pip install soundfile speechbrain -q
!pip install transformers -q
!pip install --upgrade accelerate -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/864.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m864.1/864.1 kB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/119.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/753.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m753.1/753.1 kB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#!pip install huggingface-hub==1.0.1 -q

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### Imports

In [None]:
import os
import numpy as np
import soundfile as sf
import torch

from transformers import (
    SpeechT5Processor,
    SpeechT5ForTextToSpeech,
    SpeechT5HifiGan,
)

from speechbrain.pretrained import EncoderClassifier

try:
    from IPython.display import Audio, display
    _IN_COLAB = True
except Exception:
    _IN_COLAB = False

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

  available_backends = torchaudio.list_audio_backends()
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


Using device: cuda


  from speechbrain.pretrained import EncoderClassifier


### Helper Functions

In [None]:
def load_wav_mono_16k(path: str) -> np.ndarray:
    """
    Load a mono waveform at 16 kHz. If the file is stereo, take the first channel.
    If the sample rate != 16000, a simple linear resample is applied.
    """
    waveform, sr = sf.read(path, dtype="float32", always_2d=False)
    if waveform.ndim == 2:
        waveform = waveform[:, 0]  # take first channel
    if sr != 16000:
        # Lightweight linear resample to 16k to avoid extra deps
        x_old = np.linspace(0, 1, num=len(waveform), endpoint=False, dtype=np.float32)
        new_len = int(len(waveform) * 16000 / sr)
        x_new = np.linspace(0, 1, num=new_len, endpoint=False, dtype=np.float32)
        waveform = np.interp(x_new, x_old, waveform).astype(np.float32)
    return waveform

def create_speaker_embedding(waveform_16k: np.ndarray, speaker_model: EncoderClassifier) -> torch.Tensor:
    """
    Compute an x-vector speaker embedding using SpeechBrain and normalize it.
    Returns a tensor of shape (1, embedding_dim).
    """
    with torch.no_grad():
        wav = torch.tensor(waveform_16k, dtype=torch.float32).unsqueeze(0)  # (1, T)
        emb = speaker_model.encode_batch(wav.to(DEVICE))                    # (1, 1, D) typically
        emb = torch.nn.functional.normalize(emb, dim=2)                     # L2 normalize
        emb = emb.squeeze(0)                                               # (1, D)
    return emb

def tts_infer(
    text: str,
    processor: SpeechT5Processor,
    model: SpeechT5ForTextToSpeech,
    vocoder: SpeechT5HifiGan,
    speaker_embeddings: torch.Tensor,
) -> torch.Tensor:
    """
    Full inference: text -> ids -> spectrogram -> vocoder -> waveform (16k).
    Returns a 1D torch.FloatTensor waveform at 16k sample rate.
    """
    inputs = processor(text=text, return_tensors="pt")
    input_ids = inputs["input_ids"].to(model.device)


    with torch.no_grad():
        spectrogram = model.generate_speech(input_ids, speaker_embeddings)

    with torch.no_grad():
        waveform = vocoder(spectrogram)

    if waveform.dim() > 1:
        waveform = waveform.squeeze()
    return waveform.cpu().float()

def save_wav(path: str, waveform: torch.Tensor, sr: int = 16000) -> None:
    """
    Save the waveform to WAV (float32).
    """
    wav_np = waveform.detach().cpu().numpy().astype(np.float32)
    sf.write(path, wav_np, sr)
    print(f"Saved: {path}")

### Load Pre-Trained Model + Prepare Speaker Embedding

In [None]:
MODEL_DIR = "/content/drive/MyDrive/Part_4/speecht5_finetuned_clasp/checkpoint-6318"  # <-- change if needed
REFERENCE_WAV = "/content/drive/MyDrive/Part_4/reference_tts.wav"  # <-- upload a 3–10s sample of the target voice


processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained(MODEL_DIR).to(DEVICE).eval()


vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(DEVICE).eval()


speaker_model = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-xvect-voxceleb",
    run_opts={"device": DEVICE},
    savedir="/tmp/sb_xvect_voxceleb",
)


if not os.path.exists(REFERENCE_WAV):
    raise FileNotFoundError(
        f"Reference WAV not found at {REFERENCE_WAV}. "
        "Upload a short 3–10s mono 16k WAV and update REFERENCE_WAV if needed."
    )
ref_wav = load_wav_mono_16k(REFERENCE_WAV)
speaker_embeddings = create_speaker_embedding(ref_wav, speaker_model)  # shape: (1, D)
print("Loaded model, vocoder, processor, and computed speaker embedding.")

Contents of MODEL_DIR (/content/drive/MyDrive/Part_4/speecht5_finetuned_clasp/checkpoint-6318):
config.json		rng_state.pth	    training_args.bin
generation_config.json	scheduler.pt
model.safetensors	trainer_state.json


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/50.7M [00:00<?, ?B/s]

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


hyperparams.yaml: 0.00B [00:00, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/hyperparams.yaml' -> '/tmp/sb_xvect_voxceleb/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered parameter transfer hook for _load
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load_if_possible
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in /tmp/sb_xvect_voxceleb.
INFO:speechbrain.

embedding_model.ckpt:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/50.6M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/embedding_model.ckpt' -> '/tmp/sb_xvect_voxceleb/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /tmp/sb_xvect_voxceleb/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


mean_var_norm_emb.ckpt:   0%|          | 0.00/3.20k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/mean_var_norm_emb.ckpt' -> '/tmp/sb_xvect_voxceleb/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /tmp/sb_xvect_voxceleb/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


classifier.ckpt:   0%|          | 0.00/15.9M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/classifier.ckpt' -> '/tmp/sb_xvect_voxceleb/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["classifier"] = /tmp/sb_xvect_voxceleb/classifier.ckpt
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


label_encoder.txt: 0.00B [00:00, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/label_encoder.txt' -> '/tmp/sb_xvect_voxceleb/label_encoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["label_encoder"] = /tmp/sb_xvect_voxceleb/label_encoder.ckpt
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): embedding_model -> /tmp/sb_xvect_voxceleb/embedding_model.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): mean_var_norm_emb -> /tmp/sb_xvect_voxceleb/mean_var_norm_emb.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): classifier -> /tmp/sb_xvect_voxceleb/classifier.ckpt
DEBUG:speechbrain.utils.parameter_trans

Loaded model, vocoder, processor, and computed speaker embedding.


# Inference Examples

### Example 1 (Model gives sub-optimal outputs)

In [None]:
input_text_1 = "Kunt u wat langzamer praten, alstublieft?"
wave_1 = tts_infer(input_text_1, processor, model, vocoder, speaker_embeddings)
save_wav("/content/out_example_1.wav", wave_1, sr=16000)
if _IN_COLAB:
    display(Audio(wave_1.numpy(), rate=16000))

Saved: /content/out_example_1.wav


### Example 2 (Model gives sub-optimal outputs)

In [None]:
input_text_2 = "Hallo, mijn naam is SpeechT5 en ik spreek nu Nederlands.Praat rustig met je vader."
wave_2 = tts_infer(input_text_2, processor, model, vocoder, speaker_embeddings)
save_wav("/content/out_example_2.wav", wave_2, sr=16000)
if _IN_COLAB:
    display(Audio(wave_2.numpy(), rate=16000))

Saved: /content/out_example_2.wav


### Example 3 (Model performs very poorly)

In [None]:
input_text_3 = "De jongen heeft urenlang de deur geverfd."
wave_3 = tts_infer(input_text_3, processor, model, vocoder, speaker_embeddings)
save_wav("/content/out_example_3.wav", wave_3, sr=16000)
if _IN_COLAB:
    display(Audio(wave_3.numpy(), rate=16000))

Saved: /content/out_example_3.wav


### Example 4 (Model performs very poorly)

In [None]:
input_text_4 = "Wij smachten naar achtentachtig prachtige nachten bij achtentachtig prachtige grachten."
wave_4 = tts_infer(input_text_4, processor, model, vocoder, speaker_embeddings)
save_wav("/content/out_example_4.wav", wave_4, sr=16000)
if _IN_COLAB:
    display(Audio(wave_4.numpy(), rate=16000))

Saved: /content/out_example_4.wav
