In [None]:
# Transformers installation
! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m58

# Fine tuning

In [None]:
!pip install datasets soundfile speechbrain accelerate


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting speechbrain
  Downloading speechbrain-0.5.14-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.0/519.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.0-py3-none-any.whl (16 kB)
Collecting sentencepiece (from speechbrain)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Collecting ruamel.yaml>=0.17.8 (from hyperpyyaml->speechbrain)
  Downloading ruamel.yaml-0.17.27-py3-none-any.whl (109 kB)
[2

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

##  Dataset loading

[VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) is a large-scale multilingual speech corpus consisting of 
data sourced from 2009-2020 European Parliament event recordings. It contains labelled audio-transcription data for 15 
European languages. In this guide, we are using the Dutch language subset.

In [None]:
from datasets import load_dataset, Audio

dataset = load_dataset("facebook/voxpopuli", "nl", split="train")
len(dataset)

20968 examples should be sufficient for fine-tuning. SpeechT5 expects audio data to have a sampling rate of 16 kHz, so 
make sure the examples in the dataset meet this requirement:

In [None]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

## Preprocess the data

Let's begin by defining the model checkpoint to use and loading the appropriate processor:

In [None]:
from transformers import SpeechT5Processor

checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)

### Text cleanup for SpeechT5 tokenization 

Start by cleaning up the text data. We'll need the tokenizer part of the processor to process the text:

In [None]:
tokenizer = processor.tokenizer

The dataset examples contain `raw_text` and `normalized_text` features. When deciding which feature to use as the text input, 
we have to consider that the SpeechT5 tokenizer doesn't have any tokens for numbers. In `normalized_text` the numbers are written 
out as text. Thus, it is a better fit, and we will use    `normalized_text` as input text.

Because SpeechT5 was trained on the English language, it may not recognize certain characters in the Dutch dataset. If 
left as is, these characters will be converted to `<unk>` tokens. However, in Dutch, certain characters like `à` are 
used to stress syllables. In order to preserve the meaning of the text, we can replace this character with a regular `a`.

To identify unsupported tokens, extract all unique characters in the dataset using the `SpeechT5Tokenizer` which 
works with characters as tokens. To do this, write the `extract_all_chars` mapping function that concatenates 
the transcriptions from all examples into one string and converts it to a set of characters. 
Make sure to set `batched=True` and `batch_size=-1` in `dataset.map()` so that all transcriptions are available at once for 
the mapping function.

In [None]:
def extract_all_chars(batch):
    all_text = " ".join(batch["normalized_text"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}


vocabs = dataset.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=dataset.column_names,
)

dataset_vocab = set(vocabs["vocab"][0])
tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}

Map:   0%|          | 0/20968 [00:00<?, ? examples/s]

Now we have two sets of characters: one with the vocabulary from the dataset and one with the vocabulary from the tokenizer. 
To identify any unsupported characters in the dataset, we can take the difference between these two sets. The resulting 
set will contain the characters that are in the dataset but not in the tokenizer.

In [None]:
dataset_vocab - tokenizer_vocab

{' ', 'à', 'ç', 'è', 'ë', 'í', 'ï', 'ö', 'ü'}

To handle the unsupported characters identified in the previous step, we define a function that maps these characters to 
valid tokens. Note that spaces are already replaced by `▁` in the tokenizer and don't need to be handled separately.

In [None]:
replacements = [
    ("à", "a"),
    ("ç", "c"),
    ("è", "e"),
    ("ë", "e"),
    ("í", "i"),
    ("ï", "i"),
    ("ö", "o"),
    ("ü", "u"),
]


def cleanup_text(inputs):
    for src, dst in replacements:
        inputs["normalized_text"] = inputs["normalized_text"].replace(src, dst)
    return inputs


dataset = dataset.map(cleanup_text)



Now that we have to dealt with special characters in the text, it's time to shift focus to the audio data.

### Speakers

The VoxPopuli dataset includes speech from multiple speakers, but how many speakers are represented in the dataset? To 
determine this, we can count the number of unique speakers and the number of examples each speaker contributes to the dataset. 
With a total of 20,968 examples in the dataset, this information will give us a better understanding of the distribution of 
speakers and examples in the data.

In [None]:
from collections import defaultdict

speaker_counts = defaultdict(int)

for speaker_id in dataset["speaker_id"]:
    speaker_counts[speaker_id] += 1

In [None]:
def select_speaker(speaker_id):
    return 100 <= speaker_counts[speaker_id] <= 400


dataset = dataset.filter(select_speaker, input_columns=["speaker_id"])



Let's check how many speakers remain:

In [None]:
len(set(dataset["speaker_id"]))

42

Let's see how many examples are left:

In [None]:
len(dataset)

9973

### Speaker embeddings

To enable the TTS model to differentiate between multiple speakers, we'll need to create a speaker embedding for each example. 
The speaker embedding is an additional input into the model that captures a particular speaker's voice characteristics.
To generate these speaker embeddings, we use the pre-trained [spkrec-xvect-voxceleb](https://huggingface.co/speechbrain/spkrec-xvect-voxceleb) 
model from SpeechBrain. 

Create a function `create_speaker_embedding()` that takes an input audio waveform and outputs a 512-element vector 
containing the corresponding speaker embedding.

In [None]:
import os
import torch
from speechbrain.pretrained import EncoderClassifier

spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name),
)


def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

### Processing the dataset

Finally, let's process the data into the format the model expects. here we will create a `prepare_dataset` function that takes in a 
single example and uses the `SpeechT5Processor` object to tokenize the input text and load the target audio into a log-mel spectrogram. 
It should also add the speaker embeddings as an additional input.

In [None]:
def prepare_dataset(example):
    audio = example["audio"]

    example = processor(
        text=example["normalized_text"],
        audio_target=audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_attention_mask=False,
    )

    # strip off the batch dimension
    example["labels"] = example["labels"][0]

    # use SpeechBrain to obtain x-vector
    example["speaker_embeddings"] = create_speaker_embedding(audio["array"])

    return example

Now we will Verify the processing is correct by looking at a single example:

In [None]:
processed_example = prepare_dataset(dataset[0])
list(processed_example.keys())

['input_ids', 'labels', 'speaker_embeddings']

Speaker embeddings should be a 512-element vector:

In [None]:
processed_example["speaker_embeddings"].shape

(512,)

In [None]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)



In [None]:
def is_not_too_long(input_ids):
    input_length = len(input_ids)
    return input_length < 200


dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"])
len(dataset)



8259

Next, we will create a basic train/test split:

In [None]:
dataset = dataset.train_test_split(test_size=0.1)

### Data collator

In order to combine multiple examples into a batch, we need to define a custom data collator. This collator will pad shorter sequences with padding 
tokens, ensuring that all examples have the same length. For the spectrogram labels, the padded portions are replaced with the special value `-100`. This special value 
instructs the model to ignore that part of the spectrogram when calculating the spectrogram loss.

In [None]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class TTSDataCollatorWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
        label_features = [{"input_values": feature["labels"]} for feature in features]
        speaker_features = [feature["speaker_embeddings"] for feature in features]

        # collate the inputs and targets into a batch
        batch = processor.pad(input_ids=input_ids, labels=label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        batch["labels"] = batch["labels"].masked_fill(batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100)

        # not used during fine-tuning
        del batch["decoder_attention_mask"]

        # round down target lengths to multiple of reduction factor
        if model.config.reduction_factor > 1:
            target_lengths = torch.tensor([len(feature["input_values"]) for feature in label_features])
            target_lengths = target_lengths.new(
                [length - length % model.config.reduction_factor for length in target_lengths]
            )
            max_length = max(target_lengths)
            batch["labels"] = batch["labels"][:, :max_length]

        # also add in the speaker embeddings
        batch["speaker_embeddings"] = torch.tensor(speaker_features)

        return batch

In SpeechT5, the input to the decoder part of the model is reduced by a factor 2. In other words, it throws away every 
other timestep from the target sequence. The decoder then predicts a sequence that is twice as long. Since the original 
target sequence length may be odd, the data collator makes sure to round the maximum length of the batch down to be a 
multiple of 2.

In [None]:
data_collator = TTSDataCollatorWithPadding(processor=processor)

## Train the model

Now we will load the pre-trained model from the same checkpoint as we used for loading the processor:

In [None]:
from transformers import SpeechT5ForTextToSpeech

model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)

The `use_cache=True` option is incompatible with gradient checkpointing. we will disable it for training.

In [None]:
model.config.use_cache = False

Define the training arguments. Here we are not computing any evaluation metrics during the training process. Instead, we'll 
only look at the loss:

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="speecht5_finetuned_voxpopuli_nl",  # change to a repo name of your choice
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=2,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    greater_is_better=False,
    label_names=["labels"],
    push_to_hub=True,
)

Now we will instantiate the `Trainer` object  and pass the model, dataset, and data collator to it.

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=processor,
)

/content/speecht5_finetuned_voxpopuli_nl is already a clone of https://huggingface.co/SurendraKumarDhaka/speecht5_finetuned_voxpopuli_nl. Make sure you pull the latest changes with `repo.git_pull()`.


#Training

In [None]:
trainer.train()



Step,Training Loss,Validation Loss
1000,0.5235,0.479455
2000,0.5013,0.466575


Step,Training Loss,Validation Loss
1000,0.5235,0.479455
2000,0.5013,0.466575
3000,0.4933,0.46071
4000,0.4857,0.459464


TrainOutput(global_step=4000, training_loss=0.5225280311107635, metrics={'train_runtime': 8980.6135, 'train_samples_per_second': 14.253, 'train_steps_per_second': 0.445, 'total_flos': 1.6992227696948856e+16, 'train_loss': 0.5225280311107635, 'epoch': 17.21})

Now we will push the final model to the Hugging face Hub:

In [None]:
trainer.push_to_hub()

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 1.00/558M [00:00<?, ?B/s]

Upload file runs/May25_09-24-25_36dcc00bf32b/events.out.tfevents.1685006668.36dcc00bf32b.175.4:   0%|         …

To https://huggingface.co/SurendraKumarDhaka/speecht5_finetuned_voxpopuli_nl
   54d58e0..4aeb34e  main -> main

   54d58e0..4aeb34e  main -> main

To https://huggingface.co/SurendraKumarDhaka/speecht5_finetuned_voxpopuli_nl
   4aeb34e..503898a  main -> main

   4aeb34e..503898a  main -> main



'https://huggingface.co/SurendraKumarDhaka/speecht5_finetuned_voxpopuli_nl/commit/4aeb34ed5ff194f72c5a7a6a30525de2f72efc7e'

## Inference

Now we have fine-tuned a model, we can use it for inference!
For that we will load the model from the Hub.

In [None]:
from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan,SpeechT5ForTextToSpeech


In [None]:
from transformers import SpeechT5Processor
checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)

Downloading (…)rocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

In [None]:
model = SpeechT5ForTextToSpeech.from_pretrained("SurendraKumarDhaka/speecht5_finetuned_voxpopuli_nl")

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]

Pick an example, here we'll take one from the test dataset. Obtain a speaker embedding.

In [None]:
import numpy as np
import torch
from datasets import load_dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7310]["xvector"]).unsqueeze(0)

Downloading builder script:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading and preparing dataset cmu-arctic-xvectors/default to /root/.cache/huggingface/datasets/Matthijs___cmu-arctic-xvectors/default/0.0.1/a62fea1f9415e240301ea0042ffad2a3aadf4d1caa7f9a8d9512d631723e781f...


Downloading data:   0%|          | 0.00/17.9M [00:00<?, ?B/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset cmu-arctic-xvectors downloaded and prepared to /root/.cache/huggingface/datasets/Matthijs___cmu-arctic-xvectors/default/0.0.1/a62fea1f9415e240301ea0042ffad2a3aadf4d1caa7f9a8d9512d631723e781f. Subsequent calls will reuse this data.


In [None]:
speaker_embeddings = np.load("cmu_us_slt_arctic-wav-arctic_a0499.npy")

In [None]:
text = "Wikipedia is a free online encyclopedia, created and edited by volunteers around the world and hosted by the Wikimedia Foundation."

Preprocess the input text:

In [None]:
inputs = processor(text=text, return_tensors="pt")

Create a spectrogram with your model:

In [None]:
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)

Visualize the spectrogram, if you'd like to:

In [None]:
from transformers import SpeechT5HifiGan
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


In [None]:
with torch.no_grad():
    speech = vocoder(spectrogram)
    import soundfile as sf
    sf.write("speech.wav", speech.numpy(), samplerate=16000)

from IPython.display import Audio

Audio(speech.numpy(), rate=16000)

In [None]:
import soundfile as sf
sf.write("speech.wav", speech.numpy(), samplerate=16000)