# Text to speech

## Load the dataset

In [None]:
from datasets import Dataset, Audio
import json

with open("/content/data.json", "r") as f:
    data = json.load(f)

dataset = Dataset.from_list(data)

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
dataset[0]

In [None]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

## Preprocess the data

In [None]:
from transformers import SpeechT5Processor

checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)


In [None]:
tokenizer = processor.tokenizer

In [None]:
def extract_all_chars(batch):
    all_text = " ".join(batch["text"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}


vocabs = dataset.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=dataset.column_names,
)

dataset_vocab = set(vocabs["vocab"][0])
tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}

### Speaker embeddings

In [None]:
!pip install datasets soundfile speechbrain accelerate

In [None]:
import os
import torch
from speechbrain.pretrained import EncoderClassifier

spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name),
)


def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

### Processing the dataset

In [None]:
def prepare_dataset(example):
    audio = example["audio"]

    example = processor(
        text=example["text"],
        audio_target=audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_attention_mask=False,
    )

    example["labels"] = example["labels"][0]

    example["speaker_embeddings"] = create_speaker_embedding(audio["array"])

    return example

In [None]:
len(dataset)

In [None]:
processed_example = prepare_dataset(dataset[0])
list(processed_example.keys())

In [None]:
processed_example

In [None]:
processed_example["speaker_embeddings"].shape

In [None]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)

Next, create a basic train/test split:

In [None]:
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
data_dict = {
    split: dataset[split].to_dict()
    for split in dataset.keys()
}

json_path = "dataset.json"
with open(json_path, "w") as f:
    json.dump(data_dict, f, indent=4)

In [None]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class TTSDataCollatorWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        print(features)
        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
        label_features = [{"input_values": feature["labels"]} for feature in features]
        speaker_features = [feature["speaker_embeddings"] for feature in features]

        batch = processor.pad(input_ids=input_ids, labels=label_features, return_tensors="pt")

        batch["labels"] = batch["labels"].masked_fill(batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100)

        del batch["decoder_attention_mask"]

        if model.config.reduction_factor > 1:
            target_lengths = torch.tensor([len(feature["input_values"]) for feature in label_features])
            target_lengths = target_lengths.new(
                [length - length % model.config.reduction_factor for length in target_lengths]
            )
            max_length = max(target_lengths)
            batch["labels"] = batch["labels"][:, :max_length]

        batch["speaker_embeddings"] = torch.tensor(speaker_features)

        return batch

In [None]:
data_collator = TTSDataCollatorWithPadding(processor=processor)

## Train the model

In [None]:
from transformers import SpeechT5ForTextToSpeech

model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)

In [None]:
model.config.use_cache = False

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="speecht5_finetuned_mr",  # change to a repo name of your choice
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    warmup_steps=5,
    max_steps=5,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=2,
    save_steps=10,
    eval_steps=10,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    greater_is_better=False,
    label_names=["labels"],
    push_to_hub=True,
    disable_tqdm=False,  # Explicitly enable tqdm
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=processor,
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
trainer.save_model()

## Inference

In [None]:
model = SpeechT5ForTextToSpeech.from_pretrained("Sankalp-Bahad/speecht5_finetuned_mr")

Pick an example, here we'll take one from the test dataset. Obtain a speaker embedding.

In [None]:
len(dataset["test"])

In [None]:
example = dataset["test"][0]
speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)

Define some input text and tokenize it.

In [None]:
text = "बदल कधीच सोपा नसतो."

Preprocess the input text:

In [None]:
inputs = processor(text=text, return_tensors="pt")

Create a spectrogram with your model:

In [None]:
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)

In [None]:
spectrogram = spectrogram.unsqueeze(0).reshape(1, 80, -1).to("cuda")

import torch
import torchaudio
from IPython.display import Audio

device = torch.device('cuda')
waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp32', map_location=device)

waveglow = waveglow.to(device)
with torch.no_grad():
    audio_waveform = waveglow.infer(spectrogram)

Audio(audio_waveform.squeeze(0).cpu().numpy(), rate=16000)