<a href="https://colab.research.google.com/github/Rohit-515/tts_finetuned_model/blob/main/speechT5_finetune_hindi/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets soundfile accelerate speechbrain

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

In [None]:
from datasets import load_dataset, Audio

dataset = load_dataset("1rsh/tts-rj-hi-karya")

In [None]:
dataset

In [None]:
dataset = dataset['train']

In [None]:
len(dataset)

In [None]:
half_size = len(dataset) // 35
dataset = dataset.select(range(half_size))

print(dataset)

In [None]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
from transformers import SpeechT5Processor

model_name = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(model_name)


In [None]:
tokenizer = processor.tokenizer

In [None]:
dataset[2:5]

In [None]:
def extract_all_chars(batch):
    all_text = " ".join(batch["sentence"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}


vocabs = dataset.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=dataset.column_names,
)

dataset_vocab = set(vocabs["vocab"][0])
tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}

Map:   0%|          | 0/12074 [00:00<?, ? examples/s]

In [None]:
dataset_vocab - tokenizer_vocab

In [None]:
import re

def normalize_text(text):

    # Remove punctuation
    text = re.sub(r'[^\u0900-\u097F\s\']', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

# Define a function to add the normalized_text column
def add_normalized_text(example):
    example['normalized_text'] = normalize_text(example['sentence'])
    return example

# Apply the function to the dataset
dataset = dataset.map(add_normalized_text)

# Print the first few examples to verify
print(dataset[2:5])

In [None]:
def extract_all_chars(batch):
    all_text = " ".join(batch["normalized_text"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}


vocabs = dataset.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=dataset.column_names,
)

dataset_vocab = set(vocabs["vocab"][0])
tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}

In [None]:
new_symbols = ['४', '५', '६','७']
dataset_vocab.update(new_symbols)

In [None]:
dataset_vocab - tokenizer_vocab

In [None]:
replacements = {
    ('ँ', 'n'),   # Anusvara (nasal sound)
    ('ं', 'n'),   # Anusvara (nasal sound)
    ('ः', 'h'),   # Visarga (aspirated sound)
    ('अ', 'uh'),
    ('आ', 'aa'),
    ('इ', 'i'),
    ('ई', 'ee'),
    ('उ', 'u'),
    ('ऊ', 'oo'),
    ('ऋ', 'ri'),
    ('ऍ', 'ae'),
    ('ए', 'e'),
    ('ऐ', 'ai'),
    ('ऑ', 'aw'),
    ('ओ', 'o'),
    ('औ', 'au'),
    ('क', 'k'),
    ('ख', 'kh'),
    ('ग', 'g'),
    ('घ', 'gh'),
    ('च', 'ch'),
    ('छ', 'chh'),
    ('ज', 'j'),
    ('झ', 'jh'),
    ('ञ', 'ny'),
    ('ट', 't'),
    ('ठ', 'th'),
    ('ड', 'd'),
    ('ढ', 'dh'),
    ('ण', 'n'),
    ('त', 't'),
    ('थ', 'th'),
    ('द', 'd'),
    ('ध', 'dh'),
    ('न', 'n'),
    ('प', 'p'),
    ('फ', 'ph'),
    ('ब', 'b'),
    ('भ', 'bh'),
    ('म', 'm'),
    ('य', 'y'),
    ('र', 'r'),
    ('ल', 'l'),
    ('व', 'v'),
    ('श', 'sh'),
    ('ष', 'shh'),
    ('स', 's'),
    ('ह', 'h'),
    ('़', ''),    # Nukta (diacritic mark for foreign sounds)
    ('ा', 'aa'),  # Vowel sound modifier
    ('ि', 'i'),   # Vowel sound modifier
    ('ी', 'ee'),  # Vowel sound modifier
    ('ु', 'u'),   # Vowel sound modifier
    ('ू', 'oo'),  # Vowel sound modifier
    ('ृ', 'ri'),  # Vowel sound modifier
    ('े', 'e'),   # Vowel sound modifier
    ('ै', 'ai'),  # Vowel sound modifier
    ('ॉ', 'aw'),  # Vowel sound modifier
    ('ो', 'o'),   # Vowel sound modifier
    ('ौ', 'au'),  # Vowel sound modifier
    ('्', ''),    # Halant (for stopping consonant sound)
    ('क़', 'q'),
    ('ख़', 'kh'),
    ('ग़', 'gh'),
    ('ज़', 'z'),
    ('ड़', 'r'),
    ('ढ़', 'rh'),
    ('फ़', 'f'),
    ('ॠ', 'rri'),
    ('।', 'period'),  # Purnavirama (full stop)
    ('०', '0'),
    ('१', '1'),
    ('२', '2'),
    ('३', '3'),
    ('४', '4'),
    ('५', '5'),
    ('६', '6'),
    ('७', '7'),
    ('८', '8'),
    ('९', '9')
}

In [None]:
def cleanup_text(inputs):
    for src, dst in replacements:
        inputs["normalized_text"] = inputs["normalized_text"].replace(src, dst)
    return inputs

dataset = dataset.map(cleanup_text)

In [None]:
import os
import torch
from speechbrain.pretrained import EncoderClassifier

spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name),
)


def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

In [None]:
def prepare_dataset(example):
    audio = example["audio"]

    example = processor(
        text=example["normalized_text"],
        audio_target=audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_attention_mask=False,
    )

    # strip off the batch dimension
    example["labels"] = example["labels"][0]

    # use SpeechBrain to obtain x-vector
    example["speaker_embeddings"] = create_speaker_embedding(audio["array"])

    return example

In [None]:
processed_example = prepare_dataset(dataset[0])
list(processed_example.keys())

In [None]:
processed_example["speaker_embeddings"].shape

In [None]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)

In [None]:
def is_not_too_long(input_ids):
    input_length = len(input_ids)
    return input_length < 200

dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"])
len(dataset)

In [None]:
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class TTSDataCollatorWithPadding:
    processor: Any

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
        label_features = [{"input_values": feature["labels"]} for feature in features]
        speaker_features = [feature["speaker_embeddings"] for feature in features]

        # collate the inputs and targets into a batch
        batch = processor.pad(
            input_ids=input_ids, labels=label_features, return_tensors="pt"
        )

        # replace padding with -100 to ignore loss correctly
        batch["labels"] = batch["labels"].masked_fill(
            batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
        )

        # not used during fine-tuning
        del batch["decoder_attention_mask"]

        # round down target lengths to multiple of reduction factor
        if model.config.reduction_factor > 1:
            target_lengths = torch.tensor(
                [len(feature["input_values"]) for feature in label_features]
            )
            target_lengths = target_lengths.new(
                [
                    length - length % model.config.reduction_factor
                    for length in target_lengths
                ]
            )
            max_length = max(target_lengths)
            batch["labels"] = batch["labels"][:, :max_length]

        # also add in the speaker embeddings
        batch["speaker_embeddings"] = torch.tensor(speaker_features)

        return batch

In [None]:
data_collator = TTSDataCollatorWithPadding(processor=processor)

In [None]:
from transformers import SpeechT5ForTextToSpeech

model = SpeechT5ForTextToSpeech.from_pretrained(model_name)

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="speecht5_finetuned_rohit_hindi",  # change to a repo name of your choice
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    warmup_steps=100,
    max_steps=800,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=2,
    save_steps=100,
    eval_steps=100,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    greater_is_better=False,
    label_names=["labels"],
    push_to_hub=True,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=processor,
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
model = SpeechT5ForTextToSpeech.from_pretrained(
    "speecht5_finetuned_rohit_hindi"
)

In [None]:
example = dataset["test"][304]
speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)

In [None]:
text = "मुझे सुबह ७ बजे उठना है, मेरे पास २ किताबें हैं।"

In [None]:
!pip install indic-num2words

from num_to_words import num_to_word

import re

def replace_numbers_with_words(text):
    def replace(match):
        number = int(match.group())
        return number_to_word(number, lang='hi')

    result = re.sub(r'\b\d+\b', replace, text)
    return result


In [None]:
# Function to clean up text using the replacement pairs
def cleanup_text(text):
    for src, dst in replacements:
        text = text.replace(src, dst)
    return text

In [None]:
converted_text = replace_numbers_with_words(text)
cleaned_text = cleanup_text(converted_text)
final_text = normalize_text(cleaned_text)
final_text

In [None]:
inputs = processor(text=final_text, return_tensors="pt")

In [None]:
from transformers import SpeechT5HifiGan

vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

In [None]:
from IPython.display import Audio
import soundfile as sf

Audio(speech.numpy(), rate=16000)

In [None]:
# Save the audio to a file (e.g., 'output.wav')
sf.write('output.wav', speech.numpy(), 16000)