<a href="https://colab.research.google.com/github/Rohit-515/tts_finetuned_model/blob/main/speechT5_finetune_hindi/implementation_and_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
with open('requirements.txt', 'w') as f:
    f.write('numpy==1.23.5\n')
    f.write('transformers\n')
    f.write('datasets\n')
    f.write('soundfile\n')
    f.write('torch\n')
    f.write('torchaudio\n')
    f.write('sentencepiece\n')
    f.write('speechbrain==0.5.16\n')
    f.write('librosa\n')
    f.write('gradio==4.44.1\n')

In [4]:
!pip install -r requirements.txt

Collecting numpy==1.23.5 (from -r requirements.txt (line 1))
  Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting datasets (from -r requirements.txt (line 3))
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting speechbrain==0.5.16 (from -r requirements.txt (line 8))
  Downloading speechbrain-0.5.16-py3-none-any.whl.metadata (23 kB)
Collecting gradio==4.44.1 (from -r requirements.txt (line 10))
  Downloading gradio-4.44.1-py3-none-any.whl.metadata (15 kB)
Collecting hyperpyyaml (from speechbrain==0.5.16->-r requirements.txt (line 8))
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio==4.44.1->-r requirements.txt (line 10))
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio==4.44.1->-r requirements.txt (line 10))
  Downloading fastapi-0.115.3-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (fr

In [1]:
import gradio as gr
import torch
import soundfile as sf
import os
import numpy as np
import re
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
from datasets import load_dataset

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
def load_models_and_data():
    model_name = "microsoft/speecht5_tts"
    processor = SpeechT5Processor.from_pretrained(model_name)
    model = SpeechT5ForTextToSpeech.from_pretrained("rohit0619/speecht5_finetuned_rohit_hindi").to(device)
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

    spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
    speaker_model = EncoderClassifier.from_hparams(
        source=spk_model_name,
        run_opts={"device": device},
        savedir=os.path.join("/tmp", spk_model_name),
    )

    # Load a sample from a dataset for default embedding
    dataset = load_dataset("1rsh/tts-rj-hi-karya", split="train")
    example = dataset[10]

    return model, processor, vocoder, speaker_model, example

model, processor, vocoder, speaker_model, default_example = load_models_and_data()

  torch.load(path, map_location=device), strict=False
  stats = torch.load(path, map_location=device)


README.md:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

train-00000-of-00015.parquet:   0%|          | 0.00/495M [00:00<?, ?B/s]

train-00001-of-00015.parquet:   0%|          | 0.00/495M [00:00<?, ?B/s]

train-00002-of-00015.parquet:   0%|          | 0.00/493M [00:00<?, ?B/s]

train-00003-of-00015.parquet:   0%|          | 0.00/496M [00:00<?, ?B/s]

train-00004-of-00015.parquet:   0%|          | 0.00/495M [00:00<?, ?B/s]

train-00005-of-00015.parquet:   0%|          | 0.00/495M [00:00<?, ?B/s]

train-00006-of-00015.parquet:   0%|          | 0.00/497M [00:00<?, ?B/s]

train-00007-of-00015.parquet:   0%|          | 0.00/497M [00:00<?, ?B/s]

train-00008-of-00015.parquet:   0%|          | 0.00/494M [00:00<?, ?B/s]

train-00009-of-00015.parquet:   0%|          | 0.00/496M [00:00<?, ?B/s]

train-00010-of-00015.parquet:   0%|          | 0.00/495M [00:00<?, ?B/s]

train-00011-of-00015.parquet:   0%|          | 0.00/496M [00:00<?, ?B/s]

train-00012-of-00015.parquet:   0%|          | 0.00/495M [00:00<?, ?B/s]

train-00013-of-00015.parquet:   0%|          | 0.00/495M [00:00<?, ?B/s]

train-00014-of-00015.parquet:   0%|          | 0.00/495M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/75.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/422603 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4269 [00:00<?, ? examples/s]

In [6]:
def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze()
    return speaker_embeddings

def prepare_default_embedding(example):
    audio = example["audio"]
    return create_speaker_embedding(audio["array"])

default_embedding = prepare_default_embedding(default_example)


In [7]:
replacements = {
    ('ँ', 'n'),   # Anusvara (nasal sound)
    ('ं', 'n'),   # Anusvara (nasal sound)
    ('ः', 'h'),   # Visarga (aspirated sound)
    ('अ', 'uh'),
    ('आ', 'aa'),
    ('इ', 'i'),
    ('ई', 'ee'),
    ('उ', 'u'),
    ('ऊ', 'oo'),
    ('ऋ', 'ri'),
    ('ऍ', 'ae'),
    ('ए', 'e'),
    ('ऐ', 'ai'),
    ('ऑ', 'aw'),
    ('ओ', 'o'),
    ('औ', 'au'),
    ('क', 'k'),
    ('ख', 'kh'),
    ('ग', 'g'),
    ('घ', 'gh'),
    ('च', 'ch'),
    ('छ', 'chh'),
    ('ज', 'j'),
    ('झ', 'jh'),
    ('ञ', 'ny'),
    ('ट', 't'),
    ('ठ', 'th'),
    ('ड', 'd'),
    ('ढ', 'dh'),
    ('ण', 'n'),
    ('त', 't'),
    ('थ', 'th'),
    ('द', 'd'),
    ('ध', 'dh'),
    ('न', 'n'),
    ('प', 'p'),
    ('फ', 'ph'),
    ('ब', 'b'),
    ('भ', 'bh'),
    ('म', 'm'),
    ('य', 'y'),
    ('र', 'r'),
    ('ल', 'l'),
    ('व', 'v'),
    ('श', 'sh'),
    ('ष', 'shh'),
    ('स', 's'),
    ('ह', 'h'),
    ('़', ''),    # Nukta (diacritic mark for foreign sounds)
    ('ा', 'aa'),  # Vowel sound modifier
    ('ि', 'i'),   # Vowel sound modifier
    ('ी', 'ee'),  # Vowel sound modifier
    ('ु', 'u'),   # Vowel sound modifier
    ('ू', 'oo'),  # Vowel sound modifier
    ('ृ', 'ri'),  # Vowel sound modifier
    ('े', 'e'),   # Vowel sound modifier
    ('ै', 'ai'),  # Vowel sound modifier
    ('ॉ', 'aw'),  # Vowel sound modifier
    ('ो', 'o'),   # Vowel sound modifier
    ('ौ', 'au'),  # Vowel sound modifier
    ('्', ''),    # Halant (for stopping consonant sound)
    ('क़', 'q'),
    ('ख़', 'kh'),
    ('ग़', 'gh'),
    ('ज़', 'z'),
    ('ड़', 'r'),
    ('ढ़', 'rh'),
    ('फ़', 'f'),
    ('ॠ', 'rri'),
    ('।', 'period'),  # Purnavirama (full stop)
    ('०', '0'),
    ('१', '1'),
    ('२', '2'),
    ('३', '3'),
    ('४', '4'),
    ('५', '5'),
    ('६', '6'),
    ('७', '7'),
    ('८', '8'),
    ('९', '9')
}

In [8]:
!pip install indic-num2words

from num_to_words import num_to_word

def replace_numbers_with_words(text):
    def replace(match):
        number = int(match.group())
        return number_to_word(number, lang='hi')

    result = re.sub(r'\b\d+\b', replace, text)
    return result


Collecting indic-num2words
  Downloading indic_num2words-1.3.0-py3-none-any.whl.metadata (3.6 kB)
Downloading indic_num2words-1.3.0-py3-none-any.whl (17 kB)
Installing collected packages: indic-num2words
Successfully installed indic-num2words-1.3.0


In [11]:
def normalize_text(text):

    # Replace numbers with words
    text = replace_numbers_with_words(text)

    # Apply character replacements
    for old, new in replacements:
        text = text.replace(old, new)

    # Remove punctuation
    text = re.sub(r'[^\u0900-\u097F\s\']', '', text)

    return text

In [13]:
def text_to_speech(text, audio_file=None):
    # Normalize the input text
    normalized_text = normalize_text(text)

    # Prepare the input for the model
    inputs = processor(text=normalized_text, return_tensors="pt").to(device)

    # Use the default speaker embedding
    speaker_embeddings = default_embedding

    # Generate speech
    with torch.no_grad():
        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)

    speech_np = speech.cpu().numpy()

    return (16000, speech_np)

In [14]:
iface = gr.Interface(
    fn=text_to_speech,
    inputs=[
        gr.Textbox(label="Enter text to convert to speech")
    ],
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy")
    ],
    title="Hindi Language SpeechT5 Text-to-Speech Demo",
    description="Enter Your text, and listen to the generated speech."
)

In [15]:
iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://fcfb20ea2ecfc89815.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




##Evaluation of finetuned regional model

The model is not generating anything, model has been trained once again but not getting proper result and further I did not know what to do now

In [None]:
mos_ratings = 0
naturalness = 0
intelligibility = 0
pronounciation = 0