In [None]:
from transformers import pipeline
import scipy

model_id = "./Finetune/vits_mms_finetune/models/mms-tts-nova-train"
synthesiser = pipeline("text-to-speech", model_id, device=0) # add device=0 if you want to use a GPU

speech = synthesiser("ၾႃႉၾူၼ်သွမ်ႇတႃ ႁၢင်ႈလီၼႃႇၼႃႇ")

scipy.io.wavfile.write("finetuned_output.wav", rate=speech["sampling_rate"], data=speech["audio"][0])

In [None]:
from transformers import VitsTokenizer

tokenizer = VitsTokenizer.from_pretrained("./Finetune/vits_mms_finetune/models/mms-tts-nova-train")
text = "ၾႃႉၾူၼ်ၵမ်ႇလမ်သွမ်ႇ ၸွမ်းၾင်ႇၼမ်ႉၾင်ႇၼွင်"

tokenizer.decode(tokenizer.encode(text))

In [1]:
from transformers import VitsModel, VitsTokenizer, set_seed
import torch
from shannlp import util, word_tokenize

def preprocess_string(input_string: str):
    string_token = word_tokenize(input_string)
    num_to_shanword = util.num_to_shanword

    result = []
    for token in string_token:
        if token.strip().isdigit():
            result.append(num_to_shanword(int(token)))
        else:
            result.append(token)

    full_token = ''.join(result)
    return full_token

model_name = "./Finetune/vits_mms_finetune/models/mms-tts-nova-train"
model = VitsModel.from_pretrained(model_name)
tokenizer = VitsTokenizer.from_pretrained(model_name)

text = """မိူဝ်ႈပီ 1958 လိူၼ်မေႊ 21 ဝၼ်းၼၼ်ႉ ၸဝ်ႈၼွႆႉသေႃးယၼ်ႇတ ဢမ်ႇၼၼ် ၸဝ်ႈၼွႆႉ ဢွၼ်ႁူဝ် ၽူႈႁၵ်ႉၸိူဝ်ႉၸၢတ်ႈ 31 ၵေႃႉသေ တိူင်ႇၵၢဝ်ႇယၼ်ႇၸႂ် ၵိၼ်ၼမ်ႉသတ်ႉၸႃႇ တႃႇၵေႃႇတင်ႈပူၵ်းပွင် ၵၢၼ်လုၵ်ႉၽိုၼ်ႉ တီႈႁူၺ်ႈပူႉ ႁိမ်းသူပ်းၼမ်ႉၵျွတ်ႈ ၼႂ်းဢိူင်ႇမိူင်းႁၢင် ၸႄႈဝဵင်းမိူင်းတူၼ် ၸိုင်ႈတႆးပွတ်းဢွၵ်ႇၶူင်း လႅၼ်လိၼ်ၸိုင်ႈထႆး။"""
text2 = """သိုၵ်းမၢၼ်ႈဢဝ်ၶိူင်ႈမိၼ်တိုၵ်းပွႆႇမၢၵ်ႇ ဢဝ်ၵွင်ႈလူင်ယိုဝ်းလႄႈမၢၵ်ႇၾင်လိၼ် ႁဵတ်းႁႂ်ႈၵူၼ်းမိူင်းလုတၢႆ 7 ၵေႃႉ၊ မၢတ်ႇၸဵပ်း 20 ၼႂ်းၸႄႈဝဵင်းၼွင်ၶဵဝ်လႄႈ မၢတ်ႇၸဵပ်း 3 ၵေႃႉၼႂ်းၸႄႈဝဵင်းမိူင်းမိတ်ႈ ၸိုင်ႈတႆးပွတ်းႁွင်ႇ"""
text3 = """ၵူၺ်းသမ်ႉမၢၵ်ႈမီးမႃးၵွပ်ႈမၼ်းၼမ်လၢႆ ငိုၼ်းၶီႈငိုၼ်းၶွၼ်ႇ ငိုၼ်းၶွင်လၢႆၵေႃႈလႆႈၵိၼ်ပႃး တေလီဢမ်ႇလီတႃႇပိူၼ်ႈတႄႉဢမ်ႇႁူႉ"""
text4 = """ၾႃႈၾူၼ်ၵမ်ႇလမ်သႃး ႁေႃႈၵႃးၵႂႃႇလႄႇလဵၼ်ႈ တိုင်းၾႃႉတိုင်းၾူၼ် ႁူၼ်ၶႂ်ႈၼွၼ်းဝၼ်း ၾၼ်ႁၼ်ႁၢင်ႈလီ ၼၢင်းၽီၼိူဝ်ၾႃႉ"""

processed_string = preprocess_string(text)
inputs = tokenizer(processed_string, return_tensors="pt")
set_seed(456)

model.speaking_rate = 1.2
model.noise_scale = 0.8

with torch.no_grad():
    output = model(**inputs)

waveform = output.waveform[0]


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
from IPython.display import Audio

Audio(waveform, rate=model.config.sampling_rate)


In [3]:
print(inputs)

{'input_ids': tensor([[ 0, 10,  0, 15,  0, 12,  0,  5,  0,  1,  0,  3,  0, 20,  0, 31,  0,  2,
          0, 15,  0, 22,  0, 11,  0,  1,  0,  3,  0, 23,  0, 28,  0, 11,  0,  1,
          0,  7,  0,  5,  0,  1,  0,  3,  0, 20,  0, 17,  0,  7,  0,  1,  0,  8,
          0, 23,  0,  6,  0,  3,  0, 21,  0, 15,  0, 20,  0,  1,  0,  4,  0, 20,
          0, 33,  0,  9,  0,  1,  0,  8,  0, 16,  0, 15,  0, 12,  0,  2,  0,  1,
          0, 10,  0, 19,  0, 48,  0, 21,  0, 17,  0,  5,  0,  1,  0,  4,  0, 13,
          0, 28,  0,  9,  0,  1,  0,  4,  0,  5,  0,  2,  0,  1,  0,  4,  0,  2,
          0,  2,  0,  1,  0, 14,  0, 43,  0, 18,  0,  5,  0,  1,  0,  3,  0,  2,
          0, 27,  0, 24,  0, 14,  0, 21,  0, 19,  0,  6,  0,  4,  0, 29,  0,  2,
          0,  1,  0,  8,  0,  9,  0, 43,  0, 13,  0, 10,  0,  1,  0,  8,  0,  2,
          0,  2,  0,  1,  0, 43,  0, 18,  0,  5,  0,  1,  0,  3,  0,  2,  0, 27,
          0, 24,  0, 14,  0, 43,  0, 13,  0, 27,  0,  2,  0,  1,  0, 23,  0, 12,
          0,  

In [4]:
print(output)

VitsModelOutput(waveform=tensor([[-7.1445e-06, -1.0369e-05, -4.6381e-05,  ...,  2.5176e-04,
          1.7845e-04,  2.4536e-04]]), sequence_lengths=tensor([321280]), spectrogram=tensor([[[-0.0760,  0.9632,  0.1010,  ...,  0.5149,  0.2956, -0.4877],
         [ 0.3045,  0.4140,  0.1577,  ...,  1.1720,  0.6379,  0.5424],
         [-0.0979, -2.1794, -2.5842,  ..., -1.7795, -1.2630, -2.2350],
         ...,
         [ 0.6953,  1.4124,  0.4159,  ...,  1.5285,  1.8194,  1.0474],
         [ 0.8494,  0.7464,  1.9675,  ...,  1.9624,  0.5913,  0.0773],
         [ 1.5960,  2.2913,  3.7308,  ...,  0.1266, -0.2213,  1.5017]]]), hidden_states=None, attentions=None)


In [5]:
print(model.config.sampling_rate)

16000


In [None]:
preprocess_string(text)

In [6]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
model.push_to_hub("NorHsangPha/mms-tts-nova-train")

model.safetensors:   0%|          | 0.00/145M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NorHsangPha/mms-tts-nova-train/commit/5b437231aaa1b52d763a9d2d61e2d8747aa2d2bf', commit_message='Upload model', commit_description='', oid='5b437231aaa1b52d763a9d2d61e2d8747aa2d2bf', pr_url=None, pr_revision=None, pr_num=None)

In [8]:
tokenizer.push_to_hub("NorHsangPha/mms-tts-nova-train")

CommitInfo(commit_url='https://huggingface.co/NorHsangPha/mms-tts-nova-train/commit/e578d3dfb19189f5d12db25f87bef96d958c98c1', commit_message='Upload tokenizer', commit_description='', oid='e578d3dfb19189f5d12db25f87bef96d958c98c1', pr_url=None, pr_revision=None, pr_num=None)

In [12]:
from transformers import VitsModel, VitsTokenizer, set_seed
import torch
from IPython.display import Audio

model = VitsModel.from_pretrained("./Finetune/vits_mms_finetune/models/mms-tts-shn-train")
tokenizer = VitsTokenizer.from_pretrained("./Finetune/vits_mms_finetune/models/mms-tts-shn-train")

text = """မိူဝ်ႈပီ 1958 လိူၼ်မေႊ 21 ဝၼ်းၼၼ်ႉ ၸဝ်ႈၼွႆႉသေႃးယၼ်ႇတ ဢမ်ႇၼၼ် ၸဝ်ႈၼွႆႉ ဢွၼ်ႁူဝ် ၽူႈႁၵ်ႉၸိူဝ်ႉၸၢတ်ႈ 31 ၵေႃႉသေ တိူင်ႇၵၢဝ်ႇယၼ်ႇၸႂ် ၵိၼ်ၼမ်ႉသတ်ႉၸႃႇ တႃႇၵေႃႇတင်ႈပူၵ်းပွင် ၵၢၼ်လုၵ်ႉၽိုၼ်ႉ တီႈႁူၺ်ႈပူႉ ႁိမ်းသူပ်းၼမ်ႉၵျွတ်ႈ ၼႂ်းဢိူင်ႇမိူင်းႁၢင် ၸႄႈဝဵင်းမိူင်းတူၼ် ၸိုင်ႈတႆးပွတ်းဢွၵ်ႇၶူင်း လႅၼ်လိၼ်ၸိုင်ႈထႆး။"""
inputs = tokenizer(text4, return_tensors="pt")
set_seed(456)

model.speaking_rate = 0.9
model.noise_scale = 0.8

with torch.no_grad():
    output = model(**inputs)

waveform = output.waveform[0]

Audio(waveform, rate=model.config.sampling_rate)

In [None]:


model = VitsModel.from_pretrained("NorHsangPha/mms-tts-shn-train")
processor = VitsTokenizer.from_pretrained("NorHsangPha/mms-tts-shn-train")

text = """မိူဝ်ႈပီ 1958 လိူၼ်မေႊ 21 ဝၼ်းၼၼ်ႉ ၸဝ်ႈၼွႆႉသေႃးယၼ်ႇတ ဢမ်ႇၼၼ် ၸဝ်ႈၼွႆႉ ဢွၼ်ႁူဝ် ၽူႈႁၵ်ႉၸိူဝ်ႉၸၢတ်ႈ 31 ၵေႃႉသေ တိူင်ႇၵၢဝ်ႇယၼ်ႇၸႂ် ၵိၼ်ၼမ်ႉသတ်ႉၸႃႇ တႃႇၵေႃႇတင်ႈပူၵ်းပွင် ၵၢၼ်လုၵ်ႉၽိုၼ်ႉ တီႈႁူၺ်ႈပူႉ ႁိမ်းသူပ်းၼမ်ႉၵျွတ်ႈ ၼႂ်းဢိူင်ႇမိူင်းႁၢင် ၸႄႈဝဵင်းမိူင်းတူၼ် ၸိုင်ႈတႆးပွတ်းဢွၵ်ႇၶူင်း လႅၼ်လိၼ်ၸိုင်ႈထႆး။"""
inputs = tokenizer(text, return_tensors="pt")
set_seed(555)

model.speaking_rate = 0.9
model.noise_scale = 0.8

with torch.no_grad():
    output = model(**inputs)

waveform = output.waveform[0]

Audio(waveform, rate=model.config.sampling_rate)