# Ejemplos de Large Language Model (LLM) disponibles en HuggingFace para convertir Texto a Audio (es decir, leer el texto)

https://huggingface.co/models

In [1]:
#@title Instalar paquete Transformers de HuggingFace
!pip install --upgrade pip
!pip install --upgrade transformers sentencepiece datasets[audio]
#!pip install --upgrade accelerate

Collecting pip
  Downloading pip-23.3.2-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.3.2
Collecting transformers
  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting datasets[audio]
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets[audio])
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting multiprocess (from datasets[audio])
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to

In [2]:
#@title Cargar Librerías

from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
import torch

import re
import random

import textwrap
from IPython.display import HTML
from base64 import b64encode

print("Librerías cargadas.")

# determina si usa GPU o CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print("Device: ", device)

Librerías cargadas.
Device:  cpu


# Modelo Texto-to-Speech (TTS)

In [3]:
#@title Cargar Modelo TTS

nombreModeloTTS = "microsoft/speecht5_tts" #@param [ "microsoft/speecht5_tts"]
speaker_embeddings_dataset_name = "Matthijs/cmu-arctic-xvectors" #@param[ "Matthijs/cmu-arctic-xvectors" ]
#@markdown otros modelos disponibles en: https://huggingface.co/models?pipeline_tag=text-to-speech&sort=trending

#@markdown También ver https://colab.research.google.com/drive/1i7I5pzBcU3WDFarDnzweIj4-sVVoIUFJ#scrollTo=W7spxtTGtmba para ejemplos sobre cómo personalizar la voz.

# carga modelo
synthesiser = pipeline("text-to-speech", nombreModeloTTS)
embeddings_dataset = load_dataset(speaker_embeddings_dataset_name, split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


def reproducir(text):
  archivo_procesar = "speech.wav"

  speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
  sf.write(archivo_procesar, speech["audio"], samplerate=speech["sampling_rate"])

  mp4 = open(archivo_procesar,'rb').read()
  data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
  return HTML("""
  <video width=400 controls>
        <source src="%s" type="audio/wav">
  </video>
  """ % data_url)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/50.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.3M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/7931 [00:00<?, ? examples/s]

In [7]:
#@title Probar Modelo TTS 1

texto = "hello, how are you?" #@param {type:"string"}

print("> ", textwrap.fill(texto, 100))

reproducir(texto)

>  hello, how are you?


In [9]:
#@title Probar Modelo TTS 2

texto = "Artificial intelligence (AI) is the intelligence of machines or software, as opposed to the intelligence of humans or other animals. " #@param {type:"string"}

print("> ", textwrap.fill(texto, 100))

reproducir(texto)

>  Artificial intelligence (AI) is the intelligence of machines or software, as opposed to the
intelligence of humans or other animals.
