In [None]:
# Kokoru is an OpenSource Text-to-Speech SOTA model trained using <100 hours of audio. 
# The model is only 385 MB which makes it very light and easy to run even on a CPU
# You can further fine-tune the model!

In [None]:
# Install dependencies 
!git lfs install
!git clone https://huggingface.co/hexgrad/Kokoro-82M
%cd Kokoro-82M
!apt-get -qq -y install espeak-ng > /dev/null 2>&1
!pip install -q phonemizer torch transformers scipy munch

In [None]:
# Build the model and load the default voicepack
from models import build_model
import torch

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
MODEL = build_model('kokoro-v0_19.pth', device)

In [None]:
VOICE_NAME = [
    'af', # Default voice is a 50-50 mix of Bella & Sarah
    'af_bella', 'af_sarah', 'am_adam', 'am_michael',
    'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
    'af_nicole', 'af_sky',
][0]
VOICEPACK = torch.load(f'voices/{VOICE_NAME}.pt', weights_only=True).to(device)
print(f'Loaded voice: {VOICE_NAME}')

In [None]:
# Call generate, which returns 24khz audio and the phonemes used
from kokoro import generate
text = "This is test to experiment the new Kokoru's text2speech model to see how good it is and if it can be used for Customer Support agents."
audio, out_ps = generate(MODEL, text, VOICEPACK, lang=VOICE_NAME[0])
# Language is determined by the first letter of the VOICE_NAME:
# 🇺🇸 'a' => American English => en-us
# 🇬🇧 'b' => British English => en-gb

In [None]:
# Display the 24khz audio and print the output phonemes
from IPython.display import display, Audio
display(Audio(data=audio, rate=24000, autoplay=True))
print(out_ps)