In [8]:
import os
from scipy.io.wavfile import write

os.environ["CUDA_VISIBLE_DEVICES"] = "0"


from IPython.display import Audio
import nltk  # we'll use this to split into sentences
import numpy as np

from bark.generation import (
    generate_text_semantic,
    preload_models,
)
from bark.api import semantic_to_waveform
from bark import generate_audio, SAMPLE_RATE

In [2]:
preload_models()

No GPU being used. Careful, inference might be very slow!


In [None]:
nltk.download('all')


# Simple Long-Form Generation
We split longer text into sentences using `nltk` and generate the sentences one by one.

In [3]:
script = """
Hey, have you heard about this new text-to-audio model called "Bark"? 
Apparently, it's the most realistic and natural-sounding text-to-audio model 
out there right now. People are saying it sounds just like a real person speaking. 
I think it uses advanced machine learning algorithms to analyze and understand the 
nuances of human speech, and then replicates those nuances in its own speech output. 
It's pretty impressive, and I bet it could be used for things like audiobooks or podcasts. 
In fact, I heard that some publishers are already starting to use Bark to create audiobooks. 
It would be like having your own personal voiceover artist. I really think Bark is going to 
be a game-changer in the world of text-to-audio technology.
""".replace("\n", " ").strip()

In [5]:
sentences = nltk.sent_tokenize(script)

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/ricky/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/ricky/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/ricky/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/ricky/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/ricky/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to
[nltk_data]    |     /Users/ricky/nltk_data...
[nltk_data]    | Downloading pac

In [6]:
SPEAKER = "v2/en_speaker_6"
silence = np.zeros(int(0.25 * SAMPLE_RATE))  # quarter second of silence

pieces = []
for sentence in sentences:
    audio_array = generate_audio(sentence, history_prompt=SPEAKER)
    pieces += [audio_array, silence.copy()]


100%|██████████| 283/283 [00:07<00:00, 36.16it/s]
100%|██████████| 15/15 [00:34<00:00,  2.32s/it]
100%|██████████| 450/450 [00:11<00:00, 40.12it/s]
100%|██████████| 23/23 [00:52<00:00,  2.30s/it]
100%|██████████| 250/250 [00:05<00:00, 42.69it/s]
100%|██████████| 13/13 [00:30<00:00,  2.33s/it]
100%|██████████| 653/653 [00:16<00:00, 38.63it/s]
100%|██████████| 33/33 [01:16<00:00,  2.32s/it]
100%|██████████| 381/381 [00:09<00:00, 41.68it/s]
100%|██████████| 20/20 [00:45<00:00,  2.26s/it]
100%|██████████| 570/570 [00:14<00:00, 39.81it/s]
100%|██████████| 29/29 [01:10<00:00,  2.44s/it]
100%|██████████| 173/173 [00:04<00:00, 37.40it/s]
100%|██████████| 9/9 [00:23<00:00,  2.66s/it]
100%|██████████| 334/334 [00:10<00:00, 31.28it/s]
100%|██████████| 17/17 [00:42<00:00,  2.50s/it]


In [10]:
audio_file = np.concatenate(pieces)
Audio(audio_file, rate=SAMPLE_RATE)

In [13]:
write('output_audio.wav', SAMPLE_RATE, audio_file)

# $ \\ $

# Advanced Long-Form Generation
Somtimes Bark will hallucinate a little extra audio at the end of the prompt.
We can solve this issue by lowering the threshold for bark to stop generating text. 
We use the `min_eos_p` kwarg in `generate_text_semantic`

In [None]:
GEN_TEMP = 0.6
SPEAKER = "v2/en_speaker_6"
silence = np.zeros(int(0.25 * SAMPLE_RATE))  # quarter second of silence

pieces = []
for sentence in sentences:
    semantic_tokens = generate_text_semantic(
        sentence,
        history_prompt=SPEAKER,
        temp=GEN_TEMP,
        min_eos_p=0.05,  # this controls how likely the generation is to end
    )

    audio_array = semantic_to_waveform(semantic_tokens, history_prompt=SPEAKER,)
    pieces += [audio_array, silence.copy()]



100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 38.05it/s]
100%|████████████████████████████████████████████████████████████████████████| 18/18 [00:07<00:00,  2.46it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 32.28it/s]
100%|████████████████████████████████████████████████████████████████████████| 21/21 [00:08<00:00,  2.54it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 55.78it/s]
100%|████████████████████████████████████████████████████████████████████████| 14/14 [00:05<00:00,  2.57it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:06<00:00, 14.73it/s]
100%|████████████████████████████████████████████████████████████████████████| 35/35 [00:14<00:00,  2.47it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 40.29it/s]
100%|█████

In [None]:
Audio(np.concatenate(pieces), rate=SAMPLE_RATE)

# $ \\ $

# Make a Long-Form Dialog with Bark

### Step 1: Format a script and speaker lookup

In [None]:
speaker_lookup = {"Samantha": "v2/en_speaker_9", "John": "v2/en_speaker_2"}

# Script generated by chat GPT
script = """
Samantha: Hey, have you heard about this new text-to-audio model called "Bark"?

John: No, I haven't. What's so special about it?

Samantha: Well, apparently it's the most realistic and natural-sounding text-to-audio model out there right now. People are saying it sounds just like a real person speaking.

John: Wow, that sounds amazing. How does it work?

Samantha: I think it uses advanced machine learning algorithms to analyze and understand the nuances of human speech, and then replicates those nuances in its own speech output.

John: That's pretty impressive. Do you think it could be used for things like audiobooks or podcasts?

Samantha: Definitely! In fact, I heard that some publishers are already starting to use Bark to create audiobooks. And I bet it would be great for podcasts too.

John: I can imagine. It would be like having your own personal voiceover artist.

Samantha: Exactly! I think Bark is going to be a game-changer in the world of text-to-audio technology."""
script = script.strip().split("\n")
script = [s.strip() for s in script if s]
script

['Samantha: Hey, have you heard about this new text-to-audio model called "Bark"?',
 "John: No, I haven't. What's so special about it?",
 "Samantha: Well, apparently it's the most realistic and natural-sounding text-to-audio model out there right now. People are saying it sounds just like a real person speaking.",
 'John: Wow, that sounds amazing. How does it work?',
 'Samantha: I think it uses advanced machine learning algorithms to analyze and understand the nuances of human speech, and then replicates those nuances in its own speech output.',
 "John: That's pretty impressive. Do you think it could be used for things like audiobooks or podcasts?",
 'Samantha: Definitely! In fact, I heard that some publishers are already starting to use Bark to create audiobooks. And I bet it would be great for podcasts too.',
 'John: I can imagine. It would be like having your own personal voiceover artist.',
 'Samantha: Exactly! I think Bark is going to be a game-changer in the world of text-to-audi

### Step 2: Generate the audio for every speaker turn

In [None]:
pieces = []
silence = np.zeros(int(0.5*SAMPLE_RATE))
for line in script:
    speaker, text = line.split(": ")
    audio_array = generate_audio(text, history_prompt=speaker_lookup[speaker], )
    pieces += [audio_array, silence.copy()]

100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 34.03it/s]
100%|████████████████████████████████████████████████████████████████████████| 22/22 [00:08<00:00,  2.55it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 71.58it/s]
100%|████████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.65it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 22.75it/s]
100%|████████████████████████████████████████████████████████████████████████| 33/33 [00:13<00:00,  2.53it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 70.76it/s]
100%|████████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.63it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 20.46it/s]
100%|█████

### Step 3: Concatenate all of the audio and play it

In [None]:
Audio(np.concatenate(pieces), rate=SAMPLE_RATE)