In [21]:
from openai import OpenAI
from pathlib import Path
import os
from playsound import playsound
import json

In [2]:
api_key = ""

# Step 1

OpenAI client instance.

Define characteristics of each voice.

estimate_cost method used to estimate cost of CHARACTER level sequence.

In [3]:
client = OpenAI(api_key=api_key)

VOICES = {
    "alloy": "androgynous, soft",
    "echo": "male, soft",
    "fable": "male, soft, british accent",
    "onyx": "male, grizzly",
    "nova": "female, soft, adult",
    "shimmer": "female, soft, middle aged",
}

def estimate_cost(string, per_char):
    one_K_units = len(string) / 100
    return one_K_units * per_char

# Step 2

Load snippet of a book, chosen Shadow Slave is a webnovel downloaded manually.

In [4]:
book_file = "ShadowSlave_Ch1.txt"

with open(book_file, "r") as fh:
    book_lines = fh.readlines()

book_str = "\n".join(book_lines)

snippet = book_str[:1000]

# Step 3

Define system message, and create chat completion for gpt-4.

System message instructs the model to segment up the given snippet into a script like string with special delimiters.

In [5]:
sys_msg1 = """You are a text classification expert. 
Given a section of text from a book extract the character / narrator dialogue section.
All text should be perfectly preserved without anything changed, only classifying who says what.
For example the input: \"Walking the dog Janice shouted \'\Here Boy!'. Soon after a black labrador ran through the open field\"
Provides the output in the following format marking the speaker with *!*SPEAKER*!*:
*!*Narrator*!* "Walking the dog Jance shouted"
*!*Janice!*!* "Here Boy!"
*!*Narrator*!* "Soon after a black labrador ran through the open field"
"""

completion = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system","content": sys_msg1},
        {"role": "user"  ,"content": snippet},
    ]
)

# Step 4

Format the returned completion for easier use.

Generate characters array (The unique set of characters found via system message 1).

In [6]:
script = completion.choices[0].message.content.split("*!*")
while "" in script:
    script.remove("")

source = None
is_voice_source = True
formatted_script = []
characters = []
for section in script:
    if is_voice_source:
        source = section.strip()
        characters.append(source)
        is_voice_source = False
    else:
        formatted_script.append((source, section.strip()))
        is_voice_source = True

characters = list(set(characters))

Here we within the given snippet two voices or characters were found, one of the is the narrator and the other is named Sunny.

In [26]:
characters

['Sunny', 'Narrator']

Using the formatted chat completion we can see the sequence of text we now feed into the text-to-voice model.

In [23]:
for part in formatted_script:
    speaker, text = part
    print(repr(text))

'"A frail-looking young man with pale skin and dark circles under his eyes was sitting on a rusty bench across from the police station. He was cradling a cup of coffee in his hands — not the cheap synthetic type slum rats like him had access to, but the real deal. This cup of plant-based coffee, usually available only to higher rank citizens, had cost most of his savings. But on this particular day, Sunny decided to pamper himself. After all, his life was coming to an end. Enjoying the warmth of the luxurious drink, he raised the cup and savored the aroma. Then, tentatively, he took a small sip… and immediately grimaced."'
'"Ah! So bitter!"'
'"Giving the cup of coffee an intense look, Sunny sighed and forced himself to drink some more. Bitter or not, he was determined to get his money\'s worth — taste buds be damned."'
'"I should have bought a piece of real meat instead. Who knew actual coffee is so disgusting? Well. It\'s going to keep me awake, at least."'


# Step 5

Now we determing from the defined VOICES mapping which character/narrator should be assigned which voice.

The system message indicates to the model that given a string of the script generated in step 3 match a character to a fitting voice.

In [7]:
voices_desc = ""
for voice in VOICES:
    voices_desc += f"{voice}: {VOICES[voice]}\n"

script_str = ""
for part in formatted_script:
    speaker, text = part
    script_str += f"{speaker}: {text}"

sys_msg2 = f"""You are the director of an audio book and have to choose who gets to voice each character.
Choose from the following list of voices {list(VOICES)} and the related descriptions.
{voices_desc}

Given the above list of voices, and the script assign voices to each character and reply in json where the key is the characters name, and the value is voice name."""

completion1 = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system","content": sys_msg2},
        {"role": "user"  ,"content": script_str},
    ]
)

For consistency sake the voice_legend is manually assigned within this step, however using json.loads we can use the returned completion1 message to fill this dictioanry.

In [27]:
voices_legend = {
    "Sunny": "alloy",
    "Narrator": "onyx"
}

# Step 6

Given each segment of the script prompt for chat completion.

Each line of the script has a text section, a speaker (who is saying the current line) and a related mapping from voices_legend to speaker.

Save the resulting file to 0 indexed mp4 file.

In [28]:
# voices_legend = json.loads(completion1.choices[0].message.content)

for i, part in enumerate(formatted_script):
    speaker, text = part

    speech_file_path = Path("./results").parent / f"_{i}_.mp4"

    response = client.audio.speech.create(
        model="tts-1",
        voice=voices_legend[speaker],
        input=text
    )
    print(f"{speaker} says: {text}")

    response.stream_to_file(speech_file_path)


Narrator says: "A frail-looking young man with pale skin and dark circles under his eyes was sitting on a rusty bench across from the police station. He was cradling a cup of coffee in his hands — not the cheap synthetic type slum rats like him had access to, but the real deal. This cup of plant-based coffee, usually available only to higher rank citizens, had cost most of his savings. But on this particular day, Sunny decided to pamper himself. After all, his life was coming to an end. Enjoying the warmth of the luxurious drink, he raised the cup and savored the aroma. Then, tentatively, he took a small sip… and immediately grimaced."
Sunny says: "Ah! So bitter!"
Narrator says: "Giving the cup of coffee an intense look, Sunny sighed and forced himself to drink some more. Bitter or not, he was determined to get his money's worth — taste buds be damned."
Sunny says: "I should have bought a piece of real meat instead. Who knew actual coffee is so disgusting? Well. It's going to keep me a

In [29]:
results = os.listdir("./results")
targets = {}
for file in results:
    segments = file.split("_")

    while "" in segments:
        segments.remove("")

    targets[segments[0]] = file
    
for i in range(0, 100):

    try:
        print(targets[str(i)])

        playsound(f"results/{targets[str(i)]}")
    
    except Exception as E:
        print(f"Done at {i} because {E}")
        break
    

_0_.mp3
_1_.mp3
_2_.mp3
_3_.mp3
Done at 4 because '4'
