In [None]:
# This notebook expects the input manuscript in ./input/story.txt
# and the voice model onnx files in ./PiperVoiceModels/ folder 
# and will output the audio files in ./output/

# install the necessary packages 
# pip install crewai
# pip install gTTS
# pip install pydub
# pip install ffmpegio

# Import Necessary Libraries
# This cell imports all required libraries for the workflow.

# Downloaded ffmpeg as per this - https://stackoverflow.com/a/74658329/23656140

# Piper TTS - https://github.com/rhasspy/piper
# Piper downloaded from https://github.com/rhasspy/piper/releases
# Voice models for Piper downloaded from https://rhasspy.github.io/piper-samples/
# Use SSML Tags for better control over the voice - https://cloud.google.com/text-to-speech/docs/ssml

import os
from crewai import Agent, Task, Crew
import subprocess
from crewai import LLM
from gtts import gTTS
from pydub import AudioSegment
import pathlib

import re
from IPython.display import display, Audio

ffmpeg_path = pathlib.Path.home() / 'AppData' / 'Local' / 'ffmpegio' / 'ffmpeg-downloader' / 'ffmpeg' / 'bin' 
AudioSegment.converter = ffmpeg_path / 'ffmpeg.exe'
AudioSegment.ffmpeg = ffmpeg_path / 'ffmpeg.exe'
AudioSegment.ffprobe = ffmpeg_path / 'ffprobe.exe'

In [None]:


os.environ['CREWAI_API_URL'] = 'http://localhost:11434'

# Cell 2: Define the Local LLM with Ollama
# This cell initializes the Ollama LLM (e.g., Llama 3) to be used by agents.

llm = LLM(model="llama3.1:latest", base_url="http://localhost:11434", provider="ollama", temperature=0.3, timeout=180000)


In [None]:
# Define Utility Functions
# These functions handle manuscript splitting, TTS conversion, and audio merging.

def read_manuscript(file_path):
    """Read the manuscript from a text file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        print(f"Error: Manuscript file '{file_path}' not found.")
        return None

def split_manuscript(text, max_length=5000):
    """Split the manuscript into chunks based on natural breaks (paragraphs, chapters)."""
    # Use regex to split on chapter headings or multiple newlines
    sections = re.split(r'\n{2,}|\bChapter \d+\b', text)
    chunks = []
    current_chunk = ""
    
    for section in sections:
        if len(current_chunk) + len(section) <= max_length:
            current_chunk += section + "\n\n"
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = section + "\n\n"
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

def text_to_speech(text, output_file, language='en', accent=None):
    """Convert text to speech using Piper TTS and save as an MP3 file."""

    piper_binary = r"C:\piperTTS\piper.exe"
    
    piper_model = r"./PiperVoiceModels/en_GB-cori-high.onnx"  # British Female
    #piper_model = r"./PiperVoiceModels/en_US-hfc_female-medium.onnx"  # American Female

    # Parameters for speed and pitch
    length_scale = "1.2"  # Slow down by 20% (increase this value to slow down more)
    f0_scale = "0.8"      # Increase pitch by 20% (increase this value to make pitch higher)

    # Save as WAV
    wav_file = output_file.replace('.mp3', '.wav')
    piper_cmd = [
        piper_binary,
        "--model", piper_model,
        "--length_scale", length_scale,     # adjust speed
        "--f0_scale", f0_scale,             # adjust pitch
        "--sentence_silence", "0.5",         # pause between sentences
        "--noise_scale", "0.0",              # randomness and expressiveness in the voice
        "--output_file", wav_file
    ]

    # Run Piper command with text piped in
    try:
        process = subprocess.Popen(piper_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        stdout, stderr = process.communicate(input=text)
        if process.returncode != 0:
            print(f"Error running Piper TTS: {stderr}")
            return None
    except subprocess.CalledProcessError as e:
        print(f"Error running Piper TTS: {e}")
        return None

    # Convert WAV to MP3
    audio = AudioSegment.from_wav(wav_file)
    audio.export(output_file, format="mp3")

    # Clean up the temporary WAV file
    os.remove(wav_file)

    return output_file

def merge_audio_files(audio_files, output_file):
    """Merge multiple audio files into a single audiobook file."""
    combined = AudioSegment.empty()
    for audio_file in audio_files:
        if not os.path.exists(audio_file):
           print(f"Error: The file {audio_file} does not exist.")
           continue
        print ("*** Merging audio file ", audio_file)
        audio = AudioSegment.from_mp3(audio_file)
        combined += audio
        # Delete the audio file after merging
        os.remove(audio_file)
        
    combined.export(output_file, format="mp3")
    return output_file


In [None]:

# Cell 4: Define CrewAI Agents
# This cell defines the agents responsible for different tasks in the workflow.

# Agent 1: Manuscript Splitter Agent
splitter_agent = Agent(
    role='Manuscript Splitter',
    goal='Split the novel manuscript into manageable sections for narration',
    backstory='You are an expert editor skilled at breaking down large texts into coherent, manageable sections while preserving narrative flow.',
    verbose=True,
    llm=llm
)

# Agent 2: Narration Agent
narration_agent = Agent(
    role='Narration Agent',
    goal='Convert text sections into audio narration using text-to-speech',
    backstory='You are a voice actor with expertise in converting written text into engaging audio narration, ensuring clarity and emotional tone.',
    verbose=True,
    llm=llm
)

# Agent 3: Audio Combiner Agent
combiner_agent = Agent(
    role='Audio Combiner',
    goal='Combine individual audio files into a final audiobook file',
    backstory='You are an audio engineer skilled at merging audio clips seamlessly to create a polished audiobook.',
    verbose=True,
    llm=llm
)


In [None]:

# Cell 5: Define CrewAI Tasks
# This cell defines the tasks each agent will perform.

def create_split_task(manuscript_text):
    return Task(
        description=f'Split the following manuscript text into manageable sections for narration:\n\n{manuscript_text}',
        agent=splitter_agent,
        expected_output='A list of text sections, each no longer than 5000 characters, split at natural narrative breaks.'
    )

def create_narration_task(section, index):
    return Task(
        description=f'Convert the following text section into an audio file using text-to-speech:\n\n{section}',
        agent=narration_agent,
        expected_output=f'Audio file saved as section_{index}.mp3'
    )

def create_combiner_task(audio_files):
    return Task(
        description=f'Merge the following audio files into a single audiobook file: {audio_files}',
        agent=combiner_agent,
        expected_output='A single audiobook file named audiobook.mp3'
    )


In [None]:

# Cell 6: Main Workflow Execution
# This cell executes the entire workflow, from reading the manuscript to creating the audiobook.

def main():
    # Step 1: Read the manuscript
    manuscript_path = "./input/story.txt"  # Replace with your manuscript file path
    manuscript_text = read_manuscript(manuscript_path)
    if not manuscript_text:
        return

    # Step 2: Split the manuscript into sections
    print("Splitting manuscript into sections...")
    chunks = split_manuscript(manuscript_text)
    print(f"Manuscript split into {len(chunks)} sections.")

    # Step 3: Define tasks for splitting (though splitting is done programmatically here, we simulate agent involvement)
    split_task = create_split_task(manuscript_text)

    # Step 4: Convert each section to audio
    audio_files = []
    narration_tasks = []
    for i, chunk in enumerate(chunks):
        output_file = f"./output/section_{i}.mp3"
        print(f"Converting section {i+1} to audio...")
        text_to_speech(chunk, output_file)
        audio_files.append(output_file)
        narration_tasks.append(create_narration_task(chunk, i))
        
    # Step 5: Merge audio files into a final audiobook
    print("Merging audio files into final audiobook...")
    final_audiobook = "./output/audiobook.mp3"
    merge_audio_files(audio_files, final_audiobook)

    # Step 6: Define combiner task
    combiner_task = create_combiner_task(audio_files)

    # Step 7: Create and run the Crew
    crew = Crew(
        agents=[splitter_agent, narration_agent, combiner_agent],
        tasks=[split_task] + narration_tasks + [combiner_task],
        verbose=False  # Detailed logging
    )

    print("Starting CrewAI workflow...")
    crew.kickoff()

    # Step 8: Display the final audiobook for playback in the notebook
    print(f"Audiobook created successfully: {final_audiobook}")
    display(Audio(final_audiobook))        



In [None]:
# Cell 7: Run the Workflow
# Execute the main function to start the workflow.

if __name__ == "__main__":
    main()