In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install -q kokoro>=0.9.2 soundfile

In [None]:
!apt-get -qq -y install espeak-ng > /dev/null 2>&1

In [None]:
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf
import torch

In [None]:
pipeline = KPipeline(lang_code='a')

In [None]:
!mkdir /kaggle/working/epubs

In [None]:
!cp /kaggle/input/ebooks-epubs/sample_01.epub /kaggle/working/epubs

In [None]:
# l=[]
# for dirname, _, filenames in os.walk('/kaggle/working/epubs'):
#     for filename in filenames:
#         name=os.path.join(dirname, filename)
#         l.append(name)
#         print("name = ", name)
#         print(os.path.join(dirname, filename))
# print(l)

In [None]:
!pip install ebooklib beautifulsoup4
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import os

def extract_chapters_from_epub(epub_path, base_txt_dir="txt"):
    """
    Extract chapters from an EPUB file and save them as individual text files.
    
    Args:
        epub_path (str): Path to the EPUB file.
        base_txt_dir (str, optional): Directory to epub_name folder.
    """
    # Get the EPUB file name without extension
    epub_name = os.path.splitext(os.path.basename(epub_path))[0]
    
    # Set output directory (default: epub_name folder in current directory)
    # if base_txt_dir=="txt":
    os.makedirs(base_txt_dir, exist_ok=True)
    output_dir = os.path.join(base_txt_dir,epub_name)
    os.makedirs(output_dir, exist_ok=True)
    
    # Open the EPUB file
    try:
        book = epub.read_epub(epub_path)
    except Exception as e:
        print(f"Error reading EPUB file: {e}")
        return
    
    # Iterate through items in the reading order (spine)
    chapter_count = 0
    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        # Increment chapter number
        chapter_count += 1
        
        # Get the content (XHTML) of the item
        content = item.get_content().decode('utf-8')
        
        # Parse HTML content with BeautifulSoup
        soup = BeautifulSoup(content, 'html.parser')
        
        # Extract plain text (excluding tags)
        text = soup.get_text(separator='\n', strip=True)
        
        # Define output file name (e.g., chapter_0001.txt)
        chapter_file = os.path.join(output_dir, f"chapter_{chapter_count:04d}.txt")
        
        # Write text to file
        try:
            with open(chapter_file, 'w', encoding='utf-8') as f:
                f.write(text)
            print(f"Saved: {chapter_file}")
        except Exception as e:
            print(f"Error writing {chapter_file}: {e}")

def main():
    base_txt_dir = 'txt'
    epub_path_list=[]
    for dirname, _, filenames in os.walk('/kaggle/working/epubs'):
        for filename in filenames:
            name=os.path.join(dirname, filename)
            epub_path_list.append(name)
            print("name = ", name)
            print(os.path.join(dirname, filename))
    print(epub_path_list)
    for epub_path in epub_path_list:
        extract_chapters_from_epub(epub_path,base_txt_dir)

if __name__ == "__main__":
    main()

In [None]:
# !pwd
# !ls -lh

In [None]:
# !rm -r /kaggle/working/sample_01

In [None]:
# text = '''
# [Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.
# '''

In [None]:
# dir(os)

In [None]:
base_txt_dir = 'txt'
epub_name_list=[]
txt_chapter_path_list=[]
for epub_name in os.listdir(base_txt_dir):
    epub_name_list.append(epub_name)
    # print(epub_name)
    for chapter in os.listdir(os.path.join(base_txt_dir,epub_name)):
        txt_chapter_path_list.append(os.path.join(base_txt_dir,epub_name,chapter))
        # print(f"chapter: {chapter}")
    txt_chapter_path_list=sorted(txt_chapter_path_list)
    print(txt_chapter_path_list)
print(epub_name_list)

In [None]:
def chapter_path_to_audio(chapter_txt,epub_name,base_audio_dir="AudioBook", voice='af_heart'):
    text = open(chapter_txt, "r", encoding="utf-8").read()

    base_audio_dir = "AudioBook"
    os.makedirs(base_audio_dir, exist_ok=True)
    print(f"Created : {base_audio_dir}")

    audio_path = os.path.join(base_audio_dir,epub_name)
    os.makedirs(audio_path, exist_ok=True)
    print(f"Created : {audio_path}")
    
    generator = pipeline(text, voice=voice)
    
    combined_audio = []
    sample_rate = 24000
    txt_filename = os.path.basename(chapter_txt)  # Gets "chapter_0001.txt"
    chapter = txt_filename.split('.')[0]
    chapter_path = os.path.join(audio_path,f'{chapter}.wav')
    
    for i, (gs, ps, audio) in enumerate(generator):
        combined_audio.append(audio)  # Collect audio arrays
    
    # Concatenate all audio arrays into one
    combined_audio = np.concatenate(combined_audio)
    
    # Save as a single WAV file
    sf.write(chapter_path, combined_audio, sample_rate)
    print(f"Saved combined audio to '{chapter_path}'")
    # display(Audio(data=combined_audio, rate=24000, autoplay=0))

In [None]:
for epub_name in epub_name_list:
    for chapter_txt in txt_chapter_path_list:
        print(chapter_txt)
        chapter_path_to_audio(chapter_txt,epub_name)

In [None]:
!zip -r sample_01_af_heart.zip /kaggle/working/AudioBook/sample_01

In [None]:
!rm -r /kaggle/working/AudioBook/sample_01

In [None]:
for epub_name in epub_name_list:
    for chapter_txt in txt_chapter_path_list:
        print(chapter_txt)
        chapter_path_to_audio(chapter_txt,epub_name,voice='af_aoede')

In [None]:
!zip -r sample_01_af_aoede.zip /kaggle/working/AudioBook/sample_01

In [None]:
# text = open("/kaggle/working/txt/sample_01/chapter_0001.txt", "r", encoding="utf-8").read()

In [None]:
# generator = pipeline(text, voice='af_heart')

In [None]:
# combined_audio = []
# sample_rate = 24000

# filename = "chapter_0001"

# for i, (gs, ps, audio) in enumerate(generator):
#     # print(i, gs, ps)
#     # display(Audio(data=audio, rate=24000, autoplay=0))
#     combined_audio.append(audio)  # Collect audio arrays

# # Concatenate all audio arrays into one
# combined_audio = np.concatenate(combined_audio)

# # Save as a single WAV file
# sf.write(f'{filename}.wav', combined_audio, sample_rate)
# print(f"Saved combined audio as '{filename}.wav'")
# display(Audio(data=combined_audio, rate=24000, autoplay=0))

In [None]:
# !rm 1.wav 0.wav 2.wav 3.wav 4.wav

In [None]:
# display(Audio(data=combined_audio, rate=24000, autoplay=0))

In [None]:
# import kokoro as kkr
# help(kkr)
# dir(kkr)
# help(kkr.model)