<a href="https://colab.research.google.com/github/SVG-campus/77/blob/master/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the required libraries
!pip install -q TTS
!pip install -q pydub

import os
import re
import shutil
import glob
import numpy as np
from TTS.api import TTS
from pydub import AudioSegment
import torch
from IPython.display import Audio

def text_to_audiobook_coqui(input_file, output_dir, chunk_size=1500, model_name="tts_models/en/ljspeech/tacotron2-DDC"):
    """
    Convert a text file into an audiobook using Coqui TTS

    Parameters:
    input_file (str): Path to the text file
    output_dir (str): Directory path for the output audio file
    chunk_size (int): Number of characters to process at once
    model_name (str): Name of the TTS model to use
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Generate output filename based on input file
    filename = os.path.basename(input_file).split('.')[0]
    output_file = os.path.join(output_dir, f"{filename}.mp3")

    # Clean up any previous temp directory
    temp_dir = 'temp_audio'
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir, ignore_errors=True)
    os.makedirs(temp_dir, exist_ok=True)

    print("Reading text file...")
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            text = f.read()
    except UnicodeDecodeError:
        # Try different encodings if utf-8 fails
        try:
            with open(input_file, 'r', encoding='latin-1') as f:
                text = f.read()
        except:
            with open(input_file, 'r', encoding='cp1252') as f:
                text = f.read()

    # Clean the text (remove extra whitespace, etc.)
    text = re.sub(r'\s+', ' ', text).strip()

    # List available models and select a female voice model
    print("Loading TTS model...")

    # Initialize TTS with the specified model
    tts = TTS(model_name=model_name)

    # Show available models
    print("\nAvailable TTS models:")
    print(f"Currently using: {model_name}")
    print("You can change the model by modifying the model_name parameter")

    # Let's get some female voice models
    female_models = [
        "tts_models/en/ljspeech/tacotron2-DDC",          # LJSpeech - Female voice
        "tts_models/en/ljspeech/glow-tts",               # Another female voice model
        "tts_models/en/vctk/vits",                       # Multi-speaker model with female options
    ]
    print("\nRecommended female voice models:")
    for model in female_models:
        print(f" - {model}")

    # Split text into sentences or logical chunks
    print("Processing text into chunks...")

    # Use regex to split by sentence endings
    sentence_pattern = r'(?<=[.!?])\s+'
    sentences = re.split(sentence_pattern, text)

    # Group sentences into chunks that don't exceed chunk_size
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= chunk_size:
            current_chunk += sentence + " "
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence + " "

    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(current_chunk.strip())

    print(f"Converting {len(chunks)} text chunks to speech...")

    # Convert each chunk to speech
    audio_files = []
    for i, chunk in enumerate(chunks):
        if not chunk.strip():  # Skip empty chunks
            continue

        print(f"Processing chunk {i+1}/{len(chunks)}...")
        chunk_file = os.path.join(temp_dir, f"chunk_{i}.wav")

        try:
            # Generate speech
            tts.tts_to_file(text=chunk, file_path=chunk_file)

            # Convert wav to mp3 and add to list
            chunk_mp3_file = os.path.join(temp_dir, f"chunk_{i}.mp3")
            audio = AudioSegment.from_wav(chunk_file)
            audio.export(chunk_mp3_file, format="mp3")
            audio_files.append(chunk_mp3_file)

            # Save progress periodically
            if i > 0 and i % 10 == 0:
                print(f"Saving progress up to chunk {i}...")
                interim_file = os.path.join(output_dir, f"{filename}_part_{i//10}.mp3")
                combine_audio_files(audio_files[-10:], interim_file)

        except Exception as e:
            print(f"Error processing chunk {i+1}: {str(e)}")
            print("Continuing with next chunk...")
            continue

    # Combine all audio files into final output
    if audio_files:
        print("Combining all audio chunks...")
        combine_audio_files(audio_files, output_file)
        print(f"Audiobook creation complete! File saved to {output_file}")
    else:
        print("No audio files were created successfully.")

    # Clean up temp directory safely
    try:
        # Remove individual files first
        for file_path in glob.glob(os.path.join(temp_dir, '*')):
            try:
                os.remove(file_path)
            except:
                pass
        # Then try to remove the directory
        try:
            os.rmdir(temp_dir)
        except:
            print(f"Warning: Could not remove temp directory. You may need to manually delete {temp_dir}")
    except Exception as e:
        print(f"Warning: Error during cleanup: {str(e)}")

    return output_file

def combine_audio_files(audio_files, output_file):
    """Combine multiple audio files into one"""
    if not audio_files:
        print("No audio files to combine.")
        return None

    combined = AudioSegment.empty()
    processed_files = 0

    for audio_file in audio_files:
        if os.path.exists(audio_file):
            try:
                segment = AudioSegment.from_mp3(audio_file)
                combined += segment
                processed_files += 1
            except Exception as e:
                print(f"Error processing {audio_file}: {str(e)}")

    print(f"Successfully combined {processed_files} audio files out of {len(audio_files)}.")

    if processed_files > 0:
        print(f"Saving audio to {output_file}...")
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        combined.export(output_file, format="mp3")
        print(f"Total duration: {len(combined)/1000/60:.2f} minutes")
        return output_file
    else:
        print("No files were successfully processed.")
        return None

def create_preview(tts, text, output_file):
    """Create a preview audio sample"""
    try:
        # Take first paragraph or about 500 characters
        preview_text = text[:500].strip()
        tts.tts_to_file(text=preview_text, file_path=output_file)
        return True
    except Exception as e:
        print(f"Error creating preview: {str(e)}")
        return False

# Specific paths for Frankenstein
input_text_file = '/content/Books/Frankenstein/84-0.txt'
output_dir = '/content/Audiobooks/Frankenstein'

# Check for CUDA/GPU availability for faster processing
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Read a sample of text for preview
try:
    with open(input_text_file, 'r', encoding='utf-8') as f:
        sample_text = f.read(1000)
except:
    with open(input_text_file, 'r', encoding='latin-1') as f:
        sample_text = f.read(1000)

# Create a preview with different models to let user choose
print("\nCreating previews with different female voice models...")

# Models to try (all female voices)
models_to_try = [
    "tts_models/en/ljspeech/tacotron2-DDC",
    "tts_models/en/ljspeech/glow-tts"
]

for i, model in enumerate(models_to_try):
    preview_file = f"/content/preview_model_{i+1}.wav"
    try:
        print(f"\nTrying model: {model}")
        tts = TTS(model_name=model, progress_bar=True)
        tts.tts_to_file(text=sample_text[:200], file_path=preview_file)
        print(f"Preview {i+1} created with {model}")
        display(Audio(preview_file, autoplay=False))
    except Exception as e:
        print(f"Error with model {model}: {str(e)}")

print("\nPlease run this cell, listen to the previews, and choose your preferred voice model.")
print("Then run the cell below with your chosen model to generate the full audiobook.")

# Now give code to generate the full audiobook with the chosen model
print("""
# Generate full audiobook with your preferred model
# Replace the model_name with your choice

selected_model = "tts_models/en/ljspeech/tacotron2-DDC"  # Change this to your preferred model

output_file = text_to_audiobook_coqui(
    input_file='/content/Books/Frankenstein/84-0.txt',
    output_dir='/content/Audiobooks/Frankenstein',
    chunk_size=1500,  # Smaller chunks for better quality
    model_name=selected_model
)

if output_file and os.path.exists(output_file):
    print(f"Preview of final audiobook:")
    display(Audio(output_file, autoplay=False))
""")

Using device: cpu

Creating previews with different female voice models...

Trying model: tts_models/en/ljspeech/tacotron2-DDC
 > Downloading model to /root/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC


 99%|█████████▉| 112M/113M [00:01<00:00, 58.8MiB/s]

 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Downloading model to /root/.local/share/tts/vocoder_models--en--ljspeech--hifigan_v2



100%|██████████| 113M/113M [00:02<00:00, 39.0MiB/s]

 59%|█████▊    | 2.23M/3.80M [00:00<00:00, 22.3MiB/s][A

 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024


  return torch.load(f, map_location=map_location, **kwargs)


 > Model's reduction rate `r` is set to: 1
 > Vocoder Model: hifigan
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Generator Model: hifigan_generator
 > Discriminator Model: hifigan_discriminator
Removing weight norm...
 > Text splitted to sentences.
['*** START OF THE PROJECT GUTENBE


100%|██████████| 3.80M/3.80M [00:17<00:00, 22.3MiB/s][A

   > Decoder stopped with `max_decoder_steps` 10000


In [1]:
import os
import re
import csv
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Create necessary directories
os.makedirs('/content/Books', exist_ok=True)

# Function to sanitize filenames (remove characters not allowed in directory names)
def sanitize_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "", name)

# Function to extract book ID from URL
def extract_book_id(url):
    match = re.search(r'/epub/(\d+)/', url)
    if match:
        return match.group(1)
    else:
        match = re.search(r'ebooks/(\d+)', url)
        if match:
            return match.group(1)
    return None

# Function to download file
def download_file(url, destination):
    response = requests.get(url)
    if response.status_code == 200:
        with open(destination, 'wb') as f:
            f.write(response.content)
        return True
    return False

# Base URL
base_url = "https://www.gutenberg.org"

# URL of the top 100 page
top_100_url = f"{base_url}/browse/scores/top"

# List to store all the book data
books_data = []

# Get the top 100 page
print("Accessing Top 100 page...")
response = requests.get(top_100_url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find the section with "Top 100 EBooks last 30 days"
top_books_section = None
for h2 in soup.find_all('h2'):
    if "Top 100 EBooks last 30 days" in h2.text:
        top_books_section = h2.find_next('ol')
        break

if not top_books_section:
    print("Could not find the Top 100 books section.")
    exit(1)

# Extract all book links from this section
book_links = []
for li in top_books_section.find_all('li'):
    link = li.find('a')
    if link and link.get('href'):
        book_links.append(urljoin(base_url, link.get('href')))

# Process each book
for i, book_url in enumerate(book_links, 1):
    try:
        print(f"Processing book {i} of {len(book_links)}: {book_url}")

        # Get the book page
        response = requests.get(book_url)
        if response.status_code != 200:
            print(f"Failed to access {book_url}")
            continue

        book_soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the title
        title_element = book_soup.find('h1', {'itemprop': 'name'})
        if not title_element:
            print(f"Could not find title for book at {book_url}")
            continue

        title = title_element.text.strip()
        print(f"Found book: {title}")

        # Look for the Plain Text UTF-8 link
        plain_text_link = None
        for link in book_soup.find_all('a', {'type': 'text/plain; charset=us-ascii', 'class': 'link'}):
            if "Plain Text UTF-8" in link.text:
                plain_text_link = urljoin(base_url, link.get('href'))
                break

        if not plain_text_link:
            print(f"Could not find Plain Text UTF-8 link for {title}")
            continue

        # Extract book ID
        book_id = extract_book_id(plain_text_link)
        if not book_id:
            print(f"Could not extract book ID from {plain_text_link}")
            continue

        # Create sanitized folder name
        folder_name = sanitize_filename(title)
        folder_path = os.path.join('/content/Books', folder_name)
        os.makedirs(folder_path, exist_ok=True)

        # Get the actual filename from the Plain Text UTF-8 link
        response = requests.get(plain_text_link, allow_redirects=True)
        if response.status_code != 200:
            print(f"Failed to access Plain Text UTF-8 file for {title}")
            continue

        # Get the final URL after redirects
        final_url = response.url

        # Extract the filename from the final URL
        filename = os.path.basename(final_url)

        # If the filename doesn't end with .txt, add it
        if not filename.endswith('.txt'):
            filename = f"{filename}.txt"

        # Store the actual text content
        destination = os.path.join(folder_path, filename)
        with open(destination, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded text to {destination}")

        # Store the book information
        books_data.append({
            'Title': title,
            'Number': book_id,
            'Filename': filename
        })

        # Add a small delay to avoid hammering the server
        time.sleep(1)

    except Exception as e:
        print(f"Error processing {book_url}: {str(e)}")

# Write the CSV file
csv_path = '/content/gutenberg_top100.csv'
with open(csv_path, 'w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['Title', 'Number', 'Filename']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()
    for book in books_data:
        writer.writerow(book)

print(f"CSV file created at {csv_path}")
print(f"Downloaded {len(books_data)} out of {len(book_links)} books")

Accessing Top 100 page...
Processing book 1 of 100: https://www.gutenberg.org/ebooks/26184
Found book: Simple Sabotage Field Manual by United States. Office of Strategic Services
Downloaded text to /content/Books/Simple Sabotage Field Manual by United States. Office of Strategic Services/pg26184.txt
Processing book 2 of 100: https://www.gutenberg.org/ebooks/84
Found book: Frankenstein; Or, The Modern Prometheus by Mary Wollstonecraft Shelley
Downloaded text to /content/Books/Frankenstein; Or, The Modern Prometheus by Mary Wollstonecraft Shelley/pg84.txt
Processing book 3 of 100: https://www.gutenberg.org/ebooks/2701
Found book: Moby Dick; Or, The Whale by Herman Melville
Downloaded text to /content/Books/Moby Dick; Or, The Whale by Herman Melville/pg2701.txt
Processing book 4 of 100: https://www.gutenberg.org/ebooks/1513
Found book: Romeo and Juliet by William Shakespeare
Downloaded text to /content/Books/Romeo and Juliet by William Shakespeare/pg1513.txt
Processing book 5 of 100: http

In [2]:
import os
import re
import csv
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Create necessary directory
os.makedirs('/content/Other Books', exist_ok=True)

# Function to sanitize filenames (remove characters not allowed in directory names)
def sanitize_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "", name)

# Function to extract book ID from URL
def extract_book_id(url):
    match = re.search(r'/ebooks/(\d+)', url)
    if match:
        return match.group(1)
    return None

# Base URL
base_url = "https://www.gutenberg.org"

# URL of the last 30 days page
recent_books_url = f"{base_url}/browse/recent/last30"

# List to store all the book data
books_data = []

# Get the recent books page
print("Accessing Recent Books page...")
response = requests.get(recent_books_url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all links that start with /ebooks/ followed by a number
book_links = []
for link in soup.find_all('a', href=re.compile(r'^/ebooks/\d+')):
    book_url = urljoin(base_url, link.get('href'))
    # Extract the title from the link text
    title = link.text.strip()
    # Extract the book ID from the URL
    book_id = extract_book_id(book_url)

    if book_id and title:
        book_links.append({
            'url': book_url,
            'title': title,
            'id': book_id
        })

print(f"Found {len(book_links)} book links")

# Process each book
for i, book in enumerate(book_links, 1):
    try:
        print(f"Processing book {i} of {len(book_links)}: {book['title']} (ID: {book['id']})")

        # Get the book page
        response = requests.get(book['url'])
        if response.status_code != 200:
            print(f"Failed to access {book['url']}")
            continue

        book_soup = BeautifulSoup(response.text, 'html.parser')

        # Look for the Plain Text UTF-8 link
        plain_text_link = None
        for link in book_soup.find_all('a', {'type': 'text/plain; charset=us-ascii', 'class': 'link'}):
            if "Plain Text UTF-8" in link.text:
                plain_text_link = urljoin(base_url, link.get('href'))
                break

        if not plain_text_link:
            print(f"Could not find Plain Text UTF-8 link for {book['title']}")
            continue

        # Get the actual text file
        response = requests.get(plain_text_link, allow_redirects=True)
        if response.status_code != 200:
            print(f"Failed to access Plain Text UTF-8 file for {book['title']}")
            continue

        # Get the final URL after redirects
        final_url = response.url

        # Extract the filename from the final URL
        filename = os.path.basename(final_url)

        # If the filename doesn't end with .txt, add it
        if not filename.endswith('.txt'):
            filename = f"{filename}.txt"

        # Create sanitized folder name
        folder_name = sanitize_filename(book['title'])
        folder_path = os.path.join('/content/Other Books', folder_name)
        os.makedirs(folder_path, exist_ok=True)

        # Store the actual text content
        destination = os.path.join(folder_path, filename)
        with open(destination, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded text to {destination}")

        # Store the book information
        books_data.append({
            'Title': book['title'],
            'Number': book['id'],
            'Filename': filename
        })

        # Add a small delay to avoid hammering the server
        time.sleep(1)

    except Exception as e:
        print(f"Error processing {book['url']}: {str(e)}")

# Write the CSV file
csv_path = '/content/gutenberg_recent_books.csv'
with open(csv_path, 'w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['Title', 'Number', 'Filename']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()
    for book in books_data:
        writer.writerow(book)

print(f"CSV file created at {csv_path}")
print(f"Downloaded {len(books_data)} out of {len(book_links)} books")

Accessing Recent Books page...
Found 562 book links
Processing book 1 of 562: Jungen :  Vierzehn Geschichten von kleinen ganzen Kerlen (ID: 75541)
Downloaded text to /content/Other Books/Jungen   Vierzehn Geschichten von kleinen ganzen Kerlen/pg75541.txt
Processing book 2 of 562: An illustrated commentary on the Gospel according to St. John (ID: 75543)
Downloaded text to /content/Other Books/An illustrated commentary on the Gospel according to St. John/pg75543.txt
Processing book 3 of 562: Das Cistercienserstift Heiligenkreuz in Niederösterreich (ID: 75371)
Downloaded text to /content/Other Books/Das Cistercienserstift Heiligenkreuz in Niederösterreich/pg75371.txt
Processing book 4 of 562: Le Père Lebonnard :  Comédie en 4 actes, en vers, reprise à la Comédie Française le 4 août 1904 (ID: 75459)
Downloaded text to /content/Other Books/Le Père Lebonnard   Comédie en 4 actes, en vers, reprise à la Comédie Française le 4 août 1904/pg75459.txt
Processing book 5 of 562: Jessie :  or, trying