In [9]:
# Cell 1: Environment Setup
!sudo apt-get update -q
!sudo apt-get install -q python3.10 python3.10-venv python3.10-distutils
!python3.10 -m venv /content/myenv
!/content/myenv/bin/pip install --upgrade pip
!/content/myenv/bin/pip install --no-cache-dir TTS numpy torch scipy pydub pdfplumber -q  # Added pdfplumber

Hit:1 https://cli.github.com/packages stable InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:6 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:8 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:10 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Reading package lists...
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lis

In [10]:
!/content/myenv/bin/python3 --version

Python 3.10.12


In [11]:
# Cell 2: Verify Python version and TTS installation
!/content/myenv/bin/python -c "import sys; print('Python version:', sys.version); import TTS; print('TTS package imported successfully!')"

Python version: 3.10.12 (main, Aug 15 2025, 14:32:43) [GCC 11.4.0]
TTS package imported successfully!


In [12]:
# Cell 2: Mount Drive (move here for proper order)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# Cell 3: Upload PDF and voice sample in default environment
from google.colab import files
import os

print("Upload your 'Life 3.0' PDF and your voice sample (.mp3 or .wav recommended):")
uploaded = files.upload()

# Find PDF and voice sample
pdf_path = next((k for k in uploaded if k.endswith('.pdf')), None)
voice_path = next((k for k in uploaded if k.endswith(('.wav', '.mp3'))), None)

if not pdf_path:
    raise Exception("No PDF file uploaded. Please upload a .pdf file.")
if not voice_path:
    raise Exception("No audio file (.wav or .mp3) uploaded. Please upload a voice sample.")

print(f"Uploaded PDF: {pdf_path}")
print(f"Uploaded voice sample: {voice_path}")

Upload your 'Life 3.0' PDF and your voice sample (.mp3 or .wav recommended):


Saving myvoice.mp3 to myvoice (6).mp3
Saving max-tegmark-life-30-being-human-in-the-age-of-artificial-intelligence-alfred-a-knopf-2017-aTvn.pdf to max-tegmark-life-30-being-human-in-the-age-of-artificial-intelligence-alfred-a-knopf-2017-aTvn (6).pdf
Uploaded PDF: max-tegmark-life-30-being-human-in-the-age-of-artificial-intelligence-alfred-a-knopf-2017-aTvn (6).pdf
Uploaded voice sample: myvoice (6).mp3


In [14]:
# Cell 4: Save and run script with fixed regex
import os

# Set environment variables for file paths
os.environ['PDF_PATH'] = pdf_path
os.environ['VOICE_PATH'] = voice_path

with open('/content/script.py', 'w') as f:
    f.write('''
import os
import time
import torch
import torchaudio
from pydub import AudioSegment
from TTS.api import TTS
import pdfplumber
from scipy.io import wavfile
import numpy as np
from librosa import resample
import re

# Step 1: Get file paths from environment variables
pdf_path = os.environ.get('PDF_PATH')
voice_path = os.environ.get('VOICE_PATH')
if not os.path.exists(pdf_path):
    raise Exception(f"PDF file {pdf_path} does not exist.")
if not os.path.exists(voice_path):
    raise Exception(f"Voice file {voice_path} does not exist.")
print(f"'{voice_path}' and '{pdf_path}' exist and are ready for use.")

# Step 2: Extract and Clean Text from Entire PDF
def extract_text(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\\n"
    # Clean text: keep letters, spaces, and basic punctuation, remove numbers and special characters
    text = re.sub(r'[^a-zA-Z\\s.,!?\\\'"-]', '', text)
    return text

book_text = extract_text(pdf_path)
print(f"Extracted {len(book_text)} characters from the entire book.")

# Step 3: Set up YourTTS and Load Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False)
tts.to(device)  # Move model to GPU if available
print("YourTTS model loaded.")

# Step 4: Prepare Voice Sample
# Convert MP3 to WAV (mono) if necessary
if voice_path.endswith('.mp3'):
    audio = AudioSegment.from_mp3(voice_path)
    audio = audio.set_channels(1)  # Convert to mono
    voice_path_wav = voice_path.replace('.mp3', '.wav')
    audio.export(voice_path_wav, format='wav')
    voice_path = voice_path_wav

# Read WAV file and convert to floating-point
samplerate, data = wavfile.read(voice_path)
if data.dtype != np.float32:  # Convert to float32 if not already
    data = data.astype(np.float32) / np.iinfo(data.dtype).max  # Normalize to [-1.0, 1.0]

# Resample to 22050 Hz if necessary
if samplerate != 22050:  # YourTTS default sample rate
    data = resample(data, orig_sr=samplerate, target_sr=22050)
    wavfile.write("resampled_voice.wav", 22050, (data * 32767).astype(np.int16))  # Convert back to int16 for WAV
    voice_path = "resampled_voice.wav"

# Step 5: Generate Cloned Speech for Text in Chunks
output_dir = "/content/life_30_chunks/"
os.makedirs(output_dir, exist_ok=True)
chunk_size = 2000  # Reduced chunk size for stability
chunks = [book_text[i:i+chunk_size] for i in range(0, len(book_text), chunk_size)]
output_files = []

print(f"Generating audiobook in {len(chunks)} chunks...")
start_time = time.time()
for i, chunk in enumerate(chunks):
    output_path = f"{output_dir}/chunk_{i}.wav"
    tts.tts_to_file(text=chunk, file_path=output_path, speaker_wav=voice_path, language='en')
    output_files.append(output_path)
    print(f"Generated chunk {i+1}/{len(chunks)}")

# Combine chunks
combined = AudioSegment.empty()
for output_path in output_files:
    combined += AudioSegment.from_wav(output_path)
mp3_path = "/content/drive/My Drive/life_30_yourtts_cloned.mp3"
combined.export(mp3_path, format="mp3")
print(f"Combined audiobook saved to: {mp3_path} in {time.time() - start_time} seconds")
''')

# Run the script using the virtual environment's Python
!/content/myenv/bin/python /content/script.py

'myvoice (6).mp3' and 'max-tegmark-life-30-being-human-in-the-age-of-artificial-intelligence-alfred-a-knopf-2017-aTvn (6).pdf' exist and are ready for use.
Extracted 724171 characters from the entire book.
 > Downloading model to /root/.local/share/tts/tts_models--multilingual--multi-dataset--your_tts
 > Model's license - CC BY-NC-ND 4.0
 > Check https://creativecommons.org/licenses/by-nc-nd/4.0/ for more info.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm