In [1]:
import os
import subprocess
import pickle
import pdfplumber
import torch
from openvoice.api import BaseSpeakerTTS, ToneColorConverter
from openvoice import se_extractor
from pydub import AudioSegment
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload

print("Starting execution...")

# Define paths
downloads_dir = os.path.join(os.path.expanduser("~"), "Downloads")
pdf_files = [f for f in os.listdir(downloads_dir) if f.endswith(".pdf")]
audio_files = [f for f in os.listdir(downloads_dir) if f.endswith((".mp3", ".wav"))]
pdf_path = os.path.join(downloads_dir, pdf_files[0]) if pdf_files else None
reference_audio = os.path.join(downloads_dir, audio_files[0]) if audio_files else None
output_dir = r'D:\chunks'  # Temporary chunks on D drive
output_file = r'D:\Life_3.0_audiobook.mp3'  # Final MP3 on D drive
credentials_file = r'C:/Users/Admin/Desktop/client_secret_237595400842-smil8abge2j3lvvum3kf4963r64i6s46.apps.googleusercontent.com.json'
drive_folder_id = "1gh0Tk2YKxtPi8XTVR8AnH9pR_jedUF5t"

# Validate files
if not pdf_path or not reference_audio:
    raise ValueError("No PDF or audio file found in Downloads. Please add files (e.g., .pdf, .mp3, or .wav).")

print(f"Using PDF: {pdf_path}")
print(f"Using reference audio: {reference_audio}")

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.

Starting execution...
Using PDF: C:\Users\Admin\Downloads\max-tegmark-life-30-being-human-in-the-age-of-artificial-intelligence-alfred-a-knopf-2017-aTvn.pdf
Using reference audio: C:\Users\Admin\Downloads\myvoice.mp3


In [2]:
# Step 1: Check OpenVoice directory (clone if not present)
print("Checking OpenVoice directory...")
if not os.path.exists("OpenVoice"):
    subprocess.run(["git", "clone", "https://github.com/myshell-ai/OpenVoice"], check=True)

os.chdir('OpenVoice')
print(f"Current directory: {os.getcwd()}")

Checking OpenVoice directory...
Current directory: C:\Users\Admin\Desktop\zonos\OpenVoice


In [3]:
#Attempt to download checkpoint (optional: comment out if manually downloaded)
checkpoint_url = 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_1226.zip'
try:
    print('Downloading checkpoint...')
    subprocess.run(['powershell', '-Command', 'Invoke-WebRequest', '-Uri', checkpoint_url, '-OutFile', 'checkpoints_1226.zip'], check=True)
    print('Extracting checkpoint...')
    subprocess.run(['powershell', '-Command', 'Expand-Archive', '-Path', 'checkpoints_1226.zip', '-DestinationPath', '.'], check=True)
    subprocess.run(['dir', 'checkpoints'], shell=True)
except Exception as e:
    print(f'Checkpoint download failed: {e}')
    print('Using manually downloaded checkpoint (place checkpoints_1226.zip in OpenVoice and extract).')
    # Ensure checkpoints folder exists manually

Downloading checkpoint...
Extracting checkpoint...


In [4]:
# Load OpenVoice model
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
base_speaker_tts = BaseSpeakerTTS('checkpoints/base_speakers/EN/config.json', device=device)
base_speaker_tts.load_ckpt('checkpoints/base_speakers/EN/checkpoint.pth')
tone_color_converter = ToneColorConverter('checkpoints/converter/config.json', device=device)
tone_color_converter.load_ckpt('checkpoints/converter/checkpoint.pth')
print('OpenVoice model loaded.')

  WeightNorm.apply(module, name, dim)


Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'
missing/unexpected keys: [] []
Loaded checkpoint 'checkpoints/converter/checkpoint.pth'
missing/unexpected keys: [] []
OpenVoice model loaded.


In [9]:
# Step 4: Extract text from PDF
with pdfplumber.open(pdf_path) as pdf:
    text = ''.join(page.extract_text() or '' for page in pdf.pages)
print(f"Extracted {len(text)} characters from PDF")

Extracted 734945 characters from PDF


In [11]:
# Step 5: Chunk the text
def chunk_text(text, max_length=1902):  # Adjusted to target ~368 chunks (e.g., for 700,000 characters)
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

text_chunks = chunk_text(text)
print(f"Created {len(text_chunks)} chunks")

Created 387 chunks


In [None]:
# Step 6: Generate cloned speech for each chunk
if os.path.exists('processed'):
    subprocess.run(['rmdir', '/S', '/Q', 'processed'], shell=True)

# Extract source and target speaker embeddings
source_se = torch.load('checkpoints/base_speakers/EN/en_default_se.pth', map_location=device)
target_se, _ = se_extractor.get_se(reference_audio, tone_color_converter, target_dir='processed', vad=True)

# Generate audio for each chunk
os.makedirs(output_dir, exist_ok=True)
for i, chunk in enumerate(text_chunks):
    src_path = f'{output_dir}\\tmp_{i}.wav'
    base_speaker_tts.tts(chunk, src_path, speaker='default', language='English', speed=1.0)
    save_path = f'{output_dir}\\part_{i}.wav'
    tone_color_converter.convert(
        audio_src_path=src_path,
        src_se=source_se,
        tgt_se=target_se,
        output_path=save_path,
        message='@MyShell'
    )
    print(f'Generated audio for chunk {i+1}/{len(text_chunks)}: {save_path}')

OpenVoice version: v1




Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to C:\Users\Admin/.cache\torch\hub\master.zip
[(1.134, 16.274)]
after vad: dur = 15.14


Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\native\SpectralOps.cpp:880.)
  return _VF.stft(  # type: ignore[attr-defined]


 > Text splitted to sentences.
Also by Max Tegmark Our Mathematical UniverseThis Is a Borzoi Book Published by Alfred A.
Knopf Copyright © 2017 by Max Tegmark All rights reserved. Published in the United States by Alfred A.
Knopf, a division of Penguin Random House LLC, New York, and distributed in Canada by Random House of Canada,
a division of Penguin Random House Canada Limited, Toronto. www. aaknopf.
com Knopf, Borzoi Books and the colophon are registered trademarks of Penguin Random House LLC.
Library of Congress Cataloging-in-Publication Data Names: Tegmark, Max, author. Title: Life 3.
0 : being human in the age of artificial intelligence / by Max Tegmark.
Other titles: Life three point zero Description: New York : Alfred A.
Knopf, 2017. | This is a Borzoi Book published by Alfred A.
Knopf. | Includes bibliographical references and index. Identifiers: LCCN 2017006248 print | LCCN 2017022912 ebook | ISBN 9781101946596 hardcover | ISBN 9781101946602 ebook Subjects: LCSH: Artificial

In [7]:
# Step 7: Merge audio chunks and save to D drive
combined = AudioSegment.empty()
for i in range(len(text_chunks)):
    audio = AudioSegment.from_wav(f'{output_dir}\\part_{i}.wav')
    combined += audio

combined.export(output_file, format='mp3')
print(f'Audiobook generated and saved to {output_file}')

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\chunks\\part_0.wav'

In [None]:
# Step 8: Upload to Google Drive
print("Authenticating with Google Drive...")
flow = InstalledAppFlow.from_client_secrets_file(
    credentials_file,
    scopes=['https://www.googleapis.com/auth/drive.file']
)
creds = flow.run_local_server(port=0)
drive_service = build('drive', 'v3', credentials=creds)

file_metadata = {
    'name': 'Life_3.0_audiobook.mp3',
    'parents': [drive_folder_id]
}
media = MediaFileUpload(output_file, mimetype='audio/mpeg')
file = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
print(f'File uploaded to Google Drive with ID: {file.get("id")}')