<a href="https://colab.research.google.com/gist/amod-ml/5ea8d00ec177be39dcf913f5efc7f03c/tts-voice-clone-fishspeech-workflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Helper Function For Dropbox

In [1]:
import re
import subprocess
from typing import Optional

def download_from_dropbox(dropbox_url: str, output_filename: Optional[str] = None) -> None:
    """
    Download a file directly from a Dropbox shareable link.

    Args:
        dropbox_url (str): The Dropbox link to the file (can be with ?dl=0 or ?dl=1 or no query at all).
        output_filename (Optional[str]): If provided, saves the file as this name.
                                         Otherwise, keeps the original Dropbox file name.

    Raises:
        ValueError: If the input URL is not a Dropbox link.
    """
    if "dropbox.com" not in dropbox_url:
        raise ValueError("Provided URL is not a valid Dropbox link.")

    # Force direct download
    direct_url = re.sub(r"\?dl=0$", "?dl=1", dropbox_url)
    direct_url = re.sub(r"\?dl=1$", "?dl=1", direct_url)
    if "?dl=1" not in direct_url:
        direct_url += "?dl=1"

    print(f"Starting download from: {direct_url}")

    # Build the wget command
    if output_filename:
        command = ["wget", "-O", output_filename, direct_url]
    else:
        command = ["wget", direct_url]

    try:
        subprocess.run(command, check=True)
        print("Download completed successfully.")
    except subprocess.CalledProcessError as e:
        print(f"Download failed with error: {e}")

In [2]:
download_from_dropbox(dropbox_url="https://www.dropbox.com/scl/fi/ff260om4wgwkmsiibg8ek/Crash-Course-Political-Theory-Preview.mp4?rlkey=3533vc4y4oruqaeo08g1dakm7&dl=0", output_filename="video_en.mp4")

Starting download from: https://www.dropbox.com/scl/fi/ff260om4wgwkmsiibg8ek/Crash-Course-Political-Theory-Preview.mp4?rlkey=3533vc4y4oruqaeo08g1dakm7&dl=0?dl=1
Download completed successfully.


In [3]:
!pip install -qU ffmpeg-python deepgram-sdk assemblyai openai google-genai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.9/156.9 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m765.0/765.0 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m217.7/217.7 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.6/165.6 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import ffmpeg

(
    ffmpeg
    .input('/content/video_en.mp4')
    .output('/content/video_en.mp3', acodec='libmp3lame')
    .run()
)


(None, None)

In [5]:
def simplify_deepgram_response(response: dict) -> dict:
    # Extract the metadata directly
    metadata = response.get("metadata", {})

    # Extract the full transcript from the first alternative
    channels = response.get("results", {}).get("channels", [])
    full_transcript = ""
    if channels and "alternatives" in channels[0] and channels[0]["alternatives"]:
        full_transcript = channels[0]["alternatives"][0].get("transcript", "")

    # Simplify the utterances
    simplified_utterances = []
    utterances = response.get("results", {}).get("utterances", [])
    for utterance in utterances:
        simplified = {
            "start": utterance["start"],
            "end": utterance["end"],
            "utterance": utterance["transcript"],
            "speaker": utterance.get("speaker", 0)
        }
        simplified_utterances.append(simplified)

    # Return the full structured object
    return {
        "metadata": metadata,
        "simplified_results": {
            "full_transcript": full_transcript,
            "utterances": simplified_utterances
        }
    }

In [9]:
import os

from deepgram import (
    DeepgramClient,
    PrerecordedOptions,
    FileSource,
)
from google.colab import userdata
DEEPGRAM_API_KEY = userdata.get('DEEPGRAM_API_KEY')

# Path to the audio file
AUDIO_FILE = "/content/video_en.mp3"

try:

    deepgram = DeepgramClient(DEEPGRAM_API_KEY)

    with open(AUDIO_FILE, "rb") as file:
        buffer_data = file.read()

    payload: FileSource = {
            "buffer": buffer_data,
    }

    options = PrerecordedOptions(
            model="nova-3",
            language="en",
            smart_format=True,
            utterances=True,
            utt_split=0.5,
            diarize=True,
        )

    response = deepgram.listen.rest.v("1").transcribe_file(payload, options)

except Exception as e:
    print(f"Exception: {e}")

In [10]:
simplify_deepgram_response(response.to_dict())

{'metadata': {'transaction_key': 'deprecated',
  'request_id': 'c294b0c9-c8b4-4fff-9764-063b932e3a14',
  'sha256': '433e30cb102a971adc02a8d764106ad6042601b2173c2a4c432ae27bd21b2147',
  'created': '2025-07-17T19:18:34.635Z',
  'duration': 157.824,
  'channels': 1,
  'models': ['3b3aabe4-608a-46ac-9585-7960a25daf1a'],
  'model_info': {'3b3aabe4-608a-46ac-9585-7960a25daf1a': {'name': 'general-nova-3',
    'version': '2024-12-20.0',
    'arch': 'nova-3'}}},
 'simplified_results': {'full_transcript': "What is justice, and who gets to decide? What is liberty, and how do we measure it? Is war ever justified? Should prisons be abolished? What does communism actually mean? Should governments even exist? Maybe in a swirl of information, some of these questions are on your mind. Or maybe the only question on your mind is who should I vote for? Or what's going to happen if that's who people vote for? Or wait, I'm supposed to be voting? No matter where you are in the information vortex, we'll meet 

In [11]:
#Initialize Google GenAI client

from google import genai

client = genai.Client(api_key=userdata.get('GOOGLE_API_KEY'))

In [12]:
import json
from google.genai import types

def translate_transcript_with_gemini_structured(
    client,
    full_transcript: str,
    utterances: list,
    target_language: str = "Spanish"
) -> str:
    """
    Translates a full transcript and utterances into the target language using Gemini 2.5 Pro.
    Returns a structured JSON with only the translated full transcript.

    Args:
        client: The already initialized Gemini client.
        full_transcript (str): The full original English transcript.
        utterances (list): The list of utterances with timestamps and text.
        target_language (str): The target language for translation (default "Spanish").

    Returns:
        str: The full translated transcript as a single string.
    """
    # Stringify the utterances to JSON text
    utterances_json = json.dumps(utterances, ensure_ascii=False, indent=2)

    # Define the structured output format in the system instruction
    system_instruction = (
        f"You are a professional translator and timing expert. "
        f"You must translate English content into {target_language} with high fidelity. "
        f"Maintain the timing alignment from utterances as much as possible. "
        f"If the translation would overflow time bounds, summarize or compress naturally. "
        f"ONLY return a structured JSON object with the following schema: "
        f"{{ 'translated_full_transcript': str }}. "
        f"No extra commentary, no explanation — only the JSON."
    )

    # Build the dynamic prompt
    prompt = f"""
You are given:
1. The full original English transcript.
2. The utterance-by-utterance segmentation data (with timestamps).

Your task:
- Translate the full transcript into {target_language}.
- Keep the translated utterances roughly aligned with the original timing (small deviations allowed).
- Summarize or compress if necessary to fit within the original utterance timing.
- Return only the full translated transcript as a single string.

Here is the Full Transcript:
\"\"\"
{full_transcript}
\"\"\"

Here are the Utterance Segments (JSON):
\"\"\"
{utterances_json}
\"\"\"
    """

    # Make the Gemini API call
    response = client.models.generate_content(
        model="gemini-2.5-pro-preview-03-25",
        contents=[prompt],
        config=types.GenerateContentConfig(
            temperature=0.3,
            system_instruction=system_instruction,
        )
    )

    raw_text = response.text.strip()

    # --- Clean up any ```json ... ``` blocks if present ---
    if raw_text.startswith("```json"):
        raw_text = re.sub(r"^```json", "", raw_text).strip()
    if raw_text.startswith("```"):
        raw_text = re.sub(r"^```", "", raw_text).strip()
    if raw_text.endswith("```"):
        raw_text = re.sub(r"```$", "", raw_text).strip()

    return raw_text

In [13]:
response_dict = response.to_dict()
simplified_response = simplify_deepgram_response(response_dict)

full_transcript = simplified_response["simplified_results"]["full_transcript"]
utterances = simplified_response["simplified_results"]["utterances"]

translated_transcript = translate_transcript_with_gemini_structured(
    client=client,
    full_transcript=full_transcript,
    utterances=utterances,
    target_language="Spanish"
)

In [14]:
json.loads(translated_transcript)["translated_full_transcript"]

'Qué es la justicia, y quién decide? ¿Qué es la libertad, y cómo la medimos? ¿Se justifica alguna vez la guerra? ¿Deberían abolirse las prisiones? ¿Qué significa realmente el comunismo? ¿Deberían siquiera existir los gobiernos? Quizás en un torbellino de información, algunas de estas preguntas rondan tu mente. O quizás la única pregunta en tu mente es ¿por quién debo votar? ¿O qué pasará si la gente vota por esa persona? O espera, ¿se supone que debo votar? No importa dónde te encuentres en el vórtice de la información, allí nos encontraremos. Soy Ellie Anderson, y esto es Crash Course Teoría Política. Soy profesora de filosofía y estoy obsesionada con encontrar formas de dar sentido al presente aprendiendo de los pensadores del pasado. He enseñado en colegios y universidades durante más de una década, y escribo tanto para otros académicos como para el público. También soy coanfitriona del podcast y canal de YouTube Overthink, donde ayudo a la gente a entender la filosofía y cómo puede

In [33]:
# ── 1️⃣  Torch first (needs the special index) ──────────────────────────
!pip install -qU torch==2.3.0+cu121 --index-url https://download.pytorch.org/whl/cu121
print("✓ torch with CUDA 12 installed")

# ── 2️⃣  All remaining runtime deps from the normal PyPI index ─────────
!pip install -qU \
    numpy==1.26.4 \
    pydantic==2.9.2\
    pyrootutils==1.0.4 \
    loguru==0.7.2 \
    vector_quantize_pytorch==1.14.24 \
    "einx[torch]==0.2.2" \
    tiktoken==0.8.0 \
    resampy==0.4.3 \
    pydub==0.25.1 \
    soundfile \
    sentencepiece \
    antlr4-python3-runtime \
    hydra-core==1.3.2 \
    "pytorch-lightning<2.2" \
    audiotools

print("✓ remaining deps installed")

✓ torch with CUDA 12 installed
[31mERROR: Ignored the following versions that require a different python version: 1.21.2 Requires-Python >=3.7,<3.11; 1.21.3 Requires-Python >=3.7,<3.11; 1.21.4 Requires-Python >=3.7,<3.11; 1.21.5 Requires-Python >=3.7,<3.11; 1.21.6 Requires-Python >=3.7,<3.11[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement audiotools (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for audiotools[0m[31m
[0m✓ remaining deps installed


In [28]:
!git clone --depth 1 https://github.com/fishaudio/fish-speech.git /content/fish-speech
!pip install -qU --force-reinstall /content/fish-speech --no-deps

fatal: destination path '/content/fish-speech' already exists and is not an empty directory.
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for fish-speech (pyproject.toml) ... [?25l[?25hdone


Patches

In [17]:
%%bash
sed -i 's/from lightning\.pytorch\.utilities /from pytorch_lightning.utilities /' \
    /content/fish-speech/fish_speech/utils/logging_utils.py
sed -i 's/from lightning\.pytorch /from pytorch_lightning /' \
    /content/fish-speech/fish_speech/utils/instantiators.py
touch /content/fish-speech/.project-root


In [18]:
from huggingface_hub import snapshot_download
MODEL_DIR = "/content/checkpoints/fish-speech-1.5"

snapshot_download(
    repo_id               = "fishaudio/fish-speech-1.5",
    local_dir             = MODEL_DIR,
    local_dir_use_symlinks=False,
    resume_download       = True,
)
print("✓ checkpoints in", MODEL_DIR)


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

(…)fly-gan-vq-fsq-8x1024-21hz-generator.pth:   0%|          | 0.00/189M [00:00<?, ?B/s]

model.pth:   0%|          | 0.00/1.28G [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/697 [00:00<?, ?B/s]

tokenizer.tiktoken: 0.00B [00:00, ?B/s]

special_tokens.json: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

✓ checkpoints in /content/checkpoints/fish-speech-1.5


In [19]:
import ffmpeg, json, os, pathlib, textwrap, importlib

SRC_MP3  = "/content/video_en.mp3"        # existing audio
CLIP_MP3 = "/content/voice_ref_90s.mp3"        # 90-s clip
ENC_CKPT = f"{MODEL_DIR}/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"

# 90-second slice
(
    ffmpeg
    .input(SRC_MP3, t=90)
    .output(CLIP_MP3, acodec="libmp3lame")
    .overwrite_output()
    .run(quiet=True)
)
print("✓ reference ready:", CLIP_MP3)

# Spanish transcript you already produced with Gemini
SPANISH_TEXT = json.loads(translated_transcript)["translated_full_transcript"]


✓ reference ready: /content/voice_ref_90s.mp3


In [24]:
# ensure torchvision / torchaudio match the already-installed torch 2.3 + cu121
!pip install -qU \
  torchvision==0.18.0+cu121 \
  torchaudio==2.3.0+cu121 \
  --index-url https://download.pytorch.org/whl/cu121

print("✓ torchvision & torchaudio now match torch 2.3 / CUDA-12.1")

✓ torchvision & torchaudio now match torch 2.3 / CUDA-12.1


In [25]:
!pip install -qU "lightning>=2.2,<2.3" loralib ormsgpack datasets==2.18.0 fsspec==2025.3.2

[31mERROR: Cannot install datasets==2.18.0 and fsspec==2025.3.2 because these package versions have conflicting dependencies.[0m[31m
[0m[31mERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts[0m[31m
[0m

In [22]:
# -------------------------------------------------
# Python cell  ➜  define a helper for step 2
# -------------------------------------------------
import json, subprocess, shlex, textwrap, os, pathlib, sys, re, tempfile

# 1) paths ------------------------------------------------------------------
FISH_ROOT = "/content/fish-speech"
MODEL_DIR = "/content/checkpoints/fish-speech-1.5"
SRC_MP3   = "/content/video_en.mp3"
CLIP_MP3  = "/content/voice_ref_90s.mp3"
ENC_CKPT  = f"{MODEL_DIR}/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"

# 90-second reference (only once)
import ffmpeg, pathlib, os, warnings
pathlib.Path(CLIP_MP3).unlink(missing_ok=True)
ffmpeg.input(SRC_MP3, t=90).output(CLIP_MP3, acodec="libmp3lame").overwrite_output().run(quiet=True)

# --------------------------------------------------------------------------
# 1️⃣  reference → prompt tokens (npy) + preview wav
# --------------------------------------------------------------------------
subprocess.run(
    [
        "python", "-m", "fish_speech.models.vqgan.inference",
        "-i", CLIP_MP3,
        "--checkpoint-path", ENC_CKPT,
        "-o", "/content/voice_ref_prompt.wav"        # writes .wav AND .npy
    ],
    cwd=FISH_ROOT, check=True
)

# --------------------------------------------------------------------------
# 2️⃣  Spanish text  → semantic codes
#     (we call the CLI from Python so we can pass the long text safely)
# --------------------------------------------------------------------------
SPANISH_TEXT = json.loads(translated_transcript)["translated_full_transcript"]  # already in your notebook

cmd = [
    "python", "-m", "fish_speech.models.text2semantic.inference",
    "--text", SPANISH_TEXT,
    "--prompt-text", "90-s voice ref",
    "--prompt-tokens", "/content/voice_ref_prompt.npy",
    "--checkpoint-path", MODEL_DIR,
    "--num-samples", "1",
    "--output-dir", "/content/spanish_codes",          # ==> codes_0.npy
]
subprocess.run(cmd, cwd=FISH_ROOT, check=True)

# --------------------------------------------------------------------------
# 3️⃣  codes → final wav
# --------------------------------------------------------------------------
subprocess.run(
    [
        "python", "-m", "fish_speech.models.vqgan.inference",
        "-i", "/content/spanish_codes/codes_0.npy",
        "--checkpoint-path", ENC_CKPT,
        "-o", "/content/spanish_voiceclone.wav"
    ],
    cwd=FISH_ROOT, check=True
)

print("✓ Spanish voice-cloned audio saved → /content/spanish_voiceclone.wav")


CalledProcessError: Command '['python', '-m', 'fish_speech.models.vqgan.inference', '-i', '/content/voice_ref_90s.mp3', '--checkpoint-path', '/content/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth', '-o', '/content/voice_ref_prompt.wav']' returned non-zero exit status 1.

In [None]:
AUDIO_FILE = "/content/spanish_voiceclone.wav"

try:

    deepgram = DeepgramClient(DEEPGRAM_API_KEY)

    with open(AUDIO_FILE, "rb") as file:
        buffer_data = file.read()

    payload: FileSource = {
            "buffer": buffer_data,
    }

    options = PrerecordedOptions(
            model="nova-2",
            language="es",
            smart_format=True,
            utterances=True,
            utt_split=0.5,
            diarize=True,
        )

    response = deepgram.listen.rest.v("1").transcribe_file(payload, options)

except Exception as e:
    print(f"Exception: {e}")

simplify_deepgram_response(response.to_dict())

In [27]:
# -------------------------------------------------
# Python cell  ➜  define a helper for step 2
# -------------------------------------------------
import json, subprocess, shlex, textwrap, os, pathlib, sys, re, tempfile

# 1) paths ------------------------------------------------------------------
FISH_ROOT = "/content/fish-speech"
MODEL_DIR = "/content/checkpoints/fish-speech-1.5"
SRC_MP3   = "/content/video_en.mp3"
CLIP_MP3  = "/content/voice_ref_90s.mp3"
ENC_CKPT  = f"{MODEL_DIR}/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"

# 90-second reference (only once)
import ffmpeg, pathlib, os, warnings
pathlib.Path(CLIP_MP3).unlink(missing_ok=True)
ffmpeg.input(SRC_MP3, t=90).output(CLIP_MP3, acodec="libmp3lame").overwrite_output().run(quiet=True)

# --------------------------------------------------------------------------
# 1️⃣  reference → prompt tokens (npy) + preview wav
# --------------------------------------------------------------------------
try:
    subprocess.run(
        [
            "python", "-m", "fish_speech.models.vqgan.inference",
            "-i", CLIP_MP3,
            "--checkpoint-path", ENC_CKPT,
            "-o", "/content/voice_ref_prompt.wav"        # writes .wav AND .npy
        ],
        cwd=FISH_ROOT, check=True, capture_output=True, text=True
    )
except subprocess.CalledProcessError as e:
    print(f"Error during reference processing (Step 1):")
    print(f"Command: {' '.join(e.cmd)}")
    print(f"Return Code: {e.returncode}")
    print(f"Stderr:\n{e.stderr}")
    raise # Re-raise the exception to stop execution

# --------------------------------------------------------------------------
# 2️⃣  Spanish text  → semantic codes
#     (we call the CLI from Python so we can pass the long text safely)
# --------------------------------------------------------------------------
SPANISH_TEXT = json.loads(translated_transcript)["translated_full_transcript"]  # already in your notebook

cmd = [
    "python", "-m", "fish_speech.models.text2semantic.inference",
    "--text", SPANISH_TEXT,
    "--prompt-text", "90-s voice ref",
    "--prompt-tokens", "/content/voice_ref_prompt.npy",
    "--checkpoint-path", MODEL_DIR,
    "--num-samples", "1",
    "--output-dir", "/content/spanish_codes",          # ==> codes_0.npy
]
try:
    subprocess.run(cmd, cwd=FISH_ROOT, check=True, capture_output=True, text=True)
except subprocess.CalledProcessError as e:
    print(f"Error during text to semantic codes conversion (Step 2):")
    print(f"Command: {' '.join(e.cmd)}")
    print(f"Return Code: {e.returncode}")
    print(f"Stderr:\n{e.stderr}")
    raise # Re-raise the exception to stop execution


# --------------------------------------------------------------------------
# 3️⃣  codes → final wav
# --------------------------------------------------------------------------
try:
    subprocess.run(
        [
            "python", "-m", "fish_speech.models.vqgan.inference",
            "-i", "/content/spanish_codes/codes_0.npy",
            "--checkpoint-path", ENC_CKPT,
            "-o", "/content/spanish_voiceclone.wav"
        ],
        cwd=FISH_ROOT, check=True, capture_output=True, text=True
    )
except subprocess.CalledProcessError as e:
    print(f"Error during codes to audio conversion (Step 3):")
    print(f"Command: {' '.join(e.cmd)}")
    print(f"Return Code: {e.returncode}")
    print(f"Stderr:\n{e.stderr}")
    raise # Re-raise the exception to stop execution


print("✓ Spanish voice-cloned audio saved → /content/spanish_voiceclone.wav")

Error during reference processing (Step 1):
Command: python -m fish_speech.models.vqgan.inference -i /content/voice_ref_90s.mp3 --checkpoint-path /content/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth -o /content/voice_ref_prompt.wav
Return Code: 1
Stderr:
/usr/bin/python3: Error while finding module specification for 'fish_speech.models.vqgan.inference' (ModuleNotFoundError: No module named 'fish_speech.models.vqgan')



CalledProcessError: Command '['python', '-m', 'fish_speech.models.vqgan.inference', '-i', '/content/voice_ref_90s.mp3', '--checkpoint-path', '/content/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth', '-o', '/content/voice_ref_prompt.wav']' returned non-zero exit status 1.

In [32]:
# -------------------------------------------------
# Python cell  ➜  define a helper for step 2
# -------------------------------------------------
import json, subprocess, shlex, textwrap, os, pathlib, sys, re, tempfile

# 1) paths ------------------------------------------------------------------
FISH_ROOT = "/content/fish-speech"
MODEL_DIR = "/content/checkpoints/fish-speech-1.5"
SRC_MP3   = "/content/video_en.mp3"
CLIP_MP3  = "/content/voice_ref_90s.mp3"
ENC_CKPT  = f"{MODEL_DIR}/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"

# Add fish-speech directory to Python path for subprocesses
env = os.environ.copy()
env["PYTHONPATH"] = f"{FISH_ROOT}:{env.get('PYTHONPATH', '')}"


# 90-second reference (only once)
import ffmpeg, pathlib, os, warnings
pathlib.Path(CLIP_MP3).unlink(missing_ok=True)
ffmpeg.input(SRC_MP3, t=90).output(CLIP_MP3, acodec="libmp3lame").overwrite_output().run(quiet=True)

# --------------------------------------------------------------------------
# 1️⃣  reference → prompt tokens (npy) + preview wav
# --------------------------------------------------------------------------
try:
    subprocess.run(
        [
            "python", "-m", "fish_speech.models.dac.inference",
            "-i", CLIP_MP3,
            "--checkpoint-path", ENC_CKPT,
            "-o", "/content/voice_ref_prompt.wav"        # writes .wav AND .npy
        ],
        cwd=FISH_ROOT, check=True, capture_output=True, text=True, env=env # Pass the modified environment
    )
except subprocess.CalledProcessError as e:
    print(f"Error during reference processing (Step 1):")
    print(f"Command: {' '.join(e.cmd)}")
    print(f"Return Code: {e.returncode}")
    print(f"Stderr:\n{e.stderr}")
    raise # Re-raise the exception to stop execution

# --------------------------------------------------------------------------
# 2️⃣  Spanish text  → semantic codes
#     (we call the CLI from Python so we can pass the long text safely)
# --------------------------------------------------------------------------
SPANISH_TEXT = json.loads(translated_transcript)["translated_full_transcript"]  # already in your notebook

cmd = [
    "python", "-m", "fish_speech.models.text2semantic.inference",
    "--text", SPANISH_TEXT,
    "--prompt-text", "90-s voice ref",
    "--prompt-tokens", "/content/voice_ref_prompt.npy",
    "--checkpoint-path", MODEL_DIR,
    "--num-samples", "1",
    "--output-dir", "/content/spanish_codes",          # ==> codes_0.npy
]
try:
    subprocess.run(cmd, cwd=FISH_ROOT, check=True, capture_output=True, text=True, env=env) # Pass the modified environment
except subprocess.CalledProcessError as e:
    print(f"Error during text to semantic codes conversion (Step 2):")
    print(f"Command: {' '.join(e.cmd)}")
    print(f"Return Code: {e.returncode}")
    print(f"Stderr:\n{e.stderr}")
    raise # Re-raise the exception to stop execution


# --------------------------------------------------------------------------
# 3️⃣  codes → final wav
# --------------------------------------------------------------------------
try:
    subprocess.run(
        [
            "python", "-m", "fish_speech.models.dac.inference",
            "-i", "/content/spanish_codes/codes_0.npy",
            "--checkpoint-path", ENC_CKPT,
            "-o", "/content/spanish_voiceclone.wav"
        ],
        cwd=FISH_ROOT, check=True, capture_output=True, text=True, env=env # Pass the modified environment
    )
except subprocess.CalledProcessError as e:
    print(f"Error during codes to audio conversion (Step 3):")
    print(f"Command: {' '.join(e.cmd)}")
    print(f"Return Code: {e.returncode}")
    print(f"Stderr:\n{e.stderr}")
    raise # Re-raise the exception to stop execution


print("✓ Spanish voice-cloned audio saved → /content/spanish_voiceclone.wav")

Error during reference processing (Step 1):
Command: python -m fish_speech.models.dac.inference -i /content/voice_ref_90s.mp3 --checkpoint-path /content/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth -o /content/voice_ref_prompt.wav
Return Code: 1
Stderr:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 644, in _locate
    obj = getattr(obj, part)
          ^^^^^^^^^^^^^^^^^^
AttributeError: module 'fish_speech.models.dac' has no attribute 'modded_dac'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 650, in _locate
    obj = import_module(mod)
          ^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

CalledProcessError: Command '['python', '-m', 'fish_speech.models.dac.inference', '-i', '/content/voice_ref_90s.mp3', '--checkpoint-path', '/content/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth', '-o', '/content/voice_ref_prompt.wav']' returned non-zero exit status 1.

In [35]:
# -------------------------------------------------
# Python cell  ➜  define a helper for step 2
# -------------------------------------------------
import json, subprocess, shlex, textwrap, os, pathlib, sys, re, tempfile

# 1) paths ------------------------------------------------------------------
FISH_ROOT = "/content/fish-speech"
MODEL_DIR = "/content/checkpoints/fish-speech-1.5"
SRC_MP3   = "/content/video_en.mp3"
CLIP_MP3  = "/content/voice_ref_90s.mp3"
ENC_CKPT  = f"{MODEL_DIR}/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"

# Add fish-speech directory to Python path for subprocesses
env = os.environ.copy()
env["PYTHONPATH"] = f"{FISH_ROOT}:{env.get('PYTHONPATH', '')}"


# 90-second reference (only once)
import ffmpeg, pathlib, os, warnings
pathlib.Path(CLIP_MP3).unlink(missing_ok=True)
ffmpeg.input(SRC_MP3, t=90).output(CLIP_MP3, acodec="libmp3lame").overwrite_output().run(quiet=True)

# --------------------------------------------------------------------------
# 1️⃣  reference → prompt tokens (npy) + preview wav
# --------------------------------------------------------------------------
try:
    subprocess.run(
        [
            "python", "-m", "fish_speech.models.dac.inference", # Changed from vqgan.inference
            "-i", CLIP_MP3,
            "--checkpoint-path", ENC_CKPT,
            "-o", "/content/voice_ref_prompt.wav"        # writes .wav AND .npy
        ],
        cwd=FISH_ROOT, check=True, capture_output=True, text=True, env=env # Pass the modified environment
    )
except subprocess.CalledProcessError as e:
    print(f"Error during reference processing (Step 1):")
    print(f"Command: {' '.join(e.cmd)}")
    print(f"Return Code: {e.returncode}")
    print(f"Stderr:\n{e.stderr}")
    raise # Re-raise the exception to stop execution

# --------------------------------------------------------------------------
# 2️⃣  Spanish text  → semantic codes
#     (we call the CLI from Python so we can pass the long text safely)
# --------------------------------------------------------------------------
SPANISH_TEXT = json.loads(translated_transcript)["translated_full_transcript"]  # already in your notebook

cmd = [
    "python", "-m", "fish_speech.models.text2semantic.inference",
    "--text", SPANISH_TEXT,
    "--prompt-text", "90-s voice ref",
    "--prompt-tokens", "/content/voice_ref_prompt.npy",
    "--checkpoint-path", MODEL_DIR,
    "--num-samples", "1",
    "--output-dir", "/content/spanish_codes",          # ==> codes_0.npy
]
try:
    subprocess.run(cmd, cwd=FISH_ROOT, check=True, capture_output=True, text=True, env=env) # Pass the modified environment
except subprocess.CalledProcessError as e:
    print(f"Error during text to semantic codes conversion (Step 2):")
    print(f"Command: {' '.join(e.cmd)}")
    print(f"Return Code: {e.returncode}")
    print(f"Stderr:\n{e.stderr}")
    raise # Re-raise the exception to stop execution


# --------------------------------------------------------------------------
# 3️⃣  codes → final wav
# --------------------------------------------------------------------------
try:
    subprocess.run(
        [
            "python", "-m", "fish_speech.models.dac.inference", # Changed from vqgan.inference
            "-i", "/content/spanish_codes/codes_0.npy",
            "--checkpoint-path", ENC_CKPT,
            "-o", "/content/spanish_voiceclone.wav"
        ],
        cwd=FISH_ROOT, check=True, capture_output=True, text=True, env=env # Pass the modified environment
    )
except subprocess.CalledProcessError as e:
    print(f"Error during codes to audio conversion (Step 3):")
    print(f"Command: {' '.join(e.cmd)}")
    print(f"Return Code: {e.returncode}")
    print(f"Stderr:\n{e.stderr}")
    raise # Re-raise the exception to stop execution


print("✓ Spanish voice-cloned audio saved → /content/spanish_voiceclone.wav")

Error during reference processing (Step 1):
Command: python -m fish_speech.models.dac.inference -i /content/voice_ref_90s.mp3 --checkpoint-path /content/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth -o /content/voice_ref_prompt.wav
Return Code: 1
Stderr:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 644, in _locate
    obj = getattr(obj, part)
          ^^^^^^^^^^^^^^^^^^
AttributeError: module 'fish_speech.models.dac' has no attribute 'modded_dac'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 650, in _locate
    obj = import_module(mod)
          ^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

CalledProcessError: Command '['python', '-m', 'fish_speech.models.dac.inference', '-i', '/content/voice_ref_90s.mp3', '--checkpoint-path', '/content/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth', '-o', '/content/voice_ref_prompt.wav']' returned non-zero exit status 1.

In [36]:
!pip install -qU audiotools
print("✓ audiotools installed")

[31mERROR: Could not find a version that satisfies the requirement audiotools (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for audiotools[0m[31m
[0m✓ audiotools installed


In [1]:
# -------------------------------------------------
# Python cell  ➜  define a helper for step 2
# -------------------------------------------------
import json, subprocess, shlex, textwrap, os, pathlib, sys, re, tempfile

# 1) paths ------------------------------------------------------------------
FISH_ROOT = "/content/fish-speech"
MODEL_DIR = "/content/checkpoints/fish-speech-1.5"
SRC_MP3   = "/content/video_en.mp3"
CLIP_MP3  = "/content/voice_ref_90s.mp3"
ENC_CKPT  = f"{MODEL_DIR}/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"

# Add fish-speech directory to Python path for subprocesses
env = os.environ.copy()
env["PYTHONPATH"] = f"{FISH_ROOT}:{env.get('PYTHONPATH', '')}"


# 90-second reference (only once)
import ffmpeg, pathlib, os, warnings
pathlib.Path(CLIP_MP3).unlink(missing_ok=True)
ffmpeg.input(SRC_MP3, t=90).output(CLIP_MP3, acodec="libmp3lame").overwrite_output().run(quiet=True)

# --------------------------------------------------------------------------
# 1️⃣  reference → prompt tokens (npy) + preview wav
# --------------------------------------------------------------------------
try:
    subprocess.run(
        [
            "python", "-m", "fish_speech.models.dac.inference", # Changed from vqgan.inference
            "-i", CLIP_MP3,
            "--checkpoint-path", ENC_CKPT,
            "-o", "/content/voice_ref_prompt.wav"        # writes .wav AND .npy
        ],
        cwd=FISH_ROOT, check=True, capture_output=True, text=True, env=env # Pass the modified environment
    )
except subprocess.CalledProcessError as e:
    print(f"Error during reference processing (Step 1):")
    print(f"Command: {' '.join(e.cmd)}")
    print(f"Return Code: {e.returncode}")
    print(f"Stderr:\n{e.stderr}")
    raise # Re-raise the exception to stop execution

# --------------------------------------------------------------------------
# 2️⃣  Spanish text  → semantic codes
#     (we call the CLI from Python so we can pass the long text safely)
# --------------------------------------------------------------------------
SPANISH_TEXT = json.loads(translated_transcript)["translated_full_transcript"]  # already in your notebook

cmd = [
    "python", "-m", "fish_speech.models.text2semantic.inference",
    "--text", SPANISH_TEXT,
    "--prompt-text", "90-s voice ref",
    "--prompt-tokens", "/content/voice_ref_prompt.npy",
    "--checkpoint-path", MODEL_DIR,
    "--num-samples", "1",
    "--output-dir", "/content/spanish_codes",          # ==> codes_0.npy
]
try:
    subprocess.run(cmd, cwd=FISH_ROOT, check=True, capture_output=True, text=True, env=env) # Pass the modified environment
except subprocess.CalledProcessError as e:
    print(f"Error during text to semantic codes conversion (Step 2):")
    print(f"Command: {' '.join(e.cmd)}")
    print(f"Return Code: {e.returncode}")
    print(f"Stderr:\n{e.stderr}")
    raise # Re-raise the exception to stop execution


# --------------------------------------------------------------------------
# 3️⃣  codes → final wav
# --------------------------------------------------------------------------
try:
    subprocess.run(
        [
            "python", "-m", "fish_speech.models.dac.inference", # Changed from vqgan.inference
            "-i", "/content/spanish_codes/codes_0.npy",
            "--checkpoint-path", ENC_CKPT,
            "-o", "/content/spanish_voiceclone.wav"
        ],
        cwd=FISH_ROOT, check=True, capture_output=True, text=True, env=env # Pass the modified environment
    )
except subprocess.CalledProcessError as e:
    print(f"Error during codes to audio conversion (Step 3):")
    print(f"Command: {' '.join(e.cmd)}")
    print(f"Return Code: {e.returncode}")
    print(f"Stderr:\n{e.stderr}")
    raise # Re-raise the exception to stop execution


print("✓ Spanish voice-cloned audio saved → /content/spanish_voiceclone.wav")

Error during reference processing (Step 1):
Command: python -m fish_speech.models.dac.inference -i /content/voice_ref_90s.mp3 --checkpoint-path /content/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth -o /content/voice_ref_prompt.wav
Return Code: 1
Stderr:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 644, in _locate
    obj = getattr(obj, part)
          ^^^^^^^^^^^^^^^^^^
AttributeError: module 'fish_speech.models.dac' has no attribute 'modded_dac'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 650, in _locate
    obj = import_module(mod)
          ^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

CalledProcessError: Command '['python', '-m', 'fish_speech.models.dac.inference', '-i', '/content/voice_ref_90s.mp3', '--checkpoint-path', '/content/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth', '-o', '/content/voice_ref_prompt.wav']' returned non-zero exit status 1.

**Note on current issue:** A `ModuleNotFoundError: No module named 'audiotools'` is occurring when running the `fish_speech.models.dac.inference` subprocess. This indicates that the `audiotools` library, a dependency of `fish-speech`, is not being found by the Python interpreter within the subprocess environment, despite attempts to install it and adjust the Python path. This issue requires further troubleshooting potentially with the `fish-speech` library developers.