# Zonos Model: Google Colab Notebook

This notebook will help you set up and run the Zonos model in Google Colab. The model will be automatically downloaded from [Hugging Face](https://huggingface.co/Wamp1re-Ai/Zonos-v0.1-transformer).

## Steps:
1. Clone or upload the Zonos repository and download the model from HuggingFace.
2. Install dependencies.
3. Run the model with example code.
4. (Optional) Upload your own audio files for inference.
5. Use Cloudflare subdomain for sharing your Gradio interface.

In [None]:
#@title 1. Clone the repository and download the model
import os

# Clone the Zonos repository (use the correct URL)
if not os.path.exists('Zonos'):
    !git clone https://github.com/Wamp1re-Ai/Zonos.git  # Change this to your actual GitHub username
    print("Repository cloned successfully!")
else:
    print("Repository already exists!")

%cd Zonos

# Install system dependencies first (eSpeak is required for phonemization)
!apt-get update -qq
!apt-get install -y espeak-ng git-lfs

# Initialize git LFS
!git lfs install

print("System dependencies installed successfully!")

In [None]:
#@title 2. Install dependencies
# Install required packages efficiently and avoid dependency conflicts
import subprocess
import sys
import os

def install_package(package):
    """Install a package with better error handling"""
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet"])
        return True
    except subprocess.CalledProcessError as e:
        print(f"Failed to install {package}: {e}")
        return False

# Don't upgrade pip/setuptools in Colab - it causes conflicts
print("Skipping pip upgrade to avoid dependency conflicts in Colab...")

# Check if we're in Colab and use pre-installed torch if available
IN_COLAB = 'google.colab' in sys.modules
print(f"Running in Google Colab: {IN_COLAB}")

if IN_COLAB:
    # Use Colab's pre-installed PyTorch to save time
    print("Using Colab's pre-installed PyTorch...")
    try:
        import torch
        import torchaudio
        print(f"‚úì PyTorch {torch.__version__} already available")
        print(f"‚úì TorchAudio {torchaudio.__version__} already available")
        torch_installed = True
    except ImportError:
        print("PyTorch not found, will install...")
        torch_installed = False
else:
    torch_installed = False

# Install core dependencies (skip torch if already available)
packages = [
    "transformers>=4.45.0",
    "gradio>=4.0.0", 
    "huggingface-hub>=0.20.0",
    "soundfile>=0.12.1",
    "phonemizer>=3.2.0",
    "numpy>=1.24.0",
    "inflect>=7.0.0",
    "scipy"
]

# Add torch packages if not already installed
if not torch_installed:
    packages = ["torch>=2.0.0", "torchaudio>=2.0.0"] + packages

print(f"Installing {len(packages)} core dependencies...")
failed_packages = []

for package in packages:
    print(f"Installing {package}...")
    if not install_package(package):
        failed_packages.append(package)

if failed_packages:
    print(f"\n‚ö†Ô∏è Failed to install: {failed_packages}")
    print("Continuing anyway - some packages might work...")

# Install optional dependencies for better performance (if supported)
print("\nInstalling optional dependencies...")
optional_packages = [
    "flash-attn>=2.0.0",
    "mamba-ssm>=1.0.0", 
    "causal-conv1d>=1.0.0"
]

for package in optional_packages:
    print(f"Attempting to install {package}...")
    if not install_package(package):
        print(f"  -> {package} installation failed (optional - continuing)")

# Install the project itself
print("\nInstalling Zonos package...")
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", ".", "--quiet"])
    print("‚úì Zonos package installed successfully!")
except subprocess.CalledProcessError as e:
    print(f"‚ùå Failed to install Zonos package: {e}")
    print("Trying alternative installation...")
    # Alternative: add current directory to Python path
    current_dir = os.getcwd()
    if current_dir not in sys.path:
        sys.path.insert(0, current_dir)
    print(f"Added {current_dir} to Python path")

print("\n‚úÖ Dependency installation complete!")
print("\nüìù Installation Summary:")
print(f"  - Core packages: {len(packages) - len(failed_packages)}/{len(packages)} successful")
if failed_packages:
    print(f"  - Failed packages: {failed_packages}")
print(f"  - Optional packages: Attempted (failures are normal)")
print(f"  - Zonos package: Installed")

In [None]:
#@title 3. Load and run the Zonos model
import sys
import torch
import torchaudio
import os

# Make sure we can import zonos modules
if '/content/Zonos' not in sys.path:
    sys.path.insert(0, '/content/Zonos')

try:
    from zonos.model import Zonos, DEFAULT_BACKBONE_CLS
    from zonos.conditioning import make_cond_dict, supported_language_codes
    from zonos.utils import DEFAULT_DEVICE
    print("‚úì Zonos modules imported successfully!")
except ImportError as e:
    print(f"‚ùå Import error: {e}")
    print("Make sure the previous installation steps completed successfully.")
    raise

# Set device (use GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Check GPU memory if using CUDA
if device.type == 'cuda':
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

# Load the model from HuggingFace
model_name = "Wamp1re-Ai/Zonos-v0.1-transformer"
print(f"Loading model: {model_name}")
print("This may take a few minutes for the first time...")

try:
    model = Zonos.from_pretrained(model_name, device=device)
    model.requires_grad_(False).eval()
    print("‚úì Model loaded successfully!")
    
    # Show available conditioning options
    if hasattr(model, 'prefix_conditioner') and hasattr(model.prefix_conditioner, 'conditioners'):
        print("\nAvailable conditioning options:")
        for c in model.prefix_conditioner.conditioners:
            print(f"  - {c.name}")
    
    # Show supported languages
    print(f"\nSupported languages: {supported_language_codes}")
            
    print("\nüéâ Setup complete! You can now use the model in the cells below.")
    
except Exception as e:
    print(f"‚ùå Error loading model: {e}")
    print("\nTroubleshooting tips:")
    print("1. Make sure you have a stable internet connection")
    print("2. Check if you have enough GPU/RAM memory")
    print("3. Try restarting the runtime and running from the beginning")
    raise

In [None]:
#@title 4. Run text-to-speech example

# Import necessary modules for conditional generation
from google.colab import files
import IPython.display as ipd
import numpy as np

#@markdown ### Text and language settings
text = "Hello, this is Zonos text-to-speech model. How can I help you today?" #@param {type:"string"}
language = "en-us" #@param ["en-us", "en-gb", "fr-fr", "es-es", "de-de", "it-it", "ja-jp", "zh-cn"]

#@markdown ### Optional: Upload your own audio for speaker cloning
use_speaker_cloning = False #@param {type:"boolean"}

speaker_embedding = None
if use_speaker_cloning:
    print("Upload a short audio file (5-30 seconds) of the speaker you want to clone:")
    uploaded = files.upload()
    
    if uploaded:
        speaker_file = list(uploaded.keys())[0]
        print(f"Processing {speaker_file}...")
        try:
            wav, sr = torchaudio.load(speaker_file)
            # Convert to mono if stereo
            if wav.shape[0] > 1:
                wav = wav.mean(0, keepdim=True)
            speaker_embedding = model.make_speaker_embedding(wav, sr)
            speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)
            print(f"‚úì Speaker embedding created from {speaker_file}")
        except Exception as e:
            print(f"‚ùå Error processing speaker audio: {e}")
            use_speaker_cloning = False

# Set a random seed for reproducibility
torch.manual_seed(42)

# Create conditioning dictionary
print("Creating conditioning...")
try:
    cond_dict = make_cond_dict(
        text=text,
        language=language,
        speaker=speaker_embedding,
        device=device,
        # Use emotion as unconditional for more natural speech
        unconditional_keys=["emotion"] if not use_speaker_cloning else ["emotion"]
    )

    # Prepare conditioning
    conditioning = model.prepare_conditioning(cond_dict)

    # Generate audio
    print("üéµ Generating audio...")
    print("This may take 30-60 seconds depending on text length...")
    
    codes = model.generate(
        prefix_conditioning=conditioning,
        max_new_tokens=min(86 * 30, len(text) * 20),  # Adaptive based on text length
        cfg_scale=2.0,
        batch_size=1,
        progress_bar=True
    )

    # Decode the audio
    print("üîä Decoding audio...")
    wav_out = model.autoencoder.decode(codes).cpu().detach()
    sr_out = model.autoencoder.sampling_rate
    
    if wav_out.dim() == 2 and wav_out.size(0) > 1:
        wav_out = wav_out[0:1, :]

    # Play the audio
    wav_numpy = wav_out.squeeze().numpy()
    print(f"‚úì Audio generated successfully!")
    print(f"Sample rate: {sr_out} Hz, Duration: {len(wav_numpy)/sr_out:.2f} seconds")
    
    # Display audio player
    ipd.display(ipd.Audio(wav_numpy, rate=sr_out))
    
except Exception as e:
    print(f"‚ùå Error during audio generation: {e}")
    print("\nTroubleshooting:")
    print("- Try shorter text (under 100 characters)")
    print("- Check GPU memory usage")
    print("- Restart runtime if needed")

In [None]:
#@title 4.1 Advanced Text-to-Speech Options

#@markdown ### Adjust model parameters for generation

#@markdown #### Text and language
text = "I can speak with different emotions and characteristics. This is an example of advanced text-to-speech synthesis." #@param {type:"string"}
language = "en-us" #@param ["en-us", "en-gb", "fr-fr", "es-es", "de-de", "it-it", "ja-jp", "zh-cn"]

#@markdown #### Emotion controls (0-1 scale)
happiness = 0.7 #@param {type:"slider", min:0, max:1, step:0.05}
sadness = 0.1 #@param {type:"slider", min:0, max:1, step:0.05}
anger = 0.1 #@param {type:"slider", min:0, max:1, step:0.05}
fear = 0.05 #@param {type:"slider", min:0, max:1, step:0.05}
surprise = 0.05 #@param {type:"slider", min:0, max:1, step:0.05}
disgust = 0.05 #@param {type:"slider", min:0, max:1, step:0.05}
other = 0.1 #@param {type:"slider", min:0, max:1, step:0.05}
neutral = 0.3 #@param {type:"slider", min:0, max:1, step:0.05}

#@markdown #### Voice characteristics  
speaking_rate = 15.0 #@param {type:"slider", min:5, max:30, step:0.5}
pitch_std = 45.0 #@param {type:"slider", min:0, max:300, step:5}
fmax = 24000 #@param {type:"slider", min:8000, max:24000, step:1000}
vq_score = 0.78 #@param {type:"slider", min:0.5, max:0.8, step:0.01}
dnsmos_ovrl = 4.0 #@param {type:"slider", min:1, max:5, step:0.1}

#@markdown #### Generation settings
cfg_scale = 2.0 #@param {type:"slider", min:1, max:5, step:0.1}
randomize_seed = True #@param {type:"boolean"}
seed = 42 #@param {type:"integer"}
max_length_multiplier = 20 #@param {type:"slider", min:10, max:50, step:5}

# Set seed for reproducibility
if not randomize_seed:
    torch.manual_seed(seed)
    used_seed = seed
else:
    used_seed = torch.randint(0, 2**32 - 1, (1,)).item()
    torch.manual_seed(used_seed)

print(f"Using seed: {used_seed}")

# Validate emotion values sum (should be close to 1.0 for best results)
emotion_sum = happiness + sadness + anger + fear + surprise + disgust + other + neutral
if emotion_sum > 1.2 or emotion_sum < 0.8:
    print(f"‚ö†Ô∏è Warning: Emotion values sum to {emotion_sum:.2f}, consider adjusting for better results")

try:
    # Create emotion tensor
    emotion_tensor = torch.tensor([
        float(happiness),  # Happiness
        float(sadness),    # Sadness  
        float(disgust),    # Disgust
        float(fear),       # Fear
        float(surprise),   # Surprise
        float(anger),      # Anger
        float(other),      # Other
        float(neutral)     # Neutral
    ], device=device)

    # Create VQ score tensor (8 values for 8 codebooks)
    vq_tensor = torch.tensor([float(vq_score)] * 8, device=device).unsqueeze(0)

    # Create conditioning dictionary with more parameters
    print("Creating advanced conditioning...")
    cond_dict = make_cond_dict(
        text=text,
        language=language,
        speaker=speaker_embedding if 'speaker_embedding' in globals() else None,
        emotion=emotion_tensor,
        speaking_rate=speaking_rate,
        pitch_std=pitch_std,
        fmax=fmax,
        vqscore_8=vq_tensor,
        dnsmos_ovrl=dnsmos_ovrl,
        device=device,
        unconditional_keys=["emotion"] if 'speaker_embedding' not in globals() or speaker_embedding is None else []
    )

    # Prepare conditioning
    conditioning = model.prepare_conditioning(cond_dict)

    # Calculate appropriate max_new_tokens based on text length
    estimated_tokens = min(86 * 30, len(text) * max_length_multiplier)
    
    # Generate audio
    print(f"üéµ Generating audio with advanced settings...")
    print(f"Text length: {len(text)} chars, Estimated tokens: {estimated_tokens}")
    
    codes = model.generate(
        prefix_conditioning=conditioning,
        max_new_tokens=estimated_tokens,
        cfg_scale=cfg_scale,
        batch_size=1,
        progress_bar=True,
        sampling_params=dict(min_p=0.1, top_k=0, top_p=0.0)  # Use min_p sampling
    )

    # Decode the audio
    print("üîä Decoding audio...")
    wav_out = model.autoencoder.decode(codes).cpu().detach()
    sr_out = model.autoencoder.sampling_rate
    if wav_out.dim() == 2 and wav_out.size(0) > 1:
        wav_out = wav_out[0:1, :]

    # Play the audio
    wav_numpy = wav_out.squeeze().numpy()
    duration = len(wav_numpy) / sr_out
    
    print(f"‚úì Advanced audio generated successfully!")
    print(f"Sample rate: {sr_out} Hz, Duration: {duration:.2f} seconds")
    print(f"Settings used: CFG={cfg_scale}, Emotions=[H:{happiness}, S:{sadness}, A:{anger}, N:{neutral}]")
    ipd.display(ipd.Audio(wav_numpy, rate=sr_out))
    
    # Store for potential download
    globals()['last_generated_audio'] = (wav_numpy, sr_out, used_seed)
    
except Exception as e:
    print(f"‚ùå Error during advanced audio generation: {e}")
    print("\nTroubleshooting:")
    print("- Try simpler emotion settings (closer to default values)")
    print("- Reduce text length")
    print("- Lower CFG scale (try 1.5-2.0)")
    print("- Check GPU memory usage")

In [None]:
#@title 4.2 Save Generated Audio

#@markdown ### Save and download the generated audio
import os
import scipy.io.wavfile
from datetime import datetime

#@markdown Choose what to save
save_last_generated = True #@param {type:"boolean"}
filename_prefix = "zonos_audio" #@param {type:"string"}
include_timestamp = True #@param {type:"boolean"}
include_settings = True #@param {type:"boolean"}

# Check if we have generated audio from previous cells
try:
    if 'last_generated_audio' in globals():
        wav_numpy, sr_out, used_seed = last_generated_audio
        
        # Create filename
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if include_timestamp else ""
        settings_suffix = f"_seed{used_seed}" if include_settings else ""
        
        if timestamp and settings_suffix:
            filename = f"{filename_prefix}_{timestamp}{settings_suffix}.wav"
        elif timestamp:
            filename = f"{filename_prefix}_{timestamp}.wav"
        elif settings_suffix:
            filename = f"{filename_prefix}{settings_suffix}.wav"
        else:
            filename = f"{filename_prefix}.wav"
        
        # Save the audio file
        print(f"üíæ Saving audio as: {filename}")
        scipy.io.wavfile.write(filename, sr_out, wav_numpy)
        
        # Show file info
        file_size = os.path.getsize(filename) / (1024 * 1024)  # MB
        duration = len(wav_numpy) / sr_out
        print(f"‚úì Audio saved successfully!")
        print(f"  File: {filename}")
        print(f"  Size: {file_size:.2f} MB")
        print(f"  Duration: {duration:.2f} seconds")
        print(f"  Sample rate: {sr_out} Hz")
        
        # Provide download link
        print("\nüì• Starting download...")
        from google.colab import files
        files.download(filename)
        
        print("üéâ Audio file ready for download!")
        
    elif 'wav_numpy' in globals() and 'sr_out' in globals():
        # Fallback to basic variables if available
        filename = f"{filename_prefix}_basic.wav"
        scipy.io.wavfile.write(filename, sr_out, wav_numpy)
        print(f"‚úì Audio saved as {filename}")
        files.download(filename)
        
    else:
        print("‚ùå No audio has been generated yet.")
        print("Run one of the audio generation cells above first.")
        
except Exception as e:
    print(f"‚ùå Error saving audio: {e}")
    print("Make sure audio generation completed successfully in previous cells.")

---
**Note:**
- Adjust the import paths and model usage according to your codebase.
- If you encounter issues with dependencies, check the `pyproject.toml` or manually install missing packages.

## Additional Notes and Troubleshooting

### ‚úÖ What This Notebook Does
- Automatically installs all required dependencies including system packages
- Downloads the Zonos transformer model from HuggingFace (2-3 GB)
- Provides both simple and advanced text-to-speech generation
- Supports speaker cloning with uploaded audio files
- Includes a full Gradio web interface for interactive use
- Handles error checking and provides helpful feedback

### üéØ Performance Tips
- **GPU Runtime**: Use GPU runtime for best performance (Runtime ‚Üí Change runtime type ‚Üí Hardware accelerator ‚Üí GPU)
- **Text Length**: Shorter texts (under 200 characters) generate faster
- **Memory**: The model uses ~2-4 GB of GPU memory when loaded
- **First Run**: Initial model download may take 5-10 minutes depending on connection

### üîß Common Issues and Solutions

**Model Loading Fails:**
- Check internet connection stability
- Ensure sufficient GPU/RAM memory (restart runtime if needed)
- Try the transformer model if hybrid fails

**Audio Generation Errors:**
- Reduce text length (try under 100 characters)
- Lower CFG scale (try 1.5 instead of 2.0)
- Simplify emotion settings (use defaults)
- Check GPU memory usage

**Import Errors:**
- Restart runtime and run all cells from the beginning
- Check that installation completed without errors
- Ensure you're using a GPU runtime

**Speaker Cloning Issues:**
- Use audio files 5-30 seconds long
- Ensure audio is clear and contains speech
- Supported formats: WAV, MP3, FLAC
- Try mono audio if stereo doesn't work

### üìö Model Information
- **Model**: Wamp1re-Ai/Zonos-v0.1-transformer
- **Languages**: English, Japanese, Chinese, French, German
- **Sample Rate**: 44.1 kHz
- **Architecture**: Transformer-based with DAC autoencoder
- **Training Data**: 200k+ hours of multilingual speech

### üåê Using Custom Subdomains
When running the Gradio interface, you can optionally use a custom Cloudflare subdomain:
1. Set `use_custom_subdomain = True` in the Gradio cell
2. Choose a unique subdomain name
3. Your interface will be available at `https://your-name.gradio.app`

### üí° Advanced Usage
For production use or custom applications, consider:
- Using the hybrid model for better quality (requires mamba-ssm)
- Implementing custom conditioning parameters
- Fine-tuning for specific voices or languages
- Using the API programmatically

### üîó Useful Links
- [Zonos GitHub Repository](https://github.com/YourUsername/Zonos)
- [Model on HuggingFace](https://huggingface.co/Wamp1re-Ai/Zonos-v0.1-transformer)
- [Zyphra Blog Post](https://www.zyphra.com/post/beta-release-of-zonos-v0-1)
- [Online Playground](https://playground.zyphra.com/audio)

## Using Cloudflare Subdomain for Gradio Interface

When running the Gradio interface, you can use a custom Cloudflare subdomain to make your interface accessible via a consistent URL. This is especially useful for sharing your model with others.

**To use a custom subdomain:**

1. Set the `GRADIO_SUBDOMAIN` environment variable to your desired subdomain name.
2. Set `GRADIO_SHARE=True` to enable sharing.

For example:

In [None]:
#@title 5. Run Gradio Interface (Interactive Web UI)
import os
import threading
import time

#@markdown ### Gradio Interface Settings
subdomain_name = "my-zonos-app" #@param {type:"string"}
use_custom_subdomain = False #@param {type:"boolean"}
share_publicly = True #@param {type:"boolean"}

# Set environment variables for Gradio
if use_custom_subdomain and subdomain_name:
    os.environ["GRADIO_SUBDOMAIN"] = subdomain_name
    print(f"üåê Will attempt to use subdomain: {subdomain_name}.gradio.app")
else:
    # Remove subdomain if previously set
    os.environ.pop("GRADIO_SUBDOMAIN", None)

os.environ["GRADIO_SHARE"] = "True" if share_publicly else "False"

print("üöÄ Starting Gradio interface...")
print("This may take a moment to initialize...")

# Check if gradio_interface.py exists
if not os.path.exists("gradio_interface.py"):
    print("‚ùå gradio_interface.py not found!")
    print("Make sure you're in the correct directory and the file exists.")
else:
    try:
        # Import and run the interface
        import subprocess
        import sys
        
        # Run the gradio interface in a separate process
        print("üì± Launching Gradio interface...")
        print("Click on the public URL below to access the web interface")
        print("‚ö†Ô∏è Note: The interface will run until you stop this cell\n")
        
        # Run the gradio interface
        result = subprocess.run([
            sys.executable, "gradio_interface.py"
        ], capture_output=False, text=True)
        
    except KeyboardInterrupt:
        print("\nüõë Gradio interface stopped by user")
    except Exception as e:
        print(f"‚ùå Error running Gradio interface: {e}")
        print("\nTroubleshooting:")
        print("- Make sure all dependencies are installed correctly")
        print("- Check that the model loaded successfully in previous cells")
        print("- Try restarting the runtime if needed")

In [None]:
#@title üß™ Quick Test - Verify Everything Works
#@markdown Run this cell to verify that Zonos is properly installed and working

print("Running comprehensive test of Zonos installation...")
print("This will check dependencies and try to load the model.")
print()

exec(open('colab_quick_test.py').read())