# 🎤 Enhanced Voice Cloning with Zonos TTS

This interactive notebook provides an easy-to-use interface for the enhanced voice cloning system that fixes common issues like:
- ❌ Long pauses and unnatural timing
- ❌ Speed variations (fast/slow speech)
- ❌ Gibberish generation
- ❌ Inconsistent voice characteristics

## ✅ Enhanced Features:
- 🔧 **Advanced Audio Preprocessing**: Automatic silence removal, normalization
- 📊 **Voice Quality Analysis**: SNR estimation, quality scoring
- ⚙️ **Optimized Parameters**: Conservative sampling, better timing control
- 🎯 **Adaptive Settings**: Parameters adjust based on voice quality
- 🔄 **Reproducible Results**: Seed support for consistent generation

## 📦 Setup and Installation

Run this cell first to install dependencies and import required modules:

In [None]:
# Install required packages if needed
import subprocess
import sys

def install_package(package):
    try:
        __import__(package)
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Check and install required packages
packages = ['torch', 'torchaudio', 'IPython', 'ipywidgets']
for package in packages:
    install_package(package)

print("✅ All packages installed successfully!")

In [None]:
# Import all required modules
import torch
import torchaudio
import os
import warnings
import time
from typing import Optional, Dict, Any, Tuple
from IPython.display import Audio, display, HTML, clear_output
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

# Import enhanced voice cloning modules
try:
    from enhanced_voice_cloning import (
        EnhancedVoiceCloner, 
        create_enhanced_voice_cloner, 
        quick_voice_clone
    )
    from zonos.speaker_cloning import (
        preprocess_audio_for_cloning,
        analyze_voice_quality,
        get_voice_cloning_conditioning_params,
        get_voice_cloning_sampling_params
    )
    from zonos.utils import DEFAULT_DEVICE
    
    print("🚀 Enhanced Voice Cloning modules loaded successfully!")
    ENHANCED_AVAILABLE = True
    
except ImportError as e:
    print(f"❌ Enhanced modules not found: {e}")
    print("Please ensure all enhanced voice cloning files are in the correct directory.")
    ENHANCED_AVAILABLE = False

# Set device
device = DEFAULT_DEVICE if ENHANCED_AVAILABLE else 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"🖥️ Using device: {device}")

## 🎯 Quick Start: One-Click Voice Cloning

The easiest way to clone a voice and generate speech:

In [None]:
# Quick voice cloning function
def quick_clone_interface():
    if not ENHANCED_AVAILABLE:
        print("❌ Enhanced voice cloning not available. Please check installation.")
        return
    
    # Create interactive widgets
    text_input = widgets.Textarea(
        value="Hello! This is an enhanced voice cloning demonstration. The new system provides much better consistency and naturalness with no more gibberish or unnatural pauses.",
        description="Text to speak:",
        layout=widgets.Layout(width='100%', height='100px')
    )
    
    audio_path = widgets.Text(
        value="assets/exampleaudio.mp3",
        description="Voice audio path:",
        layout=widgets.Layout(width='100%')
    )
    
    language = widgets.Dropdown(
        options=['en-us', 'en-gb', 'fr-fr', 'es-es', 'de-de', 'it-it', 'ja-jp', 'zh-cn'],
        value='en-us',
        description='Language:'
    )
    
    seed = widgets.IntText(
        value=42,
        description='Seed (for reproducibility):'
    )
    
    output_name = widgets.Text(
        value="quick_clone_output.wav",
        description="Output filename:"
    )
    
    generate_button = widgets.Button(
        description="🎤 Generate Voice Clone",
        button_style='success',
        layout=widgets.Layout(width='200px', height='40px')
    )
    
    output_area = widgets.Output()
    
    def on_generate_click(b):
        with output_area:
            clear_output(wait=True)
            
            if not os.path.exists(audio_path.value):
                print(f"❌ Audio file not found: {audio_path.value}")
                return
            
            try:
                print("🚀 Starting enhanced voice cloning...")
                start_time = time.time()
                
                result = quick_voice_clone(
                    text=text_input.value,
                    voice_audio_path=audio_path.value,
                    output_path=output_name.value,
                    language=language.value,
                    seed=seed.value
                )
                
                generation_time = time.time() - start_time
                
                print(f"✅ Voice cloning completed in {generation_time:.2f} seconds!")
                print(f"📁 Output saved: {result['output_path']}")
                print(f"⏱️ Duration: {result['duration']:.2f} seconds")
                print(f"📊 Quality Score: {result['quality_metrics']['quality_score']:.3f}")
                print(f"📊 SNR Estimate: {result['quality_metrics']['snr_estimate']:.1f} dB")
                
                # Display audio player
                print("\n🔊 Generated Audio:")
                display(Audio(result['output_path']))
                
            except Exception as e:
                print(f"❌ Error during generation: {e}")
                import traceback
                traceback.print_exc()
    
    generate_button.on_click(on_generate_click)
    
    # Display interface
    display(widgets.VBox([
        widgets.HTML("<h3>🎤 Quick Voice Cloning Interface</h3>"),
        text_input,
        audio_path,
        widgets.HBox([language, seed]),
        output_name,
        generate_button,
        output_area
    ]))

# Run the quick clone interface
quick_clone_interface()