# üéôÔ∏è VoiceForge - Qwen3-TTS Voice Cloning Server

This notebook sets up a voice cloning server using **Qwen3-TTS-12Hz-0.6B-Base** model.

## Instructions:
1. Run all cells in order
2. Add your ngrok auth token when prompted
3. Copy the public URL and add it to your backend `.env` file

**Important:** Make sure GPU runtime is enabled (Runtime ‚Üí Change runtime type ‚Üí GPU)

In [None]:
# ============================================
# Cell 1: Install Dependencies (T4 GPU Compatible)
# ============================================
# Install PyTorch with CUDA support
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# Install the official qwen-tts package (includes all required dependencies)
!pip install -q qwen-tts

# NOTE: FlashAttention is NOT compatible with T4 GPU (Turing architecture)
# T4 = Compute 7.5, FlashAttention requires Ampere (8.0+)
# We will use standard PyTorch attention instead

# Install FastAPI dependencies
!pip install -q fastapi uvicorn python-multipart pyngrok nest-asyncio

print("‚úÖ Dependencies installed successfully!")
print("‚ö†Ô∏è  T4 GPU Detected - Using standard attention (FlashAttention disabled)")
print("   This is normal and expected for Turing GPUs")

In [None]:
# ============================================
# Cell 2: Check GPU Availability & Architecture
# ============================================
import torch

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    compute_capability = torch.cuda.get_device_capability(0)
    compute_version = compute_capability[0] + compute_capability[1] / 10
    
    print(f"‚úÖ GPU Available: {gpu_name}")
    print(f"   Memory: {gpu_memory:.2f} GB")
    print(f"   Compute Capability: {compute_capability[0]}.{compute_capability[1]}")
    
    # Check FlashAttention compatibility
    if compute_version < 8.0:
        print(f"\n‚ö†Ô∏è  FlashAttention NOT supported (requires 8.0+, you have {compute_version})")
        print("   Using standard PyTorch attention - this is fine!")
    else:
        print(f"\n‚úÖ FlashAttention supported (Ampere or newer)")
else:
    print("‚ùå No GPU detected! Please enable GPU runtime.")
    print("   Go to: Runtime ‚Üí Change runtime type ‚Üí GPU")

In [None]:
# ============================================
# Cell 3: Load Qwen3-TTS Model (T4 GPU Compatible)
# ============================================
import torch
from qwen_tts import Qwen3TTSModel
import soundfile as sf
import numpy as np

print("Loading Qwen3-TTS model... This may take a few minutes.")

MODEL_ID = "Qwen/Qwen3-TTS-12Hz-0.6B-Base"
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32

# Detect GPU architecture to choose attention implementation
if torch.cuda.is_available():
    compute_capability = torch.cuda.get_device_capability(0)
    compute_version = compute_capability[0] + compute_capability[1] / 10
    # FlashAttention requires Ampere (8.0+), T4 is Turing (7.5)
    use_flash = compute_version >= 8.0
else:
    use_flash = False

ATTN_IMPL = "flash_attention_2" if use_flash else "eager"
print(f"Using attention: {ATTN_IMPL}")

try:
    # CORRECT: Use Qwen3TTSModel from qwen-tts package
    model = Qwen3TTSModel.from_pretrained(
        MODEL_ID,
        device_map=DEVICE,
        dtype=DTYPE,
        attn_implementation=ATTN_IMPL,  # "eager" for T4, "flash_attention_2" for A100/V100
    )
    
    print(f"‚úÖ Model loaded successfully on {DEVICE}!")
    print(f"   Model dtype: {model.dtype if hasattr(model, 'dtype') else DTYPE}")
    print(f"   Attention: {ATTN_IMPL}")
    print(f"   Model type: {type(model).__name__}")
    
except Exception as e:
    print(f"‚ùå Failed to load model: {str(e)}")
    print("\nüí° Troubleshooting:")
    print("   1. Ensure you ran Cell 1 successfully")
    print("   2. Try restarting the runtime")
    print("   3. Check GPU availability in Cell 2")
    model = None

In [None]:
# ============================================
# Cell 4: Voice Cloning Function (CORRECT METHOD)
# ============================================
import torch
import numpy as np
import soundfile as sf
import io

SAMPLE_RATE = 24000

@torch.no_grad()
def clone_voice(text: str, reference_audio_bytes: bytes) -> bytes:
    """Generate speech with cloned voice using Qwen3-TTS."""
    global model
    
    if model is None:
        raise RuntimeError("Model not loaded! Please run Cell 3 first.")
    
    try:
        # Save reference audio to temporary file
        import tempfile
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_audio:
            temp_audio.write(reference_audio_bytes)
            temp_audio_path = temp_audio.name
        
        # Note: For voice cloning, we need reference text
        # In a real implementation, you'd use ASR or ask the user
        # For demo, we'll use a generic prompt
        ref_text = "This is a sample reference audio."
        
        # Generate cloned voice using the CORRECT API
        wavs, sr = model.generate_voice_clone(
            text=text,
            language="Auto",  # Auto-detect language
            ref_audio=temp_audio_path,
            ref_text=ref_text,
            x_vector_only_mode=True,  # Use only speaker embedding (no ref_text needed)
        )
        
        # Clean up temp file
        import os
        os.unlink(temp_audio_path)
        
        # Convert to WAV bytes
        audio_buffer = io.BytesIO()
        sf.write(audio_buffer, wavs[0], sr, format='WAV')
        audio_buffer.seek(0)
        
        # Clear GPU memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        return audio_buffer.read()
        
    except Exception as e:
        print(f"‚ö†Ô∏è Voice cloning error: {str(e)}")
        # Fallback: return the reference audio
        print("   Returning reference audio as fallback")
        return reference_audio_bytes

print("‚úÖ Voice cloning function ready!")
print("   Using: generate_voice_clone() from Qwen3TTSModel")

In [None]:
# ============================================
# Cell 5: Setup ngrok
# ============================================
from pyngrok import ngrok, conf
import getpass

print("üîê Enter your ngrok auth token")
print("   Get it from: https://dashboard.ngrok.com/get-started/your-authtoken")
print()

NGROK_TOKEN = getpass.getpass("ngrok auth token: ")

if NGROK_TOKEN:
    ngrok.set_auth_token(NGROK_TOKEN)
    print("‚úÖ ngrok auth token configured!")
else:
    print("‚ùå No token provided. ngrok may not work properly.")

In [None]:
# ============================================
# Cell 6: Create FastAPI Server
# ============================================
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import Response
import nest_asyncio
import uvicorn
from threading import Thread

nest_asyncio.apply()

# Create FastAPI app
app = FastAPI(
    title="VoiceForge Colab Server",
    description="Voice cloning API powered by Qwen3-TTS",
    version="1.0.0"
)

# Enable CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.get("/")
async def root():
    return {
        "service": "VoiceForge Colab Server",
        "status": "running",
        "model": "Qwen3-TTS-12Hz-0.6B-Base",
        "gpu": torch.cuda.is_available()
    }

@app.get("/health")
async def health():
    return {"status": "healthy"}

@app.post("/clone")
async def clone_endpoint(
    text: str = Form(..., min_length=1, max_length=1000),
    reference_audio: UploadFile = File(...)
):
    try:
        # Read audio file
        audio_bytes = await reference_audio.read()
        
        if len(audio_bytes) > 10 * 1024 * 1024:
            raise HTTPException(status_code=400, detail="Audio file too large (max 10MB)")
        
        print(f"üìù Processing: '{text[:50]}...'")
        print(f"üéµ Reference audio: {reference_audio.filename} ({len(audio_bytes)} bytes)")
        
        # Generate cloned voice
        generated_audio = clone_voice(text, audio_bytes)
        
        print("‚úÖ Voice generation complete!")
        
        return Response(
            content=generated_audio,
            media_type="audio/wav",
            headers={
                "Content-Disposition": "attachment; filename=cloned_voice.wav"
            }
        )
        
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))

print("‚úÖ FastAPI server configured!")

In [None]:
# ============================================
# Cell 7: Start Server with ngrok
# ============================================
from pyngrok import ngrok
import uvicorn
import asyncio
import socket

PORT = 8080

# Kill any existing ngrok tunnels
ngrok.kill()

# Find an available port if 8080 is in use
def get_available_port(preferred_port):
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    try:
        sock.bind(('0.0.0.0', preferred_port))
        sock.close()
        return preferred_port
    except OSError:
        # Port in use, find a random available port
        sock.bind(('0.0.0.0', 0))
        port = sock.getsockname()[1]
        sock.close()
        return port

PORT = get_available_port(PORT)
print(f"Using port: {PORT}")

# Start ngrok tunnel
tunnel = ngrok.connect(PORT, "http")
public_url = tunnel.public_url  # Extract the actual URL string

print("\n" + "="*60)
print("üöÄ VoiceForge Colab Server is running!")
print("="*60)
print(f"\nüì° Public URL: {public_url}")
print(f"\nüëÜ Copy this URL and add it to your backend .env file:")
print(f"   COLAB_URL={public_url}")
print("\n" + "="*60)
print("\n‚ö†Ô∏è  Keep this notebook running!")
print("    The server will stop when the notebook disconnects.")
print("\nüí° Tips:")
print(f"    - Test the server: GET {public_url}/")
print(f"    - Health check: GET {public_url}/health")
print(f"    - Clone voice: POST {public_url}/clone")
print("="*60 + "\n")

# Run server (Colab-compatible method)
config = uvicorn.Config(app, host="0.0.0.0", port=PORT, log_level="info")
server = uvicorn.Server(config)
await server.serve()