In [23]:
# @title 1. Install Dependencies & Setup
import os
import json
import base64
import pathlib
import io
import struct
import requests
import soundfile as sf
import time
import threading
import subprocess
import tempfile

# Create necessary directories
os.makedirs("audio", exist_ok=True)
os.makedirs("output/audio", exist_ok=True)
os.makedirs("audio_wav", exist_ok=True)  # For converted files

print("‚úÖ Dependencies loaded and folders created.")

‚úÖ Dependencies loaded and folders created.


In [27]:
# @title 2. Configuration
import dashscope
from dashscope.audio.qwen_tts_realtime import QwenTtsRealtime, QwenTtsRealtimeCallback, AudioFormat

# üîë YOUR ALIBABA CLOUD API KEY
API_KEY = "sk-1188eaed07a64bdabb72216901742a53"

# Set up DashScope
dashscope.api_key = API_KEY
dashscope.base_http_api_url = 'https://dashscope-intl.aliyuncs.com/api/v1'

# API endpoints
BASE_URL = "https://dashscope-intl.aliyuncs.com/api/v1"
WS_URL = "wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime"

# Model for voice cloning
MODEL_VOICE_CLONE = "qwen3-tts-vc-realtime-2026-01-15"

INPUT_JSON_FILE = 'manifest.json'
OUTPUT_DIR = 'output'
OUTPUT_AUDIO_SUBDIR = 'audio'
OUTPUT_TS_FILENAME = 'speakers.ts'

FOX_TEXT = "The quick brown fox jumps over the lazy dog. But what about the man who watched over them? He barely noticed."

print(f"‚úÖ Configuration set.")
print(f"   API Key: {API_KEY[:8]}...****")
print(f"   Voice Clone Model: {MODEL_VOICE_CLONE}")

‚úÖ Configuration set.
   API Key: sk-1188e...****
   Voice Clone Model: qwen3-tts-vc-realtime-2026-01-15


In [28]:
# @title 3. Audio Conversion (webm/m4a/ogg -> WAV)

def convert_to_wav(input_path: str) -> str:
    """
    Convert audio file to WAV format using ffmpeg.
    Returns path to the WAV file.
    """
    input_path = pathlib.Path(input_path)
    
    # If already WAV, return as-is
    if input_path.suffix.lower() == '.wav':
        return str(input_path)
    
    # Output path in audio_wav folder
    output_path = pathlib.Path('audio_wav') / f"{input_path.stem}.wav"
    
    # Skip if already converted
    if output_path.exists():
        return str(output_path)
    
    # Convert using ffmpeg
    cmd = [
        'ffmpeg', '-y', '-i', str(input_path),
        '-ar', '16000',  # 16kHz sample rate
        '-ac', '1',      # Mono
        '-acodec', 'pcm_s16le',  # 16-bit PCM
        str(output_path)
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode != 0:
        raise RuntimeError(f"ffmpeg failed: {result.stderr}")
    
    return str(output_path)


print("‚úÖ Audio conversion function defined.")

‚úÖ Audio conversion function defined.


In [29]:
# @title 4. Voice Enrollment Function

def create_voice(file_path: str, audio_text: str = None, preferred_name: str = None) -> str:
    """
    Enroll a voice and return the voice ID.
    Automatically converts audio to WAV if needed.
    """
    # Convert to WAV first
    wav_path = convert_to_wav(file_path)
    file_path_obj = pathlib.Path(wav_path)
    
    if not file_path_obj.exists():
        raise FileNotFoundError(f"Audio file not found: {wav_path}")

    # Encode audio to base64
    base64_str = base64.b64encode(file_path_obj.read_bytes()).decode()
    data_uri = f"data:audio/wav;base64,{base64_str}"

    # Build payload
    inp = {
        "action": "create",
        "target_model": MODEL_VOICE_CLONE,
        "preferred_name": preferred_name or "clone",
        "audio": {"data": data_uri}
    }
    
    # Add optional text transcript (improves quality)
    if audio_text:
        inp["text"] = audio_text
    
    payload = {
        "model": "qwen-voice-enrollment",
        "input": inp
    }
    
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }

    resp = requests.post(
        f"{BASE_URL}/services/audio/tts/customization",
        headers=headers,
        json=payload,
        timeout=60
    )
    
    if resp.status_code != 200:
        raise RuntimeError(f"Voice enrollment failed: {resp.status_code}, {resp.text}")

    return resp.json()["output"]["voice"]


print("‚úÖ create_voice() defined")

‚úÖ create_voice() defined


In [30]:
# @title 5. Speech Synthesis Function (WebSocket)

def pcm_to_wav(pcm: bytes) -> bytes:
    size = len(pcm)
    return struct.pack('<4sI4s4sIHHIIHH4sI', b'RIFF', 36 + size, b'WAVE', b'fmt ', 16, 1, 1, 24000, 48000, 2, 16, b'data', size) + pcm


class TTSCallback(QwenTtsRealtimeCallback):
    def __init__(self):
        self.chunks = []
        self.error = None
        self.done = threading.Event()

    def on_open(self): pass
    def on_close(self, *_): self.done.set()
    def on_error(self, e): self.error = str(e); self.done.set()

    def on_event(self, r):
        t = r.get('type', '')
        if t == 'response.audio.delta':
            self.chunks.append(base64.b64decode(r.get('delta', '')))
        elif t in ('response.done', 'session.finished'):
            self.done.set()
        elif t == 'error':
            self.error = r.get('error', {}).get('message', 'Error')
            self.done.set()


def synthesize_speech(text: str, voice: str) -> tuple:
    """
    Synthesize speech via WebSocket using cloned voice.
    Returns (audio_data, sample_rate)
    """
    cb = TTSCallback()
    tts = QwenTtsRealtime(model=MODEL_VOICE_CLONE, callback=cb, url=WS_URL)
    
    tts.connect()
    tts.update_session(voice=voice, response_format=AudioFormat.PCM_24000HZ_MONO_16BIT, mode='commit')
    tts.append_text(text)
    tts.commit()
    
    cb.done.wait(timeout=120)
    
    if cb.error:
        raise Exception(cb.error)
    
    tts.finish()
    
    wav_bytes = pcm_to_wav(b''.join(cb.chunks))
    audio_io = io.BytesIO(wav_bytes)
    audio_data, sample_rate = sf.read(audio_io)
    return audio_data, sample_rate


print("‚úÖ synthesize_speech() defined")

‚úÖ synthesize_speech() defined


In [31]:
# @title 6. Run Processing

def generate_ts_file(metadata):
    ts_content = "export const speakers: Speaker[] = "
    json_str = json.dumps(metadata, indent=2)
    out_path = os.path.join(OUTPUT_DIR, OUTPUT_TS_FILENAME)
    with open(out_path, 'w', encoding='utf-8') as f:
        f.write(ts_content + json_str + ";\n")
    print(f"\nüìÑ Generated: {out_path}")


def process_samples():
    if not os.path.exists(INPUT_JSON_FILE):
        print(f"‚ùå Error: '{INPUT_JSON_FILE}' not found.")
        return

    with open(INPUT_JSON_FILE, 'r', encoding='utf-8') as f:
        data = json.load(f)

    full_output_audio_path = os.path.join(OUTPUT_DIR, OUTPUT_AUDIO_SUBDIR)
    os.makedirs(full_output_audio_path, exist_ok=True)
    speakers_metadata = []
    enrolled_voices = {}  # Cache: user_id -> voice_id

    print(f"\nüöÄ Processing {len(data['samples'])} samples...")
    print(f"   Using model: {MODEL_VOICE_CLONE}\n")

    for i, sample in enumerate(data['samples']):
        user_id = sample['user_id']
        original_text = sample['text']
        
        json_audio_path = sample['audio_file']
        if os.path.exists(json_audio_path):
            ref_audio_path = json_audio_path
        elif os.path.exists(os.path.basename(json_audio_path)):
            ref_audio_path = os.path.basename(json_audio_path)
        else:
            print(f"‚ö†Ô∏è Skipping {user_id}: Audio not found ({json_audio_path})")
            continue

        print(f"[{i+1}/{len(data['samples'])}] üéôÔ∏è Processing {user_id}...")

        speaker_entry = {
            "id": f"spk-{user_id}",
            "name": f"User {user_id}",
            "originalText": original_text,
            "originalAudio": f"/audio/{os.path.basename(ref_audio_path)}",
            "models": []
        }

        try:
            # Enroll voice if not cached
            if user_id not in enrolled_voices:
                print(f"   ‚òÅÔ∏è Enrolling voice...")
                voice_id = create_voice(
                    file_path=ref_audio_path,
                    audio_text=original_text,
                    preferred_name=f"user_{user_id}"
                )
                enrolled_voices[user_id] = voice_id
                print(f"   ‚úÖ Voice: {voice_id}")
            else:
                voice_id = enrolled_voices[user_id]
            
            base_name = f"spk-{user_id}-qwen"
            orig_filename = f"{base_name}-original.wav"
            new_filename = f"{base_name}-new.wav"
            out_path_orig = os.path.join(full_output_audio_path, orig_filename)
            out_path_new = os.path.join(full_output_audio_path, new_filename)

            # Generate original text
            if not os.path.exists(out_path_orig):
                audio_orig, sr_orig = synthesize_speech(original_text, voice_id)
                sf.write(out_path_orig, audio_orig, sr_orig)
                print(f"   ‚úÖ {orig_filename}")

            # Generate fox text
            if not os.path.exists(out_path_new):
                audio_new, sr_new = synthesize_speech(FOX_TEXT, voice_id)
                sf.write(out_path_new, audio_new, sr_new)
                print(f"   ‚úÖ {new_filename}")

            speaker_entry["models"].append({
                "modelId": "qwen",
                "modelName": "Qwen3 Engine",
                "clonedOriginalAudio": f"/{OUTPUT_AUDIO_SUBDIR}/{orig_filename}",
                "clonedOriginalText": original_text,
                "clonedNewAudio": f"/{OUTPUT_AUDIO_SUBDIR}/{new_filename}",
                "clonedNewText": FOX_TEXT
            })

        except Exception as e:
            print(f"   ‚ùå Error: {e}")

        speakers_metadata.append(speaker_entry)
        time.sleep(0.3)

    generate_ts_file(speakers_metadata)
    print("\n‚úÖ All processing complete!")

# RUN
process_samples()


üöÄ Processing 30 samples...
   Using model: qwen3-tts-vc-realtime-2026-01-15

[1/30] üéôÔ∏è Processing 153293...
   ‚òÅÔ∏è Enrolling voice...
   ‚úÖ Voice: qwen-tts-vc-user_153293-voice-20260211124227975-a85e
   ‚úÖ spk-153293-qwen-original.wav
   ‚úÖ spk-153293-qwen-new.wav
[2/30] üéôÔ∏è Processing 185885...
   ‚òÅÔ∏è Enrolling voice...
   ‚úÖ Voice: qwen-tts-vc-user_185885-voice-20260211124244440-7fb7
   ‚úÖ spk-185885-qwen-original.wav
   ‚úÖ spk-185885-qwen-new.wav
[3/30] üéôÔ∏è Processing 173685...
   ‚òÅÔ∏è Enrolling voice...
   ‚úÖ Voice: qwen-tts-vc-user_173685-voice-20260211124301738-aa54
   ‚úÖ spk-173685-qwen-original.wav
   ‚úÖ spk-173685-qwen-new.wav
[4/30] üéôÔ∏è Processing 112547...
   ‚òÅÔ∏è Enrolling voice...
   ‚úÖ Voice: qwen-tts-vc-user_112547-voice-20260211124316083-f45c
   ‚úÖ spk-112547-qwen-original.wav
   ‚úÖ spk-112547-qwen-new.wav
[5/30] üéôÔ∏è Processing 158899...
   ‚òÅÔ∏è Enrolling voice...
   ‚úÖ Voice: qwen-tts-vc-user_158899-voice-20260211124328

websocket closed due to Connection to remote host was lost.
websocket closed due to Connection to remote host was lost.
websocket closed due to fin=1 opcode=8 data=b'\x03\xe8Bye'
websocket closed due to fin=1 opcode=8 data=b'\x03\xe8Bye'
websocket closed due to fin=1 opcode=8 data=b'\x03\xe8Bye'
websocket closed due to fin=1 opcode=8 data=b'\x03\xe8Bye'
websocket closed due to Connection to remote host was lost.
websocket closed due to fin=1 opcode=8 data=b'\x03\xe8Bye'
websocket closed due to fin=1 opcode=8 data=b'\x03\xe8Bye'
websocket closed due to Connection to remote host was lost.
websocket closed due to Connection to remote host was lost.
websocket closed due to fin=1 opcode=8 data=b'\x03\xe8Bye'
websocket closed due to fin=1 opcode=8 data=b'\x03\xe8Bye'
websocket closed due to fin=1 opcode=8 data=b'\x03\xe8Bye'
websocket closed due to fin=1 opcode=8 data=b'\x03\xe8Bye'
websocket closed due to Connection to remote host was lost.
websocket closed due to fin=1 opcode=8 data=b'\x03