In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import hf_hub_download
from distilcodec import DistilCodec
import asyncio
import aiohttp
import torch
import logging
import json
import re

In [None]:
codec_model_config_path = hf_hub_download(repo_id="IDEA-Emdoor/DistilCodec-v1.0", filename="model_config.json")
codec_ckpt_path = hf_hub_download(repo_id="IDEA-Emdoor/DistilCodec-v1.0", filename="g_00204000")

codec = DistilCodec.from_pretrained(
    config_path=codec_model_config_path,
    model_path=codec_ckpt_path,
    use_generator=True,
    is_debug=False
).eval()

In [None]:
def decode_speech_token(speech_token):
    numbers = re.findall(r'speech_(\d+)', speech_token)
    d = list(map(int, numbers))
    with torch.no_grad():
        y_gen = codec.decode_from_codes(d, minus_token_offset=False)
        return (24000, y_gen[0, 0].cpu().numpy())

In [None]:
speaker = 'male'
input = 'Hello, saya adalah pembantu agent AI anda, apa yang boleh saya bantu anda?'
prompt = f'<|im_start|>{speaker}: {input}<|speech_start|>'
headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json',
}
json_data = {
    'model': "scicom-ai-enterprise/TTS-1.7B-Stage2",
    'prompt': prompt,
    'max_tokens': 2048,
    'temperature': 0.7,
    'stream': True,
}

In [None]:
queue = asyncio.Queue()

async def generate_audio_stream():
    try:
        async with aiohttp.ClientSession() as session:
            async with session.post(
                'http://localhost:9090/v1/completions',
                headers=headers,
                json=json_data,
            ) as resp:
                if resp.status != 200:
                    error_text = await resp.text()
                    logging.error(f"TTS backend error: {resp.status} - {error_text}")
                    return

                async for line in resp.content:
                    if line.startswith(b"data: "):
                        data_str = line.decode("utf-8").strip()[6:]
                        if data_str == "[DONE]":
                            break
                        try:
                            data_json = json.loads(data_str)
                            delta = data_json["choices"][0]
                            if "text" in delta:
                                await queue.put(delta["text"])
                        except json.JSONDecodeError:
                            continue

                await queue.put(None)
                
    except Exception:
        logging.error("Audio generation failed", exc_info=True)
        return

asyncio.create_task(generate_audio_stream())
outputs = []
count = 0
async def audio_stream():
    global count
    while True:
        try:
            output = queue.get_nowait()
        except asyncio.QueueEmpty:
            await asyncio.sleep(1e-9)
            continue
            
        if output is None:
            break
            
        outputs.append(output)
        count += 1

await audio_stream()

In [None]:
int(0.05 * 93)

In [None]:
chunk_size = int(0.3 * 93)
overlap = int(0.05 * 93)
overlap_chunk = int((overlap / 93) * 24000)

ys = []
buffer = []
count = 0
to_yield = 0
for token in outputs:
    buffer.append(token)
    count += 1
    if count % chunk_size == 0:
        sr, y = decode_speech_token(''.join(buffer))

        y_ = y[to_yield : -overlap_chunk] 
        if to_yield == 0:
            to_yield = len(y) - to_yield - overlap_chunk
        
        buffer = buffer[-chunk_size:]
        ys.append(y_)
        
if len(buffer):
    sr, y = decode_speech_token(''.join(buffer))
    y_ = y[to_yield :]
    ys.append(y_)

In [None]:
len(ys[0])

In [None]:
import IPython.display as ipd
import numpy as np

ipd.Audio(np.concatenate(ys), rate = sr)

In [None]:
sr, y = decode_speech_token(''.join(outputs))
ipd.Audio(y, rate = sr)