In [1]:
import torch
from torch.serialization import add_safe_globals
from TTS.tts.configs.xtts_config import XttsConfig
add_safe_globals([XttsConfig])
from TTS.api import TTS
from IPython import display

  from .autonotebook import tqdm as notebook_tqdm
  import pkg_resources


In [15]:
tts = TTS("tts_models/et/cv/vits", gpu=False)   # single-speaker; no language/speaker needed

 > tts_models/et/cv/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-embedding layers.
 > initialization of language-embedding layers.


In [13]:
%%time
tts.tts_to_file(text="tere tulemast maailma, mina olen villu", file_path="output.wav")

 > Text splitted to sentences.
['tere tulemast maailma, mina olen villu']
 > Processing time: 0.3070552349090576
 > Real-time factor: 0.12960505225391883
CPU times: total: 734 ms
Wall time: 317 ms


'output.wav'

In [14]:
display.Audio('output.wav')

In [19]:
# tts = TTS("tts_models/en/ljspeech/vits")

## From local file

In [2]:
tts = TTS(model_path='model/model_file.pth.tar',
          config_path='model/config.json',gpu=False)

 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-embedding layers.
 > initialization of language-embedding layers.


In [5]:
import types

def _patched_check_arguments(self, **kwargs):
    # Skip checks; assume single-speaker, single-language
    return

tts._check_arguments = types.MethodType(_patched_check_arguments, tts)

In [10]:
%%time
tts.tts_to_file(text="oh sa vana raks see on igavene jamaa", file_path="output.wav", language='et')

 > Text splitted to sentences.
['oh sa vana raks see on igavene jamaa']
 > Processing time: 0.28420591354370117
 > Real-time factor: 0.12235903611446834
CPU times: total: 703 ms
Wall time: 292 ms


'output.wav'

In [None]:
tts.tts_to_file(text="igavene pekimagu ollakse igal pool", file_path="output.wav", language='et')

In [11]:
display.Audio('output.wav')

## Client websocket

In [1]:
import asyncio
import json
import base64
import io
import wave
import numpy as np
import websockets
from IPython.display import Audio, display

async def tts_request(
    text="Tere maailm! See on test Coqui TTS teenusega.",
    host="localhost",
    port=8080,
    output_path="output.wav"
):
    """Send text to TTS WebSocket server and play/save the result."""
    uri = f"ws://{host}:{port}/synthesize"
    print(f"🔌 Connecting to {uri}")
    async with websockets.connect(uri) as websocket:
        # Send synthesis request
        await websocket.send(json.dumps({"text": text}))
        print(f"📤 Sent request: {text[:60]}")

        # Receive response
        response = await websocket.recv()
        result = json.loads(response)

        if not result.get("success"):
            print("❌ Error:", result.get("message"))
            return None

        audio_b64 = result.get("audio_data")
        if not audio_b64:
            print("❌ No audio_data in response.")
            return None

        audio_bytes = base64.b64decode(audio_b64)
        sr = int(result.get("sampling_rate", 22050))
        fmt = result.get("format", "").lower()
        is_wav = audio_bytes[:4] == b"RIFF"

        if is_wav or fmt == "wav":
            with open(output_path, "wb") as f:
                f.write(audio_bytes)
            print(f"✅ Saved WAV directly → {output_path}")
            data = audio_bytes
        else:
            # Wrap raw PCM float32 as WAV
            print("⚙️  Wrapping raw PCM into WAV...")
            wav = np.frombuffer(audio_bytes, dtype="<f4")
            wav = np.clip(wav, -1.0, 1.0)
            pcm16 = (wav * 32767.0).astype("<i2")

            buf = io.BytesIO()
            with wave.open(buf, "wb") as wf:
                wf.setnchannels(1)
                wf.setsampwidth(2)
                wf.setframerate(sr)
                wf.writeframes(pcm16.tobytes())

            data = buf.getvalue()
            with open(output_path, "wb") as f:
                f.write(data)
            print(f"✅ Converted and saved playable WAV → {output_path}")

        print("🎧 Playing audio inline...")
        display(Audio(data=data, rate=sr))
        return output_path


# ⏩ Example use inside a notebook cell:
await tts_request("mis asi see on", host="localhost", port=8080)


🔌 Connecting to ws://localhost:8080/synthesize


OSError: Multiple exceptions: [Errno 10061] Connect call failed ('::1', 8080, 0, 0), [Errno 10061] Connect call failed ('127.0.0.1', 8080)

## Client grpc

In [3]:
# gRPC notebook client for your Coqui TTS server
# - Calls Synthesize(text)
# - Handles WAV bytes directly, or wraps raw PCM float32 into WAV
# - Saves to output.wav and plays inline

import io
import wave
import numpy as np
from IPython.display import Audio, display

import grpc
import tts_pb2
import tts_pb2_grpc


def tts_request_grpc(
    text="Tere maailm! See on test Coqui TTS teenusega.",
    host="localhost",
    port=8081,
    output_path="output_grpc.wav",
    default_sr=22050,
):
    # allow large responses just in case
    channel = grpc.insecure_channel(
        f"{host}:{port}",
        options=[
            ("grpc.max_receive_message_length", 100 * 1024 * 1024),
            ("grpc.max_send_message_length", 100 * 1024 * 1024),
        ],
    )
    stub = tts_pb2_grpc.TTSServiceStub(channel)

    # Synthesize
    req = tts_pb2.SynthesizeRequest(text=text)
    resp = stub.Synthesize(req)

    if not resp.success:
        print("❌ Error:", resp.message)
        return None

    audio_bytes = bytes(resp.audio_data)  # raw bytes from gRPC
    sr = int(resp.sampling_rate or default_sr)
    fmt = (getattr(resp, "format", "") or "").lower()
    is_wav = audio_bytes[:4] == b"RIFF"

    if is_wav or fmt == "wav":
        # Already a WAV container – save directly
        with open(output_path, "wb") as f:
            f.write(audio_bytes)
        data = audio_bytes
        print(f"✅ Saved WAV directly → {output_path} @ {sr} Hz")
    else:
        # Server sent raw PCM float32 – wrap as WAV
        print("⚙️  Wrapping raw PCM float32 into WAV…")
        wav = np.frombuffer(audio_bytes, dtype="<f4")
        wav = np.clip(wav, -1.0, 1.0)
        pcm16 = (wav * 32767.0).astype("<i2")

        buf = io.BytesIO()
        with wave.open(buf, "wb") as wf:
            wf.setnchannels(1)        # mono
            wf.setsampwidth(2)        # int16
            wf.setframerate(sr)
            wf.writeframes(pcm16.tobytes())
        data = buf.getvalue()
        with open(output_path, "wb") as f:
            f.write(data)
        print(f"✅ Converted and saved playable WAV → {output_path} @ {sr} Hz")

    # Inline playback
    print("🎧 Playing inline…")
    display(Audio(data=data, rate=sr))
    return output_path


def health_check_grpc(host="localhost", port=8081):
    channel = grpc.insecure_channel(f"{host}:{port}")
    stub = tts_pb2_grpc.TTSServiceStub(channel)
    try:
        resp = stub.HealthCheck(tts_pb2.HealthCheckRequest(service="tts"))
        print("Health:", resp.status, "-", resp.message)
    except Exception as e:
        print("Health check failed:", e)


# ⏩ Example usage in a notebook cell (no await needed; gRPC call is sync):
# health_check_grpc(host="localhost", port=8081)
tts_request_grpc("mis asi see on", host="localhost", port=9000, output_path="output_grpc.wav")

✅ Saved WAV directly → output_grpc.wav @ 22050 Hz
🎧 Playing inline…


'output_grpc.wav'