In [None]:
%pip install TTS bark soundfile torch whisperspeech


In [None]:
import os
import torch
import time
import psutil

corpus_path = "corpus/en_corpus_10.txt"


def get_corpus(path: str):
    if not os.path.exists(path=path):
        raise FileNotFoundError("corpus not found")
    
    with open(path, "r") as file: 
        samples = [x.strip() for x in file.readlines]
        return samples
    


corpus = get_corpus(corpus_path)
device = "cuda" if torch.cuda.is_available() else "cpu"

dummy = {
    "report": {},
    "deltas": [],
}

stats = {
    "bark": dummy,
    "fastspeech": dummy,
    "vits": dummy,
    "xtts": dummy,
    "whisper-speech": dummy,
}





def evaluate_tts_model(name, synth_fn, warmup_text="Hello", test_text="Тестовая фраза для синтеза."):
    """
    name       — название модели (строка)
    synth_fn   — функция, которая принимает text и генерирует аудио
    warmup_text — короткая фраза для прогрева
    test_text   — фраза для измерения времени
    """

    # -------- Прогрев --------
    try:
        synth_fn(warmup_text)
    except Exception as e:
        print(f"[{name}] Ошибка при прогреве:", e)

    # -------- Измерение времени синтеза --------
    start_time = time.time()
    try:
        synth_fn(test_text)
    except Exception as e:
        print(f"[{name}] Ошибка при синтезе:", e)
        synth_time = None
    else:
        synth_time = time.time() - start_time

    # -------- Измерение размера модели --------
    model_size_mb = None
    try:
        if hasattr(synth_fn, "__self__") and hasattr(synth_fn.__self__, "state_dict"):
            state = synth_fn.__self__.state_dict()
            total_bytes = sum(t.nelement() * t.element_size() for t in state.values())
            model_size_mb = total_bytes / (1024 * 1024)
    except:
        pass

    # -------- Использование CPU/RAM --------
    process = psutil.Process(os.getpid())
    mem_mb = process.memory_info().rss / 1024 / 1024
    cpu_percent = psutil.cpu_percent(interval=None)

    # -------- GPU --------
    gpu_mem = None
    if torch.cuda.is_available():
        gpu_mem = torch.cuda.max_memory_allocated() / 1024 / 1024

    return {
        "model": name,
        "synthesis_time_s": synth_time,
        "model_size_mb": model_size_mb,
        "cpu_load_percent": cpu_percent,
        "ram_usage_mb": mem_mb,
        "gpu_usage_mb": gpu_mem,
    }


def timer(func, *args, **kwargs):
    start = time.time()
    func(*args, **kwargs)
    end = time.time()
    return end - start


In [None]:

# Fast Speech 

import os
from TTS.api import TTS

fast_speech_tts = TTS("tts_models/en/ljspeech/fastspeech2") 
fast_speech_tts = fast_speech_tts.to(device)

if not os.path.exists("models/fastspeech"):
    raise FileNotFoundError("directory fastspeech does not exists")


for i, sequence in enumerate(corpus):
    dt = timer(
        fast_speech_tts.tts_to_file,
          sequence, 
          f"models/fastspeech/{i}.wav",
    )

    stats["fastspeech"]["deltas"].append(dt)


stats["fastspeech"]["report"] = evaluate_tts_model("fastspeech",fast_speech_tts.tts_to_file)

    


In [None]:
# VITS


vits_tts = TTS("tts_models/en/ljspeech/vits")
vits_tts = vits_tts.to(device)

if not os.path.exists("models/vits"):
    raise FileNotFoundError("directory fastspeech does not exists")



for i, sequence in enumerate(corpus):
    dt = timer(
        vits_tts.tts_to_file,
        sequence, 
        f"models/vits/{i}.wav"
    )

    stats["vits"]["deltas"].append(dt)


stats["vits"]["report"] = evaluate_tts_model("vits", vits_tts.tts_to_file)


In [None]:
# Bark

from bark import generate_audio, SAMPLE_RATE
import soundfile as sf



if not os.path.exists("models/bark"):
    raise FileNotFoundError("bark directory is not exists")


for i, sequence in enumerate(corpus):
    dt = timer(
        sf.write,
        f"models/bark/{i}.wav",
        generate_audio(sequence), 
        SAMPLE_RATE,
    )

    stats["bark"]["deltas"].append(dt)


stats["bark"]["report"] = evaluate_tts_model("vits", sf.write)

In [None]:
# xtts
xtts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
xtts = xtts.to(device)


if not os.path.exists("models/bark"):
    raise FileNotFoundError("bark directory is not exists")


for i, sequence in enumerate(corpus):
    dt = timer(
        xtts.tts_to_file,
        text=sequence,
        file_path=f"models/xtts/{i}.wav",
        language="en",
    )

    stats["xtts"]["deltas"].append(dt)
    
stats["xtts"]["report"] = evaluate_tts_model("xtts", xtts.tts_to_file)

In [None]:
# Whisper Speech

from whisperspeech.pipeline import Pipeline
import soundfile as sf

pipe = Pipeline()


if not os.path.exists("models/whisper-speech"):
    raise FileNotFoundError("whisper direction is not exists")

for i, sequence in enumerate(corpus):
    audio = pipe.generate_audio(sequence)

    dt = timer(
        sf.write,
        f"models/whisper-speech/{i}.wav", audio, 16000,
    )

    stats["whisper-speech"]["deltas"].append(dt)


stats["whisper-speech"]["report"] = evaluate_tts_model("whisper-speech", pipe.generate_audio)
