In [1]:
# %pip install TTS bark soundfile torch whisperspeech psutil
%pip install torch
%pip install whisperspeech
%pip install psutil
%pip install TTS bark soundfile
%pip install speechbrain torchaudio


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Collecting whisperspeech
  Obtaining dependency information for whisperspeech from https://files.pythonhosted.org/packages/86/d2/79287f07e8c5fce780b68ac0e01c195c8e4e5a8874d4d8c3a1dc65da71a2/WhisperSpeech-0.8.9-py3-none-any.whl.metadata
  Downloading WhisperSpeech-0.8.9-py3-none-any.whl.metadata (11 kB)
Collecting vocos (from whisperspeech)
  Obtaining dependency information for vocos from https://files.pythonhosted.org/packages/0a/45/82fe9b5696eb5dd4f84632f75b549b48bed0c33a5920b6309fbafd7e3477/vocos-0.1.0-py3-none-any.whl.metadata
  Downloading vocos-0.1.0-py3-none-any.whl.metadata (4.8 kB)
Collecting speechbrain<1.0 (from whisperspeech)
  Obtaining dependency 

In [None]:

import csv
import os
import tempfile
import time
from pathlib import Path
import statistics

import psutil
import torch

corpus_path = "corpus/en_corpus_10.txt"
WARMUP_TEXT = "Hello"
BENCHMARK_TEXT = "Тестовая фраза для синтеза."


def get_corpus(path: str):
    if not os.path.exists(path):
        raise FileNotFoundError("corpus not found")

    with open(path, "r") as file:
        samples = [x.strip() for x in file.readlines()]

    return [sample for sample in samples if sample]


corpus = get_corpus(corpus_path)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device: ", device)
model_names = ["bark", "glow_tts", "vits", "xtts", "whisper-speech"]
stats = {name: {"report": {}, "deltas": []} for name in model_names}

reports_dir = Path("reports")
reports_dir.mkdir(parents=True, exist_ok=True)
report_csv_path = reports_dir / "lab4_report.csv"
report_rows = []


def create_temp_wav(prefix: str) -> str:
    tmp = tempfile.NamedTemporaryFile(prefix=f"{prefix}_", suffix=".wav", delete=False)
    path = tmp.name
    tmp.close()
    return path


def summarize_deltas(deltas):
    if not deltas:
        return {
            "num_runs": 0,
            "mean_runtime_s": None,
            "median_runtime_s": None,
            "std_runtime_s": None,
            "min_runtime_s": None,
            "max_runtime_s": None,
        }

    summary = {
        "num_runs": len(deltas),
        "mean_runtime_s": statistics.mean(deltas),
        "median_runtime_s": statistics.median(deltas),
        "std_runtime_s": statistics.pstdev(deltas) if len(deltas) > 1 else 0.0,
        "min_runtime_s": min(deltas),
        "max_runtime_s": max(deltas),
    }

    return summary


def record_run(model_name, audio_file, duration, model_report):
    row = {
        "audio_file": audio_file,
        "model": model_name,
        "inference_time_s": duration,
    }

    for key, value in (model_report or {}).items():
        if key == "model_name":
            continue
        row[key] = value

    report_rows.append(row)


def save_report(rows, path):
    if not rows:
        print("Report was not generated: no successful synthesis runs.")
        return

    fieldnames = ["audio_file", "model", "inference_time_s"]
    for row in rows:
        for key in row.keys():
            if key not in fieldnames:
                fieldnames.append(key)

    with open(path, "w", newline="") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

    print(f"Saved report to {path} ({len(rows)} rows).")


def evaluate_tts_model(
    name,
    synth_fn,
    *,
    warmup_kwargs=None,
    benchmark_kwargs=None,
    model_obj=None,
):
    warmup_kwargs = dict(warmup_kwargs or {})
    benchmark_kwargs = dict(benchmark_kwargs or {})

    def _run(call_kwargs):
        kwargs = dict(call_kwargs) if call_kwargs else {}
        tmp_path = kwargs.get("file_path")
        try:
            synth_fn(**kwargs)
        finally:
            if tmp_path and os.path.exists(tmp_path):
                try:
                    os.remove(tmp_path)
                except OSError:
                    pass

    if torch.cuda.is_available() and hasattr(torch.cuda, "reset_peak_memory_stats"):
        torch.cuda.reset_peak_memory_stats()

    try:
        _run(warmup_kwargs)
    except Exception as exc:
        print(f"[{name}] Warmup error: {exc}")

    synth_time = None
    start_time = time.time()
    try:
        _run(benchmark_kwargs)
    except Exception as exc:
        print(f"[{name}] Benchmark error: {exc}")
    else:
        synth_time = time.time() - start_time

    model_ref = model_obj or getattr(synth_fn, "__self__", None)

    model_size_mb = None
    if model_ref is not None and hasattr(model_ref, "state_dict"):
        state = model_ref.state_dict()
        total_bytes = sum(t.nelement() * t.element_size() for t in state.values())
        model_size_mb = total_bytes / (1024 * 1024)

    process = psutil.Process(os.getpid())
    mem_mb = process.memory_info().rss / (1024 * 1024)
    cpu_percent = psutil.cpu_percent(interval=0.1)

    gpu_mem = None
    if torch.cuda.is_available():
        gpu_mem = torch.cuda.max_memory_allocated() / (1024 * 1024)

    return {
        "model_name": name,
        "benchmark_synthesis_time_s": synth_time,
        "model_size_mb": model_size_mb,
        "cpu_load_percent": cpu_percent,
        "ram_usage_mb": mem_mb,
        "gpu_usage_mb": gpu_mem,
    }


def timer(func, *args, **kwargs):
    start = time.time()
    func(*args, **kwargs)
    end = time.time()
    return end - start


In [6]:
# Glow-TTS вместо FastSpeech2
from pathlib import Path
from TTS.api import TTS
import torch

glow_tts_model_name = "glow_tts"   # можешь назвать как хочешь
output_dir = Path("models") / glow_tts_model_name
output_dir.mkdir(parents=True, exist_ok=True)

# инициализация модели (GPU если доступен)
glow_tts = TTS(
    "tts_models/en/ljspeech/glow-tts",
    gpu=torch.cuda.is_available()
)

# отчёт по метрикам (evaluate_tts_model, WARMUP_TEXT, BENCHMARK_TEXT, create_temp_wav должны быть уже определены выше)
glow_report = evaluate_tts_model(
    glow_tts_model_name,
    glow_tts.tts_to_file,
    warmup_kwargs={
        "text": WARMUP_TEXT,
        "file_path": create_temp_wav(f"{glow_tts_model_name}_warmup"),
    },
    benchmark_kwargs={
        "text": BENCHMARK_TEXT,
        "file_path": create_temp_wav(f"{glow_tts_model_name}_bench"),
    },
    model_obj=glow_tts,
)

stats[glow_tts_model_name]["report"] = dict(glow_report)

# прогон по корпусу
for i, sequence in enumerate(corpus):
    output_path = output_dir / f"{i}.wav"
    dt = timer(
        glow_tts.tts_to_file,
        text=sequence,
        file_path=str(output_path),
    )

    stats[glow_tts_model_name]["deltas"].append(dt)
    record_run(glow_tts_model_name, str(output_path), dt, glow_report)

# добавляем агрегированные статистики по времени
stats[glow_tts_model_name]["report"].update(summarize_deltas(stats[glow_tts_model_name]["deltas"]))


 > Downloading model to /Users/velimirhlebnikov/Library/Application Support/tts/tts_models--en--ljspeech--glow-tts


 17%|█▋        | 58.2M/344M [00:07<00:56, 5.05MiB/s]

KeyboardInterrupt: 

In [None]:

# VITS
from TTS.api import TTS


vits_tts = TTS("tts_models/en/ljspeech/vits")
vits_tts = vits_tts.to(device)

glow_tts_model_name = "vits"
output_dir = Path("models") / glow_tts_model_name
output_dir.mkdir(parents=True, exist_ok=True)

vits_report = evaluate_tts_model(
    glow_tts_model_name,
    vits_tts.tts_to_file,
    warmup_kwargs={
        "text": WARMUP_TEXT,
        "file_path": create_temp_wav(f"{glow_tts_model_name}_warmup"),
    },
    benchmark_kwargs={
        "text": BENCHMARK_TEXT,
        "file_path": create_temp_wav(f"{glow_tts_model_name}_bench"),
    },
    model_obj=vits_tts,
)

stats[glow_tts_model_name]["report"] = dict(vits_report)

for i, sequence in enumerate(corpus):
    output_path = output_dir / f"{i}.wav"
    dt = timer(
        vits_tts.tts_to_file,
        text=sequence,
        file_path=str(output_path)
    )

    stats[glow_tts_model_name]["deltas"].append(dt)
    record_run(glow_tts_model_name, str(output_path), dt, vits_report)

stats[glow_tts_model_name]["report"].update(summarize_deltas(stats[glow_tts_model_name]["deltas"]))


 > tts_models/en/ljspeech/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Text splitted to sentences.
['Hello']
 > Processing time: 0.1639270782470703
 > Real-time factor: 0.1785865649875445
 > Text splitted to sentences.
['Тестовая фраза для синтеза.

In [None]:

# Bark

from bark import generate_audio, SAMPLE_RATE
import soundfile as sf

glow_tts_model_name = "bark"
output_dir = Path("models") / glow_tts_model_name
output_dir.mkdir(parents=True, exist_ok=True)


def bark_to_file(text, file_path):
    audio_arr = generate_audio(text)
    sf.write(file_path, audio_arr, SAMPLE_RATE)


bark_report = evaluate_tts_model(
    glow_tts_model_name,
    bark_to_file,
    warmup_kwargs={
        "text": WARMUP_TEXT,
        "file_path": create_temp_wav(f"{glow_tts_model_name}_warmup"),
    },
    benchmark_kwargs={
        "text": BENCHMARK_TEXT,
        "file_path": create_temp_wav(f"{glow_tts_model_name}_bench"),
    },
)

stats[glow_tts_model_name]["report"] = dict(bark_report)

for i, sequence in enumerate(corpus):
    output_path = output_dir / f"{i}.wav"
    dt = timer(
        bark_to_file,
        text=sequence,
        file_path=str(output_path),
    )

    stats[glow_tts_model_name]["deltas"].append(dt)
    record_run(glow_tts_model_name, str(output_path), dt, bark_report)

stats[glow_tts_model_name]["report"].update(summarize_deltas(stats[glow_tts_model_name]["deltas"]))




text_2.pt:   0%|          | 0.00/5.35G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:

# xtts
xtts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
xtts = xtts.to(device)

glow_tts_model_name = "xtts"
output_dir = Path("models") / glow_tts_model_name
output_dir.mkdir(parents=True, exist_ok=True)

xtts_report = evaluate_tts_model(
    glow_tts_model_name,
    xtts.tts_to_file,
    warmup_kwargs={
        "text": WARMUP_TEXT,
        "language": "en",
        "file_path": create_temp_wav(f"{glow_tts_model_name}_warmup"),
    },
    benchmark_kwargs={
        "text": BENCHMARK_TEXT,
        "language": "en",
        "file_path": create_temp_wav(f"{glow_tts_model_name}_bench"),
    },
    model_obj=xtts,
)

stats[glow_tts_model_name]["report"] = dict(xtts_report)

for i, sequence in enumerate(corpus):
    output_path = output_dir / f"{i}.wav"
    dt = timer(
        xtts.tts_to_file,
        text=sequence,
        file_path=str(output_path),
        language="en",
    )

    stats[glow_tts_model_name]["deltas"].append(dt)
    record_run(glow_tts_model_name, str(output_path), dt, xtts_report)

stats[glow_tts_model_name]["report"].update(summarize_deltas(stats[glow_tts_model_name]["deltas"]))


In [None]:

# Whisper Speech

from whisperspeech.pipeline import Pipeline
import soundfile as sf

pipe = Pipeline()

glow_tts_model_name = "whisper-speech"
output_dir = Path("models") / glow_tts_model_name
output_dir.mkdir(parents=True, exist_ok=True)


def whisper_to_file(text, file_path):
    audio = pipe.generate_audio(text)
    sf.write(file_path, audio, 16000)


whisper_report = evaluate_tts_model(
    glow_tts_model_name,
    whisper_to_file,
    warmup_kwargs={
        "text": WARMUP_TEXT,
        "file_path": create_temp_wav(f"{glow_tts_model_name}_warmup"),
    },
    benchmark_kwargs={
        "text": BENCHMARK_TEXT,
        "file_path": create_temp_wav(f"{glow_tts_model_name}_bench"),
    },
)

stats[glow_tts_model_name]["report"] = dict(whisper_report)

for i, sequence in enumerate(corpus):
    output_path = output_dir / f"{i}.wav"

    dt = timer(
        whisper_to_file,
        text=sequence,
        file_path=str(output_path),
    )

    stats[glow_tts_model_name]["deltas"].append(dt)
    record_run(glow_tts_model_name, str(output_path), dt, whisper_report)

stats[glow_tts_model_name]["report"].update(summarize_deltas(stats[glow_tts_model_name]["deltas"]))


In [5]:

# Persist metrics report
save_report(report_rows, report_csv_path)
stats


Saved report to reports/lab4_report.csv (10 rows).


{'bark': {'report': {}, 'deltas': []},
 'fastspeech': {'report': {}, 'deltas': []},
 'vits': {'report': {'model_name': 'vits',
   'benchmark_synthesis_time_s': 0.7924511432647705,
   'model_size_mb': 138.55462646484375,
   'cpu_load_percent': 97.1,
   'ram_usage_mb': 1348.84375,
   'gpu_usage_mb': None,
   'num_runs': 10,
   'mean_runtime_s': 0.7027400493621826,
   'median_runtime_s': 0.6386858224868774,
   'std_runtime_s': 0.17438568981140987,
   'min_runtime_s': 0.5424678325653076,
   'max_runtime_s': 1.123274326324463},
  'deltas': [0.556908130645752,
   0.7674808502197266,
   0.8372459411621094,
   0.5424678325653076,
   0.5814900398254395,
   0.6918158531188965,
   1.123274326324463,
   0.5855557918548584,
   0.5535428524017334,
   0.78761887550354]},
 'xtts': {'report': {}, 'deltas': []},
 'whisper-speech': {'report': {}, 'deltas': []}}