In [1]:
# %pip install TTS bark soundfile torch whisperspeech psutil
%pip install torch
%pip install whisperspeech
%pip install psutil
%pip install TTS bark soundfile
%pip install speechbrain torchaudio
%pip install transformers datasets soundfile hf_transfer



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[3

In [2]:

import csv
import os
import tempfile
import time
from pathlib import Path
import statistics

import psutil
import torch

corpus_path = "corpus/en_corpus_10.txt"
WARMUP_TEXT = "Hello"
BENCHMARK_TEXT = "Тестовая фраза для синтеза."


def get_corpus(path: str):
    if not os.path.exists(path):
        raise FileNotFoundError("corpus not found")

    with open(path, "r") as file:
        samples = [x.strip() for x in file.readlines()]

    return [sample for sample in samples if sample]


corpus = get_corpus(corpus_path)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device: ", device)
model_names = ["tacotron2_DCA", "speecht5", "vits", "fast_pitch", "whisper-speech"]
stats = {name: {"report": {}, "deltas": []} for name in model_names}

reports_dir = Path("reports")
reports_dir.mkdir(parents=True, exist_ok=True)
report_csv_path = reports_dir / "lab4_report.csv"
report_rows = []


def create_temp_wav(prefix: str) -> str:
    tmp = tempfile.NamedTemporaryFile(prefix=f"{prefix}_", suffix=".wav", delete=False)
    path = tmp.name
    tmp.close()
    return path


def summarize_deltas(deltas):
    if not deltas:
        return {
            "num_runs": 0,
            "mean_runtime_s": None,
            "median_runtime_s": None,
            "std_runtime_s": None,
            "min_runtime_s": None,
            "max_runtime_s": None,
        }

    summary = {
        "num_runs": len(deltas),
        "mean_runtime_s": statistics.mean(deltas),
        "median_runtime_s": statistics.median(deltas),
        "std_runtime_s": statistics.pstdev(deltas) if len(deltas) > 1 else 0.0,
        "min_runtime_s": min(deltas),
        "max_runtime_s": max(deltas),
    }

    return summary


def record_run(model_name, audio_file, duration, model_report):
    row = {
        "audio_file": audio_file,
        "model": model_name,
        "inference_time_s": duration,
    }

    for key, value in (model_report or {}).items():
        if key == "model_name":
            continue
        row[key] = value

    report_rows.append(row)


def save_report(rows, path):
    if not rows:
        print("Report was not generated: no successful synthesis runs.")
        return

    fieldnames = ["audio_file", "model", "inference_time_s"]
    for row in rows:
        for key in row.keys():
            if key not in fieldnames:
                fieldnames.append(key)

    with open(path, "w", newline="") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

    print(f"Saved report to {path} ({len(rows)} rows).")


def evaluate_tts_model(
    name,
    synth_fn,
    *,
    warmup_kwargs=None,
    benchmark_kwargs=None,
    model_obj=None,
):
    warmup_kwargs = dict(warmup_kwargs or {})
    benchmark_kwargs = dict(benchmark_kwargs or {})

    def _run(call_kwargs):
        kwargs = dict(call_kwargs) if call_kwargs else {}
        tmp_path = kwargs.get("file_path")
        try:
            synth_fn(**kwargs)
        finally:
            if tmp_path and os.path.exists(tmp_path):
                try:
                    os.remove(tmp_path)
                except OSError:
                    pass

    if torch.cuda.is_available() and hasattr(torch.cuda, "reset_peak_memory_stats"):
        torch.cuda.reset_peak_memory_stats()

    try:
        _run(warmup_kwargs)
    except Exception as exc:
        print(f"[{name}] Warmup error: {exc}")

    synth_time = None
    start_time = time.time()
    try:
        _run(benchmark_kwargs)
    except Exception as exc:
        print(f"[{name}] Benchmark error: {exc}")
    else:
        synth_time = time.time() - start_time

    model_ref = model_obj or getattr(synth_fn, "__self__", None)

    model_size_mb = None
    if model_ref is not None and hasattr(model_ref, "state_dict"):
        state = model_ref.state_dict()
        total_bytes = sum(t.nelement() * t.element_size() for t in state.values())
        model_size_mb = total_bytes / (1024 * 1024)

    process = psutil.Process(os.getpid())
    mem_mb = process.memory_info().rss / (1024 * 1024)
    cpu_percent = psutil.cpu_percent(interval=0.1)

    gpu_mem = None
    if torch.cuda.is_available():
        gpu_mem = torch.cuda.max_memory_allocated() / (1024 * 1024)

    return {
        "model_name": name,
        "benchmark_synthesis_time_s": synth_time,
        "model_size_mb": model_size_mb,
        "cpu_load_percent": cpu_percent,
        "ram_usage_mb": mem_mb,
        "gpu_usage_mb": gpu_mem,
    }


def timer(func, *args, **kwargs):
    start = time.time()
    func(*args, **kwargs)
    end = time.time()
    return end - start


Device:  cuda


In [3]:
# # Glow-TTS вместо FastSpeech2
# from pathlib import Path
# from TTS.api import TTS
# import torch

# glow_tts_model_name = "glow_tts"   # можешь назвать как хочешь
# output_dir = Path("models") / glow_tts_model_name
# output_dir.mkdir(parents=True, exist_ok=True)

# # инициализация модели (GPU если доступен)
# glow_tts = TTS(
#     "tts_models/en/ljspeech/glow-tts",
#     gpu=torch.cuda.is_available()
# )

# # отчёт по метрикам (evaluate_tts_model, WARMUP_TEXT, BENCHMARK_TEXT, create_temp_wav должны быть уже определены выше)
# glow_report = evaluate_tts_model(
#     glow_tts_model_name,
#     glow_tts.tts_to_file,
#     warmup_kwargs={
#         "text": WARMUP_TEXT,
#         "file_path": create_temp_wav(f"{glow_tts_model_name}_warmup"),
#     },
#     benchmark_kwargs={
#         "text": BENCHMARK_TEXT,
#         "file_path": create_temp_wav(f"{glow_tts_model_name}_bench"),
#     },
#     model_obj=glow_tts,
# )

# stats[glow_tts_model_name]["report"] = dict(glow_report)

# # прогон по корпусу
# for i, sequence in enumerate(corpus):
#     output_path = output_dir / f"{i}.wav"
#     dt = timer(
#         glow_tts.tts_to_file,
#         text=sequence,
#         file_path=str(output_path),
#     )

#     stats[glow_tts_model_name]["deltas"].append(dt)
#     record_run(glow_tts_model_name, str(output_path), dt, glow_report)

# # добавляем агрегированные статистики по времени
# stats[glow_tts_model_name]["report"].update(summarize_deltas(stats[glow_tts_model_name]["deltas"]))



import torch
import soundfile as sf
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

# Вместо загрузки настоящего спикера — фиксированный случайный вектор
torch.manual_seed(0)  # чтобы голос был стабильным между запусками
speaker_embedding = torch.randn(1, 512, device=device)

def speecht5_tts_to_file(text: str, file_path: str):
    inputs = processor(text=text, return_tensors="pt").to(device)
    with torch.no_grad():
        speech = model.generate_speech(
            inputs["input_ids"],
            speaker_embedding,
            vocoder=vocoder,
        )
    sf.write(file_path, speech.cpu().numpy(), samplerate=16000)


speech_t5_model_name = "speecht5"
output_dir = Path("models") / speech_t5_model_name
output_dir.mkdir(parents=True, exist_ok=True)

# отчёт (evaluate_tts_model у тебя уже есть)
speech_t5_report = evaluate_tts_model(
    speech_t5_model_name,
    speecht5_tts_to_file,
    warmup_kwargs={
        "text": WARMUP_TEXT,
        "file_path": create_temp_wav(f"{speech_t5_model_name}_warmup"),
    },
    benchmark_kwargs={
        "text": BENCHMARK_TEXT,
        "file_path": create_temp_wav(f"{speech_t5_model_name}_bench"),
    },
    model_obj=model,   # именно torch-модель, а не функция
)

stats[speech_t5_model_name]["report"] = dict(speech_t5_report)

# прогон по корпусу
for i, sequence in enumerate(corpus):
    out_path = output_dir / f"{i}.wav"
    dt = timer(
        speecht5_tts_to_file,
        text=sequence,
        file_path=str(out_path),
    )

    stats[speech_t5_model_name]["deltas"].append(dt)
    record_run(speech_t5_model_name, str(out_path), dt, speech_t5_report)

# агрегация статистики по времени
stats[speech_t5_model_name]["report"].update(
    summarize_deltas(stats[speech_t5_model_name]["deltas"])
)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:

# VITS
from TTS.api import TTS


vits_tts = TTS("tts_models/en/ljspeech/vits")
vits_tts = vits_tts.to(device)

glow_tts_model_name = "vits"
output_dir = Path("models") / glow_tts_model_name
output_dir.mkdir(parents=True, exist_ok=True)

vits_report = evaluate_tts_model(
    glow_tts_model_name,
    vits_tts.tts_to_file,
    warmup_kwargs={
        "text": WARMUP_TEXT,
        "file_path": create_temp_wav(f"{glow_tts_model_name}_warmup"),
    },
    benchmark_kwargs={
        "text": BENCHMARK_TEXT,
        "file_path": create_temp_wav(f"{glow_tts_model_name}_bench"),
    },
    model_obj=vits_tts,
)

stats[glow_tts_model_name]["report"] = dict(vits_report)

for i, sequence in enumerate(corpus):
    output_path = output_dir / f"{i}.wav"
    dt = timer(
        vits_tts.tts_to_file,
        text=sequence,
        file_path=str(output_path)
    )

    stats[glow_tts_model_name]["deltas"].append(dt)
    record_run(glow_tts_model_name, str(output_path), dt, vits_report)

stats[glow_tts_model_name]["report"].update(summarize_deltas(stats[glow_tts_model_name]["deltas"]))


  from pkg_resources import resource_filename


 > tts_models/en/ljspeech/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Text splitted to sentences.
['Hello']
 > Processing time: 0.2443995475769043
 > Real-time factor: 0.2731655527205363
 > Text splitted to sentences.
['Тестовая фраза для синтеза.

In [5]:
from pathlib import Path
import torch
from TTS.api import TTS

tacotron_model_name = "tacotron2_DCA"  # так назовём её в stats и отчёте
output_dir = Path("models") / tacotron_model_name
output_dir.mkdir(parents=True, exist_ok=True)

use_gpu = torch.cuda.is_available()

# тут выбираешь реальную Tacotron2-модель из Coqui
# пример: английская LJSpeech
tacotron_tts = TTS(
    "tts_models/en/ljspeech/tacotron2-DDC",  # <<< реально существующая модель
    gpu=use_gpu,
)

# --- бенчмарк / метрики ---
tacotron_report = evaluate_tts_model(
    tacotron_model_name,
    tacotron_tts.tts_to_file,
    warmup_kwargs={
        "text": WARMUP_TEXT,
        "file_path": create_temp_wav(f"{tacotron_model_name}_warmup"),
    },
    benchmark_kwargs={
        "text": BENCHMARK_TEXT,
        "file_path": create_temp_wav(f"{tacotron_model_name}_bench"),
    },
    model_obj=tacotron_tts,   # сам объект TTS, у него внутри torch-модель
)

stats[tacotron_model_name]["report"] = dict(tacotron_report)

# --- прогон по корпусу ---
for i, sequence in enumerate(corpus):
    out_path = output_dir / f"{i}.wav"
    dt = timer(
        tacotron_tts.tts_to_file,
        text=sequence,
        file_path=str(out_path),
    )

    stats[tacotron_model_name]["deltas"].append(dt)
    record_run(tacotron_model_name, str(out_path), dt, tacotron_report)

# --- агрегация времени ---
stats[tacotron_model_name]["report"].update(
    summarize_deltas(stats[tacotron_model_name]["deltas"])
)


 > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.
 > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024




 > Model's reduction rate `r` is set to: 1
 > Vocoder Model: hifigan
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Generator Model: hifigan_generator
 > Discriminator Model: hifigan_discriminator
Removing weight norm...
 > Text splitted to sentences.
['Hello']
   > Decoder stopped wit

In [6]:
# === FastPitch (Coqui TTS) ===
from pathlib import Path
import torch
from TTS.api import TTS

fast_pitch_name = "fast_pitch"
output_dir = Path("models") / fast_pitch_name
output_dir.mkdir(parents=True, exist_ok=True)

use_gpu = torch.cuda.is_available()

# инициализация fast_pitch модели
fast_pitch_tts = TTS(
    "tts_models/en/ljspeech/fast_pitch",
    gpu=use_gpu,
)

# --- Базовый отчёт по метрикам (warmup + benchmark) ---
fast_pitch_report = evaluate_tts_model(
    fast_pitch_name,
    fast_pitch_tts.tts_to_file,
    warmup_kwargs={
        "text": WARMUP_TEXT,
        "file_path": create_temp_wav(f"{fast_pitch_name}_warmup"),
    },
    benchmark_kwargs={
        "text": BENCHMARK_TEXT,
        "file_path": create_temp_wav(f"{fast_pitch_name}_bench"),
    },
    model_obj=fast_pitch_tts,  # объект TTS, внутри которого torch-модель
)

stats[fast_pitch_name]["report"] = dict(fast_pitch_report)

# --- Прогон по корпусу текстов ---
for i, sequence in enumerate(corpus):
    out_path = output_dir / f"{i}.wav"
    dt = timer(
        fast_pitch_tts.tts_to_file,
        text=sequence,
        file_path=str(out_path),
    )

    stats[fast_pitch_name]["deltas"].append(dt)
    record_run(fast_pitch_name, str(out_path), dt, fast_pitch_report)

# --- Агрегированные статистики по времени синтеза ---
stats[fast_pitch_name]["report"].update(
    summarize_deltas(stats[fast_pitch_name]["deltas"])
)


 > tts_models/en/ljspeech/fast_pitch is already downloaded.
 > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.
 > Using model: fast_pitch
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Vocoder Model: hifigan
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resam

In [7]:
%pip install webdataset torchcodec 


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:

# Whisper Speech

from whisperspeech.pipeline import Pipeline
import soundfile as sf

pipe = Pipeline()

glow_tts_model_name = "whisper-speech"
output_dir = Path("models") / glow_tts_model_name
output_dir.mkdir(parents=True, exist_ok=True)


def whisper_to_file(text, file_path):
    # audio = pipe.generate_audio(text)
    # sf.write(file_path, audio, 16000)
    pipe.generate_to_file(file_path, text)

whisper_report = evaluate_tts_model(
    glow_tts_model_name,
    whisper_to_file,
    warmup_kwargs={
        "text": WARMUP_TEXT,
        "file_path": create_temp_wav(f"{glow_tts_model_name}_warmup"),
    },
    benchmark_kwargs={
        "text": BENCHMARK_TEXT,
        "file_path": create_temp_wav(f"{glow_tts_model_name}_bench"),
    },
)

stats[glow_tts_model_name]["report"] = dict(whisper_report)

for i, sequence in enumerate(corpus):
    output_path = output_dir / f"{i}.wav"

    dt = timer(
        whisper_to_file,
        text=sequence,
        file_path=str(output_path),
    )

    stats[glow_tts_model_name]["deltas"].append(dt)
    record_run(glow_tts_model_name, str(output_path), dt, whisper_report)

stats[glow_tts_model_name]["report"].update(summarize_deltas(stats[glow_tts_model_name]["deltas"]))


  WeightNorm.apply(module, name, dim)
  self.gen = func(*args, **kwds)


In [9]:

# Persist metrics report
save_report(report_rows, report_csv_path)
stats


Saved report to reports/lab4_report.csv (50 rows).


{'tacotron2_DCA': {'report': {'model_name': 'tacotron2_DCA',
   'benchmark_synthesis_time_s': 0.08579468727111816,
   'model_size_mb': 380.8066177368164,
   'cpu_load_percent': 2.5,
   'ram_usage_mb': 2462.44921875,
   'gpu_usage_mb': 1715.89697265625,
   'num_runs': 10,
   'mean_runtime_s': 0.5267518043518067,
   'median_runtime_s': 0.4128316640853882,
   'std_runtime_s': 0.3876749952350756,
   'min_runtime_s': 0.33842945098876953,
   'max_runtime_s': 1.6815605163574219},
  'deltas': [0.35874509811401367,
   0.33842945098876953,
   0.4434845447540283,
   0.3476574420928955,
   0.42641258239746094,
   0.42685532569885254,
   1.6815605163574219,
   0.39925074577331543,
   0.3569071292877197,
   0.48821520805358887]},
 'speecht5': {'report': {'model_name': 'speecht5',
   'benchmark_synthesis_time_s': 0.07980227470397949,
   'model_size_mb': 550.9716110229492,
   'cpu_load_percent': 13.0,
   'ram_usage_mb': 1500.73046875,
   'gpu_usage_mb': 626.12109375,
   'num_runs': 10,
   'mean_runtim