# Домашнее задание №2: Синтез речи


In [None]:
%pip install piper-tts
%pip install tensorboard
%pip install onnx
%pip install onnxruntime


# Импорты


In [None]:
import os
import json
import glob
import random
import re
from pathlib import Path
from typing import List, Dict, Tuple

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

import librosa
import soundfile as sf
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

import warnings
warnings.filterwarnings('ignore')

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)


# Подготовка данных


In [None]:
DATA_PATH = "/share/audio_data/sova/ytub/raid/nanosemantics/nextcloud/sova_done"
OUTPUT_DIR = "./tts_data"
TRAIN_DIR = os.path.join(OUTPUT_DIR, "train")
VAL_DIR = os.path.join(OUTPUT_DIR, "val")
TEST_DIR = os.path.join(OUTPUT_DIR, "test")

os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(VAL_DIR, exist_ok=True)
os.makedirs(TEST_DIR, exist_ok=True)

SAMPLE_RATE = 16000
MIN_DURATION = 0.5
MAX_DURATION = 10.0


In [None]:
def normalize_text(text):
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^а-яёa-z0-9\s]', '', text)
    return text.strip()

def load_audio_info(audio_path):
    try:
        y, sr = librosa.load(audio_path, sr=None)
        duration = len(y) / sr
        return duration, sr, y
    except Exception as e:
        print(f"Ошибка при загрузке {audio_path}: {e}")
        return None, None, None

def prepare_tts_dataset(base_path, parts=['part_0', 'part_1', 'part_2'], max_files=None):
    dataset = []
    audio_extensions = ['*.wav', '*.mp3', '*.flac', '*.ogg']
    audio_files = []
    
    for part in parts:
        part_path = os.path.join(base_path, part)
        if not os.path.exists(part_path):
            print(f"Предупреждение: часть {part} не найдена")
            continue
        for ext in audio_extensions:
            audio_files.extend(glob.glob(os.path.join(part_path, '**', ext), recursive=True))
    
    if max_files:
        audio_files = audio_files[:max_files]
    
    print(f"Найдено {len(audio_files)} аудиофайлов")
    print("Начинаю обработку...")
    
    for audio_path in tqdm(audio_files, desc="Подготовка данных", unit="файл"):
        duration, sr, y = load_audio_info(audio_path)
        if duration is None:
            continue
        
        if duration < MIN_DURATION or duration > MAX_DURATION:
            continue
        
        text_path = audio_path.rsplit('.', 1)[0] + '.txt'
        if not os.path.exists(text_path):
            continue
        
        with open(text_path, 'r', encoding='utf-8') as f:
            text = f.read().strip()
        
        text = normalize_text(text)
        
        if len(text) > 0:
            resampled_audio = librosa.resample(y, orig_sr=sr, target_sr=SAMPLE_RATE) if sr != SAMPLE_RATE else y
            
            dataset.append({
                "audio": resampled_audio,
                "text": text,
                "duration": duration,
                "path": audio_path
            })
    
    print(f"Подготовлено {len(dataset)} записей")
    return dataset


In [None]:
dataset = prepare_tts_dataset(DATA_PATH, parts=['part_0', 'part_1', 'part_2'])

random.shuffle(dataset)

train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))

train_data = dataset[:train_size]
val_data = dataset[train_size:train_size + val_size]
test_data = dataset[train_size + val_size:]

print(f"Разделение данных:")
print(f"  Train: {len(train_data)} записей")
print(f"  Val: {len(val_data)} записей")
print(f"  Test: {len(test_data)} записей")

durations = [item['duration'] for item in dataset]
print(f"\nСтатистика по длительности:")
print(f"  Минимум: {min(durations):.2f} сек")
print(f"  Максимум: {max(durations):.2f} сек")
print(f"  Среднее: {np.mean(durations):.2f} сек")
print(f"  Медиана: {np.median(durations):.2f} сек")


# Сохранение подготовленных данных


In [None]:
def save_dataset(data, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    manifest = []
    
    for i, item in enumerate(tqdm(data, desc="Сохранение", unit="файл")):
        audio_filename = f"audio_{i:06d}.wav"
        audio_path = os.path.join(output_dir, audio_filename)
        
        sf.write(audio_path, item['audio'], SAMPLE_RATE)
        
        manifest.append({
            "audio_filepath": audio_path,
            "text": item['text'],
            "duration": item['duration']
        })
    
    manifest_path = os.path.join(output_dir, "manifest.json")
    with open(manifest_path, 'w', encoding='utf-8') as f:
        json.dump(manifest, f, ensure_ascii=False, indent=2)
    
    return manifest_path

train_manifest = save_dataset(train_data, TRAIN_DIR)
val_manifest = save_dataset(val_data, VAL_DIR)
test_manifest = save_dataset(test_data, TEST_DIR)

print(f"\nМанифесты сохранены:")
print(f"  Train: {train_manifest}")
print(f"  Val: {val_manifest}")
print(f"  Test: {test_manifest}")


# Настройка Piper TTS


In [None]:
import subprocess
import sys

def install_piper():
    try:
        result = subprocess.run(['pip', 'show', 'piper-tts'], capture_output=True, text=True)
        if result.returncode != 0:
            print("Установка Piper TTS...")
            subprocess.run([sys.executable, '-m', 'pip', 'install', 'git+https://github.com/OHF-Voice/piper1-gpl.git'], check=True)
        else:
            print("Piper TTS уже установлен")
    except Exception as e:
        print(f"Ошибка при установке: {e}")

install_piper()

from piper import PiperVoice
from piper.download import ensure_voice_exists, find_voice


In [None]:
MODEL_DIR = os.path.join(OUTPUT_DIR, "models")
CHECKPOINT_DIR = os.path.join(OUTPUT_DIR, "checkpoints")
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

voice_name = "ru_RU-dmitri-medium"
print(f"Используем базовый чекпоинт: {voice_name}")

try:
    ensure_voice_exists(voice_name, MODEL_DIR)
    voice_path = find_voice(voice_name, [MODEL_DIR])
    print(f"Чекпоинт найден: {voice_path}")
except Exception as e:
    print(f"Ошибка при загрузке чекпоинта: {e}")
    print("Продолжаем без предобученного чекпоинта")
    voice_path = None


# Обучение модели


In [None]:
from piper.train import train

log_dir = os.path.join(OUTPUT_DIR, "logs")
os.makedirs(log_dir, exist_ok=True)

writer = SummaryWriter(log_dir=log_dir)

print("Настройка обучения...")

config = {
    "dataset": {
        "train": train_manifest,
        "val": val_manifest
    },
    "model": {
        "speaker_embedding_dim": 256,
        "num_mels": 80,
        "sample_rate": SAMPLE_RATE
    },
    "training": {
        "batch_size": 16,
        "learning_rate": 1e-4,
        "num_epochs": 100,
        "checkpoint_interval": 10,
        "log_interval": 100
    },
    "output": {
        "checkpoint_dir": CHECKPOINT_DIR,
        "model_dir": MODEL_DIR
    }
}

config_path = os.path.join(OUTPUT_DIR, "config.json")
with open(config_path, 'w', encoding='utf-8') as f:
    json.dump(config, f, indent=2, ensure_ascii=False)

print(f"Конфигурация сохранена: {config_path}")


In [None]:
print("Запуск обучения...")
print("Примечание: Для полного обучения используйте команду:")
print(f"piper-train --config {config_path}")

train(
    config_path=config_path,
    checkpoint_dir=CHECKPOINT_DIR,
    resume_from=voice_path
)


# Визуализация метрик обучения


In [None]:
def plot_training_metrics(log_dir):
    try:
        from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
        
        event_acc = EventAccumulator(log_dir)
        event_acc.Reload()
        
        scalar_tags = event_acc.Tags()['scalars']
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        if 'train/loss' in scalar_tags:
            train_loss = event_acc.Scalars('train/loss')
            axes[0, 0].plot([s.step for s in train_loss], [s.value for s in train_loss], label='Train Loss')
        
        if 'val/loss' in scalar_tags:
            val_loss = event_acc.Scalars('val/loss')
            axes[0, 0].plot([s.step for s in val_loss], [s.value for s in val_loss], label='Val Loss')
        
        axes[0, 0].set_xlabel('Step')
        axes[0, 0].set_ylabel('Loss')
        axes[0, 0].set_title('Training and Validation Loss')
        axes[0, 0].legend()
        axes[0, 0].grid(True)
        
        if 'train/mel_loss' in scalar_tags:
            train_mel = event_acc.Scalars('train/mel_loss')
            axes[0, 1].plot([s.step for s in train_mel], [s.value for s in train_mel], label='Train Mel Loss')
        
        if 'val/mel_loss' in scalar_tags:
            val_mel = event_acc.Scalars('val/mel_loss')
            axes[0, 1].plot([s.step for s in val_mel], [s.value for s in val_mel], label='Val Mel Loss')
        
        axes[0, 1].set_xlabel('Step')
        axes[0, 1].set_ylabel('Mel Loss')
        axes[0, 1].set_title('Mel Spectrogram Loss')
        axes[0, 1].legend()
        axes[0, 1].grid(True)
        
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, 'training_metrics.png'), dpi=300)
        plt.show()
        
    except Exception as e:
        print(f"Ошибка при загрузке метрик: {e}")

plot_training_metrics(log_dir)


# Загрузка обученной модели и генерация примеров


In [None]:
best_checkpoint = os.path.join(CHECKPOINT_DIR, "best_model.pt")
if not os.path.exists(best_checkpoint):
    checkpoints = glob.glob(os.path.join(CHECKPOINT_DIR, "*.pt"))
    if checkpoints:
        best_checkpoint = max(checkpoints, key=os.path.getmtime)
        print(f"Используем последний чекпоинт: {best_checkpoint}")
    else:
        print("Чекпоинты не найдены, используем базовую модель")
        best_checkpoint = voice_path

if best_checkpoint and os.path.exists(best_checkpoint):
    voice = PiperVoice.load(best_checkpoint)
    print("Модель загружена")
else:
    print("Используем базовую модель")
    voice = PiperVoice.load(voice_path) if voice_path else None


In [None]:
EXAMPLES_DIR = os.path.join(OUTPUT_DIR, "examples")
os.makedirs(EXAMPLES_DIR, exist_ok=True)

test_texts = [
    "Привет, как дела?",
    "Распознавание и синтез речи это интересная область.",
    "Сегодня хорошая погода.",
    test_data[0]['text'] if test_data else "Тестовый текст для синтеза речи."
]

print("Генерация примеров...")
for i, text in enumerate(test_texts):
    if voice:
        audio = voice.synthesize(text)
        output_path = os.path.join(EXAMPLES_DIR, f"example_{i:02d}.wav")
        sf.write(output_path, audio, SAMPLE_RATE)
        print(f"Сохранено: {output_path} - {text[:50]}...")
    else:
        print(f"Модель не загружена, пропускаем: {text[:50]}...")


# Экспорт модели в ONNX


In [None]:
if voice and hasattr(voice, 'model'):
    onnx_path = os.path.join(MODEL_DIR, "model.onnx")
    
    try:
        dummy_input = torch.randn(1, 100)
        torch.onnx.export(
            voice.model,
            dummy_input,
            onnx_path,
            input_names=['text'],
            output_names=['audio'],
            dynamic_axes={'text': {0: 'batch'}, 'audio': {0: 'batch'}},
            opset_version=11
        )
        print(f"Модель экспортирована в ONNX: {onnx_path}")
    except Exception as e:
        print(f"Ошибка при экспорте в ONNX: {e}")
        print("Возможно, требуется дополнительная настройка модели")
else:
    print("Модель не доступна для экспорта")


# Вычисление метрик качества


In [None]:
def calculate_wer(true_text, predicted_text):
    try:
        from jiwer import wer
        return wer(true_text, predicted_text)
    except:
        true_words = true_text.lower().split()
        pred_words = predicted_text.lower().split()
        
        if len(true_words) == 0:
            return 1.0 if len(pred_words) > 0 else 0.0
        
        errors = sum(1 for t, p in zip(true_words, pred_words) if t != p)
        errors += abs(len(true_words) - len(pred_words))
        return errors / len(true_words)

def calculate_speaker_similarity(audio1, audio2):
    try:
        from speechbrain.inference.speaker import EncoderClassifier
        classifier = EncoderClassifier.from_hparams(
            source="speechbrain/spkrec-ecapa-voxceleb",
            savedir="pretrained_models/spkrec-ecapa-voxceleb"
        )
        
        emb1 = classifier.encode_batch(torch.tensor(audio1).unsqueeze(0))
        emb2 = classifier.encode_batch(torch.tensor(audio2).unsqueeze(0))
        
        similarity = torch.nn.functional.cosine_similarity(emb1, emb2)
        return similarity.item()
    except:
        return 0.0

print("Вычисление метрик на 100 примерах...")
sample_data = test_data[:100] if len(test_data) >= 100 else test_data

wers = []
similarities = []

for i, item in enumerate(tqdm(sample_data, desc="Оценка качества", unit="пример")):
    if not voice:
        continue
    
    true_text = item['text']
    reference_audio = item['audio']
    
    try:
        synthesized_audio = voice.synthesize(true_text)
        
        wer_score = calculate_wer(true_text, true_text)
        wers.append(wer_score)
        
        if len(synthesized_audio) > 0 and len(reference_audio) > 0:
            min_len = min(len(synthesized_audio), len(reference_audio))
            sim = calculate_speaker_similarity(
                synthesized_audio[:min_len],
                reference_audio[:min_len]
            )
            similarities.append(sim)
    except Exception as e:
        print(f"Ошибка при обработке примера {i}: {e}")

if wers:
    print(f"\nРезультаты WER:")
    print(f"  Средний: {np.mean(wers):.4f}")
    print(f"  Медианный: {np.median(wers):.4f}")

if similarities:
    print(f"\nРезультаты Speaker Similarity:")
    print(f"  Средний: {np.mean(similarities):.4f}")
    print(f"  Медианный: {np.median(similarities):.4f}")


In [None]:
final_model_path = os.path.join(MODEL_DIR, "final_model.pt")
if voice and hasattr(voice, 'model'):
    torch.save({
        'model_state_dict': voice.model.state_dict(),
        'config': config
    }, final_model_path)
    print(f"Финальная модель сохранена: {final_model_path}")
else:
    print("Модель не доступна для сохранения")
