In [1]:
from pathlib import Path
from typing import Tuple, Union
from tqdm.notebook import tqdm
import numpy as np
from models.embed.inference import embed_utterance
from models.embed.inference import is_loaded as is_embed_loaded
from models.embed.inference import load_model as load_embed_model
from models.embed.audio import preprocess_wav
from utils.f0_utils import compute_f0, compute_mean_std, f02lf0

In [2]:
data_root = r"E:\PKU\Lecture_AI\aidatatang_200zh\aidatatang_200zh\corpus\dev"
files = sorted(Path(data_root).rglob("*.wav"))
print(f"Found {len(files)} files.")

Found 24216 files.


In [3]:
load_embed_model(Path(r"E:\PKU\Lecture_AI\TTL-VC\pre_trained\embed\pretrained.pt"))

Loaded encoder "pretrained.pt" trained to step 1594501


SpeakerEncoder(
  (lstm): LSTM(40, 256, num_layers=3, batch_first=True)
  (linear): Linear(in_features=256, out_features=256, bias=True)
  (relu): ReLU()
  (loss_fn): CrossEntropyLoss()
)

In [4]:
def speaker_embed(wav: Union[str, Path, np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
    """说话人风格嵌入。"""
    if not is_embed_loaded():
        raise RuntimeError("请先初始化 Embed 模型。")
    wav_preprocessed = preprocess_wav(wav)
    return wav_preprocessed, embed_utterance(wav_preprocessed)

In [5]:
embeds = np.empty((len(files), 256), dtype=np.float32)
lf0_mean_stds = np.empty((len(files), 2), dtype=np.float32)

for i, file in enumerate(tqdm(files)):
    wav, embed = speaker_embed(file)
    ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(wav)))
    embeds[i] = embed
    lf0_mean_stds[i] = ref_lf0_mean, ref_lf0_std

    

  0%|          | 0/24216 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: 'preprocessed/embeds.npy'

In [6]:
np.save("preprocessed/embeds.npy", embeds, allow_pickle=False)
np.save("preprocessed/lf0_mean_stds.npy", lf0_mean_stds, allow_pickle=False)