# ДЗ 5. Обучение поющей speech-to-speech модели so-VITS-svc
### Пухкало Влада

## Подготовка данных

In [None]:
!pip install numpy
!pip install librosa
!pip install soundfile

In [None]:
!git clone https://github.com/openvpi/audio-slicer.git

In [3]:
import sys
import os

%cd /content/audio-slicer

sys.path.append(os.getcwd())

/content/audio-slicer


In [None]:
import os
import librosa
import soundfile as sf
from slicer2 import Slicer

# Убедитесь, что директория для сохранения нарезанных аудиофайлов существует
output_dir = '/content/dataset_raw/skrip'
os.makedirs(output_dir, exist_ok=True)

# Путь к папке с аудиофайлами
vocal_scrip_dir = '/content/vocal_scrip'

# Инициализация счетчика для уникальных имен файлов
file_counter = 1

# Перебор всех файлов в папке
for filename in os.listdir(vocal_scrip_dir):
    if filename.endswith('.wav'):
        # Полный путь к аудиофайлу
        file_path = os.path.join(vocal_scrip_dir, filename)

        # Загрузка аудиофайла
        audio, sr = librosa.load(file_path, sr=None, mono=False)

        # Настройка и использование Slicer
        slicer = Slicer(
            sr=sr,
            threshold=-40,
            min_length=15000,
            min_interval=100,
            hop_size=10,
            max_sil_kept=500
        )
        chunks = slicer.slice(audio)

        # Сохранение нарезанных аудиофайлов
        for chunk in chunks:
            if len(chunk.shape) > 1:
                chunk = chunk.T  # Swap axes if the audio is stereo.
            output_path = os.path.join(output_dir, f'{file_counter:06}.wav')
            print(f"Saving chunk to {output_path}")
            sf.write(output_path, chunk, sr)
            file_counter += 1

In [5]:
%cd /content
!git clone https://github.com/PlayVoice/whisper-vits-svc.git
%cd whisper-vits-svc

/content
Cloning into 'whisper-vits-svc'...
remote: Enumerating objects: 2389, done.[K
remote: Counting objects: 100% (1127/1127), done.[K
remote: Compressing objects: 100% (413/413), done.[K
remote: Total 2389 (delta 803), reused 901 (delta 711), pack-reused 1262[K
Receiving objects: 100% (2389/2389), 42.35 MiB | 36.97 MiB/s, done.
Resolving deltas: 100% (1343/1343), done.
/content/whisper-vits-svc


In [None]:
# Step 1: Install PyTorch if not already installed
import torch

# Step 2: Install project dependencies
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt

# Step 3: Download and place models in the correct directories

# Timbre Encoder
# !mkdir -p speaker_pretrain
# !wget -O speaker_pretrain/best_model.pth.tar https://drive.google.com/uc?id=1UPjQ2LVSIt3o-9QMKMJcdzT8aZRZCI-E


# Whisper Model
!mkdir -p whisper_pretrain
!wget -O whisper_pretrain/large-v2.pt https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt

# HuBERT Model
!mkdir -p hubert_pretrain
!wget -O hubert_pretrain/hubert-soft-0d54a1f4.pt https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt

# Pitch Extractor
!mkdir -p crepe/assets
!wget -O crepe/assets/full.pth https://github.com/maxrmorrison/torchcrepe/raw/master/torchcrepe/assets/full.pth

In [None]:
!mkdir -p speaker_pretrain
# !wget -O speaker_pretrain/best_model.pth.tar https://drive.google.com/uc?id=1UPjQ2LVSIt3o-9QMKMJcdzT8aZRZCI-E

!pip install gdown
!gdown --id 1UPjQ2LVSIt3o-9QMKMJcdzT8aZRZCI-E -O speaker_pretrain/best_model.pth.tar

!file speaker_pretrain/best_model.pth.tar

In [None]:
# Pretrain Model
!mkdir -p vits_pretrain
!wget -O vits_pretrain/sovits5.0.pretrain.pth https://github.com/PlayVoice/whisper-vits-svc/releases/download/5.0/sovits5.0.pretrain.pth

In [9]:
!python svc_inference.py --config configs/base.yaml --model ./vits_pretrain/sovits5.0.pretrain.pth --spk ./configs/singers/singer0001.npy --wave test.wav

Auto run : python whisper/inference.py -w test.wav -p svc_tmp.ppg.npy
test.wav
svc_tmp.ppg.npy
Auto run : python hubert/inference.py -w test.wav -v svc_tmp.vec.npy
test.wav
svc_tmp.vec.npy
Auto run : python pitch/inference.py -w test.wav -p svc_tmp.pit.csv
test.wav
svc_tmp.pit.csv
  return F.conv2d(input, weight, bias, self.stride,
  return F.conv2d(input, weight, bias, self.stride,
INFO:__main__:infer without retrival
pitch shift:  0
  return F.conv1d(input, weight, bias, self.stride,


## Обучение модели

In [None]:
!mv /content/dataset_raw /content/whisper-vits-svc
%cd /content/whisper-vits-svc
!python svc_preprocessing.py -t 2

In [11]:
# Ресэмплинг
!python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000
!python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000

# Извлечение пичей
!python prepare/preprocess_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch

# Извлечение PPG
!python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper

# Извлечение Hubert
!python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert

# Извлечение тембрового кода
!python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker

# Усреднение тембрового кода
!python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer

# Извлечение линейного спектра
!python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs

# Генерация тренировочного индекса
!python prepare/preprocess_train.py

# Отладка тренировочных файлов
!python prepare/preprocess_zzz.py

./dataset_raw
./data_svc/waves-16k
16000
Processing 16000 skrip: 100% 72/72 [00:04<00:00, 17.95it/s]
./dataset_raw
./data_svc/waves-32k
32000
Processing 32000 skrip: 100% 72/72 [00:05<00:00, 13.00it/s]
data_svc/waves-16k/
data_svc/pitch
  return F.conv2d(input, weight, bias, self.stride,
Processing crepe skrip: 100% 72/72 [03:21<00:00,  2.80s/it]
data_svc/waves-16k/
data_svc/whisper
ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=1280, n_audio_head=20, n_audio_layer=32, n_vocab=51865, n_text_ctx=448, n_text_state=1280, n_text_head=20, n_text_layer=32)
Processing ppg skrip: 100% 72/72 [00:00<00:00, 203497.23it/s]
data_svc/waves-16k/
data_svc/hubert
Processing vec skrip: 100% 72/72 [00:06<00:00, 11.02it/s]
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.98
 | > gri

In [12]:
current_path = '/content/whisper-vits-svc/vits_pretrain/sovits5.0.pretrain.pth'
target_path = '/content/whisper-vits-svc/sovits5.0.pretrain.pth'

!mv {current_path} {target_path}

In [14]:
import yaml

config_file_path = '/content/whisper-vits-svc/configs/base.yaml'

new_pretrain_value = "sovits5.0.pretrain.pth"

with open(config_file_path, 'r') as file:
    config = yaml.safe_load(file)

config['pretrain'] = new_pretrain_value
config['train']['batch_size'] = 6
config['train']['epochs'] = 50

with open(config_file_path, 'w') as file:
    yaml.dump(config, file)

print(f"Файл {config_file_path} успешно обновлен.")

Файл /content/whisper-vits-svc/configs/base.yaml успешно обновлен.


In [15]:
!python svc_trainer.py -c configs/base.yaml -n sovits5.0

2024-06-18 10:23:52.331659: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-18 10:23:52.331708: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-18 10:23:52.469788: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-18 10:23:52.730073: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Batch size per GPU : 6
----------10----------
2024-06

In [17]:
!python svc_trainer.py -c configs/base.yaml -n sovits5.0 -p chkpt/sovits5.0/sovits5.0_0045.pt

2024-06-18 10:39:48.593483: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-18 10:39:48.593537: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-18 10:39:48.594872: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-18 10:39:48.602110: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Batch size per GPU : 6
----------10----------
2024-06

## Инференс

In [18]:
!python svc_export.py --config configs/base.yaml --checkpoint_path chkpt/sovits5.0/sovits5.0_0045.pt



In [20]:
!python svc_inference.py --config configs/base.yaml --model sovits5.0.pth --spk ./data_svc/singer/skrip.spk.npy --wave test.wav --shift 0

Auto run : python whisper/inference.py -w test.wav -p svc_tmp.ppg.npy
test.wav
svc_tmp.ppg.npy
Auto run : python hubert/inference.py -w test.wav -v svc_tmp.vec.npy
test.wav
svc_tmp.vec.npy
Auto run : python pitch/inference.py -w test.wav -p svc_tmp.pit.csv
test.wav
svc_tmp.pit.csv
  return F.conv2d(input, weight, bias, self.stride,
INFO:__main__:infer without retrival
pitch shift:  0
  return F.conv1d(input, weight, bias, self.stride,
