## Configure Colab

- 找到上方菜单栏“代码执行程序”——>弹窗底部“更改运行时类型”，选择GPU作为硬件加速器



- 防止断开连接：按住 Ctrl+Shift 再按下 I 呼出弹窗，于控制台内输入以下内容并回车：
```
function ConnectButton()
{
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()
}
setInterval(ConnectButton,60000);
```



## Mount Google Drive for Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Setup Environment for Windows

### Step1.（无Conda则直接执行第三步）

### Step2.

切换Jupyter当前的内核到虚拟环境的内核

### Step3.

In [None]:
# 升级pip版本
!python -m pip --default-timeout=900 install --upgrade pip
# 安装pytorch（需从官网复制命令）
!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117
# 安装项目依赖
!pip install -r requirements.txt

## Setup Environment for Linux

In [None]:
!apt-get update``
!sudo apt install portaudio19-dev
%cd drive/MyDrive/Easy Voice Toolkit
!pip install -r requirements.txt

## Setup FFmpeg（Experimental）

```已经安装了FFmpeg的用户可直接跳过```

In [None]:
# 快速部署FFmpeg（实验性）
!static_ffmpeg -i file.mp4 ...

如果报错，请手动安装FFmpeg

## Set Dir

In [None]:
#%cd /content/drive/MyDrive/Easy Voice Toolkit

import os
from typing import Optional

Dir_Current = '/content/drive/MyDrive/Easy Voice Toolkit'

## Tool_AudioProcessor

In [None]:
import sys
sys.path.append(Dir_Current + '/Tool_AudioProcessor')

from Process import Audio_Processing

class Execute_Audio_Processing:
    '''
    Change media format to WAV and cut off the silent parts
    '''
    Media_Dir_Input: str = ''
    Media_Dir_Output: str = '' #os.path.join(os.path.dirname(Media_Dir_Input), 'Output')
    Media_Format_Output: str = 'wav'   # choices = ['mp3', 'wav', ...]
    RMS_Threshold: float = -40.   # Default
    Audio_Length_Min: int = 5000   # Default
    Silent_Interval_Min: int = 300   # Default
    Hop_Size: int = 10   # Default
    Silence_Kept_Max: int = 1000   # Default

    def Run():
        AudioConvertandSlice = Audio_Processing(
            Execute_Audio_Processing.Media_Dir_Input,
            Execute_Audio_Processing.Media_Dir_Output,
            Execute_Audio_Processing.Media_Format_Output,
            Execute_Audio_Processing.RMS_Threshold, Execute_Audio_Processing.Audio_Length_Min,
            Execute_Audio_Processing.Silent_Interval_Min, Execute_Audio_Processing.Hop_Size,
            Execute_Audio_Processing.Silence_Kept_Max
        )
        AudioConvertandSlice.Convert_Media()
        AudioConvertandSlice.Slice_Audio()

Execute_Audio_Processing.Run()

## Tool_VoiceIdentifier

In [None]:
import sys
sys.path.append(Dir_Current + '/Tool_VoiceIdentifier')

from Identify import Voice_Identifying

class Execute_Voice_Identifying:
    '''
    Contrast the voice and filter out the similar ones
    '''
    Audio_Path_Std: str = ''
    Audio_Dir_Input: str = '' #Execute_Audio_Processing.Media_Dir_Output
    Audio_Dir_Output: str = ''
    Model_Dir: str = ''
    Model_Type: str = 'Ecapa-Tdnn'   # choices = ['Ecapa-Tdnn']
    Model_Name: str = 'small'   # choices = ['small']
    Feature_Method: str = 'spectrogram'   # choices = ['spectrogram', 'melspectrogram']
    DecisionThreshold: float = 0.84   # Recommanded
    Duration_of_Audio: float = 4.20   # Recommanded

    def Run():
        AudioContrastInference = Voice_Identifying(
            Execute_Voice_Identifying.Audio_Path_Std,
            Execute_Voice_Identifying.Audio_Dir_Input,
            Execute_Voice_Identifying.Audio_Dir_Output,
            Execute_Voice_Identifying.Model_Dir,
            Execute_Voice_Identifying.Model_Type,
            Execute_Voice_Identifying.Model_Name,
            Execute_Voice_Identifying.Feature_Method,
            Execute_Voice_Identifying.DecisionThreshold,
            Execute_Voice_Identifying.Duration_of_Audio
        )
        AudioContrastInference.GetModel()
        AudioContrastInference.Inference()

Execute_Voice_Identifying.Run()

## Tool_VoiceTranscriber

In [None]:
import sys
sys.path.append(Dir_Current + '/Tool_VoiceTranscriber')

from Transcribe import Voice_Transcribing

class Execute_Voice_Transcribing:
    '''
    Transcribe WAV content to SRT
    '''
    Model_Name: str = 'small'   # choices = ['tiny', 'base', 'small', 'medium', 'large']
    Model_Dir: str = '' #Execute_Voice_Identifying.Model_Dir
    WAV_Dir: str = '' #Execute_Voice_Identifying.Audio_Dir_Output
    SRT_Dir: str = ''
    Verbose: bool = True   # Default
    Language: Optional[str] = None   # choices = ['zh', 'en', ..., None]
    Condition_on_Previous_Text: bool = True   # Default

    def Run():
        WAVtoSRT = Voice_Transcribing(
            Execute_Voice_Transcribing.Model_Name,
            Execute_Voice_Transcribing.Model_Dir,
            Execute_Voice_Transcribing.WAV_Dir,
            Execute_Voice_Transcribing.SRT_Dir,
            Execute_Voice_Transcribing.Verbose,
            Execute_Voice_Transcribing.Language,
            Execute_Voice_Transcribing.Condition_on_Previous_Text
        )
        WAVtoSRT.Transcriber()

Execute_Voice_Transcribing.Run()

## Tool_DatasetCreator

In [None]:
import sys
sys.path.append(Dir_Current + '/Tool_DatasetCreator')

from Create import Dataset_Creating

class Execute_Dataset_Creating:
    '''
    Convert the whisper-generated SRT to CSV and split the WAV
    '''
    SRT_Dir: str = '' #Execute_Voice_Transcribing.SRT_Dir
    WAV_Dir: str = '' #Execute_Voice_Transcribing.WAV_Dir
    Sample_Rate: int = 22050   # For vits
    Subtype: str = 'PCM_16'   # For vits
    WAV_Dir_Split: str = ''
    Encoder: str = 'VITS'   # choices = ['VITS']
    IsSpeakerMultiple: bool = False   # Default
    FileList_Path_Training: str = '' #'./FileLists/Train_FileList.txt'
    FileList_Path_Validation: str = '' #'./FileLists/Val_FileList.txt'

    def Run():
        SRTtoCSVandSplitAudio = Dataset_Creating(
            Execute_Dataset_Creating.SRT_Dir,
            Execute_Dataset_Creating.WAV_Dir,
            Execute_Dataset_Creating.Sample_Rate,
            Execute_Dataset_Creating.Subtype,
            Execute_Dataset_Creating.WAV_Dir_Split,
            Execute_Dataset_Creating.Encoder,
            Execute_Dataset_Creating.IsSpeakerMultiple,
            Execute_Dataset_Creating.FileList_Path_Training,
            Execute_Dataset_Creating.FileList_Path_Validation
        )
        SRTtoCSVandSplitAudio.CallingFunctions()

Execute_Dataset_Creating.Run()

## Tool_VoiceEncoder

In [None]:
import sys
sys.path.append(Dir_Current + '/Tool_VoiceEncoder')

from Encode import Voice_Encoding

class Execute_Voice_Encoding:
    '''
    Preprocess and then start training
    '''
    FileList_Path_Validation: str = '' #Execute_Dataset_Creating.FileList_Path_Validation
    FileList_Path_Training: str = '' #Execute_Dataset_Creating.FileList_Path_Training
    Language: str = 'chinese'   # Default
    Config_Path_Load: Optional[str] = None   # choices = ['%CustomConfigPath%', None]
    Config_Dir_Save: str = ''
    Eval_Interval: int = 1000   # Recommanded
    Epochs: int = 10000   # Recommanded
    Batch_Size: int = 8   # choices = [Power of 2]
    FP16_Run: bool = True   # Recommanded
    IsSpeakerMultiple: str = '' #Execute_Dataset_Creating.IsSpeakerMultiple
    N_Speakers: int = 0   # Default
    Speakers: str = ''   # choices = ['%SpeakerName1%', '%SpeakerName2%', ...]
    Num_Workers: int = 8   # Default
    Model_Dir: str = ''
    Model_Name: str = ''
    
    def Run():
        PreprocessandTrain = Voice_Encoding(
            Execute_Voice_Encoding.FileList_Path_Validation,
            Execute_Voice_Encoding.FileList_Path_Training,
            Execute_Voice_Encoding.Language,
            Execute_Voice_Encoding.Config_Path_Load,
            Execute_Voice_Encoding.Config_Dir_Save,
            Execute_Voice_Encoding.Eval_Interval,
            Execute_Voice_Encoding.Epochs,
            Execute_Voice_Encoding.Batch_Size,
            Execute_Voice_Encoding.FP16_Run,
            Execute_Voice_Encoding.IsSpeakerMultiple,
            Execute_Voice_Encoding.N_Speakers,
            Execute_Voice_Encoding.Speakers,
            Execute_Voice_Encoding.Num_Workers,
            Execute_Voice_Encoding.Model_Dir,
            Execute_Voice_Encoding.Model_Name
        )
        PreprocessandTrain. Preprocessing_and_Training()

Execute_Voice_Encoding.Run()