## Configure Colab

- 找到上方菜单栏“代码执行程序”——>弹窗底部“更改运行时类型”，选择GPU作为硬件加速器



- 防止断开连接：按住 Ctrl+Shift 再按下 I 呼出弹窗，于控制台内输入以下内容并回车：
```
function ConnectButton()
{
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()
}
setInterval(ConnectButton,60000);
```



## Mount Google Drive for Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Setup Environment

In [None]:
!apt-get update``
!sudo apt install portaudio19-dev
%cd drive/MyDrive/Easy Voice Toolkit
!pip install -r requirements.txt

## Set Dir

In [None]:
#%cd /content/drive/MyDrive/Easy Voice Toolkit

#import os
from typing import Optional

## Tool_AudioProcessor
该工具会将媒体文件批量转换为音频文件然后自动切除音频的静音部分

In [None]:
#import sys
#sys.path.append(os.path.join(os.getcwd(), "Tool_AudioProcessor"))

from Tool_AudioProcessor.Process import Audio_Processing

class Execute_Audio_Processing:
    '''
    Change media format to WAV and cut off the silent parts
    '''
    Media_Dir_Input: str = ''   # 媒体输入目录
    Media_Dir_Output: str = ''   # 媒体输出目录
    Media_Format_Output: str = 'wav'   # 媒体输出格式 choices = ['mp3', 'wav', ...]
    RMS_Threshold: float = -40.   # 均方根阈值 (db)
    Audio_Length_Min: int = 5000   # 最小音频长度 (ms)
    Silent_Interval_Min: int = 300   # 最小静音间隔 (ms)
    Hop_Size: int = 10   # 跳跃大小 (ms)
    Silence_Kept_Max: int = 1000   # 最大静音长度 (ms)

    def Run():
        AudioConvertandSlice = Audio_Processing(
            Execute_Audio_Processing.Media_Dir_Input,
            Execute_Audio_Processing.Media_Dir_Output,
            Execute_Audio_Processing.Media_Format_Output,
            Execute_Audio_Processing.RMS_Threshold,
            Execute_Audio_Processing.Audio_Length_Min,
            Execute_Audio_Processing.Silent_Interval_Min,
            Execute_Audio_Processing.Hop_Size,
            Execute_Audio_Processing.Silence_Kept_Max
        )
        AudioConvertandSlice.Convert_Media()
        AudioConvertandSlice.Slice_Audio()

Execute_Audio_Processing.Run()

## Tool_VoiceIdentifier
该工具会在不同说话人的音频中批量筛选出属于同一说话人的音频

In [None]:
#import sys
#sys.path.append(os.path.join(os.getcwd(), 'Tool_VoiceIdentifier'))

from Tool_VoiceIdentifier.Identify import Voice_Identifying

class Execute_Voice_Identifying:
    '''
    Contrast the voice and filter out the similar ones
    '''
    Audio_Path_Std: str = ''   # 标准音频路径
    Audio_Dir_Input: str = ''   # 音频输入目录
    Model_Dir: str = ''   # 模型存放目录
    Model_Type: str = 'Ecapa-Tdnn'   # 模型类型 choices = ['Ecapa-Tdnn']
    Model_Name: str = 'small'   # 模型名字 choices = ['small']
    Feature_Method: str = 'spectrogram'   # 特征提取方法 choices = ['spectrogram', 'melspectrogram']
    DecisionThreshold: float = 0.84   # 判断阈值 Recommanded
    Duration_of_Audio: float = 3.00   # 音频长度 Recommanded

    def Run():
        AudioContrastInference = Voice_Identifying(
            Execute_Voice_Identifying.Audio_Path_Std,
            Execute_Voice_Identifying.Audio_Dir_Input,
            Execute_Voice_Identifying.Audio_Dir_Output,
            Execute_Voice_Identifying.Model_Dir,
            Execute_Voice_Identifying.Model_Type,
            Execute_Voice_Identifying.Model_Name,
            Execute_Voice_Identifying.Feature_Method,
            Execute_Voice_Identifying.DecisionThreshold,
            Execute_Voice_Identifying.Duration_of_Audio
        )
        AudioContrastInference.GetModel()
        AudioContrastInference.Inference()

Execute_Voice_Identifying.Run()

## Tool_VoiceTranscriber
该工具会将语音文件的内容批量转换为带时间戳的文本并以字幕文件的形式保存

In [None]:
#import sys
#sys.path.append(os.path.join(os.getcwd(), 'Tool_VoiceTranscriber'))

from Tool_VoiceTranscriber.Transcribe import Voice_Transcribing

class Execute_Voice_Transcribing:
    '''
    Transcribe WAV content to SRT
    '''
    Model_Name: str = 'small'   # 模型名字 choices = ['tiny', 'base', 'small', 'medium', 'large']
    Model_Dir: str = ''   # 模型存放目录
    WAV_Dir: str = ''   # 音频目录
    SRT_Dir: str = ''   # 字幕输出目录
    Verbose: bool = True   # 启用输出日志
    Language: Optional[str] = None   # 所用语言 choices = ['zh', 'en', ..., None]
    Condition_on_Previous_Text: bool = True   # 前后文一致
    fp16: bool = True   # 半精度训练

    def Run():
        WAVtoSRT = Voice_Transcribing(
            Execute_Voice_Transcribing.Model_Name,
            Execute_Voice_Transcribing.Model_Dir,
            Execute_Voice_Transcribing.WAV_Dir,
            Execute_Voice_Transcribing.SRT_Dir,
            Execute_Voice_Transcribing.Verbose,
            Execute_Voice_Transcribing.Language,
            Execute_Voice_Transcribing.Condition_on_Previous_Text,
            Execute_Voice_Transcribing.fp16
        )
        WAVtoSRT.Transcriber()

Execute_Voice_Transcribing.Run()

## Tool_DatasetCreator
该工具会生成适用于语音模型训练的数据集

In [None]:
#import sys
#sys.path.append(os.path.join(os.getcwd(), 'Tool_DatasetCreator'))

from Tool_DatasetCreator.Create import Dataset_Creating

class Execute_Dataset_Creating:
    '''
    Convert the whisper-generated SRT to CSV and split the WAV
    '''
    SRT_Dir: str = ''   #Execute_Voice_Transcribing.SRT_Dir
    WAV_Dir: str = ''   # 音频输入目录
    Sample_Rate: int = 22050   # 采样率 (HZ)
    Subtype: str = 'PCM_16'   # 采样格式
    WAV_Dir_Split: str = ''   # 音频输出目录
    Encoder: str = 'VITS'   # 自编码器 choices = ['VITS']
    IsSpeakerMultiple: bool = False   # 是否多人
    FileList_Path_Training: str = ''   # 训练集文本路径
    FileList_Path_Validation: str = ''   # 验证集文本路径

    def Run():
        SRTtoCSVandSplitAudio = Dataset_Creating(
            Execute_Dataset_Creating.SRT_Dir,
            Execute_Dataset_Creating.WAV_Dir,
            Execute_Dataset_Creating.Sample_Rate,
            Execute_Dataset_Creating.Subtype,
            Execute_Dataset_Creating.WAV_Dir_Split,
            Execute_Dataset_Creating.Encoder,
            Execute_Dataset_Creating.IsSpeakerMultiple,
            Execute_Dataset_Creating.FileList_Path_Training,
            Execute_Dataset_Creating.FileList_Path_Validation
        )
        SRTtoCSVandSplitAudio.CallingFunctions()

Execute_Dataset_Creating.Run()

## Tool_VoiceEncoder
该工具会训练出适用于语音合成的模型文件

In [None]:
#import sys
#sys.path.append(os.path.join(os.getcwd(), 'Tool_VoiceEncoder'))

from Tool_VoiceEncoder.Encode import Voice_Encoding

class Execute_Voice_Encoding:
    '''
    Preprocess and then start training
    '''
    FileList_Path_Validation: str = ''   # 训练集文本路径
    FileList_Path_Training: str = ''   # 验证集文本路径
    Language: str = 'chinese'   # 所用语言
    Config_Path_Load: Optional[str] = None   # 配置加载路径
    Config_Dir_Save: str = ''   # 配置保存目录
    Eval_Interval: int = 1000   # 保存间隔
    Epochs: int = 10000   # 迭代次数
    Batch_Size: int = 8   # 批处理量 choices = [Power of 2]
    FP16_Run: bool = True   # 半精度训练
    IsSpeakerMultiple: bool = False   # 是否多人
    N_Speakers: int = 0   # 说话人数
    Speakers: list = ['']   # 人物名字 format = ['%SpeakerName1%', '%SpeakerName2%', ...]
    Num_Workers: int = 8   # 进程数量
    Model_Path_Pretrained_G: Optional[str] = None   # 预训练G模型路径
    Model_Path_Pretrained_D: Optional[str] = None   # 预训练D模型路径
    Model_Dir_Save: str = ''   # 模型保存目录
    Model_Name_Save: str = ''   # 模型保存名字
    
    def Run():
        PreprocessandTrain = Voice_Encoding(
            Execute_Voice_Encoding.FileList_Path_Validation,
            Execute_Voice_Encoding.FileList_Path_Training,
            Execute_Voice_Encoding.Language,
            Execute_Voice_Encoding.Config_Path_Load,
            Execute_Voice_Encoding.Config_Dir_Save,
            Execute_Voice_Encoding.Eval_Interval,
            Execute_Voice_Encoding.Epochs,
            Execute_Voice_Encoding.Batch_Size,
            Execute_Voice_Encoding.FP16_Run,
            Execute_Voice_Encoding.IsSpeakerMultiple,
            Execute_Voice_Encoding.N_Speakers,
            Execute_Voice_Encoding.Speakers,
            Execute_Voice_Encoding.Num_Workers,
            Execute_Voice_Encoding.Model_Path_Pretrained_G,
            Execute_Voice_Encoding.Model_Path_Pretrained_D,
            Execute_Voice_Encoding.Model_Dir_Save,
            Execute_Voice_Encoding.Model_Name_Save
        )
        PreprocessandTrain.Preprocessing_and_Training()

Execute_Voice_Encoding.Run()