### Move files to speakers directory, save them as wav files and resample them to 24000 Hz

In [2]:
import numpy as np
import pandas as pd
import os
import torchaudio
import soundfile as sf
import silero_vad
from tqdm import tqdm
from pathlib import Path
from pydub import AudioSegment


In [3]:
data_dir = Path("C:/Users/quang/Desktop/Dataset/Voice_Conversion/Data_common_voice/data")
data_clips_dir = Path("C:/Users/quang/Desktop/Dataset/Voice_Conversion/Data_common_voice/clips")
speaker_path = Path("C:/Users/quang/Desktop/Dataset/Voice_Conversion/Data_common_voice/speakers.csv")

In [4]:
def clean_data(waveform, model):

    speech_timestamps = silero_vad.get_speech_timestamps(
                        waveform,
                        model,
                        return_seconds = False,
                        sampling_rate = 32000
                        )
    
    start_sample = int(speech_timestamps[0]["start"])
    end_sample = int(speech_timestamps[-1]["end"]) + 6000

    if end_sample >= waveform.shape[1]:
        end_sample = waveform.shape[1]
    if start_sample >= waveform.shape[1]:
        start_sample = 0

    # waveform = waveform[:, start_sample:end_sample]

    try:
        waveform = waveform[:, start_sample:end_sample]
    except:
        waveform = waveform

    return waveform

In [None]:
speaker_csv = pd.read_csv(speaker_path)
model = silero_vad.load_silero_vad()

for index, row in speaker_csv.iterrows():
    files = row["files"].replace('[', '').replace(']','').replace("'","").replace(" ","").split(",")
    count = row["count"]
    speaker_count = index + 1
    speaker_string = "{:03d}".format(speaker_count)
    if not os.path.exists(os.path.join(data_dir, speaker_string)):
        os.makedirs(os.path.join(data_dir, speaker_string))
    
    for file in files:
        file_source = os.path.join(data_clips_dir, file)
        file_target = os.path.join(data_dir, speaker_string, file).replace(".mp3", ".wav")

        # open mp3 audio_file
        # waveform, sample_rate = librosa.load(file_source, sr=None)
        waveform, sample_rate = torchaudio.load(file_source)
        # cut data
        waveform = clean_data(waveform, model)
        # resample audio to 24000 Hz
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=24000)(waveform)
        # save audio file in wav format
        torchaudio.save(file_target, waveform, sample_rate=24000, format="wav")

    if count <= 10: break

ValueError: Currently silero VAD models support 8000 and 16000 (or multiply of 16000) sample rates